torch-memory-saver 0.0.9rc1__tar.gz → 0.0.9rc2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. torch_memory_saver-0.0.9rc2/MANIFEST.in +1 -0
  2. {torch_memory_saver-0.0.9rc1/torch_memory_saver.egg-info → torch_memory_saver-0.0.9rc2}/PKG-INFO +1 -1
  3. torch_memory_saver-0.0.9rc2/csrc/api_forwarder.h +8 -0
  4. torch_memory_saver-0.0.9rc2/csrc/core.h +96 -0
  5. torch_memory_saver-0.0.9rc2/csrc/macro.h +40 -0
  6. torch_memory_saver-0.0.9rc2/csrc/utils.h +241 -0
  7. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/setup.py +3 -3
  8. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2/torch_memory_saver.egg-info}/PKG-INFO +1 -1
  9. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver.egg-info/SOURCES.txt +5 -0
  10. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/LICENSE +0 -0
  11. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/README.md +0 -0
  12. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/csrc/api_forwarder.cpp +0 -0
  13. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/csrc/core.cpp +0 -0
  14. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/csrc/entrypoint.cpp +0 -0
  15. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/setup.cfg +0 -0
  16. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/test/test_examples.py +0 -0
  17. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver/__init__.py +0 -0
  18. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver/binary_wrapper.py +0 -0
  19. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver/entrypoint.py +0 -0
  20. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver/hooks/__init__.py +0 -0
  21. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver/hooks/base.py +0 -0
  22. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver/hooks/mode_preload.py +0 -0
  23. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver/hooks/mode_torch.py +0 -0
  24. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver/testing_utils.py +0 -0
  25. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver/utils.py +0 -0
  26. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver.egg-info/dependency_links.txt +0 -0
  27. {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver.egg-info/top_level.txt +0 -0
@@ -0,0 +1 @@
1
+ include csrc/*.h
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torch_memory_saver
3
- Version: 0.0.9rc1
3
+ Version: 0.0.9rc2
4
4
  Requires-Python: >=3.9
5
5
  License-File: LICENSE
6
6
  Dynamic: license-file
@@ -0,0 +1,8 @@
1
+ #pragma once
2
+ #include <dlfcn.h>
3
+ #include "macro.h"
4
+
5
+ namespace APIForwarder {
6
+ cudaError_t call_real_cuda_malloc(void **ptr, size_t size);
7
+ cudaError_t call_real_cuda_free(void *ptr);
8
+ }
@@ -0,0 +1,96 @@
1
+ #pragma once
2
+ #include <sys/types.h>
3
+ #include <stdio.h>
4
+ #include <unordered_map>
5
+ #include <mutex>
6
+ #include <string>
7
+ #include "utils.h"
8
+ #include "macro.h"
9
+
10
+ enum class AllocationState {
11
+ // Memory is mapped and accessible
12
+ ACTIVE,
13
+ // Memory is unmapped and inaccessible
14
+ PAUSED
15
+ };
16
+
17
+ struct AllocationMetadata {
18
+ size_t size;
19
+ CUdevice device;
20
+ std::string tag;
21
+ AllocationState state;
22
+ bool enable_cpu_backup;
23
+ void* cpu_backup;
24
+
25
+ #if defined(USE_CUDA)
26
+ CUmemGenericAllocationHandle allocHandle;
27
+ #elif defined(USE_ROCM)
28
+ size_t aligned_size;
29
+ std::vector<hipMemGenericAllocationHandle_t> allocHandles;
30
+ std::vector<size_t> chunk_sizes;
31
+ #else
32
+ #error "USE_PLATFORM is not set"
33
+ #endif
34
+ };
35
+
36
+ #if defined(USE_ROCM)
37
+ namespace DeviceUtils {
38
+ // Simple function to get global device ID from local device ID
39
+ static int get_global_device_id(hipDevice_t local_device_id) {
40
+ // Check for HIP_VISIBLE_DEVICES environment variable
41
+ const char* hip_visible = std::getenv("HIP_VISIBLE_DEVICES");
42
+
43
+ if (hip_visible && strlen(hip_visible) > 0) {
44
+ std::string devices_str(hip_visible);
45
+ std::stringstream ss(devices_str);
46
+ std::string device_str;
47
+ std::vector<int> device_list;
48
+
49
+ // Parse comma-separated device list
50
+ while (std::getline(ss, device_str, ',')) {
51
+ if (!device_str.empty()) {
52
+ device_list.push_back(std::atoi(device_str.c_str()));
53
+ }
54
+ }
55
+
56
+ if (local_device_id < device_list.size()) {
57
+ int global_device_id = device_list[local_device_id];
58
+ #ifdef TMS_DEBUG_LOG
59
+ std::cout << "[torch_memory_saver.cpp] HIP_VISIBLE_DEVICES=" << hip_visible
60
+ << " local_device_id=" << local_device_id
61
+ << " -> global_device_id=" << global_device_id << std::endl;
62
+ #endif
63
+ return global_device_id;
64
+ }
65
+ }
66
+
67
+ // Fallback: return local device ID as-is
68
+ #ifdef TMS_DEBUG_LOG
69
+ std::cout << "[torch_memory_saver.cpp] No HIP_VISIBLE_DEVICES, using local_device_id=" << local_device_id << std::endl;
70
+ #endif
71
+ return local_device_id;
72
+ }
73
+ }
74
+ #endif
75
+
76
+
77
+
78
+ class TorchMemorySaver {
79
+ public:
80
+ static TorchMemorySaver& instance();
81
+
82
+ cudaError_t malloc(void** ptr, CUdevice device, size_t size, const std::string& tag, bool enable_cpu_backup);
83
+ cudaError_t free(void* ptr);
84
+
85
+ void pause(const std::string& tag);
86
+ void resume(const std::string& tag);
87
+
88
+ private:
89
+ TorchMemorySaver();
90
+ ~TorchMemorySaver() = default;
91
+ TorchMemorySaver(const TorchMemorySaver&) = delete;
92
+ TorchMemorySaver& operator=(const TorchMemorySaver&) = delete;
93
+
94
+ std::mutex allocator_metadata_mutex_;
95
+ std::unordered_map<void*, AllocationMetadata> allocation_metadata_;
96
+ };
@@ -0,0 +1,40 @@
1
+ #pragma once
2
+
3
+ // Define platform macros and include appropriate headers
4
+ #if defined(USE_ROCM)
5
+ // Lookup the table to define the macros: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Driver_API_functions_supported_by_HIP.html
6
+ // Lookup the table to define the macros: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Runtime_API_functions_supported_by_HIP.html?utm_source=chatgpt.com
7
+ #include <hip/hip_runtime_api.h>
8
+ #include <hip/hip_runtime.h>
9
+ #include <sstream>
10
+ #include <cstdlib>
11
+ // Define a general alias
12
+ #define CUresult hipError_t
13
+ #define cudaError_t hipError_t
14
+ #define CUDA_SUCCESS hipSuccess
15
+ #define cudaSuccess hipSuccess
16
+ #define cuGetErrorString hipDrvGetErrorString
17
+ #define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
18
+ #define CUdevice hipDevice_t
19
+ #define cudaStream_t hipStream_t
20
+ #define cudaMallocHost hipHostMalloc
21
+ #define cudaMemcpy hipMemcpy
22
+ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
23
+ #define cuGetErrorString hipDrvGetErrorString
24
+ #define cudaGetErrorString hipGetErrorString
25
+ #define cuMemUnmap hipMemUnmap
26
+ #define cuMemRelease hipMemRelease
27
+ // #define cudaMalloc hipMalloc
28
+ // #define cudaFree hipFree
29
+ // #define CUdevice hipDevice_t
30
+ // #define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
31
+ #define MEMCREATE_CHUNK_SIZE (2 * 1024 * 1024)
32
+ #define MIN(a, b) (a < b ? a : b)
33
+
34
+ #elif defined(USE_CUDA)
35
+ #include <cuda_runtime_api.h>
36
+ #include <cuda.h>
37
+
38
+ #else
39
+ #error "USE_PLATFORM is not set"
40
+ #endif
@@ -0,0 +1,241 @@
1
+ #pragma once
2
+ #include <iostream>
3
+ #include <vector>
4
+ #include "macro.h"
5
+
6
+ // #define TMS_DEBUG_LOG
7
+
8
+ // Cannot use pytorch (libc10.so) since LD_PRELOAD happens earlier than `import torch`
9
+ // Thus copy from torch Macros.h
10
+ #if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
11
+ #define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
12
+ #define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
13
+ #else
14
+ #define C10_LIKELY(expr) (expr)
15
+ #define C10_UNLIKELY(expr) (expr)
16
+ #endif
17
+
18
+ #define SIMPLE_CHECK(COND, MSG) \
19
+ do { \
20
+ if (!(COND)) { \
21
+ std::cerr << "[torch_memory_saver.cpp] " << MSG \
22
+ << " file=" << __FILE__ << " func=" << __func__ << " line=" << __LINE__ \
23
+ << std::endl; \
24
+ exit(1); \
25
+ } \
26
+ } while (false)
27
+
28
+ #define CURESULT_CHECK(EXPR) \
29
+ do { \
30
+ CUresult __result = (EXPR); \
31
+ if (__result != CUDA_SUCCESS) { \
32
+ const char* err_str = nullptr; \
33
+ cuGetErrorString(__result, &err_str); \
34
+ std::cerr << "[torch_memory_saver.cpp] CUresult error: " \
35
+ << __result << " (" << (err_str ? err_str : "Unknown error") << ") " \
36
+ << " file=" << __FILE__ << " func=" << __func__ << " line=" << __LINE__ \
37
+ << std::endl; \
38
+ exit(1); \
39
+ } \
40
+ } while (false)
41
+
42
+ #define CUDA_ERROR_CHECK(EXPR) \
43
+ do { \
44
+ cudaError_t __result = (EXPR); \
45
+ if (__result != cudaSuccess) { \
46
+ const char* err_str = cudaGetErrorString(__result); \
47
+ std::cerr << "[torch_memory_saver.cpp] cudaError error: " \
48
+ << __result << " (" << (err_str ? err_str : "Unknown error") << ") " \
49
+ << " file=" << __FILE__ << " func=" << __func__ << " line=" << __LINE__ \
50
+ << std::endl; \
51
+ exit(1); \
52
+ } \
53
+ } while (false)
54
+
55
+
56
+
57
+ namespace CUDAUtils {
58
+ #if defined(USE_ROCM)
59
+
60
+ #if HIP_VERSION >= 60402000 // rocm/hip 6.4.2
61
+ #pragma message "Using ROCm/HIP 6.4.2+ implementation"
62
+ // Implement when rocm release >= 6.4.2 version
63
+ #else
64
+ #pragma message "Using ROCm/HIP < 6.4.2 implementation"
65
+ // hipMemCreate currently has issue in rocm-6.3.4. After it is fixed in rocm-7.0, we can use the same way to implement torch_memory_saver as CUDA side.
66
+ // Current, we based on the chuck-wise method to implement it.
67
+ static void cu_mem_create_and_map(hipDevice_t device,
68
+ size_t aligned_size,
69
+ void* d_mem,
70
+ std::vector<hipMemGenericAllocationHandle_t>& allocHandles,
71
+ std::vector<size_t>& chunk_sizes) {
72
+
73
+ hipMemAllocationProp prop = {};
74
+ prop.type = hipMemAllocationTypePinned;
75
+ prop.location.type = hipMemLocationTypeDevice;
76
+ prop.location.id = device;
77
+
78
+ // // Get granularity
79
+ // size_t granularity;
80
+ // CURESULT_CHECK(hipMemGetAllocationGranularity(&granularity, &prop,
81
+ // hipMemAllocationGranularityMinimum));
82
+
83
+ // // Make sure chunk size is aligned with hardware granularity
84
+ // size_t aligned_chunk_size = ((MEMCREATE_CHUNK_SIZE + granularity - 1) / granularity) * granularity;
85
+ // size_t num_chunks = (size + aligned_chunk_size - 1) / aligned_chunk_size;
86
+
87
+ // Get granularity, Make sure chunk size is aligned with hardware granularity
88
+ // size == aligned_size
89
+ size_t num_chunks = (aligned_size + MEMCREATE_CHUNK_SIZE - 1) / MEMCREATE_CHUNK_SIZE;
90
+
91
+ allocHandles.resize(num_chunks);
92
+ chunk_sizes.resize(num_chunks);
93
+
94
+ // Calculate chunk sizes
95
+ for (size_t i = 0; i < num_chunks; ++i) {
96
+ // chunk_sizes[i] = MIN(size - i * aligned_chunk_size, aligned_chunk_size);
97
+ chunk_sizes[i] = MIN(aligned_size - i * MEMCREATE_CHUNK_SIZE, MEMCREATE_CHUNK_SIZE);
98
+ #ifdef TMS_DEBUG_LOG
99
+ std::cout << "[torch_memory_saver.cpp] chunk_sizes[" << i << "] = " << chunk_sizes[i] << std::endl;
100
+ #endif
101
+ }
102
+
103
+ // Create memory handles for each chunk
104
+ for (size_t i = 0; i < num_chunks; ++i) {
105
+ CURESULT_CHECK(hipMemCreate(&allocHandles[i], chunk_sizes[i], &prop, 0));
106
+ #ifdef TMS_DEBUG_LOG
107
+ std::cout << "[torch_memory_saver.cpp] allocHandles[" << i << "] = " << allocHandles[i] << std::endl;
108
+ #endif
109
+ }
110
+
111
+ // Map each chunk
112
+ size_t allocated_size = 0;
113
+ for (size_t i = 0; i < num_chunks; ++i) {
114
+ void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
115
+ CURESULT_CHECK(hipMemMap((hipDeviceptr_t)map_addr, chunk_sizes[i], 0, allocHandles[i], 0));
116
+ allocated_size += chunk_sizes[i];
117
+ #ifdef TMS_DEBUG_LOG
118
+ std::cout << "[torch_memory_saver.cpp] mapped chunk " << i << " at offset " << allocated_size - chunk_sizes[i] << std::endl;
119
+ #endif
120
+ }
121
+
122
+ // Set access permissions
123
+ hipMemAccessDesc accessDesc = {};
124
+ accessDesc.location.type = hipMemLocationTypeDevice;
125
+ accessDesc.location.id = device;
126
+ accessDesc.flags = hipMemAccessFlagsProtReadWrite;
127
+ CURESULT_CHECK(hipMemSetAccess(d_mem, aligned_size, &accessDesc, 1));
128
+ }
129
+
130
+
131
+ static void cu_mem_unmap_and_release(hipDevice_t device,
132
+ size_t aligned_size,
133
+ hipDeviceptr_t d_mem,
134
+ const std::vector<hipMemGenericAllocationHandle_t>& allocHandles,
135
+ const std::vector<size_t>& chunk_sizes) {
136
+
137
+ // Unmap each chunk
138
+ size_t deallocated_size = 0;
139
+ for (size_t i = 0; i < allocHandles.size(); ++i) {
140
+ void* map_addr = (void*)((uintptr_t)d_mem + deallocated_size);
141
+ CURESULT_CHECK(hipMemUnmap((hipDeviceptr_t)map_addr, chunk_sizes[i]));
142
+ deallocated_size += chunk_sizes[i];
143
+ #ifdef TMS_DEBUG_LOG
144
+ std::cout << "[torch_memory_saver.cpp] unmapped chunk " << i << " at offset " << deallocated_size - chunk_sizes[i] << std::endl;
145
+ #endif
146
+ }
147
+
148
+ // Release each handle
149
+ for (size_t i = 0; i < allocHandles.size(); ++i) {
150
+ CURESULT_CHECK(hipMemRelease(allocHandles[i]));
151
+ #ifdef TMS_DEBUG_LOG
152
+ std::cout << "[torch_memory_saver.cpp] released allocHandles[" << i << "]" << std::endl;
153
+ #endif
154
+ }
155
+ }
156
+
157
+ static size_t cu_mem_get_granularity(hipDevice_t device) {
158
+ hipMemAllocationProp prop = {};
159
+ prop.type = hipMemAllocationTypePinned;
160
+ prop.location.type = hipMemLocationTypeDevice;
161
+ prop.location.id = device;
162
+
163
+ size_t granularity;
164
+ CURESULT_CHECK(hipMemGetAllocationGranularity(&granularity, &prop, hipMemAllocationGranularityMinimum));
165
+ return granularity;
166
+ }
167
+
168
+ static CUdevice cu_ctx_get_device() {
169
+ CUdevice ans;
170
+ CURESULT_CHECK(hipCtxGetDevice(&ans));
171
+ return ans;
172
+ }
173
+
174
+ static CUdevice cu_device_get(int device_ordinal) {
175
+ CUdevice ans;
176
+ CURESULT_CHECK(hipDeviceGet(&ans, device_ordinal));
177
+ return ans;
178
+ }
179
+ #endif
180
+
181
+ #elif defined(USE_CUDA)
182
+ static void cu_mem_create(CUmemGenericAllocationHandle *alloc_handle, size_t size, CUdevice device) {
183
+ CUmemAllocationProp prop = {};
184
+ prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
185
+ prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
186
+ prop.location.id = device;
187
+
188
+ int flag = 0;
189
+ CURESULT_CHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, device));
190
+ if (flag) { // support GPUDirect RDMA if possible
191
+ prop.allocFlags.gpuDirectRDMACapable = 1;
192
+ }
193
+
194
+ CURESULT_CHECK(cuMemCreate(alloc_handle, size, &prop, 0));
195
+ }
196
+
197
+ static void cu_mem_set_access(void *ptr, size_t size, CUdevice device) {
198
+ CUmemAccessDesc access_desc = {};
199
+ access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
200
+ access_desc.location.id = device;
201
+ access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
202
+ CURESULT_CHECK(cuMemSetAccess((CUdeviceptr) ptr, size, &access_desc, 1));
203
+ }
204
+
205
+ static CUdevice cu_ctx_get_device() {
206
+ CUdevice ans;
207
+ CURESULT_CHECK(cuCtxGetDevice(&ans));
208
+ return ans;
209
+ }
210
+
211
+ static CUdevice cu_device_get(int device_ordinal) {
212
+ CUdevice ans;
213
+ CURESULT_CHECK(cuDeviceGet(&ans, device_ordinal));
214
+ return ans;
215
+ }
216
+
217
+ #else
218
+ #error "USE_PLATFORM is not set"
219
+
220
+ #endif
221
+ }
222
+
223
+ inline bool get_bool_env_var(const char* name) {
224
+ const char* env_cstr = std::getenv(name);
225
+ if (env_cstr == nullptr) {
226
+ return false;
227
+ }
228
+
229
+ std::string env_str(env_cstr);
230
+ if (env_str == "1" || env_str == "true" || env_str == "TRUE" || env_str == "yes" || env_str == "YES") {
231
+ return true;
232
+ }
233
+ if (env_str == "0" || env_str == "false" || env_str == "FALSE" || env_str == "no" || env_str == "NO") {
234
+ return false;
235
+ }
236
+
237
+ std::cerr << "[torch_memory_saver.cpp] Unsupported environment varialbe value "
238
+ << " name=" << name << " value=" << env_str
239
+ << std::endl;
240
+ exit(1);
241
+ }
@@ -102,7 +102,7 @@ def _create_ext_modules(platform):
102
102
  str((platform_home / 'lib64').resolve()),
103
103
  str((platform_home / 'lib64/stubs').resolve()),
104
104
  ]
105
- libraries = ['cuda']
105
+ libraries = ['cuda', 'cudart']
106
106
  platform_macros = [('USE_CUDA', '1')]
107
107
  extra_compile_args = ['-std=c++17', '-O3']
108
108
 
@@ -146,9 +146,9 @@ class build_ext_for_platform(build_platform_ext):
146
146
 
147
147
  setup(
148
148
  name='torch_memory_saver',
149
- version='0.0.9rc1',
149
+ version='0.0.9rc2',
150
150
  ext_modules=ext_modules,
151
151
  cmdclass={'build_ext': build_ext_for_platform},
152
152
  python_requires=">=3.9",
153
153
  packages=setuptools.find_packages(include=["torch_memory_saver", "torch_memory_saver.*"]),
154
- )
154
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torch_memory_saver
3
- Version: 0.0.9rc1
3
+ Version: 0.0.9rc2
4
4
  Requires-Python: >=3.9
5
5
  License-File: LICENSE
6
6
  Dynamic: license-file
@@ -1,9 +1,14 @@
1
1
  LICENSE
2
+ MANIFEST.in
2
3
  README.md
3
4
  setup.py
4
5
  csrc/api_forwarder.cpp
6
+ csrc/api_forwarder.h
5
7
  csrc/core.cpp
8
+ csrc/core.h
6
9
  csrc/entrypoint.cpp
10
+ csrc/macro.h
11
+ csrc/utils.h
7
12
  test/test_examples.py
8
13
  torch_memory_saver/__init__.py
9
14
  torch_memory_saver/binary_wrapper.py