torch-memory-saver 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torch_memory_saver-0.0.1/LICENSE +21 -0
- torch_memory_saver-0.0.1/PKG-INFO +5 -0
- torch_memory_saver-0.0.1/README.md +11 -0
- torch_memory_saver-0.0.1/csrc/torch_memory_saver.cpp +288 -0
- torch_memory_saver-0.0.1/setup.cfg +4 -0
- torch_memory_saver-0.0.1/setup.py +19 -0
- torch_memory_saver-0.0.1/torch_memory_saver/__init__.py +88 -0
- torch_memory_saver-0.0.1/torch_memory_saver.egg-info/PKG-INFO +5 -0
- torch_memory_saver-0.0.1/torch_memory_saver.egg-info/SOURCES.txt +9 -0
- torch_memory_saver-0.0.1/torch_memory_saver.egg-info/dependency_links.txt +1 -0
- torch_memory_saver-0.0.1/torch_memory_saver.egg-info/top_level.txt +2 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 fzyzcjy
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# torch_memory_saver
|
2
|
+
|
3
|
+
Allow torch tensor memory to be released and resumed later
|
4
|
+
|
5
|
+
Please refer to https://github.com/sgl-project/sglang/issues/2542#issuecomment-2563641647 for details.
|
6
|
+
|
7
|
+
TODO:
|
8
|
+
|
9
|
+
- [x] Implementation
|
10
|
+
- [ ] More tests and infra
|
11
|
+
- [ ] Publish to pypi
|
@@ -0,0 +1,288 @@
|
|
1
|
+
#include <sys/types.h>
|
2
|
+
#include <cuda_runtime_api.h>
|
3
|
+
#include <cuda.h>
|
4
|
+
#include <iostream>
|
5
|
+
#include <stdio.h>
|
6
|
+
#include <dlfcn.h>
|
7
|
+
#include <unordered_map>
|
8
|
+
#include <mutex>
|
9
|
+
|
10
|
+
// #define TMS_DEBUG_LOG
|
11
|
+
|
12
|
+
// ----------------------------------------------- copied code --------------------------------------------------
|
13
|
+
|
14
|
+
// Cannot use pytorch (libc10.so) since LD_PRELOAD happens earlier than `import torch`
|
15
|
+
// #include <ATen/cuda/Exceptions.h>
|
16
|
+
|
17
|
+
// torch Macros.h
|
18
|
+
#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
|
19
|
+
#define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
|
20
|
+
#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
|
21
|
+
#else
|
22
|
+
#define C10_LIKELY(expr) (expr)
|
23
|
+
#define C10_UNLIKELY(expr) (expr)
|
24
|
+
#endif
|
25
|
+
|
26
|
+
// ----------------------------------------------- utils --------------------------------------------------
|
27
|
+
|
28
|
+
#define SIMPLE_CHECK(COND, MSG) \
|
29
|
+
do { \
|
30
|
+
if (!(COND)) { \
|
31
|
+
std::cerr << "[torch_memory_saver.cpp] " << MSG << std::endl; \
|
32
|
+
exit(1); \
|
33
|
+
} \
|
34
|
+
} while (false)
|
35
|
+
|
36
|
+
// very naive
|
37
|
+
#define CURESULT_CHECK(EXPR) \
|
38
|
+
do { \
|
39
|
+
CUresult __result = (EXPR); \
|
40
|
+
if (__result != CUDA_SUCCESS) { \
|
41
|
+
std::cerr << "[torch_memory_saver.cpp] CUresult error " \
|
42
|
+
<< " result=" << __result << " file=" << __FILE__ << " func=" << __func__ << " line=" << __LINE__ \
|
43
|
+
<< std::endl; \
|
44
|
+
exit(1); \
|
45
|
+
} \
|
46
|
+
} while (false)
|
47
|
+
|
48
|
+
namespace APIForwarder {
|
49
|
+
static void *check_dlsym(void *value) {
|
50
|
+
if (nullptr == value) {
|
51
|
+
std::cerr << "[torch_memory_saver.cpp] dlsym failed dlerror=" << dlerror() << std::endl;
|
52
|
+
exit(1);
|
53
|
+
}
|
54
|
+
return value;
|
55
|
+
}
|
56
|
+
|
57
|
+
typedef cudaError_t (*CudaMallocFunc)(void **, size_t);
|
58
|
+
|
59
|
+
typedef cudaError_t (*CudaFreeFunc)(void *);
|
60
|
+
|
61
|
+
static CudaMallocFunc real_cudaMalloc = NULL;
|
62
|
+
static CudaFreeFunc real_cudaFree = NULL;
|
63
|
+
|
64
|
+
static cudaError_t call_real_cuda_malloc(void **ptr, size_t size) {
|
65
|
+
if (C10_UNLIKELY(nullptr == real_cudaMalloc)) {
|
66
|
+
real_cudaMalloc = (CudaMallocFunc) check_dlsym(dlsym(RTLD_NEXT, "cudaMalloc"));
|
67
|
+
}
|
68
|
+
|
69
|
+
cudaError_t ret = real_cudaMalloc(ptr, size);
|
70
|
+
|
71
|
+
#ifdef TMS_DEBUG_LOG
|
72
|
+
std::cout << "[torch_memory_saver.cpp] cudaMalloc [MODE NORMAL]"
|
73
|
+
<< " ptr=" << ptr << " *ptr=" << *ptr << " size=" << size << " ret=" << ret
|
74
|
+
<< std::endl;
|
75
|
+
#endif
|
76
|
+
|
77
|
+
return ret;
|
78
|
+
}
|
79
|
+
|
80
|
+
static cudaError_t call_real_cuda_free(void *ptr) {
|
81
|
+
if (C10_UNLIKELY(nullptr == real_cudaFree)) {
|
82
|
+
real_cudaFree = (CudaFreeFunc) check_dlsym(dlsym(RTLD_NEXT, "cudaFree"));
|
83
|
+
}
|
84
|
+
|
85
|
+
cudaError_t ret = real_cudaFree(ptr);
|
86
|
+
|
87
|
+
#ifdef TMS_DEBUG_LOG
|
88
|
+
std::cout << "[torch_memory_saver.cpp] cudaFree [MODE NORMAL]"
|
89
|
+
<< " ptr=" << ptr << " ret=" << ret
|
90
|
+
<< std::endl;
|
91
|
+
#endif
|
92
|
+
|
93
|
+
return ret;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
namespace CUDAUtils {
|
98
|
+
static void cu_mem_create(CUmemGenericAllocationHandle *allocHandle, size_t size, CUdevice device) {
|
99
|
+
CUmemAllocationProp prop = {};
|
100
|
+
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
101
|
+
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
102
|
+
prop.location.id = device;
|
103
|
+
CURESULT_CHECK(cuMemCreate(allocHandle, size, &prop, 0));
|
104
|
+
}
|
105
|
+
|
106
|
+
static void cu_mem_set_access(void *ptr, size_t size, CUdevice device) {
|
107
|
+
CUmemAccessDesc accessDesc = {};
|
108
|
+
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
109
|
+
accessDesc.location.id = device;
|
110
|
+
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
111
|
+
CURESULT_CHECK(cuMemSetAccess((CUdeviceptr) ptr, size, &accessDesc, 1));
|
112
|
+
}
|
113
|
+
}
|
114
|
+
|
115
|
+
// ----------------------------------------------- primary class --------------------------------------------------
|
116
|
+
|
117
|
+
struct _AllocationMetadata {
|
118
|
+
size_t size;
|
119
|
+
CUdevice device;
|
120
|
+
CUmemGenericAllocationHandle allocHandle;
|
121
|
+
};
|
122
|
+
|
123
|
+
class TorchMemorySaver {
|
124
|
+
public:
|
125
|
+
TorchMemorySaver() {}
|
126
|
+
|
127
|
+
cudaError_t malloc(void **ptr, size_t size) {
|
128
|
+
CUdevice device;
|
129
|
+
CURESULT_CHECK(cuCtxGetDevice(&device));
|
130
|
+
|
131
|
+
CUmemGenericAllocationHandle allocHandle;
|
132
|
+
CUDAUtils::cu_mem_create(&allocHandle, size, device);
|
133
|
+
|
134
|
+
CURESULT_CHECK(cuMemAddressReserve((CUdeviceptr *) ptr, size, 0, 0, 0));
|
135
|
+
CURESULT_CHECK(cuMemMap((CUdeviceptr) * ptr, size, 0, allocHandle, 0));
|
136
|
+
CUDAUtils::cu_mem_set_access(*ptr, size, device);
|
137
|
+
|
138
|
+
{
|
139
|
+
const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
|
140
|
+
allocation_metadata_.emplace(*ptr, _AllocationMetadata{size, device, allocHandle});
|
141
|
+
}
|
142
|
+
|
143
|
+
#ifdef TMS_DEBUG_LOG
|
144
|
+
std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_malloc "
|
145
|
+
<< " ptr=" << ptr << " *ptr=" << *ptr << " size=" << size
|
146
|
+
<< " allocHandle=" << allocHandle
|
147
|
+
<< std::endl;
|
148
|
+
#endif
|
149
|
+
|
150
|
+
return cudaSuccess;
|
151
|
+
}
|
152
|
+
|
153
|
+
cudaError_t free(void *ptr) {
|
154
|
+
_AllocationMetadata metadata;
|
155
|
+
{
|
156
|
+
const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
|
157
|
+
SIMPLE_CHECK(allocation_metadata_.count(ptr), "Trying to free a pointer not allocated here");
|
158
|
+
metadata = allocation_metadata_[ptr];
|
159
|
+
allocation_metadata_.erase(ptr);
|
160
|
+
}
|
161
|
+
|
162
|
+
CURESULT_CHECK(cuMemUnmap((CUdeviceptr) ptr, metadata.size));
|
163
|
+
CURESULT_CHECK(cuMemRelease(metadata.allocHandle));
|
164
|
+
CURESULT_CHECK(cuMemAddressFree((CUdeviceptr) ptr, metadata.size));
|
165
|
+
|
166
|
+
#ifdef TMS_DEBUG_LOG
|
167
|
+
std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_free "
|
168
|
+
<< " ptr=" << ptr << " metadata.size=" << metadata.size
|
169
|
+
<< " metadata.allocHandle=" << metadata.allocHandle
|
170
|
+
<< std::endl;
|
171
|
+
#endif
|
172
|
+
|
173
|
+
return cudaSuccess;
|
174
|
+
}
|
175
|
+
|
176
|
+
void pause() {
|
177
|
+
const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
|
178
|
+
|
179
|
+
for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
|
180
|
+
void *ptr = it->first;
|
181
|
+
_AllocationMetadata metadata = it->second;
|
182
|
+
|
183
|
+
CURESULT_CHECK(cuMemUnmap((CUdeviceptr) ptr, metadata.size));
|
184
|
+
CURESULT_CHECK(cuMemRelease(metadata.allocHandle));
|
185
|
+
|
186
|
+
#ifdef TMS_DEBUG_LOG
|
187
|
+
std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.pause"
|
188
|
+
<< " ptr=" << ptr << " metadata.size=" << metadata.size << " metadata.allocHandle="
|
189
|
+
<< metadata.allocHandle
|
190
|
+
<< std::endl;
|
191
|
+
#endif
|
192
|
+
}
|
193
|
+
}
|
194
|
+
|
195
|
+
void resume() {
|
196
|
+
const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
|
197
|
+
|
198
|
+
for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
|
199
|
+
void *ptr = it->first;
|
200
|
+
_AllocationMetadata &metadata = it->second;
|
201
|
+
|
202
|
+
CUmemGenericAllocationHandle newAllocHandle;
|
203
|
+
CUDAUtils::cu_mem_create(&newAllocHandle, metadata.size, metadata.device);
|
204
|
+
|
205
|
+
CURESULT_CHECK(cuMemMap((CUdeviceptr) ptr, metadata.size, 0, newAllocHandle, 0));
|
206
|
+
|
207
|
+
CUDAUtils::cu_mem_set_access(ptr, metadata.size, metadata.device);
|
208
|
+
|
209
|
+
#ifdef TMS_DEBUG_LOG
|
210
|
+
std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.resume"
|
211
|
+
<< " ptr=" << ptr << " metadata.size=" << metadata.size << " (old)metadata.allocHandle="
|
212
|
+
<< metadata.allocHandle
|
213
|
+
<< " (new)newAllocHandle=" << newAllocHandle
|
214
|
+
<< std::endl;
|
215
|
+
#endif
|
216
|
+
|
217
|
+
metadata.allocHandle = newAllocHandle;
|
218
|
+
}
|
219
|
+
}
|
220
|
+
|
221
|
+
static TorchMemorySaver &instance() {
|
222
|
+
static TorchMemorySaver instance;
|
223
|
+
return instance;
|
224
|
+
}
|
225
|
+
|
226
|
+
private:
|
227
|
+
// Similar to torch's CUDACachingAllocator and CUDAPluggableAllocator
|
228
|
+
std::mutex allocator_metadata_mutex_;
|
229
|
+
std::unordered_map<void *, _AllocationMetadata> allocation_metadata_;
|
230
|
+
};
|
231
|
+
|
232
|
+
namespace RegionManager {
|
233
|
+
static thread_local bool is_interesting_region_ = false;
|
234
|
+
|
235
|
+
void enter() {
|
236
|
+
#ifdef TMS_DEBUG_LOG
|
237
|
+
std::cout << "[torch_memory_saver.cpp] tms_region_enter" << std::endl;
|
238
|
+
#endif
|
239
|
+
is_interesting_region_ = true;
|
240
|
+
}
|
241
|
+
|
242
|
+
void leave() {
|
243
|
+
#ifdef TMS_DEBUG_LOG
|
244
|
+
std::cout << "[torch_memory_saver.cpp] tms_region_leave" << std::endl;
|
245
|
+
#endif
|
246
|
+
is_interesting_region_ = false;
|
247
|
+
}
|
248
|
+
|
249
|
+
bool is_interesting_region() {
|
250
|
+
return is_interesting_region_;
|
251
|
+
}
|
252
|
+
}
|
253
|
+
|
254
|
+
// ------------------------------------------------- entrypoints ------------------------------------------------
|
255
|
+
|
256
|
+
cudaError_t cudaMalloc(void **ptr, size_t size) {
|
257
|
+
if (RegionManager::is_interesting_region()) {
|
258
|
+
return TorchMemorySaver::instance().malloc(ptr, size);
|
259
|
+
} else {
|
260
|
+
return APIForwarder::call_real_cuda_malloc(ptr, size);
|
261
|
+
}
|
262
|
+
}
|
263
|
+
|
264
|
+
cudaError_t cudaFree(void *ptr) {
|
265
|
+
if (RegionManager::is_interesting_region()) {
|
266
|
+
return TorchMemorySaver::instance().free(ptr);
|
267
|
+
} else {
|
268
|
+
return APIForwarder::call_real_cuda_free(ptr);
|
269
|
+
}
|
270
|
+
}
|
271
|
+
|
272
|
+
extern "C" {
|
273
|
+
void tms_region_enter() {
|
274
|
+
RegionManager::enter();
|
275
|
+
}
|
276
|
+
|
277
|
+
void tms_region_leave() {
|
278
|
+
RegionManager::leave();
|
279
|
+
}
|
280
|
+
|
281
|
+
void tms_pause() {
|
282
|
+
TorchMemorySaver::instance().pause();
|
283
|
+
}
|
284
|
+
|
285
|
+
void tms_resume() {
|
286
|
+
TorchMemorySaver::instance().resume();
|
287
|
+
}
|
288
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
from setuptools import setup
|
2
|
+
from torch.utils import cpp_extension
|
3
|
+
|
4
|
+
ext_module = cpp_extension.CppExtension(
|
5
|
+
'torch_memory_saver_cpp',
|
6
|
+
['csrc/torch_memory_saver.cpp'],
|
7
|
+
extra_compile_args=['-I/usr/local/cuda/include'],
|
8
|
+
extra_link_args=['-lcuda'],
|
9
|
+
)
|
10
|
+
|
11
|
+
setup(
|
12
|
+
name='torch_memory_saver',
|
13
|
+
version='0.0.1',
|
14
|
+
# https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-a-c-extension
|
15
|
+
ext_modules=[ext_module],
|
16
|
+
cmdclass={'build_ext': cpp_extension.BuildExtension},
|
17
|
+
python_requires=">=3.9",
|
18
|
+
packages=['torch_memory_saver'],
|
19
|
+
)
|
@@ -0,0 +1,88 @@
|
|
1
|
+
import ctypes
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from contextlib import contextmanager
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
import torch
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class TorchMemorySaver:
|
14
|
+
def __init__(self):
|
15
|
+
self._mem_pool = torch.cuda.MemPool()
|
16
|
+
self._id = _global_info.next_id()
|
17
|
+
assert self._id == 1, 'Only support one single instance yet (multi-instance will be implemented later)'
|
18
|
+
|
19
|
+
@contextmanager
|
20
|
+
def region(self):
|
21
|
+
with torch.cuda.use_mem_pool(self._mem_pool):
|
22
|
+
_global_info.cdll.tms_region_enter()
|
23
|
+
try:
|
24
|
+
yield
|
25
|
+
finally:
|
26
|
+
_global_info.cdll.tms_region_leave()
|
27
|
+
|
28
|
+
def pause(self):
|
29
|
+
_global_info.cdll.tms_pause()
|
30
|
+
|
31
|
+
def resume(self):
|
32
|
+
_global_info.cdll.tms_resume()
|
33
|
+
|
34
|
+
|
35
|
+
class _GlobalInfo:
|
36
|
+
def __init__(self):
|
37
|
+
self._cdll: Optional[ctypes.CDLL] = None
|
38
|
+
self._last_id = 0
|
39
|
+
|
40
|
+
@property
|
41
|
+
def cdll(self):
|
42
|
+
if self._cdll is None:
|
43
|
+
self._cdll = _compute_cdll()
|
44
|
+
logger.debug(f'Use cdll={self._cdll}')
|
45
|
+
return self._cdll
|
46
|
+
|
47
|
+
def next_id(self):
|
48
|
+
self._last_id += 1
|
49
|
+
return self._last_id
|
50
|
+
|
51
|
+
|
52
|
+
_global_info = _GlobalInfo()
|
53
|
+
|
54
|
+
|
55
|
+
def _compute_cdll():
|
56
|
+
env_ld_preload = os.environ.get('LD_PRELOAD', '')
|
57
|
+
assert 'torch_memory_saver' in env_ld_preload, f'Please specify correct LD_PRELOAD (currently: {env_ld_preload})'
|
58
|
+
return ctypes.CDLL(env_ld_preload)
|
59
|
+
|
60
|
+
|
61
|
+
def get_binary_path():
|
62
|
+
dir_package = Path(__file__).parent
|
63
|
+
candidates = [
|
64
|
+
p
|
65
|
+
for d in [dir_package, dir_package.parent]
|
66
|
+
for p in d.glob('torch_memory_saver_cpp.*.so')
|
67
|
+
]
|
68
|
+
assert len(candidates) == 1, f'{candidates=}'
|
69
|
+
return candidates[0]
|
70
|
+
|
71
|
+
|
72
|
+
@contextmanager
|
73
|
+
def configure_subprocess():
|
74
|
+
with change_env('LD_PRELOAD', str(get_binary_path())):
|
75
|
+
yield
|
76
|
+
|
77
|
+
|
78
|
+
@contextmanager
|
79
|
+
def change_env(key: str, value: str):
|
80
|
+
old_value = os.environ.get(key, '')
|
81
|
+
os.environ[key] = value
|
82
|
+
logger.debug(f'change_env set key={key} value={value}')
|
83
|
+
try:
|
84
|
+
yield
|
85
|
+
finally:
|
86
|
+
assert os.environ[key] == value
|
87
|
+
os.environ[key] = old_value
|
88
|
+
logger.debug(f'change_env restore key={key} value={old_value}')
|
@@ -0,0 +1,9 @@
|
|
1
|
+
LICENSE
|
2
|
+
README.md
|
3
|
+
setup.py
|
4
|
+
csrc/torch_memory_saver.cpp
|
5
|
+
torch_memory_saver/__init__.py
|
6
|
+
torch_memory_saver.egg-info/PKG-INFO
|
7
|
+
torch_memory_saver.egg-info/SOURCES.txt
|
8
|
+
torch_memory_saver.egg-info/dependency_links.txt
|
9
|
+
torch_memory_saver.egg-info/top_level.txt
|
@@ -0,0 +1 @@
|
|
1
|
+
|