torch-memory-saver 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 fzyzcjy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ Metadata-Version: 2.1
2
+ Name: torch_memory_saver
3
+ Version: 0.0.1
4
+ Requires-Python: >=3.9
5
+ License-File: LICENSE
@@ -0,0 +1,11 @@
1
+ # torch_memory_saver
2
+
3
+ Allow torch tensor memory to be released and resumed later
4
+
5
+ Please refer to https://github.com/sgl-project/sglang/issues/2542#issuecomment-2563641647 for details.
6
+
7
+ TODO:
8
+
9
+ - [x] Implementation
10
+ - [ ] More tests and infra
11
+ - [ ] Publish to pypi
@@ -0,0 +1,288 @@
1
+ #include <sys/types.h>
2
+ #include <cuda_runtime_api.h>
3
+ #include <cuda.h>
4
+ #include <iostream>
5
+ #include <stdio.h>
6
+ #include <dlfcn.h>
7
+ #include <unordered_map>
8
+ #include <mutex>
9
+
10
+ // #define TMS_DEBUG_LOG
11
+
12
+ // ----------------------------------------------- copied code --------------------------------------------------
13
+
14
+ // Cannot use pytorch (libc10.so) since LD_PRELOAD happens earlier than `import torch`
15
+ // #include <ATen/cuda/Exceptions.h>
16
+
17
+ // torch Macros.h
18
+ #if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
19
+ #define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
20
+ #define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
21
+ #else
22
+ #define C10_LIKELY(expr) (expr)
23
+ #define C10_UNLIKELY(expr) (expr)
24
+ #endif
25
+
26
+ // ----------------------------------------------- utils --------------------------------------------------
27
+
28
+ #define SIMPLE_CHECK(COND, MSG) \
29
+ do { \
30
+ if (!(COND)) { \
31
+ std::cerr << "[torch_memory_saver.cpp] " << MSG << std::endl; \
32
+ exit(1); \
33
+ } \
34
+ } while (false)
35
+
36
+ // very naive
37
+ #define CURESULT_CHECK(EXPR) \
38
+ do { \
39
+ CUresult __result = (EXPR); \
40
+ if (__result != CUDA_SUCCESS) { \
41
+ std::cerr << "[torch_memory_saver.cpp] CUresult error " \
42
+ << " result=" << __result << " file=" << __FILE__ << " func=" << __func__ << " line=" << __LINE__ \
43
+ << std::endl; \
44
+ exit(1); \
45
+ } \
46
+ } while (false)
47
+
48
+ namespace APIForwarder {
49
+ static void *check_dlsym(void *value) {
50
+ if (nullptr == value) {
51
+ std::cerr << "[torch_memory_saver.cpp] dlsym failed dlerror=" << dlerror() << std::endl;
52
+ exit(1);
53
+ }
54
+ return value;
55
+ }
56
+
57
+ typedef cudaError_t (*CudaMallocFunc)(void **, size_t);
58
+
59
+ typedef cudaError_t (*CudaFreeFunc)(void *);
60
+
61
+ static CudaMallocFunc real_cudaMalloc = NULL;
62
+ static CudaFreeFunc real_cudaFree = NULL;
63
+
64
+ static cudaError_t call_real_cuda_malloc(void **ptr, size_t size) {
65
+ if (C10_UNLIKELY(nullptr == real_cudaMalloc)) {
66
+ real_cudaMalloc = (CudaMallocFunc) check_dlsym(dlsym(RTLD_NEXT, "cudaMalloc"));
67
+ }
68
+
69
+ cudaError_t ret = real_cudaMalloc(ptr, size);
70
+
71
+ #ifdef TMS_DEBUG_LOG
72
+ std::cout << "[torch_memory_saver.cpp] cudaMalloc [MODE NORMAL]"
73
+ << " ptr=" << ptr << " *ptr=" << *ptr << " size=" << size << " ret=" << ret
74
+ << std::endl;
75
+ #endif
76
+
77
+ return ret;
78
+ }
79
+
80
+ static cudaError_t call_real_cuda_free(void *ptr) {
81
+ if (C10_UNLIKELY(nullptr == real_cudaFree)) {
82
+ real_cudaFree = (CudaFreeFunc) check_dlsym(dlsym(RTLD_NEXT, "cudaFree"));
83
+ }
84
+
85
+ cudaError_t ret = real_cudaFree(ptr);
86
+
87
+ #ifdef TMS_DEBUG_LOG
88
+ std::cout << "[torch_memory_saver.cpp] cudaFree [MODE NORMAL]"
89
+ << " ptr=" << ptr << " ret=" << ret
90
+ << std::endl;
91
+ #endif
92
+
93
+ return ret;
94
+ }
95
+ }
96
+
97
+ namespace CUDAUtils {
98
+ static void cu_mem_create(CUmemGenericAllocationHandle *allocHandle, size_t size, CUdevice device) {
99
+ CUmemAllocationProp prop = {};
100
+ prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
101
+ prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
102
+ prop.location.id = device;
103
+ CURESULT_CHECK(cuMemCreate(allocHandle, size, &prop, 0));
104
+ }
105
+
106
+ static void cu_mem_set_access(void *ptr, size_t size, CUdevice device) {
107
+ CUmemAccessDesc accessDesc = {};
108
+ accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
109
+ accessDesc.location.id = device;
110
+ accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
111
+ CURESULT_CHECK(cuMemSetAccess((CUdeviceptr) ptr, size, &accessDesc, 1));
112
+ }
113
+ }
114
+
115
+ // ----------------------------------------------- primary class --------------------------------------------------
116
+
117
+ struct _AllocationMetadata {
118
+ size_t size;
119
+ CUdevice device;
120
+ CUmemGenericAllocationHandle allocHandle;
121
+ };
122
+
123
+ class TorchMemorySaver {
124
+ public:
125
+ TorchMemorySaver() {}
126
+
127
+ cudaError_t malloc(void **ptr, size_t size) {
128
+ CUdevice device;
129
+ CURESULT_CHECK(cuCtxGetDevice(&device));
130
+
131
+ CUmemGenericAllocationHandle allocHandle;
132
+ CUDAUtils::cu_mem_create(&allocHandle, size, device);
133
+
134
+ CURESULT_CHECK(cuMemAddressReserve((CUdeviceptr *) ptr, size, 0, 0, 0));
135
+ CURESULT_CHECK(cuMemMap((CUdeviceptr) * ptr, size, 0, allocHandle, 0));
136
+ CUDAUtils::cu_mem_set_access(*ptr, size, device);
137
+
138
+ {
139
+ const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
140
+ allocation_metadata_.emplace(*ptr, _AllocationMetadata{size, device, allocHandle});
141
+ }
142
+
143
+ #ifdef TMS_DEBUG_LOG
144
+ std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_malloc "
145
+ << " ptr=" << ptr << " *ptr=" << *ptr << " size=" << size
146
+ << " allocHandle=" << allocHandle
147
+ << std::endl;
148
+ #endif
149
+
150
+ return cudaSuccess;
151
+ }
152
+
153
+ cudaError_t free(void *ptr) {
154
+ _AllocationMetadata metadata;
155
+ {
156
+ const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
157
+ SIMPLE_CHECK(allocation_metadata_.count(ptr), "Trying to free a pointer not allocated here");
158
+ metadata = allocation_metadata_[ptr];
159
+ allocation_metadata_.erase(ptr);
160
+ }
161
+
162
+ CURESULT_CHECK(cuMemUnmap((CUdeviceptr) ptr, metadata.size));
163
+ CURESULT_CHECK(cuMemRelease(metadata.allocHandle));
164
+ CURESULT_CHECK(cuMemAddressFree((CUdeviceptr) ptr, metadata.size));
165
+
166
+ #ifdef TMS_DEBUG_LOG
167
+ std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_free "
168
+ << " ptr=" << ptr << " metadata.size=" << metadata.size
169
+ << " metadata.allocHandle=" << metadata.allocHandle
170
+ << std::endl;
171
+ #endif
172
+
173
+ return cudaSuccess;
174
+ }
175
+
176
+ void pause() {
177
+ const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
178
+
179
+ for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
180
+ void *ptr = it->first;
181
+ _AllocationMetadata metadata = it->second;
182
+
183
+ CURESULT_CHECK(cuMemUnmap((CUdeviceptr) ptr, metadata.size));
184
+ CURESULT_CHECK(cuMemRelease(metadata.allocHandle));
185
+
186
+ #ifdef TMS_DEBUG_LOG
187
+ std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.pause"
188
+ << " ptr=" << ptr << " metadata.size=" << metadata.size << " metadata.allocHandle="
189
+ << metadata.allocHandle
190
+ << std::endl;
191
+ #endif
192
+ }
193
+ }
194
+
195
+ void resume() {
196
+ const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
197
+
198
+ for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
199
+ void *ptr = it->first;
200
+ _AllocationMetadata &metadata = it->second;
201
+
202
+ CUmemGenericAllocationHandle newAllocHandle;
203
+ CUDAUtils::cu_mem_create(&newAllocHandle, metadata.size, metadata.device);
204
+
205
+ CURESULT_CHECK(cuMemMap((CUdeviceptr) ptr, metadata.size, 0, newAllocHandle, 0));
206
+
207
+ CUDAUtils::cu_mem_set_access(ptr, metadata.size, metadata.device);
208
+
209
+ #ifdef TMS_DEBUG_LOG
210
+ std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.resume"
211
+ << " ptr=" << ptr << " metadata.size=" << metadata.size << " (old)metadata.allocHandle="
212
+ << metadata.allocHandle
213
+ << " (new)newAllocHandle=" << newAllocHandle
214
+ << std::endl;
215
+ #endif
216
+
217
+ metadata.allocHandle = newAllocHandle;
218
+ }
219
+ }
220
+
221
+ static TorchMemorySaver &instance() {
222
+ static TorchMemorySaver instance;
223
+ return instance;
224
+ }
225
+
226
+ private:
227
+ // Similar to torch's CUDACachingAllocator and CUDAPluggableAllocator
228
+ std::mutex allocator_metadata_mutex_;
229
+ std::unordered_map<void *, _AllocationMetadata> allocation_metadata_;
230
+ };
231
+
232
+ namespace RegionManager {
233
+ static thread_local bool is_interesting_region_ = false;
234
+
235
+ void enter() {
236
+ #ifdef TMS_DEBUG_LOG
237
+ std::cout << "[torch_memory_saver.cpp] tms_region_enter" << std::endl;
238
+ #endif
239
+ is_interesting_region_ = true;
240
+ }
241
+
242
+ void leave() {
243
+ #ifdef TMS_DEBUG_LOG
244
+ std::cout << "[torch_memory_saver.cpp] tms_region_leave" << std::endl;
245
+ #endif
246
+ is_interesting_region_ = false;
247
+ }
248
+
249
+ bool is_interesting_region() {
250
+ return is_interesting_region_;
251
+ }
252
+ }
253
+
254
+ // ------------------------------------------------- entrypoints ------------------------------------------------
255
+
256
+ cudaError_t cudaMalloc(void **ptr, size_t size) {
257
+ if (RegionManager::is_interesting_region()) {
258
+ return TorchMemorySaver::instance().malloc(ptr, size);
259
+ } else {
260
+ return APIForwarder::call_real_cuda_malloc(ptr, size);
261
+ }
262
+ }
263
+
264
+ cudaError_t cudaFree(void *ptr) {
265
+ if (RegionManager::is_interesting_region()) {
266
+ return TorchMemorySaver::instance().free(ptr);
267
+ } else {
268
+ return APIForwarder::call_real_cuda_free(ptr);
269
+ }
270
+ }
271
+
272
+ extern "C" {
273
+ void tms_region_enter() {
274
+ RegionManager::enter();
275
+ }
276
+
277
+ void tms_region_leave() {
278
+ RegionManager::leave();
279
+ }
280
+
281
+ void tms_pause() {
282
+ TorchMemorySaver::instance().pause();
283
+ }
284
+
285
+ void tms_resume() {
286
+ TorchMemorySaver::instance().resume();
287
+ }
288
+ }
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,19 @@
1
+ from setuptools import setup
2
+ from torch.utils import cpp_extension
3
+
4
+ ext_module = cpp_extension.CppExtension(
5
+ 'torch_memory_saver_cpp',
6
+ ['csrc/torch_memory_saver.cpp'],
7
+ extra_compile_args=['-I/usr/local/cuda/include'],
8
+ extra_link_args=['-lcuda'],
9
+ )
10
+
11
+ setup(
12
+ name='torch_memory_saver',
13
+ version='0.0.1',
14
+ # https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-a-c-extension
15
+ ext_modules=[ext_module],
16
+ cmdclass={'build_ext': cpp_extension.BuildExtension},
17
+ python_requires=">=3.9",
18
+ packages=['torch_memory_saver'],
19
+ )
@@ -0,0 +1,88 @@
1
+ import ctypes
2
+ import logging
3
+ import os
4
+ from contextlib import contextmanager
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ import torch
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class TorchMemorySaver:
14
+ def __init__(self):
15
+ self._mem_pool = torch.cuda.MemPool()
16
+ self._id = _global_info.next_id()
17
+ assert self._id == 1, 'Only support one single instance yet (multi-instance will be implemented later)'
18
+
19
+ @contextmanager
20
+ def region(self):
21
+ with torch.cuda.use_mem_pool(self._mem_pool):
22
+ _global_info.cdll.tms_region_enter()
23
+ try:
24
+ yield
25
+ finally:
26
+ _global_info.cdll.tms_region_leave()
27
+
28
+ def pause(self):
29
+ _global_info.cdll.tms_pause()
30
+
31
+ def resume(self):
32
+ _global_info.cdll.tms_resume()
33
+
34
+
35
+ class _GlobalInfo:
36
+ def __init__(self):
37
+ self._cdll: Optional[ctypes.CDLL] = None
38
+ self._last_id = 0
39
+
40
+ @property
41
+ def cdll(self):
42
+ if self._cdll is None:
43
+ self._cdll = _compute_cdll()
44
+ logger.debug(f'Use cdll={self._cdll}')
45
+ return self._cdll
46
+
47
+ def next_id(self):
48
+ self._last_id += 1
49
+ return self._last_id
50
+
51
+
52
+ _global_info = _GlobalInfo()
53
+
54
+
55
+ def _compute_cdll():
56
+ env_ld_preload = os.environ.get('LD_PRELOAD', '')
57
+ assert 'torch_memory_saver' in env_ld_preload, f'Please specify correct LD_PRELOAD (currently: {env_ld_preload})'
58
+ return ctypes.CDLL(env_ld_preload)
59
+
60
+
61
+ def get_binary_path():
62
+ dir_package = Path(__file__).parent
63
+ candidates = [
64
+ p
65
+ for d in [dir_package, dir_package.parent]
66
+ for p in d.glob('torch_memory_saver_cpp.*.so')
67
+ ]
68
+ assert len(candidates) == 1, f'{candidates=}'
69
+ return candidates[0]
70
+
71
+
72
+ @contextmanager
73
+ def configure_subprocess():
74
+ with change_env('LD_PRELOAD', str(get_binary_path())):
75
+ yield
76
+
77
+
78
+ @contextmanager
79
+ def change_env(key: str, value: str):
80
+ old_value = os.environ.get(key, '')
81
+ os.environ[key] = value
82
+ logger.debug(f'change_env set key={key} value={value}')
83
+ try:
84
+ yield
85
+ finally:
86
+ assert os.environ[key] == value
87
+ os.environ[key] = old_value
88
+ logger.debug(f'change_env restore key={key} value={old_value}')
@@ -0,0 +1,5 @@
1
+ Metadata-Version: 2.1
2
+ Name: torch_memory_saver
3
+ Version: 0.0.1
4
+ Requires-Python: >=3.9
5
+ License-File: LICENSE
@@ -0,0 +1,9 @@
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ csrc/torch_memory_saver.cpp
5
+ torch_memory_saver/__init__.py
6
+ torch_memory_saver.egg-info/PKG-INFO
7
+ torch_memory_saver.egg-info/SOURCES.txt
8
+ torch_memory_saver.egg-info/dependency_links.txt
9
+ torch_memory_saver.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ torch_memory_saver
2
+ torch_memory_saver_cpp