torch-memory-saver 0.0.9rc2__tar.gz → 0.0.9rc3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {torch_memory_saver-0.0.9rc2/torch_memory_saver.egg-info → torch_memory_saver-0.0.9rc3}/PKG-INFO +1 -1
  2. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/csrc/core.cpp +13 -156
  3. torch_memory_saver-0.0.9rc3/csrc/core.h +58 -0
  4. torch_memory_saver-0.0.9rc3/csrc/hardware_amd_support.h +58 -0
  5. torch_memory_saver-0.0.9rc3/csrc/macro.h +54 -0
  6. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/csrc/utils.h +4 -4
  7. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/setup.py +7 -2
  8. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3/torch_memory_saver.egg-info}/PKG-INFO +1 -1
  9. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver.egg-info/SOURCES.txt +1 -0
  10. torch_memory_saver-0.0.9rc2/csrc/core.h +0 -96
  11. torch_memory_saver-0.0.9rc2/csrc/macro.h +0 -40
  12. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/LICENSE +0 -0
  13. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/MANIFEST.in +0 -0
  14. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/README.md +0 -0
  15. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/csrc/api_forwarder.cpp +0 -0
  16. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/csrc/api_forwarder.h +0 -0
  17. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/csrc/entrypoint.cpp +0 -0
  18. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/setup.cfg +0 -0
  19. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/test/test_examples.py +0 -0
  20. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/__init__.py +0 -0
  21. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/binary_wrapper.py +0 -0
  22. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/entrypoint.py +0 -0
  23. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/__init__.py +0 -0
  24. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/base.py +0 -0
  25. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/mode_preload.py +0 -0
  26. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/mode_torch.py +0 -0
  27. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/testing_utils.py +0 -0
  28. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/utils.py +0 -0
  29. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver.egg-info/dependency_links.txt +0 -0
  30. {torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torch_memory_saver
3
- Version: 0.0.9rc2
3
+ Version: 0.0.9rc3
4
4
  Requires-Python: >=3.9
5
5
  License-File: LICENSE
6
6
  Dynamic: license-file
@@ -3,6 +3,10 @@
3
3
  #include "macro.h"
4
4
  #include "api_forwarder.h"
5
5
 
6
+ #if defined(USE_ROCM)
7
+ #include "hardware_amd_support.h"
8
+ #endif
9
+
6
10
  TorchMemorySaver::TorchMemorySaver() {}
7
11
 
8
12
  TorchMemorySaver &TorchMemorySaver::instance() {
@@ -12,85 +16,7 @@ TorchMemorySaver &TorchMemorySaver::instance() {
12
16
 
13
17
  cudaError_t TorchMemorySaver::malloc(void **ptr, CUdevice device, size_t size, const std::string& tag, const bool enable_cpu_backup) {
14
18
  #if defined(USE_ROCM)
15
- // hipDevice_t device;
16
- CURESULT_CHECK(hipCtxGetDevice(&device));
17
-
18
- // // Get granularity and calculate aligned size
19
- // size_t granularity = CUDAUtils::cu_mem_get_granularity(device);
20
- // size_t aligned_size = (size + granularity - 1) & ~(granularity - 1);
21
-
22
- // //// Reserve aligned memory address, rocm will check granularity
23
- // CURESULT_CHECK(hipMemAddressReserve((hipDeviceptr_t *)ptr, aligned_size, granularity, 0, 0));
24
-
25
- hipMemAllocationProp prop = {};
26
- prop.type = hipMemAllocationTypePinned;
27
- prop.location.type = hipMemLocationTypeDevice;
28
- prop.location.id = device;
29
- prop.allocFlags.compressionType = 0x0;
30
-
31
- size_t granularity;
32
- CURESULT_CHECK(hipMemGetAllocationGranularity(&granularity, &prop,
33
- hipMemAllocationGranularityMinimum));
34
- size_t aligned_size = ((size + granularity - 1) / granularity) * granularity;
35
- aligned_size = (aligned_size + MEMCREATE_CHUNK_SIZE - 1) / MEMCREATE_CHUNK_SIZE * MEMCREATE_CHUNK_SIZE;
36
-
37
- assert(MEMCREATE_CHUNK_SIZE % granularity == 0);
38
- assert(aligned_size % MEMCREATE_CHUNK_SIZE == 0);
39
- assert(aligned_size % granularity == 0);
40
-
41
-
42
- // Create allocation metadata
43
- AllocationMetadata metadata;
44
- metadata.size = size;
45
- metadata.aligned_size = aligned_size;
46
- metadata.device = device;
47
- //// Not sure (Check these parameters)
48
- metadata.tag = tag;
49
- metadata.enable_cpu_backup = enable_cpu_backup;
50
- metadata.cpu_backup = nullptr;
51
- ////
52
-
53
- // Get global device ID using our utility function
54
- int global_device_id = DeviceUtils::get_global_device_id(device);
55
-
56
- // rewrite numa node
57
- uint64_t node_id = 0;
58
- if (global_device_id > 3) {
59
- node_id = 1;
60
- }
61
-
62
- #ifdef TMS_DEBUG_LOG
63
- std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_malloc "
64
- << " ptr=" << ptr << " *ptr=" << *ptr << " size=" << size
65
- << " granularity=" << granularity
66
- << " aligned_size=" << aligned_size
67
- << " node_id=" << node_id
68
- << " device=" << device
69
- << " global_device_id=" << global_device_id
70
- << std::endl;
71
- #endif
72
-
73
- hipDeviceptr_t d_mem;
74
- // Reserve aligned memory address, rocm will check granularity
75
- CURESULT_CHECK(hipMemAddressReserve(&d_mem, aligned_size, granularity, 0, node_id));
76
- *ptr = (void*)d_mem;
77
-
78
- // Create and map chunks
79
- // CUDAUtils::cu_mem_create_and_map(device, size, (hipDeviceptr_t)*ptr,
80
- CUDAUtils::cu_mem_create_and_map(device, aligned_size, (hipDeviceptr_t)*ptr,
81
- metadata.allocHandles, metadata.chunk_sizes);
82
- size_t num_chunks = metadata.allocHandles.size();
83
- {
84
- const std::lock_guard<std::mutex> lock(allocator_metadata_mutex_);
85
- allocation_metadata_.emplace(*ptr, std::move(metadata));
86
- }
87
- #ifdef TMS_DEBUG_LOG
88
- std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_malloc "
89
- << " ptr=" << ptr << " *ptr=" << *ptr << " size=" << size
90
- << " metadata.aligned_size=" << metadata.aligned_size
91
- << " num_chunks=" << num_chunks
92
- << std::endl;
93
- #endif
19
+ return ROCmHIPImplementation::rocm_malloc(ptr, device, size, tag, enable_cpu_backup, allocation_metadata_, allocator_metadata_mutex_);
94
20
 
95
21
  #elif defined(USE_CUDA)
96
22
  CUmemGenericAllocationHandle allocHandle;
@@ -122,28 +48,8 @@ cudaError_t TorchMemorySaver::malloc(void **ptr, CUdevice device, size_t size, c
122
48
 
123
49
  cudaError_t TorchMemorySaver::free(void *ptr) {
124
50
  #if defined(USE_ROCM)
125
- AllocationMetadata metadata;
126
- {
127
- const std::lock_guard<std::mutex> lock(allocator_metadata_mutex_);
128
- SIMPLE_CHECK(allocation_metadata_.count(ptr), "Trying to free a pointer not allocated here");
129
- metadata = std::move(allocation_metadata_[ptr]);
130
- allocation_metadata_.erase(ptr);
131
- }
132
-
133
- // Unmap and release chunks
134
- CUDAUtils::cu_mem_unmap_and_release(metadata.device, metadata.size,
135
- (hipDeviceptr_t)ptr, metadata.allocHandles, metadata.chunk_sizes);
51
+ return ROCmHIPImplementation::rocm_free(ptr, allocation_metadata_, allocator_metadata_mutex_);
136
52
 
137
- // Free the reserved address using stored aligned_size
138
- CURESULT_CHECK(hipMemAddressFree((hipDeviceptr_t)ptr, metadata.aligned_size));
139
-
140
- #ifdef TMS_DEBUG_LOG
141
- std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_free "
142
- << " ptr=" << ptr << " metadata.size=" << metadata.size
143
- << " metadata.aligned_size=" << metadata.aligned_size
144
- << " num_chunks=" << metadata.allocHandles.size()
145
- << std::endl;
146
- #endif
147
53
  #elif defined(USE_CUDA)
148
54
  AllocationMetadata metadata;
149
55
  {
@@ -179,41 +85,12 @@ cudaError_t TorchMemorySaver::free(void *ptr) {
179
85
  }
180
86
 
181
87
  void TorchMemorySaver::pause(const std::string& tag) {
182
- const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
183
-
184
88
  #if defined(USE_ROCM)
185
- for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
186
- void *ptr = it->first;
187
- AllocationMetadata &metadata = it->second;
89
+ ROCmHIPImplementation::rocm_pause(tag, allocation_metadata_, allocator_metadata_mutex_);
188
90
 
189
- if (!tag.empty() && metadata.tag != tag) {
190
- continue;
191
- }
192
- // Copy CUDA's code supporting cpu_backup to here
193
- if (metadata.enable_cpu_backup) {
194
- if (nullptr == metadata.cpu_backup) {
195
- CUDA_ERROR_CHECK(hipMallocHost(&metadata.cpu_backup, metadata.aligned_size));
196
- }
197
- SIMPLE_CHECK(metadata.cpu_backup != nullptr, "cpu_backup should not be nullptr");
198
- // TODO may use cudaMemcpyAsync if needed
199
- CUDA_ERROR_CHECK(cudaMemcpy(metadata.cpu_backup, ptr, metadata.aligned_size, hipMemcpyDeviceToHost));
200
- }
201
- //
202
-
203
- // Unmap and release chunks (but keep metadata for resume)
204
- // CUDAUtils::cu_mem_unmap_and_release(metadata.device, metadata.size,
205
- CUDAUtils::cu_mem_unmap_and_release(metadata.device, metadata.aligned_size,
206
- (hipDeviceptr_t)ptr, metadata.allocHandles, metadata.chunk_sizes);
207
-
208
- #ifdef TMS_DEBUG_LOG
209
- std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.pause"
210
- << " ptr=" << ptr << " metadata.size=" << metadata.size
211
- << " metadata.aligned_size=" << metadata.aligned_size
212
- << " num_chunks=" << metadata.allocHandles.size()
213
- << std::endl;
214
- #endif
215
- }
216
91
  #elif defined(USE_CUDA)
92
+ const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
93
+
217
94
  for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
218
95
  void *ptr = it->first;
219
96
  AllocationMetadata& metadata = it->second;
@@ -258,32 +135,12 @@ void TorchMemorySaver::pause(const std::string& tag) {
258
135
  }
259
136
 
260
137
  void TorchMemorySaver::resume(const std::string& tag) {
261
- const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
262
-
263
138
  #if defined(USE_ROCM)
264
- for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
265
- void *ptr = it->first;
266
- AllocationMetadata &metadata = it->second;
267
-
268
- if (!tag.empty() && metadata.tag != tag) {
269
- continue;
270
- }
271
-
272
- // Create new handles and map chunks
273
- // CUDAUtils::cu_mem_create_and_map(metadata.device, metadata.size,
274
- CUDAUtils::cu_mem_create_and_map(metadata.device, metadata.aligned_size,
275
- (hipDeviceptr_t)ptr, metadata.allocHandles, metadata.chunk_sizes);
276
-
277
- #ifdef TMS_DEBUG_LOG
278
- std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.resume"
279
- << " ptr=" << ptr << " metadata.size=" << metadata.size
280
- << " metadata.aligned_size=" << metadata.aligned_size
281
- << " num_chunks=" << metadata.allocHandles.size()
282
- << std::endl;
283
- #endif
284
- }
139
+ ROCmHIPImplementation::rocm_resume(tag, allocation_metadata_, allocator_metadata_mutex_);
285
140
 
286
141
  #elif defined(USE_CUDA)
142
+ const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
143
+
287
144
  for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
288
145
  void *ptr = it->first;
289
146
  AllocationMetadata &metadata = it->second;
@@ -329,4 +186,4 @@ void TorchMemorySaver::resume(const std::string& tag) {
329
186
  #else
330
187
  #error "USE_PLATFORM is not set"
331
188
  #endif
332
- }
189
+ }
@@ -0,0 +1,58 @@
1
+ #pragma once
2
+ #include <sys/types.h>
3
+ #include <stdio.h>
4
+ #include <unordered_map>
5
+ #include <mutex>
6
+ #include <string>
7
+ #include "utils.h"
8
+ #include "macro.h"
9
+
10
+ #if defined(USE_ROCM)
11
+ #include "hardware_amd_support.h"
12
+ #endif
13
+
14
+ enum class AllocationState {
15
+ // Memory is mapped and accessible
16
+ ACTIVE,
17
+ // Memory is unmapped and inaccessible
18
+ PAUSED
19
+ };
20
+
21
+ struct AllocationMetadata {
22
+ size_t size;
23
+ CUdevice device;
24
+ std::string tag;
25
+ AllocationState state;
26
+ bool enable_cpu_backup;
27
+ void* cpu_backup;
28
+
29
+ #if defined(USE_CUDA)
30
+ CUmemGenericAllocationHandle allocHandle;
31
+ #elif defined(USE_ROCM)
32
+ size_t aligned_size;
33
+ std::vector<hipMemGenericAllocationHandle_t> allocHandles;
34
+ std::vector<size_t> chunk_sizes;
35
+ #else
36
+ #error "USE_PLATFORM is not set"
37
+ #endif
38
+ };
39
+
40
+ class TorchMemorySaver {
41
+ public:
42
+ static TorchMemorySaver& instance();
43
+
44
+ cudaError_t malloc(void** ptr, CUdevice device, size_t size, const std::string& tag, bool enable_cpu_backup);
45
+ cudaError_t free(void* ptr);
46
+
47
+ void pause(const std::string& tag);
48
+ void resume(const std::string& tag);
49
+
50
+ private:
51
+ TorchMemorySaver();
52
+ ~TorchMemorySaver() = default;
53
+ TorchMemorySaver(const TorchMemorySaver&) = delete;
54
+ TorchMemorySaver& operator=(const TorchMemorySaver&) = delete;
55
+
56
+ std::mutex allocator_metadata_mutex_;
57
+ std::unordered_map<void*, AllocationMetadata> allocation_metadata_;
58
+ };
@@ -0,0 +1,58 @@
1
+ #pragma once
2
+ #include "macro.h"
3
+ #include "utils.h"
4
+ #include <vector>
5
+ #include <string>
6
+ #include <sstream>
7
+ #include <cstdlib>
8
+ #include <unordered_map>
9
+ #include <mutex>
10
+
11
+ #if defined(USE_ROCM)
12
+
13
+ // Forward declaration
14
+ enum class AllocationState;
15
+ struct AllocationMetadata;
16
+
17
+ // Device utility functions for ROCm
18
+ namespace DeviceUtils {
19
+ // Get global device ID from local device ID
20
+ int get_global_device_id(hipDevice_t local_device_id);
21
+ }
22
+
23
+ // High-level ROCm implementation functions
24
+ namespace ROCmHIPImplementation {
25
+ // Malloc implementation for ROCm
26
+ cudaError_t rocm_malloc(
27
+ void **ptr,
28
+ CUdevice device,
29
+ size_t size,
30
+ const std::string& tag,
31
+ bool enable_cpu_backup,
32
+ std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
33
+ std::mutex& allocator_metadata_mutex
34
+ );
35
+
36
+ // Free implementation for ROCm
37
+ cudaError_t rocm_free(
38
+ void *ptr,
39
+ std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
40
+ std::mutex& allocator_metadata_mutex
41
+ );
42
+
43
+ // Pause implementation for ROCm
44
+ void rocm_pause(
45
+ const std::string& tag,
46
+ std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
47
+ std::mutex& allocator_metadata_mutex
48
+ );
49
+
50
+ // Resume implementation for ROCm
51
+ void rocm_resume(
52
+ const std::string& tag,
53
+ std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
54
+ std::mutex& allocator_metadata_mutex
55
+ );
56
+ }
57
+
58
+ #endif // USE_ROCM
@@ -0,0 +1,54 @@
1
+ #pragma once
2
+
3
+ // Define platform macros and include appropriate headers
4
+ #if defined(USE_ROCM)
5
+ // Include HIP runtime headers for AMD ROCm platform
6
+ #include <hip/hip_runtime_api.h>
7
+ #include <hip/hip_runtime.h>
8
+ #include <sstream>
9
+ #include <cstdlib>
10
+ #include <cstring>
11
+ #include <cassert>
12
+ /*
13
+ * ROCm API Mapping References:
14
+ * - CUDA Driver API to HIP: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Driver_API_functions_supported_by_HIP.html
15
+ * - CUDA Runtime API to HIP: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Runtime_API_functions_supported_by_HIP.html
16
+ */
17
+ // --- Error Handling Types and Constants ---
18
+ #define CUresult hipError_t
19
+ #define cudaError_t hipError_t
20
+ #define CUDA_SUCCESS hipSuccess
21
+ #define cudaSuccess hipSuccess
22
+ // --- Error Reporting Functions ---
23
+ #define cuGetErrorString hipDrvGetErrorString
24
+ #define cudaGetErrorString hipGetErrorString
25
+ // --- Memory Management Functions ---
26
+ #define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
27
+ #define cuMemUnmap hipMemUnmap
28
+ #define cuMemRelease hipMemRelease
29
+ #define cudaMallocHost hipHostMalloc
30
+ #define cudaMemcpy hipMemcpy
31
+ // --- Memory Copy Direction Constants ---
32
+ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
33
+ // --- Device and Stream Types ---
34
+ #define CUdevice hipDevice_t
35
+ #define cudaStream_t hipStream_t
36
+ // --- Memory Allocation Constants ---
37
+ // Chunk size for memory creation operations (2 MB)
38
+ #define MEMCREATE_CHUNK_SIZE (2 * 1024 * 1024)
39
+ // --- Utility Macros ---
40
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
41
+
42
+ // ============================================================================
43
+ // CUDA Platform Configuration (NVIDIA GPUs)
44
+ // ============================================================================
45
+ #elif defined(USE_CUDA)
46
+ #include <cuda_runtime_api.h>
47
+ #include <cuda.h>
48
+
49
+ // ============================================================================
50
+ // Error: No Platform Specified
51
+ // ============================================================================
52
+ #else
53
+ #error "USE_PLATFORM is not set"
54
+ #endif
@@ -57,11 +57,11 @@
57
57
  namespace CUDAUtils {
58
58
  #if defined(USE_ROCM)
59
59
 
60
- #if HIP_VERSION >= 60402000 // rocm/hip 6.4.2
61
- #pragma message "Using ROCm/HIP 6.4.2+ implementation"
62
- // Implement when rocm release >= 6.4.2 version
60
+ #if HIP_VERSION < 60304000 // rocm/hip 6.3.4
61
+ #pragma message "You need to implement torch_memory_saver in ROCm/HIP 6.3.4 or lower. We did not support it currently."
63
62
  #else
64
- #pragma message "Using ROCm/HIP < 6.4.2 implementation"
63
+ // After rocm-7.0, we can use the same way to implement torch_memory_saver as CUDA side. --> Need to verify
64
+ #pragma message "Using ROCm/HIP >= 6.4.2 implementation"
65
65
  // hipMemCreate currently has issue in rocm-6.3.4. After it is fixed in rocm-7.0, we can use the same way to implement torch_memory_saver as CUDA side.
66
66
  // Current, we based on the chuck-wise method to implement it.
67
67
  static void cu_mem_create_and_map(hipDevice_t device,
@@ -87,11 +87,17 @@ def _create_ext_modules(platform):
87
87
 
88
88
  # Common define macros
89
89
  common_macros = [('Py_LIMITED_API', '0x03090000')]
90
+
91
+ # Common compile arguments
92
+ extra_compile_args = ['-std=c++17', '-O3']
90
93
 
91
94
  # Platform-specific configurations
92
95
  platform_home = Path(_find_platform_home(platform))
93
96
 
94
97
  if platform == "hip":
98
+ # Add ROCm-specific source file
99
+ sources.append('csrc/hardware_amd_support.cpp')
100
+
95
101
  include_dirs = [str(platform_home.resolve() / 'include')]
96
102
  library_dirs = [str(platform_home.resolve() / 'lib')]
97
103
  libraries = ['amdhip64', 'dl']
@@ -104,7 +110,6 @@ def _create_ext_modules(platform):
104
110
  ]
105
111
  libraries = ['cuda', 'cudart']
106
112
  platform_macros = [('USE_CUDA', '1')]
107
- extra_compile_args = ['-std=c++17', '-O3']
108
113
 
109
114
  # Create extensions with different hook modes
110
115
  ext_modules = [
@@ -146,7 +151,7 @@ class build_ext_for_platform(build_platform_ext):
146
151
 
147
152
  setup(
148
153
  name='torch_memory_saver',
149
- version='0.0.9rc2',
154
+ version='0.0.9rc3',
150
155
  ext_modules=ext_modules,
151
156
  cmdclass={'build_ext': build_ext_for_platform},
152
157
  python_requires=">=3.9",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torch_memory_saver
3
- Version: 0.0.9rc2
3
+ Version: 0.0.9rc3
4
4
  Requires-Python: >=3.9
5
5
  License-File: LICENSE
6
6
  Dynamic: license-file
@@ -7,6 +7,7 @@ csrc/api_forwarder.h
7
7
  csrc/core.cpp
8
8
  csrc/core.h
9
9
  csrc/entrypoint.cpp
10
+ csrc/hardware_amd_support.h
10
11
  csrc/macro.h
11
12
  csrc/utils.h
12
13
  test/test_examples.py
@@ -1,96 +0,0 @@
1
- #pragma once
2
- #include <sys/types.h>
3
- #include <stdio.h>
4
- #include <unordered_map>
5
- #include <mutex>
6
- #include <string>
7
- #include "utils.h"
8
- #include "macro.h"
9
-
10
- enum class AllocationState {
11
- // Memory is mapped and accessible
12
- ACTIVE,
13
- // Memory is unmapped and inaccessible
14
- PAUSED
15
- };
16
-
17
- struct AllocationMetadata {
18
- size_t size;
19
- CUdevice device;
20
- std::string tag;
21
- AllocationState state;
22
- bool enable_cpu_backup;
23
- void* cpu_backup;
24
-
25
- #if defined(USE_CUDA)
26
- CUmemGenericAllocationHandle allocHandle;
27
- #elif defined(USE_ROCM)
28
- size_t aligned_size;
29
- std::vector<hipMemGenericAllocationHandle_t> allocHandles;
30
- std::vector<size_t> chunk_sizes;
31
- #else
32
- #error "USE_PLATFORM is not set"
33
- #endif
34
- };
35
-
36
- #if defined(USE_ROCM)
37
- namespace DeviceUtils {
38
- // Simple function to get global device ID from local device ID
39
- static int get_global_device_id(hipDevice_t local_device_id) {
40
- // Check for HIP_VISIBLE_DEVICES environment variable
41
- const char* hip_visible = std::getenv("HIP_VISIBLE_DEVICES");
42
-
43
- if (hip_visible && strlen(hip_visible) > 0) {
44
- std::string devices_str(hip_visible);
45
- std::stringstream ss(devices_str);
46
- std::string device_str;
47
- std::vector<int> device_list;
48
-
49
- // Parse comma-separated device list
50
- while (std::getline(ss, device_str, ',')) {
51
- if (!device_str.empty()) {
52
- device_list.push_back(std::atoi(device_str.c_str()));
53
- }
54
- }
55
-
56
- if (local_device_id < device_list.size()) {
57
- int global_device_id = device_list[local_device_id];
58
- #ifdef TMS_DEBUG_LOG
59
- std::cout << "[torch_memory_saver.cpp] HIP_VISIBLE_DEVICES=" << hip_visible
60
- << " local_device_id=" << local_device_id
61
- << " -> global_device_id=" << global_device_id << std::endl;
62
- #endif
63
- return global_device_id;
64
- }
65
- }
66
-
67
- // Fallback: return local device ID as-is
68
- #ifdef TMS_DEBUG_LOG
69
- std::cout << "[torch_memory_saver.cpp] No HIP_VISIBLE_DEVICES, using local_device_id=" << local_device_id << std::endl;
70
- #endif
71
- return local_device_id;
72
- }
73
- }
74
- #endif
75
-
76
-
77
-
78
- class TorchMemorySaver {
79
- public:
80
- static TorchMemorySaver& instance();
81
-
82
- cudaError_t malloc(void** ptr, CUdevice device, size_t size, const std::string& tag, bool enable_cpu_backup);
83
- cudaError_t free(void* ptr);
84
-
85
- void pause(const std::string& tag);
86
- void resume(const std::string& tag);
87
-
88
- private:
89
- TorchMemorySaver();
90
- ~TorchMemorySaver() = default;
91
- TorchMemorySaver(const TorchMemorySaver&) = delete;
92
- TorchMemorySaver& operator=(const TorchMemorySaver&) = delete;
93
-
94
- std::mutex allocator_metadata_mutex_;
95
- std::unordered_map<void*, AllocationMetadata> allocation_metadata_;
96
- };
@@ -1,40 +0,0 @@
1
- #pragma once
2
-
3
- // Define platform macros and include appropriate headers
4
- #if defined(USE_ROCM)
5
- // Lookup the table to define the macros: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Driver_API_functions_supported_by_HIP.html
6
- // Lookup the table to define the macros: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Runtime_API_functions_supported_by_HIP.html?utm_source=chatgpt.com
7
- #include <hip/hip_runtime_api.h>
8
- #include <hip/hip_runtime.h>
9
- #include <sstream>
10
- #include <cstdlib>
11
- // Define a general alias
12
- #define CUresult hipError_t
13
- #define cudaError_t hipError_t
14
- #define CUDA_SUCCESS hipSuccess
15
- #define cudaSuccess hipSuccess
16
- #define cuGetErrorString hipDrvGetErrorString
17
- #define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
18
- #define CUdevice hipDevice_t
19
- #define cudaStream_t hipStream_t
20
- #define cudaMallocHost hipHostMalloc
21
- #define cudaMemcpy hipMemcpy
22
- #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
23
- #define cuGetErrorString hipDrvGetErrorString
24
- #define cudaGetErrorString hipGetErrorString
25
- #define cuMemUnmap hipMemUnmap
26
- #define cuMemRelease hipMemRelease
27
- // #define cudaMalloc hipMalloc
28
- // #define cudaFree hipFree
29
- // #define CUdevice hipDevice_t
30
- // #define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
31
- #define MEMCREATE_CHUNK_SIZE (2 * 1024 * 1024)
32
- #define MIN(a, b) (a < b ? a : b)
33
-
34
- #elif defined(USE_CUDA)
35
- #include <cuda_runtime_api.h>
36
- #include <cuda.h>
37
-
38
- #else
39
- #error "USE_PLATFORM is not set"
40
- #endif