faiss 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +7 -7
  5. data/ext/faiss/extconf.rb +6 -3
  6. data/ext/faiss/numo.hpp +4 -4
  7. data/ext/faiss/utils.cpp +1 -1
  8. data/ext/faiss/utils.h +1 -1
  9. data/lib/faiss/version.rb +1 -1
  10. data/vendor/faiss/faiss/AutoTune.cpp +292 -291
  11. data/vendor/faiss/faiss/AutoTune.h +55 -56
  12. data/vendor/faiss/faiss/Clustering.cpp +365 -194
  13. data/vendor/faiss/faiss/Clustering.h +102 -35
  14. data/vendor/faiss/faiss/IVFlib.cpp +171 -195
  15. data/vendor/faiss/faiss/IVFlib.h +48 -51
  16. data/vendor/faiss/faiss/Index.cpp +85 -103
  17. data/vendor/faiss/faiss/Index.h +54 -48
  18. data/vendor/faiss/faiss/Index2Layer.cpp +126 -224
  19. data/vendor/faiss/faiss/Index2Layer.h +22 -36
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
  21. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +195 -0
  22. data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
  23. data/vendor/faiss/faiss/IndexBinary.h +140 -132
  24. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
  25. data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
  26. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
  27. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
  28. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
  29. data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
  30. data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
  31. data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
  32. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
  33. data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
  34. data/vendor/faiss/faiss/IndexFlat.cpp +115 -176
  35. data/vendor/faiss/faiss/IndexFlat.h +42 -59
  36. data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
  37. data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
  38. data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
  39. data/vendor/faiss/faiss/IndexHNSW.h +57 -41
  40. data/vendor/faiss/faiss/IndexIVF.cpp +545 -453
  41. data/vendor/faiss/faiss/IndexIVF.h +169 -118
  42. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
  43. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
  44. data/vendor/faiss/faiss/IndexIVFFlat.cpp +247 -252
  45. data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
  46. data/vendor/faiss/faiss/IndexIVFPQ.cpp +459 -517
  47. data/vendor/faiss/faiss/IndexIVFPQ.h +75 -67
  48. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
  49. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
  50. data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
  51. data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
  52. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +163 -150
  53. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +38 -25
  54. data/vendor/faiss/faiss/IndexLSH.cpp +66 -113
  55. data/vendor/faiss/faiss/IndexLSH.h +20 -38
  56. data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
  57. data/vendor/faiss/faiss/IndexLattice.h +11 -16
  58. data/vendor/faiss/faiss/IndexNNDescent.cpp +229 -0
  59. data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
  60. data/vendor/faiss/faiss/IndexNSG.cpp +301 -0
  61. data/vendor/faiss/faiss/IndexNSG.h +85 -0
  62. data/vendor/faiss/faiss/IndexPQ.cpp +387 -495
  63. data/vendor/faiss/faiss/IndexPQ.h +64 -82
  64. data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
  65. data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
  66. data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
  67. data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
  68. data/vendor/faiss/faiss/IndexRefine.cpp +139 -127
  69. data/vendor/faiss/faiss/IndexRefine.h +32 -23
  70. data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
  71. data/vendor/faiss/faiss/IndexReplicas.h +62 -56
  72. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +111 -172
  73. data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -59
  74. data/vendor/faiss/faiss/IndexShards.cpp +256 -240
  75. data/vendor/faiss/faiss/IndexShards.h +85 -73
  76. data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
  77. data/vendor/faiss/faiss/MatrixStats.h +7 -10
  78. data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
  79. data/vendor/faiss/faiss/MetaIndexes.h +40 -34
  80. data/vendor/faiss/faiss/MetricType.h +7 -7
  81. data/vendor/faiss/faiss/VectorTransform.cpp +654 -475
  82. data/vendor/faiss/faiss/VectorTransform.h +64 -89
  83. data/vendor/faiss/faiss/clone_index.cpp +78 -73
  84. data/vendor/faiss/faiss/clone_index.h +4 -9
  85. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
  86. data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
  87. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +198 -171
  88. data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
  89. data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
  90. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
  91. data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
  92. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
  93. data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
  94. data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
  95. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
  96. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
  97. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
  98. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
  99. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
  100. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
  101. data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
  102. data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
  103. data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
  104. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
  106. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
  107. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
  108. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
  109. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
  110. data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
  111. data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
  112. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
  113. data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
  114. data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
  115. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
  116. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
  117. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
  118. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
  119. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
  120. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
  121. data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
  122. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
  123. data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
  124. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
  125. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
  126. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
  127. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
  128. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
  129. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
  130. data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
  131. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +503 -0
  132. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +175 -0
  133. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
  134. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
  135. data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
  136. data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
  137. data/vendor/faiss/faiss/impl/FaissException.h +41 -29
  138. data/vendor/faiss/faiss/impl/HNSW.cpp +606 -617
  139. data/vendor/faiss/faiss/impl/HNSW.h +179 -200
  140. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +855 -0
  141. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +244 -0
  142. data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
  143. data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
  144. data/vendor/faiss/faiss/impl/NSG.cpp +679 -0
  145. data/vendor/faiss/faiss/impl/NSG.h +199 -0
  146. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
  147. data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
  148. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
  149. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
  150. data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
  151. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +758 -0
  152. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +188 -0
  153. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
  154. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +647 -707
  155. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
  156. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
  157. data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
  158. data/vendor/faiss/faiss/impl/index_read.cpp +631 -480
  159. data/vendor/faiss/faiss/impl/index_write.cpp +547 -407
  160. data/vendor/faiss/faiss/impl/io.cpp +76 -95
  161. data/vendor/faiss/faiss/impl/io.h +31 -41
  162. data/vendor/faiss/faiss/impl/io_macros.h +60 -29
  163. data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
  164. data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
  165. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
  166. data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
  167. data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
  168. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
  169. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
  170. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
  171. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
  172. data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
  173. data/vendor/faiss/faiss/index_factory.cpp +619 -397
  174. data/vendor/faiss/faiss/index_factory.h +8 -6
  175. data/vendor/faiss/faiss/index_io.h +23 -26
  176. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
  177. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
  178. data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
  179. data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
  180. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
  181. data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
  182. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
  183. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
  184. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
  185. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
  186. data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
  187. data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
  188. data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
  189. data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
  190. data/vendor/faiss/faiss/utils/Heap.h +186 -209
  191. data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
  192. data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
  193. data/vendor/faiss/faiss/utils/distances.cpp +305 -312
  194. data/vendor/faiss/faiss/utils/distances.h +170 -122
  195. data/vendor/faiss/faiss/utils/distances_simd.cpp +498 -508
  196. data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
  197. data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
  198. data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
  199. data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
  200. data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
  201. data/vendor/faiss/faiss/utils/hamming.h +62 -85
  202. data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
  203. data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
  204. data/vendor/faiss/faiss/utils/partitioning.h +26 -21
  205. data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
  206. data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
  207. data/vendor/faiss/faiss/utils/random.cpp +39 -63
  208. data/vendor/faiss/faiss/utils/random.h +13 -16
  209. data/vendor/faiss/faiss/utils/simdlib.h +4 -2
  210. data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
  211. data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
  212. data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
  213. data/vendor/faiss/faiss/utils/utils.cpp +304 -287
  214. data/vendor/faiss/faiss/utils/utils.h +54 -49
  215. metadata +29 -4
@@ -5,16 +5,16 @@
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
7
 
8
-
9
8
  #include <faiss/gpu/StandardGpuResources.h>
10
9
  #include <faiss/gpu/utils/DeviceUtils.h>
11
10
  #include <faiss/gpu/utils/StaticUtils.h>
12
11
  #include <faiss/impl/FaissAssert.h>
13
- #include <limits>
14
12
  #include <iostream>
13
+ #include <limits>
15
14
  #include <sstream>
16
15
 
17
- namespace faiss { namespace gpu {
16
+ namespace faiss {
17
+ namespace gpu {
18
18
 
19
19
  namespace {
20
20
 
@@ -22,513 +22,536 @@ namespace {
22
22
  constexpr int kNumStreams = 2;
23
23
 
24
24
  // Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
25
- constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
25
+ constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
26
26
 
27
27
  // Default temporary memory allocation for <= 4 GiB memory GPUs
28
- constexpr size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
28
+ constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
29
29
 
30
30
  // Default temporary memory allocation for <= 8 GiB memory GPUs
31
- constexpr size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
31
+ constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
32
32
 
33
33
  // Maximum temporary memory allocation for all GPUs
34
- constexpr size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
34
+ constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
35
35
 
36
36
  std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
37
- // Produce a sorted list of all outstanding allocations by type
38
- std::unordered_map<AllocType, std::pair<int, size_t>> stats;
39
-
40
- for (auto& entry : map) {
41
- auto& a = entry.second;
42
-
43
- auto it = stats.find(a.type);
44
- if (it != stats.end()) {
45
- stats[a.type].first++;
46
- stats[a.type].second += a.size;
47
- } else {
48
- stats[a.type] = std::make_pair(1, a.size);
37
+ // Produce a sorted list of all outstanding allocations by type
38
+ std::unordered_map<AllocType, std::pair<int, size_t>> stats;
39
+
40
+ for (auto& entry : map) {
41
+ auto& a = entry.second;
42
+
43
+ auto it = stats.find(a.type);
44
+ if (it != stats.end()) {
45
+ stats[a.type].first++;
46
+ stats[a.type].second += a.size;
47
+ } else {
48
+ stats[a.type] = std::make_pair(1, a.size);
49
+ }
49
50
  }
50
- }
51
51
 
52
- std::stringstream ss;
53
- for (auto& entry : stats) {
54
- ss << "Alloc type " << allocTypeToString(entry.first) << ": "
55
- << entry.second.first << " allocations, "
56
- << entry.second.second << " bytes\n";
57
- }
52
+ std::stringstream ss;
53
+ for (auto& entry : stats) {
54
+ ss << "Alloc type " << allocTypeToString(entry.first) << ": "
55
+ << entry.second.first << " allocations, " << entry.second.second
56
+ << " bytes\n";
57
+ }
58
58
 
59
- return ss.str();
59
+ return ss.str();
60
60
  }
61
61
 
62
- }
62
+ } // namespace
63
63
 
64
64
  //
65
65
  // StandardGpuResourcesImpl
66
66
  //
67
67
 
68
- StandardGpuResourcesImpl::StandardGpuResourcesImpl() :
69
- pinnedMemAlloc_(nullptr),
70
- pinnedMemAllocSize_(0),
71
- // let the adjustment function determine the memory size for us by passing
72
- // in a huge value that will then be adjusted
73
- tempMemSize_(getDefaultTempMemForGPU(-1,
74
- std::numeric_limits<size_t>::max())),
75
- pinnedMemSize_(kDefaultPinnedMemoryAllocation),
76
- allocLogging_(false) {
77
- }
68
+ StandardGpuResourcesImpl::StandardGpuResourcesImpl()
69
+ : pinnedMemAlloc_(nullptr),
70
+ pinnedMemAllocSize_(0),
71
+ // let the adjustment function determine the memory size for us by
72
+ // passing in a huge value that will then be adjusted
73
+ tempMemSize_(getDefaultTempMemForGPU(
74
+ -1,
75
+ std::numeric_limits<size_t>::max())),
76
+ pinnedMemSize_(kDefaultPinnedMemoryAllocation),
77
+ allocLogging_(false) {}
78
78
 
79
79
  StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
80
- // The temporary memory allocator has allocated memory through us, so clean
81
- // that up before we finish fully de-initializing ourselves
82
- tempMemory_.clear();
83
-
84
- // Make sure all allocations have been freed
85
- bool allocError = false;
86
-
87
- for (auto& entry : allocs_) {
88
- auto& map = entry.second;
89
-
90
- if (!map.empty()) {
91
- std::cerr
92
- << "StandardGpuResources destroyed with allocations outstanding:\n"
93
- << "Device " << entry.first << " outstanding allocations:\n";
94
- std::cerr << allocsToString(map);
95
- allocError = true;
80
+ // The temporary memory allocator has allocated memory through us, so clean
81
+ // that up before we finish fully de-initializing ourselves
82
+ tempMemory_.clear();
83
+
84
+ // Make sure all allocations have been freed
85
+ bool allocError = false;
86
+
87
+ for (auto& entry : allocs_) {
88
+ auto& map = entry.second;
89
+
90
+ if (!map.empty()) {
91
+ std::cerr
92
+ << "StandardGpuResources destroyed with allocations outstanding:\n"
93
+ << "Device " << entry.first
94
+ << " outstanding allocations:\n";
95
+ std::cerr << allocsToString(map);
96
+ allocError = true;
97
+ }
96
98
  }
97
- }
98
99
 
99
- FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
100
+ FAISS_ASSERT_MSG(
101
+ !allocError, "GPU memory allocations not properly cleaned up");
100
102
 
101
- for (auto& entry : defaultStreams_) {
102
- DeviceScope scope(entry.first);
103
+ for (auto& entry : defaultStreams_) {
104
+ DeviceScope scope(entry.first);
103
105
 
104
- // We created these streams, so are responsible for destroying them
105
- CUDA_VERIFY(cudaStreamDestroy(entry.second));
106
- }
106
+ // We created these streams, so are responsible for destroying them
107
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
108
+ }
107
109
 
108
- for (auto& entry : alternateStreams_) {
109
- DeviceScope scope(entry.first);
110
+ for (auto& entry : alternateStreams_) {
111
+ DeviceScope scope(entry.first);
110
112
 
111
- for (auto stream : entry.second) {
112
- CUDA_VERIFY(cudaStreamDestroy(stream));
113
+ for (auto stream : entry.second) {
114
+ CUDA_VERIFY(cudaStreamDestroy(stream));
115
+ }
113
116
  }
114
- }
115
117
 
116
- for (auto& entry : asyncCopyStreams_) {
117
- DeviceScope scope(entry.first);
118
+ for (auto& entry : asyncCopyStreams_) {
119
+ DeviceScope scope(entry.first);
118
120
 
119
- CUDA_VERIFY(cudaStreamDestroy(entry.second));
120
- }
121
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
122
+ }
121
123
 
122
- for (auto& entry : blasHandles_) {
123
- DeviceScope scope(entry.first);
124
+ for (auto& entry : blasHandles_) {
125
+ DeviceScope scope(entry.first);
124
126
 
125
- auto blasStatus = cublasDestroy(entry.second);
126
- FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
127
- }
127
+ auto blasStatus = cublasDestroy(entry.second);
128
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
129
+ }
128
130
 
129
- if (pinnedMemAlloc_) {
130
- auto err = cudaFreeHost(pinnedMemAlloc_);
131
- FAISS_ASSERT_FMT(err == cudaSuccess,
132
- "Failed to cudaFreeHost pointer %p (error %d %s)",
133
- pinnedMemAlloc_, (int) err, cudaGetErrorString(err));
134
- }
131
+ if (pinnedMemAlloc_) {
132
+ auto err = cudaFreeHost(pinnedMemAlloc_);
133
+ FAISS_ASSERT_FMT(
134
+ err == cudaSuccess,
135
+ "Failed to cudaFreeHost pointer %p (error %d %s)",
136
+ pinnedMemAlloc_,
137
+ (int)err,
138
+ cudaGetErrorString(err));
139
+ }
135
140
  }
136
141
 
137
- size_t
138
- StandardGpuResourcesImpl::getDefaultTempMemForGPU(int device,
139
- size_t requested) {
140
- auto totalMem = device != -1 ?
141
- getDeviceProperties(device).totalGlobalMem :
142
- std::numeric_limits<size_t>::max();
142
+ size_t StandardGpuResourcesImpl::getDefaultTempMemForGPU(
143
+ int device,
144
+ size_t requested) {
145
+ auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
146
+ : std::numeric_limits<size_t>::max();
143
147
 
144
- if (totalMem <= (size_t) 4 * 1024 * 1024 * 1024) {
145
- // If the GPU has <= 4 GiB of memory, reserve 512 MiB
148
+ if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
149
+ // If the GPU has <= 4 GiB of memory, reserve 512 MiB
146
150
 
147
- if (requested > k4GiBTempMem) {
148
- return k4GiBTempMem;
149
- }
150
- } else if (totalMem <= (size_t) 8 * 1024 * 1024 * 1024) {
151
- // If the GPU has <= 8 GiB of memory, reserve 1 GiB
151
+ if (requested > k4GiBTempMem) {
152
+ return k4GiBTempMem;
153
+ }
154
+ } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
155
+ // If the GPU has <= 8 GiB of memory, reserve 1 GiB
152
156
 
153
- if (requested > k8GiBTempMem) {
154
- return k8GiBTempMem;
155
- }
156
- } else {
157
- // Never use more than 1.5 GiB
158
- if (requested > kMaxTempMem) {
159
- return kMaxTempMem;
157
+ if (requested > k8GiBTempMem) {
158
+ return k8GiBTempMem;
159
+ }
160
+ } else {
161
+ // Never use more than 1.5 GiB
162
+ if (requested > kMaxTempMem) {
163
+ return kMaxTempMem;
164
+ }
160
165
  }
161
- }
162
-
163
- // use whatever lower limit the user requested
164
- return requested;
165
- }
166
-
167
- void
168
- StandardGpuResourcesImpl::noTempMemory() {
169
- setTempMemory(0);
170
- }
171
-
172
- void
173
- StandardGpuResourcesImpl::setTempMemory(size_t size) {
174
- if (tempMemSize_ != size) {
175
- // adjust based on general limits
176
- tempMemSize_ = getDefaultTempMemForGPU(-1, size);
177
-
178
- // We need to re-initialize memory resources for all current devices that
179
- // have been initialized.
180
- // This should be safe to do, even if we are currently running work, because
181
- // the cudaFree call that this implies will force-synchronize all GPUs with
182
- // the CPU
183
- for (auto& p : tempMemory_) {
184
- int device = p.first;
185
- // Free the existing memory first
186
- p.second.reset();
187
-
188
- // Allocate new
189
- p.second = std::unique_ptr<StackDeviceMemory>(
190
- new StackDeviceMemory(this,
191
- p.first,
192
- // adjust for this specific device
193
- getDefaultTempMemForGPU(device, tempMemSize_)));
166
+
167
+ // use whatever lower limit the user requested
168
+ return requested;
169
+ }
170
+
171
+ void StandardGpuResourcesImpl::noTempMemory() {
172
+ setTempMemory(0);
173
+ }
174
+
175
+ void StandardGpuResourcesImpl::setTempMemory(size_t size) {
176
+ if (tempMemSize_ != size) {
177
+ // adjust based on general limits
178
+ tempMemSize_ = getDefaultTempMemForGPU(-1, size);
179
+
180
+ // We need to re-initialize memory resources for all current devices
181
+ // that have been initialized. This should be safe to do, even if we are
182
+ // currently running work, because the cudaFree call that this implies
183
+ // will force-synchronize all GPUs with the CPU
184
+ for (auto& p : tempMemory_) {
185
+ int device = p.first;
186
+ // Free the existing memory first
187
+ p.second.reset();
188
+
189
+ // Allocate new
190
+ p.second = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
191
+ this,
192
+ p.first,
193
+ // adjust for this specific device
194
+ getDefaultTempMemForGPU(device, tempMemSize_)));
195
+ }
194
196
  }
195
- }
196
197
  }
197
198
 
198
- void
199
- StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
200
- // Should not call this after devices have been initialized
201
- FAISS_ASSERT(defaultStreams_.size() == 0);
202
- FAISS_ASSERT(!pinnedMemAlloc_);
199
+ void StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
200
+ // Should not call this after devices have been initialized
201
+ FAISS_ASSERT(defaultStreams_.size() == 0);
202
+ FAISS_ASSERT(!pinnedMemAlloc_);
203
203
 
204
- pinnedMemSize_ = size;
204
+ pinnedMemSize_ = size;
205
205
  }
206
206
 
207
- void
208
- StandardGpuResourcesImpl::setDefaultStream(int device, cudaStream_t stream) {
209
- if (isInitialized(device)) {
210
- // A new series of calls may not be ordered with what was the previous
211
- // stream, so if the stream being specified is different, then we need to
212
- // ensure ordering between the two (new stream waits on old).
213
- auto it = userDefaultStreams_.find(device);
214
- cudaStream_t prevStream = nullptr;
207
+ void StandardGpuResourcesImpl::setDefaultStream(
208
+ int device,
209
+ cudaStream_t stream) {
210
+ if (isInitialized(device)) {
211
+ // A new series of calls may not be ordered with what was the previous
212
+ // stream, so if the stream being specified is different, then we need
213
+ // to ensure ordering between the two (new stream waits on old).
214
+ auto it = userDefaultStreams_.find(device);
215
+ cudaStream_t prevStream = nullptr;
215
216
 
216
- if (it != userDefaultStreams_.end()) {
217
- prevStream = it->second;
218
- } else {
219
- FAISS_ASSERT(defaultStreams_.count(device));
220
- prevStream = defaultStreams_[device];
221
- }
217
+ if (it != userDefaultStreams_.end()) {
218
+ prevStream = it->second;
219
+ } else {
220
+ FAISS_ASSERT(defaultStreams_.count(device));
221
+ prevStream = defaultStreams_[device];
222
+ }
222
223
 
223
- if (prevStream != stream) {
224
- streamWait({stream}, {prevStream});
224
+ if (prevStream != stream) {
225
+ streamWait({stream}, {prevStream});
226
+ }
225
227
  }
226
- }
227
228
 
228
- userDefaultStreams_[device] = stream;
229
+ userDefaultStreams_[device] = stream;
229
230
  }
230
231
 
231
- void
232
- StandardGpuResourcesImpl::revertDefaultStream(int device) {
233
- if (isInitialized(device)) {
234
- auto it = userDefaultStreams_.find(device);
232
+ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
233
+ if (isInitialized(device)) {
234
+ auto it = userDefaultStreams_.find(device);
235
235
 
236
- if (it != userDefaultStreams_.end()) {
237
- // There was a user stream set that we need to synchronize against
238
- cudaStream_t prevStream = userDefaultStreams_[device];
236
+ if (it != userDefaultStreams_.end()) {
237
+ // There was a user stream set that we need to synchronize against
238
+ cudaStream_t prevStream = userDefaultStreams_[device];
239
239
 
240
- FAISS_ASSERT(defaultStreams_.count(device));
241
- cudaStream_t newStream = defaultStreams_[device];
240
+ FAISS_ASSERT(defaultStreams_.count(device));
241
+ cudaStream_t newStream = defaultStreams_[device];
242
242
 
243
- streamWait({newStream}, {prevStream});
243
+ streamWait({newStream}, {prevStream});
244
+ }
244
245
  }
245
- }
246
246
 
247
- userDefaultStreams_.erase(device);
247
+ userDefaultStreams_.erase(device);
248
248
  }
249
249
 
250
- void
251
- StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
252
- for (int dev = 0; dev < getNumDevices(); ++dev) {
253
- setDefaultStream(dev, nullptr);
254
- }
250
+ void StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
251
+ for (int dev = 0; dev < getNumDevices(); ++dev) {
252
+ setDefaultStream(dev, nullptr);
253
+ }
255
254
  }
256
255
 
257
- void
258
- StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
259
- allocLogging_ = enable;
256
+ void StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
257
+ allocLogging_ = enable;
260
258
  }
261
259
 
262
- bool
263
- StandardGpuResourcesImpl::isInitialized(int device) const {
264
- // Use default streams as a marker for whether or not a certain
265
- // device has been initialized
266
- return defaultStreams_.count(device) != 0;
260
+ bool StandardGpuResourcesImpl::isInitialized(int device) const {
261
+ // Use default streams as a marker for whether or not a certain
262
+ // device has been initialized
263
+ return defaultStreams_.count(device) != 0;
267
264
  }
268
265
 
269
- void
270
- StandardGpuResourcesImpl::initializeForDevice(int device) {
271
- if (isInitialized(device)) {
272
- return;
273
- }
274
-
275
- // If this is the first device that we're initializing, create our
276
- // pinned memory allocation
277
- if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
278
- auto err =
279
- cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
266
+ void StandardGpuResourcesImpl::initializeForDevice(int device) {
267
+ if (isInitialized(device)) {
268
+ return;
269
+ }
280
270
 
281
- FAISS_THROW_IF_NOT_FMT(
282
- err == cudaSuccess,
283
- "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
284
- "async copy buffer (error %d %s)",
285
- pinnedMemSize_, (int) err, cudaGetErrorString(err));
271
+ // If this is the first device that we're initializing, create our
272
+ // pinned memory allocation
273
+ if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
274
+ auto err = cudaHostAlloc(
275
+ &pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
276
+
277
+ FAISS_THROW_IF_NOT_FMT(
278
+ err == cudaSuccess,
279
+ "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
280
+ "async copy buffer (error %d %s)",
281
+ pinnedMemSize_,
282
+ (int)err,
283
+ cudaGetErrorString(err));
284
+
285
+ pinnedMemAllocSize_ = pinnedMemSize_;
286
+ }
286
287
 
287
- pinnedMemAllocSize_ = pinnedMemSize_;
288
- }
288
+ FAISS_ASSERT(device < getNumDevices());
289
+ DeviceScope scope(device);
289
290
 
290
- FAISS_ASSERT(device < getNumDevices());
291
- DeviceScope scope(device);
291
+ // Make sure that device properties for all devices are cached
292
+ auto& prop = getDeviceProperties(device);
292
293
 
293
- // Make sure that device properties for all devices are cached
294
- auto& prop = getDeviceProperties(device);
294
+ // Also check to make sure we meet our minimum compute capability (3.0)
295
+ FAISS_ASSERT_FMT(
296
+ prop.major >= 3,
297
+ "Device id %d with CC %d.%d not supported, "
298
+ "need 3.0+ compute capability",
299
+ device,
300
+ prop.major,
301
+ prop.minor);
295
302
 
296
- // Also check to make sure we meet our minimum compute capability (3.0)
297
- FAISS_ASSERT_FMT(prop.major >= 3,
298
- "Device id %d with CC %d.%d not supported, "
299
- "need 3.0+ compute capability",
300
- device, prop.major, prop.minor);
303
+ // Our code is pre-built with and expects warpSize == 32, validate that
304
+ FAISS_ASSERT_FMT(
305
+ prop.warpSize == 32,
306
+ "Device id %d does not have expected warpSize of 32",
307
+ device);
301
308
 
302
- // Create streams
303
- cudaStream_t defaultStream = 0;
304
- CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
305
- cudaStreamNonBlocking));
309
+ // Create streams
310
+ cudaStream_t defaultStream = 0;
311
+ CUDA_VERIFY(
312
+ cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
306
313
 
307
- defaultStreams_[device] = defaultStream;
314
+ defaultStreams_[device] = defaultStream;
308
315
 
309
- cudaStream_t asyncCopyStream = 0;
310
- CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
311
- cudaStreamNonBlocking));
316
+ cudaStream_t asyncCopyStream = 0;
317
+ CUDA_VERIFY(
318
+ cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
312
319
 
313
- asyncCopyStreams_[device] = asyncCopyStream;
320
+ asyncCopyStreams_[device] = asyncCopyStream;
314
321
 
315
- std::vector<cudaStream_t> deviceStreams;
316
- for (int j = 0; j < kNumStreams; ++j) {
317
- cudaStream_t stream = 0;
318
- CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
319
- cudaStreamNonBlocking));
322
+ std::vector<cudaStream_t> deviceStreams;
323
+ for (int j = 0; j < kNumStreams; ++j) {
324
+ cudaStream_t stream = 0;
325
+ CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
320
326
 
321
- deviceStreams.push_back(stream);
322
- }
327
+ deviceStreams.push_back(stream);
328
+ }
323
329
 
324
- alternateStreams_[device] = std::move(deviceStreams);
330
+ alternateStreams_[device] = std::move(deviceStreams);
325
331
 
326
- // Create cuBLAS handle
327
- cublasHandle_t blasHandle = 0;
328
- auto blasStatus = cublasCreate(&blasHandle);
329
- FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
330
- blasHandles_[device] = blasHandle;
332
+ // Create cuBLAS handle
333
+ cublasHandle_t blasHandle = 0;
334
+ auto blasStatus = cublasCreate(&blasHandle);
335
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
336
+ blasHandles_[device] = blasHandle;
331
337
 
332
- // For CUDA 10 on V100, enabling tensor core usage would enable automatic
333
- // rounding down of inputs to f16 (though accumulate in f32) which results in
334
- // unacceptable loss of precision in general.
335
- // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
336
- // a loss of precision.
338
+ // For CUDA 10 on V100, enabling tensor core usage would enable automatic
339
+ // rounding down of inputs to f16 (though accumulate in f32) which results
340
+ // in unacceptable loss of precision in general. For CUDA 11 / A100, only
341
+ // enable tensor core support if it doesn't result in a loss of precision.
337
342
  #if CUDA_VERSION >= 11000
338
- cublasSetMathMode(blasHandle,
339
- CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
343
+ cublasSetMathMode(
344
+ blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
340
345
  #endif
341
346
 
342
- FAISS_ASSERT(allocs_.count(device) == 0);
343
- allocs_[device] = std::unordered_map<void*, AllocRequest>();
347
+ FAISS_ASSERT(allocs_.count(device) == 0);
348
+ allocs_[device] = std::unordered_map<void*, AllocRequest>();
344
349
 
345
- FAISS_ASSERT(tempMemory_.count(device) == 0);
346
- auto mem = std::unique_ptr<StackDeviceMemory>(
347
- new StackDeviceMemory(this,
348
- device,
349
- // adjust for this specific device
350
- getDefaultTempMemForGPU(device, tempMemSize_)));
350
+ FAISS_ASSERT(tempMemory_.count(device) == 0);
351
+ auto mem = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
352
+ this,
353
+ device,
354
+ // adjust for this specific device
355
+ getDefaultTempMemForGPU(device, tempMemSize_)));
351
356
 
352
- tempMemory_.emplace(device, std::move(mem));
357
+ tempMemory_.emplace(device, std::move(mem));
353
358
  }
354
359
 
355
- cublasHandle_t
356
- StandardGpuResourcesImpl::getBlasHandle(int device) {
357
- initializeForDevice(device);
358
- return blasHandles_[device];
360
+ cublasHandle_t StandardGpuResourcesImpl::getBlasHandle(int device) {
361
+ initializeForDevice(device);
362
+ return blasHandles_[device];
359
363
  }
360
364
 
361
- cudaStream_t
362
- StandardGpuResourcesImpl::getDefaultStream(int device) {
363
- initializeForDevice(device);
365
+ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
366
+ initializeForDevice(device);
364
367
 
365
- auto it = userDefaultStreams_.find(device);
366
- if (it != userDefaultStreams_.end()) {
367
- // There is a user override stream set
368
- return it->second;
369
- }
368
+ auto it = userDefaultStreams_.find(device);
369
+ if (it != userDefaultStreams_.end()) {
370
+ // There is a user override stream set
371
+ return it->second;
372
+ }
370
373
 
371
- // Otherwise, our base default stream
372
- return defaultStreams_[device];
374
+ // Otherwise, our base default stream
375
+ return defaultStreams_[device];
373
376
  }
374
377
 
375
- std::vector<cudaStream_t>
376
- StandardGpuResourcesImpl::getAlternateStreams(int device) {
377
- initializeForDevice(device);
378
- return alternateStreams_[device];
378
+ std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
379
+ int device) {
380
+ initializeForDevice(device);
381
+ return alternateStreams_[device];
379
382
  }
380
383
 
381
- std::pair<void*, size_t>
382
- StandardGpuResourcesImpl::getPinnedMemory() {
383
- return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
384
+ std::pair<void*, size_t> StandardGpuResourcesImpl::getPinnedMemory() {
385
+ return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
384
386
  }
385
387
 
386
- cudaStream_t
387
- StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
388
- initializeForDevice(device);
389
- return asyncCopyStreams_[device];
388
+ cudaStream_t StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
389
+ initializeForDevice(device);
390
+ return asyncCopyStreams_[device];
390
391
  }
391
392
 
392
- void*
393
- StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
394
- initializeForDevice(req.device);
395
-
396
- // We don't allocate a placeholder for zero-sized allocations
397
- if (req.size == 0) {
398
- return nullptr;
399
- }
400
-
401
- // Make sure that the allocation is a multiple of 16 bytes for alignment
402
- // purposes
403
- auto adjReq = req;
404
- adjReq.size = utils::roundUp(adjReq.size, (size_t) 16);
405
-
406
- void* p = nullptr;
407
-
408
- if (allocLogging_) {
409
- std::cout << "StandardGpuResources: alloc " << adjReq.toString() << "\n";
410
- }
411
-
412
- if (adjReq.space == MemorySpace::Temporary) {
413
- // If we don't have enough space in our temporary memory manager, we need
414
- // to allocate this request separately
415
- auto& tempMem = tempMemory_[adjReq.device];
416
-
417
- if (adjReq.size > tempMem->getSizeAvailable()) {
418
- // We need to allocate this ourselves
419
- AllocRequest newReq = adjReq;
420
- newReq.space = MemorySpace::Device;
421
- newReq.type = AllocType::TemporaryMemoryOverflow;
393
+ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
394
+ initializeForDevice(req.device);
422
395
 
423
- return allocMemory(newReq);
396
+ // We don't allocate a placeholder for zero-sized allocations
397
+ if (req.size == 0) {
398
+ return nullptr;
424
399
  }
425
400
 
426
- // Otherwise, we can handle this locally
427
- p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
428
-
429
- } else if (adjReq.space == MemorySpace::Device) {
430
- auto err = cudaMalloc(&p, adjReq.size);
431
-
432
- // Throw if we fail to allocate
433
- if (err != cudaSuccess) {
434
- auto& map = allocs_[req.device];
435
-
436
- std::stringstream ss;
437
- ss << "Failed to cudaMalloc " << adjReq.size << " bytes "
438
- << "on device " << adjReq.device << " (error "
439
- << (int) err << " " << cudaGetErrorString(err)
440
- << "\nOutstanding allocations:\n" << allocsToString(map);
441
- auto str = ss.str();
442
-
443
- FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
401
+ // cudaMalloc guarantees allocation alignment to 256 bytes; do the same here
402
+ // for alignment purposes (to reduce memory transaction overhead etc)
403
+ auto adjReq = req;
404
+ adjReq.size = utils::roundUp(adjReq.size, (size_t)256);
405
+
406
+ void* p = nullptr;
407
+
408
+ if (adjReq.space == MemorySpace::Temporary) {
409
+ // If we don't have enough space in our temporary memory manager, we
410
+ // need to allocate this request separately
411
+ auto& tempMem = tempMemory_[adjReq.device];
412
+
413
+ if (adjReq.size > tempMem->getSizeAvailable()) {
414
+ // We need to allocate this ourselves
415
+ AllocRequest newReq = adjReq;
416
+ newReq.space = MemorySpace::Device;
417
+ newReq.type = AllocType::TemporaryMemoryOverflow;
418
+
419
+ if (allocLogging_) {
420
+ std::cout
421
+ << "StandardGpuResources: alloc fail "
422
+ << adjReq.toString()
423
+ << " (no temp space); retrying as MemorySpace::Device\n";
424
+ }
425
+
426
+ return allocMemory(newReq);
427
+ }
428
+
429
+ // Otherwise, we can handle this locally
430
+ p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
431
+
432
+ } else if (adjReq.space == MemorySpace::Device) {
433
+ auto err = cudaMalloc(&p, adjReq.size);
434
+
435
+ // Throw if we fail to allocate
436
+ if (err != cudaSuccess) {
437
+ // FIXME: as of CUDA 11, a memory allocation error appears to be
438
+ // presented via cudaGetLastError as well, and needs to be cleared.
439
+ // Just call the function to clear it
440
+ cudaGetLastError();
441
+
442
+ std::stringstream ss;
443
+ ss << "StandardGpuResources: alloc fail " << adjReq.toString()
444
+ << " (cudaMalloc error " << cudaGetErrorString(err) << " ["
445
+ << (int)err << "])\n";
446
+ auto str = ss.str();
447
+
448
+ if (allocLogging_) {
449
+ std::cout << str;
450
+ }
451
+
452
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
453
+ }
454
+ } else if (adjReq.space == MemorySpace::Unified) {
455
+ auto err = cudaMallocManaged(&p, adjReq.size);
456
+
457
+ if (err != cudaSuccess) {
458
+ // FIXME: as of CUDA 11, a memory allocation error appears to be
459
+ // presented via cudaGetLastError as well, and needs to be cleared.
460
+ // Just call the function to clear it
461
+ cudaGetLastError();
462
+
463
+ std::stringstream ss;
464
+ ss << "StandardGpuResources: alloc fail " << adjReq.toString()
465
+ << " failed (cudaMallocManaged error " << cudaGetErrorString(err)
466
+ << " [" << (int)err << "])\n";
467
+ auto str = ss.str();
468
+
469
+ if (allocLogging_) {
470
+ std::cout << str;
471
+ }
472
+
473
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
474
+ }
475
+ } else {
476
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
444
477
  }
445
- } else if (adjReq.space == MemorySpace::Unified) {
446
- auto err = cudaMallocManaged(&p, adjReq.size);
447
-
448
- if (err != cudaSuccess) {
449
- auto& map = allocs_[req.device];
450
478
 
451
- std::stringstream ss;
452
- ss << "Failed to cudaMallocManaged " << adjReq.size << " bytes "
453
- << "(error " << (int) err << " " << cudaGetErrorString(err)
454
- << "\nOutstanding allocations:\n" << allocsToString(map);
455
- auto str = ss.str();
456
-
457
- FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
479
+ if (allocLogging_) {
480
+ std::cout << "StandardGpuResources: alloc ok " << adjReq.toString()
481
+ << " ptr 0x" << p << "\n";
458
482
  }
459
- } else {
460
- FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) adjReq.space);
461
- }
462
483
 
463
- allocs_[adjReq.device][p] = adjReq;
484
+ allocs_[adjReq.device][p] = adjReq;
464
485
 
465
- return p;
486
+ return p;
466
487
  }
467
488
 
468
- void
469
- StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
470
- FAISS_ASSERT(isInitialized(device));
489
+ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
490
+ FAISS_ASSERT(isInitialized(device));
471
491
 
472
- if (!p) {
473
- return;
474
- }
492
+ if (!p) {
493
+ return;
494
+ }
475
495
 
476
- auto& a = allocs_[device];
477
- auto it = a.find(p);
478
- FAISS_ASSERT(it != a.end());
496
+ auto& a = allocs_[device];
497
+ auto it = a.find(p);
498
+ FAISS_ASSERT(it != a.end());
479
499
 
480
- auto& req = it->second;
500
+ auto& req = it->second;
481
501
 
482
- if (allocLogging_) {
483
- std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
484
- }
502
+ if (allocLogging_) {
503
+ std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
504
+ }
485
505
 
486
- if (req.space == MemorySpace::Temporary) {
487
- tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
506
+ if (req.space == MemorySpace::Temporary) {
507
+ tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
488
508
 
489
- } else if (req.space == MemorySpace::Device ||
490
- req.space == MemorySpace::Unified) {
491
- auto err = cudaFree(p);
492
- FAISS_ASSERT_FMT(err == cudaSuccess,
493
- "Failed to cudaFree pointer %p (error %d %s)",
494
- p, (int) err, cudaGetErrorString(err));
509
+ } else if (
510
+ req.space == MemorySpace::Device ||
511
+ req.space == MemorySpace::Unified) {
512
+ auto err = cudaFree(p);
513
+ FAISS_ASSERT_FMT(
514
+ err == cudaSuccess,
515
+ "Failed to cudaFree pointer %p (error %d %s)",
516
+ p,
517
+ (int)err,
518
+ cudaGetErrorString(err));
495
519
 
496
- } else {
497
- FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) req.space);
498
- }
520
+ } else {
521
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
522
+ }
499
523
 
500
- a.erase(it);
524
+ a.erase(it);
501
525
  }
502
526
 
503
- size_t
504
- StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
505
- FAISS_ASSERT(isInitialized(device));
527
+ size_t StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
528
+ FAISS_ASSERT(isInitialized(device));
506
529
 
507
- auto it = tempMemory_.find(device);
508
- FAISS_ASSERT(it != tempMemory_.end());
530
+ auto it = tempMemory_.find(device);
531
+ FAISS_ASSERT(it != tempMemory_.end());
509
532
 
510
- return it->second->getSizeAvailable();
533
+ return it->second->getSizeAvailable();
511
534
  }
512
535
 
513
536
  std::map<int, std::map<std::string, std::pair<int, size_t>>>
514
537
  StandardGpuResourcesImpl::getMemoryInfo() const {
515
- using AT = std::map<std::string, std::pair<int, size_t>>;
538
+ using AT = std::map<std::string, std::pair<int, size_t>>;
516
539
 
517
- std::map<int, AT> out;
540
+ std::map<int, AT> out;
518
541
 
519
- for (auto& entry : allocs_) {
520
- AT outDevice;
542
+ for (auto& entry : allocs_) {
543
+ AT outDevice;
521
544
 
522
- for (auto& a : entry.second) {
523
- auto& v = outDevice[allocTypeToString(a.second.type)];
524
- v.first++;
525
- v.second += a.second.size;
526
- }
545
+ for (auto& a : entry.second) {
546
+ auto& v = outDevice[allocTypeToString(a.second.type)];
547
+ v.first++;
548
+ v.second += a.second.size;
549
+ }
527
550
 
528
- out[entry.first] = std::move(outDevice);
529
- }
551
+ out[entry.first] = std::move(outDevice);
552
+ }
530
553
 
531
- return out;
554
+ return out;
532
555
  }
533
556
 
534
557
  //
@@ -536,70 +559,58 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
536
559
  //
537
560
 
538
561
  StandardGpuResources::StandardGpuResources()
539
- : res_(new StandardGpuResourcesImpl) {
540
- }
562
+ : res_(new StandardGpuResourcesImpl) {}
541
563
 
542
- StandardGpuResources::~StandardGpuResources() {
543
- }
564
+ StandardGpuResources::~StandardGpuResources() {}
544
565
 
545
- std::shared_ptr<GpuResources>
546
- StandardGpuResources::getResources() {
547
- return res_;
566
+ std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
567
+ return res_;
548
568
  }
549
569
 
550
- void
551
- StandardGpuResources::noTempMemory() {
552
- res_->noTempMemory();
570
+ void StandardGpuResources::noTempMemory() {
571
+ res_->noTempMemory();
553
572
  }
554
573
 
555
- void
556
- StandardGpuResources::setTempMemory(size_t size) {
557
- res_->setTempMemory(size);
574
+ void StandardGpuResources::setTempMemory(size_t size) {
575
+ res_->setTempMemory(size);
558
576
  }
559
577
 
560
- void
561
- StandardGpuResources::setPinnedMemory(size_t size) {
562
- res_->setPinnedMemory(size);
578
+ void StandardGpuResources::setPinnedMemory(size_t size) {
579
+ res_->setPinnedMemory(size);
563
580
  }
564
581
 
565
- void
566
- StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
567
- res_->setDefaultStream(device, stream);
582
+ void StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
583
+ res_->setDefaultStream(device, stream);
568
584
  }
569
585
 
570
- void
571
- StandardGpuResources::revertDefaultStream(int device) {
572
- res_->revertDefaultStream(device);
586
+ void StandardGpuResources::revertDefaultStream(int device) {
587
+ res_->revertDefaultStream(device);
573
588
  }
574
589
 
575
- void
576
- StandardGpuResources::setDefaultNullStreamAllDevices() {
577
- res_->setDefaultNullStreamAllDevices();
590
+ void StandardGpuResources::setDefaultNullStreamAllDevices() {
591
+ res_->setDefaultNullStreamAllDevices();
578
592
  }
579
593
 
580
594
  std::map<int, std::map<std::string, std::pair<int, size_t>>>
581
595
  StandardGpuResources::getMemoryInfo() const {
582
- return res_->getMemoryInfo();
596
+ return res_->getMemoryInfo();
583
597
  }
584
598
 
585
- cudaStream_t
586
- StandardGpuResources::getDefaultStream(int device) {
587
- return res_->getDefaultStream(device);
599
+ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
600
+ return res_->getDefaultStream(device);
588
601
  }
589
602
 
590
- size_t
591
- StandardGpuResources::getTempMemoryAvailable(int device) const {
592
- return res_->getTempMemoryAvailable(device);
603
+ size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
604
+ return res_->getTempMemoryAvailable(device);
593
605
  }
594
606
 
595
- void
596
- StandardGpuResources::syncDefaultStreamCurrentDevice() {
597
- res_->syncDefaultStreamCurrentDevice();
607
+ void StandardGpuResources::syncDefaultStreamCurrentDevice() {
608
+ res_->syncDefaultStreamCurrentDevice();
598
609
  }
599
610
 
600
- void
601
- StandardGpuResources::setLogMemoryAllocations(bool enable) {
602
- res_->setLogMemoryAllocations(enable);
611
+ void StandardGpuResources::setLogMemoryAllocations(bool enable) {
612
+ res_->setLogMemoryAllocations(enable);
603
613
  }
604
614
 
605
- } } // namespace
615
+ } // namespace gpu
616
+ } // namespace faiss