faiss 0.1.7 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +18 -0
  3. data/README.md +7 -7
  4. data/ext/faiss/ext.cpp +1 -1
  5. data/ext/faiss/extconf.rb +8 -2
  6. data/ext/faiss/index.cpp +102 -69
  7. data/ext/faiss/index_binary.cpp +24 -30
  8. data/ext/faiss/kmeans.cpp +20 -16
  9. data/ext/faiss/numo.hpp +867 -0
  10. data/ext/faiss/pca_matrix.cpp +13 -14
  11. data/ext/faiss/product_quantizer.cpp +23 -24
  12. data/ext/faiss/utils.cpp +10 -37
  13. data/ext/faiss/utils.h +2 -13
  14. data/lib/faiss/version.rb +1 -1
  15. data/lib/faiss.rb +0 -5
  16. data/vendor/faiss/faiss/AutoTune.cpp +292 -291
  17. data/vendor/faiss/faiss/AutoTune.h +55 -56
  18. data/vendor/faiss/faiss/Clustering.cpp +334 -195
  19. data/vendor/faiss/faiss/Clustering.h +88 -35
  20. data/vendor/faiss/faiss/IVFlib.cpp +171 -195
  21. data/vendor/faiss/faiss/IVFlib.h +48 -51
  22. data/vendor/faiss/faiss/Index.cpp +85 -103
  23. data/vendor/faiss/faiss/Index.h +54 -48
  24. data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
  25. data/vendor/faiss/faiss/Index2Layer.h +22 -22
  26. data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
  27. data/vendor/faiss/faiss/IndexBinary.h +140 -132
  28. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
  29. data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
  30. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
  31. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
  32. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
  33. data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
  34. data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
  35. data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
  36. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
  37. data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
  38. data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
  39. data/vendor/faiss/faiss/IndexFlat.h +35 -46
  40. data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
  41. data/vendor/faiss/faiss/IndexHNSW.h +57 -41
  42. data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
  43. data/vendor/faiss/faiss/IndexIVF.h +146 -113
  44. data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
  45. data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
  46. data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
  47. data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
  48. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
  49. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
  50. data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
  51. data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
  52. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
  53. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
  54. data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
  55. data/vendor/faiss/faiss/IndexLSH.h +21 -26
  56. data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
  57. data/vendor/faiss/faiss/IndexLattice.h +11 -16
  58. data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
  59. data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
  60. data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
  61. data/vendor/faiss/faiss/IndexNSG.h +85 -0
  62. data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
  63. data/vendor/faiss/faiss/IndexPQ.h +64 -67
  64. data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
  65. data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
  66. data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
  67. data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
  68. data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
  69. data/vendor/faiss/faiss/IndexRefine.h +22 -23
  70. data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
  71. data/vendor/faiss/faiss/IndexReplicas.h +62 -56
  72. data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
  73. data/vendor/faiss/faiss/IndexResidual.h +152 -0
  74. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
  75. data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
  76. data/vendor/faiss/faiss/IndexShards.cpp +256 -240
  77. data/vendor/faiss/faiss/IndexShards.h +85 -73
  78. data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
  79. data/vendor/faiss/faiss/MatrixStats.h +7 -10
  80. data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
  81. data/vendor/faiss/faiss/MetaIndexes.h +40 -34
  82. data/vendor/faiss/faiss/MetricType.h +7 -7
  83. data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
  84. data/vendor/faiss/faiss/VectorTransform.h +61 -89
  85. data/vendor/faiss/faiss/clone_index.cpp +77 -73
  86. data/vendor/faiss/faiss/clone_index.h +4 -9
  87. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
  88. data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
  89. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
  90. data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
  91. data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
  92. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
  93. data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
  94. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
  95. data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
  96. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
  97. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
  98. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
  99. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
  100. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
  101. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
  102. data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
  103. data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
  104. data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
  106. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
  107. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
  108. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
  109. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
  110. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
  111. data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
  112. data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
  113. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
  114. data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
  115. data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
  116. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
  117. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
  118. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
  119. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
  120. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
  121. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
  122. data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
  123. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
  124. data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
  125. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
  126. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
  127. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
  128. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
  129. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
  130. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
  131. data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
  132. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
  133. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
  134. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
  135. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
  136. data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
  137. data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
  138. data/vendor/faiss/faiss/impl/FaissException.h +41 -29
  139. data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
  140. data/vendor/faiss/faiss/impl/HNSW.h +179 -200
  141. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
  142. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
  143. data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
  144. data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
  145. data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
  146. data/vendor/faiss/faiss/impl/NSG.h +199 -0
  147. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
  148. data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
  149. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
  150. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
  151. data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
  152. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
  153. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
  154. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
  155. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
  156. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
  157. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
  158. data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
  159. data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
  160. data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
  161. data/vendor/faiss/faiss/impl/io.cpp +75 -94
  162. data/vendor/faiss/faiss/impl/io.h +31 -41
  163. data/vendor/faiss/faiss/impl/io_macros.h +40 -29
  164. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
  165. data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
  166. data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
  167. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
  168. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
  169. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
  170. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
  171. data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
  172. data/vendor/faiss/faiss/index_factory.cpp +269 -218
  173. data/vendor/faiss/faiss/index_factory.h +6 -7
  174. data/vendor/faiss/faiss/index_io.h +23 -26
  175. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
  176. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
  177. data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
  178. data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
  179. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
  180. data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
  181. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
  182. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
  183. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
  184. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
  185. data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
  186. data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
  187. data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
  188. data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
  189. data/vendor/faiss/faiss/utils/Heap.h +186 -209
  190. data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
  191. data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
  192. data/vendor/faiss/faiss/utils/distances.cpp +301 -310
  193. data/vendor/faiss/faiss/utils/distances.h +133 -118
  194. data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
  195. data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
  196. data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
  197. data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
  198. data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
  199. data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
  200. data/vendor/faiss/faiss/utils/hamming.h +62 -85
  201. data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
  202. data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
  203. data/vendor/faiss/faiss/utils/partitioning.h +26 -21
  204. data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
  205. data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
  206. data/vendor/faiss/faiss/utils/random.cpp +39 -63
  207. data/vendor/faiss/faiss/utils/random.h +13 -16
  208. data/vendor/faiss/faiss/utils/simdlib.h +4 -2
  209. data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
  210. data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
  211. data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
  212. data/vendor/faiss/faiss/utils/utils.cpp +304 -287
  213. data/vendor/faiss/faiss/utils/utils.h +53 -48
  214. metadata +26 -12
  215. data/lib/faiss/index.rb +0 -20
  216. data/lib/faiss/index_binary.rb +0 -20
  217. data/lib/faiss/kmeans.rb +0 -15
  218. data/lib/faiss/pca_matrix.rb +0 -15
  219. data/lib/faiss/product_quantizer.rb +0 -22
@@ -5,16 +5,16 @@
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
7
 
8
-
9
8
  #include <faiss/gpu/StandardGpuResources.h>
10
9
  #include <faiss/gpu/utils/DeviceUtils.h>
11
10
  #include <faiss/gpu/utils/StaticUtils.h>
12
11
  #include <faiss/impl/FaissAssert.h>
13
- #include <limits>
14
12
  #include <iostream>
13
+ #include <limits>
15
14
  #include <sstream>
16
15
 
17
- namespace faiss { namespace gpu {
16
+ namespace faiss {
17
+ namespace gpu {
18
18
 
19
19
  namespace {
20
20
 
@@ -22,513 +22,536 @@ namespace {
22
22
  constexpr int kNumStreams = 2;
23
23
 
24
24
  // Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
25
- constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
25
+ constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
26
26
 
27
27
  // Default temporary memory allocation for <= 4 GiB memory GPUs
28
- constexpr size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
28
+ constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
29
29
 
30
30
  // Default temporary memory allocation for <= 8 GiB memory GPUs
31
- constexpr size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
31
+ constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
32
32
 
33
33
  // Maximum temporary memory allocation for all GPUs
34
- constexpr size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
34
+ constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
35
35
 
36
36
  std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
37
- // Produce a sorted list of all outstanding allocations by type
38
- std::unordered_map<AllocType, std::pair<int, size_t>> stats;
39
-
40
- for (auto& entry : map) {
41
- auto& a = entry.second;
42
-
43
- auto it = stats.find(a.type);
44
- if (it != stats.end()) {
45
- stats[a.type].first++;
46
- stats[a.type].second += a.size;
47
- } else {
48
- stats[a.type] = std::make_pair(1, a.size);
37
+ // Produce a sorted list of all outstanding allocations by type
38
+ std::unordered_map<AllocType, std::pair<int, size_t>> stats;
39
+
40
+ for (auto& entry : map) {
41
+ auto& a = entry.second;
42
+
43
+ auto it = stats.find(a.type);
44
+ if (it != stats.end()) {
45
+ stats[a.type].first++;
46
+ stats[a.type].second += a.size;
47
+ } else {
48
+ stats[a.type] = std::make_pair(1, a.size);
49
+ }
49
50
  }
50
- }
51
51
 
52
- std::stringstream ss;
53
- for (auto& entry : stats) {
54
- ss << "Alloc type " << allocTypeToString(entry.first) << ": "
55
- << entry.second.first << " allocations, "
56
- << entry.second.second << " bytes\n";
57
- }
52
+ std::stringstream ss;
53
+ for (auto& entry : stats) {
54
+ ss << "Alloc type " << allocTypeToString(entry.first) << ": "
55
+ << entry.second.first << " allocations, " << entry.second.second
56
+ << " bytes\n";
57
+ }
58
58
 
59
- return ss.str();
59
+ return ss.str();
60
60
  }
61
61
 
62
- }
62
+ } // namespace
63
63
 
64
64
  //
65
65
  // StandardGpuResourcesImpl
66
66
  //
67
67
 
68
- StandardGpuResourcesImpl::StandardGpuResourcesImpl() :
69
- pinnedMemAlloc_(nullptr),
70
- pinnedMemAllocSize_(0),
71
- // let the adjustment function determine the memory size for us by passing
72
- // in a huge value that will then be adjusted
73
- tempMemSize_(getDefaultTempMemForGPU(-1,
74
- std::numeric_limits<size_t>::max())),
75
- pinnedMemSize_(kDefaultPinnedMemoryAllocation),
76
- allocLogging_(false) {
77
- }
68
+ StandardGpuResourcesImpl::StandardGpuResourcesImpl()
69
+ : pinnedMemAlloc_(nullptr),
70
+ pinnedMemAllocSize_(0),
71
+ // let the adjustment function determine the memory size for us by
72
+ // passing in a huge value that will then be adjusted
73
+ tempMemSize_(getDefaultTempMemForGPU(
74
+ -1,
75
+ std::numeric_limits<size_t>::max())),
76
+ pinnedMemSize_(kDefaultPinnedMemoryAllocation),
77
+ allocLogging_(false) {}
78
78
 
79
79
  StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
80
- // The temporary memory allocator has allocated memory through us, so clean
81
- // that up before we finish fully de-initializing ourselves
82
- tempMemory_.clear();
83
-
84
- // Make sure all allocations have been freed
85
- bool allocError = false;
86
-
87
- for (auto& entry : allocs_) {
88
- auto& map = entry.second;
89
-
90
- if (!map.empty()) {
91
- std::cerr
92
- << "StandardGpuResources destroyed with allocations outstanding:\n"
93
- << "Device " << entry.first << " outstanding allocations:\n";
94
- std::cerr << allocsToString(map);
95
- allocError = true;
80
+ // The temporary memory allocator has allocated memory through us, so clean
81
+ // that up before we finish fully de-initializing ourselves
82
+ tempMemory_.clear();
83
+
84
+ // Make sure all allocations have been freed
85
+ bool allocError = false;
86
+
87
+ for (auto& entry : allocs_) {
88
+ auto& map = entry.second;
89
+
90
+ if (!map.empty()) {
91
+ std::cerr
92
+ << "StandardGpuResources destroyed with allocations outstanding:\n"
93
+ << "Device " << entry.first
94
+ << " outstanding allocations:\n";
95
+ std::cerr << allocsToString(map);
96
+ allocError = true;
97
+ }
96
98
  }
97
- }
98
99
 
99
- FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
100
+ FAISS_ASSERT_MSG(
101
+ !allocError, "GPU memory allocations not properly cleaned up");
100
102
 
101
- for (auto& entry : defaultStreams_) {
102
- DeviceScope scope(entry.first);
103
+ for (auto& entry : defaultStreams_) {
104
+ DeviceScope scope(entry.first);
103
105
 
104
- // We created these streams, so are responsible for destroying them
105
- CUDA_VERIFY(cudaStreamDestroy(entry.second));
106
- }
106
+ // We created these streams, so are responsible for destroying them
107
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
108
+ }
107
109
 
108
- for (auto& entry : alternateStreams_) {
109
- DeviceScope scope(entry.first);
110
+ for (auto& entry : alternateStreams_) {
111
+ DeviceScope scope(entry.first);
110
112
 
111
- for (auto stream : entry.second) {
112
- CUDA_VERIFY(cudaStreamDestroy(stream));
113
+ for (auto stream : entry.second) {
114
+ CUDA_VERIFY(cudaStreamDestroy(stream));
115
+ }
113
116
  }
114
- }
115
117
 
116
- for (auto& entry : asyncCopyStreams_) {
117
- DeviceScope scope(entry.first);
118
+ for (auto& entry : asyncCopyStreams_) {
119
+ DeviceScope scope(entry.first);
118
120
 
119
- CUDA_VERIFY(cudaStreamDestroy(entry.second));
120
- }
121
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
122
+ }
121
123
 
122
- for (auto& entry : blasHandles_) {
123
- DeviceScope scope(entry.first);
124
+ for (auto& entry : blasHandles_) {
125
+ DeviceScope scope(entry.first);
124
126
 
125
- auto blasStatus = cublasDestroy(entry.second);
126
- FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
127
- }
127
+ auto blasStatus = cublasDestroy(entry.second);
128
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
129
+ }
128
130
 
129
- if (pinnedMemAlloc_) {
130
- auto err = cudaFreeHost(pinnedMemAlloc_);
131
- FAISS_ASSERT_FMT(err == cudaSuccess,
132
- "Failed to cudaFreeHost pointer %p (error %d %s)",
133
- pinnedMemAlloc_, (int) err, cudaGetErrorString(err));
134
- }
131
+ if (pinnedMemAlloc_) {
132
+ auto err = cudaFreeHost(pinnedMemAlloc_);
133
+ FAISS_ASSERT_FMT(
134
+ err == cudaSuccess,
135
+ "Failed to cudaFreeHost pointer %p (error %d %s)",
136
+ pinnedMemAlloc_,
137
+ (int)err,
138
+ cudaGetErrorString(err));
139
+ }
135
140
  }
136
141
 
137
- size_t
138
- StandardGpuResourcesImpl::getDefaultTempMemForGPU(int device,
139
- size_t requested) {
140
- auto totalMem = device != -1 ?
141
- getDeviceProperties(device).totalGlobalMem :
142
- std::numeric_limits<size_t>::max();
142
+ size_t StandardGpuResourcesImpl::getDefaultTempMemForGPU(
143
+ int device,
144
+ size_t requested) {
145
+ auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
146
+ : std::numeric_limits<size_t>::max();
143
147
 
144
- if (totalMem <= (size_t) 4 * 1024 * 1024 * 1024) {
145
- // If the GPU has <= 4 GiB of memory, reserve 512 MiB
148
+ if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
149
+ // If the GPU has <= 4 GiB of memory, reserve 512 MiB
146
150
 
147
- if (requested > k4GiBTempMem) {
148
- return k4GiBTempMem;
149
- }
150
- } else if (totalMem <= (size_t) 8 * 1024 * 1024 * 1024) {
151
- // If the GPU has <= 8 GiB of memory, reserve 1 GiB
151
+ if (requested > k4GiBTempMem) {
152
+ return k4GiBTempMem;
153
+ }
154
+ } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
155
+ // If the GPU has <= 8 GiB of memory, reserve 1 GiB
152
156
 
153
- if (requested > k8GiBTempMem) {
154
- return k8GiBTempMem;
155
- }
156
- } else {
157
- // Never use more than 1.5 GiB
158
- if (requested > kMaxTempMem) {
159
- return kMaxTempMem;
157
+ if (requested > k8GiBTempMem) {
158
+ return k8GiBTempMem;
159
+ }
160
+ } else {
161
+ // Never use more than 1.5 GiB
162
+ if (requested > kMaxTempMem) {
163
+ return kMaxTempMem;
164
+ }
160
165
  }
161
- }
162
-
163
- // use whatever lower limit the user requested
164
- return requested;
165
- }
166
-
167
- void
168
- StandardGpuResourcesImpl::noTempMemory() {
169
- setTempMemory(0);
170
- }
171
-
172
- void
173
- StandardGpuResourcesImpl::setTempMemory(size_t size) {
174
- if (tempMemSize_ != size) {
175
- // adjust based on general limits
176
- tempMemSize_ = getDefaultTempMemForGPU(-1, size);
177
-
178
- // We need to re-initialize memory resources for all current devices that
179
- // have been initialized.
180
- // This should be safe to do, even if we are currently running work, because
181
- // the cudaFree call that this implies will force-synchronize all GPUs with
182
- // the CPU
183
- for (auto& p : tempMemory_) {
184
- int device = p.first;
185
- // Free the existing memory first
186
- p.second.reset();
187
-
188
- // Allocate new
189
- p.second = std::unique_ptr<StackDeviceMemory>(
190
- new StackDeviceMemory(this,
191
- p.first,
192
- // adjust for this specific device
193
- getDefaultTempMemForGPU(device, tempMemSize_)));
166
+
167
+ // use whatever lower limit the user requested
168
+ return requested;
169
+ }
170
+
171
+ void StandardGpuResourcesImpl::noTempMemory() {
172
+ setTempMemory(0);
173
+ }
174
+
175
+ void StandardGpuResourcesImpl::setTempMemory(size_t size) {
176
+ if (tempMemSize_ != size) {
177
+ // adjust based on general limits
178
+ tempMemSize_ = getDefaultTempMemForGPU(-1, size);
179
+
180
+ // We need to re-initialize memory resources for all current devices
181
+ // that have been initialized. This should be safe to do, even if we are
182
+ // currently running work, because the cudaFree call that this implies
183
+ // will force-synchronize all GPUs with the CPU
184
+ for (auto& p : tempMemory_) {
185
+ int device = p.first;
186
+ // Free the existing memory first
187
+ p.second.reset();
188
+
189
+ // Allocate new
190
+ p.second = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
191
+ this,
192
+ p.first,
193
+ // adjust for this specific device
194
+ getDefaultTempMemForGPU(device, tempMemSize_)));
195
+ }
194
196
  }
195
- }
196
197
  }
197
198
 
198
- void
199
- StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
200
- // Should not call this after devices have been initialized
201
- FAISS_ASSERT(defaultStreams_.size() == 0);
202
- FAISS_ASSERT(!pinnedMemAlloc_);
199
+ void StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
200
+ // Should not call this after devices have been initialized
201
+ FAISS_ASSERT(defaultStreams_.size() == 0);
202
+ FAISS_ASSERT(!pinnedMemAlloc_);
203
203
 
204
- pinnedMemSize_ = size;
204
+ pinnedMemSize_ = size;
205
205
  }
206
206
 
207
- void
208
- StandardGpuResourcesImpl::setDefaultStream(int device, cudaStream_t stream) {
209
- if (isInitialized(device)) {
210
- // A new series of calls may not be ordered with what was the previous
211
- // stream, so if the stream being specified is different, then we need to
212
- // ensure ordering between the two (new stream waits on old).
213
- auto it = userDefaultStreams_.find(device);
214
- cudaStream_t prevStream = nullptr;
207
+ void StandardGpuResourcesImpl::setDefaultStream(
208
+ int device,
209
+ cudaStream_t stream) {
210
+ if (isInitialized(device)) {
211
+ // A new series of calls may not be ordered with what was the previous
212
+ // stream, so if the stream being specified is different, then we need
213
+ // to ensure ordering between the two (new stream waits on old).
214
+ auto it = userDefaultStreams_.find(device);
215
+ cudaStream_t prevStream = nullptr;
215
216
 
216
- if (it != userDefaultStreams_.end()) {
217
- prevStream = it->second;
218
- } else {
219
- FAISS_ASSERT(defaultStreams_.count(device));
220
- prevStream = defaultStreams_[device];
221
- }
217
+ if (it != userDefaultStreams_.end()) {
218
+ prevStream = it->second;
219
+ } else {
220
+ FAISS_ASSERT(defaultStreams_.count(device));
221
+ prevStream = defaultStreams_[device];
222
+ }
222
223
 
223
- if (prevStream != stream) {
224
- streamWait({stream}, {prevStream});
224
+ if (prevStream != stream) {
225
+ streamWait({stream}, {prevStream});
226
+ }
225
227
  }
226
- }
227
228
 
228
- userDefaultStreams_[device] = stream;
229
+ userDefaultStreams_[device] = stream;
229
230
  }
230
231
 
231
- void
232
- StandardGpuResourcesImpl::revertDefaultStream(int device) {
233
- if (isInitialized(device)) {
234
- auto it = userDefaultStreams_.find(device);
232
+ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
233
+ if (isInitialized(device)) {
234
+ auto it = userDefaultStreams_.find(device);
235
235
 
236
- if (it != userDefaultStreams_.end()) {
237
- // There was a user stream set that we need to synchronize against
238
- cudaStream_t prevStream = userDefaultStreams_[device];
236
+ if (it != userDefaultStreams_.end()) {
237
+ // There was a user stream set that we need to synchronize against
238
+ cudaStream_t prevStream = userDefaultStreams_[device];
239
239
 
240
- FAISS_ASSERT(defaultStreams_.count(device));
241
- cudaStream_t newStream = defaultStreams_[device];
240
+ FAISS_ASSERT(defaultStreams_.count(device));
241
+ cudaStream_t newStream = defaultStreams_[device];
242
242
 
243
- streamWait({newStream}, {prevStream});
243
+ streamWait({newStream}, {prevStream});
244
+ }
244
245
  }
245
- }
246
246
 
247
- userDefaultStreams_.erase(device);
247
+ userDefaultStreams_.erase(device);
248
248
  }
249
249
 
250
- void
251
- StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
252
- for (int dev = 0; dev < getNumDevices(); ++dev) {
253
- setDefaultStream(dev, nullptr);
254
- }
250
+ void StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
251
+ for (int dev = 0; dev < getNumDevices(); ++dev) {
252
+ setDefaultStream(dev, nullptr);
253
+ }
255
254
  }
256
255
 
257
- void
258
- StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
259
- allocLogging_ = enable;
256
+ void StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
257
+ allocLogging_ = enable;
260
258
  }
261
259
 
262
- bool
263
- StandardGpuResourcesImpl::isInitialized(int device) const {
264
- // Use default streams as a marker for whether or not a certain
265
- // device has been initialized
266
- return defaultStreams_.count(device) != 0;
260
+ bool StandardGpuResourcesImpl::isInitialized(int device) const {
261
+ // Use default streams as a marker for whether or not a certain
262
+ // device has been initialized
263
+ return defaultStreams_.count(device) != 0;
267
264
  }
268
265
 
269
- void
270
- StandardGpuResourcesImpl::initializeForDevice(int device) {
271
- if (isInitialized(device)) {
272
- return;
273
- }
274
-
275
- // If this is the first device that we're initializing, create our
276
- // pinned memory allocation
277
- if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
278
- auto err =
279
- cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
266
+ void StandardGpuResourcesImpl::initializeForDevice(int device) {
267
+ if (isInitialized(device)) {
268
+ return;
269
+ }
280
270
 
281
- FAISS_THROW_IF_NOT_FMT(
282
- err == cudaSuccess,
283
- "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
284
- "async copy buffer (error %d %s)",
285
- pinnedMemSize_, (int) err, cudaGetErrorString(err));
271
+ // If this is the first device that we're initializing, create our
272
+ // pinned memory allocation
273
+ if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
274
+ auto err = cudaHostAlloc(
275
+ &pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
276
+
277
+ FAISS_THROW_IF_NOT_FMT(
278
+ err == cudaSuccess,
279
+ "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
280
+ "async copy buffer (error %d %s)",
281
+ pinnedMemSize_,
282
+ (int)err,
283
+ cudaGetErrorString(err));
284
+
285
+ pinnedMemAllocSize_ = pinnedMemSize_;
286
+ }
286
287
 
287
- pinnedMemAllocSize_ = pinnedMemSize_;
288
- }
288
+ FAISS_ASSERT(device < getNumDevices());
289
+ DeviceScope scope(device);
289
290
 
290
- FAISS_ASSERT(device < getNumDevices());
291
- DeviceScope scope(device);
291
+ // Make sure that device properties for all devices are cached
292
+ auto& prop = getDeviceProperties(device);
292
293
 
293
- // Make sure that device properties for all devices are cached
294
- auto& prop = getDeviceProperties(device);
294
+ // Also check to make sure we meet our minimum compute capability (3.0)
295
+ FAISS_ASSERT_FMT(
296
+ prop.major >= 3,
297
+ "Device id %d with CC %d.%d not supported, "
298
+ "need 3.0+ compute capability",
299
+ device,
300
+ prop.major,
301
+ prop.minor);
295
302
 
296
- // Also check to make sure we meet our minimum compute capability (3.0)
297
- FAISS_ASSERT_FMT(prop.major >= 3,
298
- "Device id %d with CC %d.%d not supported, "
299
- "need 3.0+ compute capability",
300
- device, prop.major, prop.minor);
303
+ // Our code is pre-built with and expects warpSize == 32, validate that
304
+ FAISS_ASSERT_FMT(
305
+ prop.warpSize == 32,
306
+ "Device id %d does not have expected warpSize of 32",
307
+ device);
301
308
 
302
- // Create streams
303
- cudaStream_t defaultStream = 0;
304
- CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
305
- cudaStreamNonBlocking));
309
+ // Create streams
310
+ cudaStream_t defaultStream = 0;
311
+ CUDA_VERIFY(
312
+ cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
306
313
 
307
- defaultStreams_[device] = defaultStream;
314
+ defaultStreams_[device] = defaultStream;
308
315
 
309
- cudaStream_t asyncCopyStream = 0;
310
- CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
311
- cudaStreamNonBlocking));
316
+ cudaStream_t asyncCopyStream = 0;
317
+ CUDA_VERIFY(
318
+ cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
312
319
 
313
- asyncCopyStreams_[device] = asyncCopyStream;
320
+ asyncCopyStreams_[device] = asyncCopyStream;
314
321
 
315
- std::vector<cudaStream_t> deviceStreams;
316
- for (int j = 0; j < kNumStreams; ++j) {
317
- cudaStream_t stream = 0;
318
- CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
319
- cudaStreamNonBlocking));
322
+ std::vector<cudaStream_t> deviceStreams;
323
+ for (int j = 0; j < kNumStreams; ++j) {
324
+ cudaStream_t stream = 0;
325
+ CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
320
326
 
321
- deviceStreams.push_back(stream);
322
- }
327
+ deviceStreams.push_back(stream);
328
+ }
323
329
 
324
- alternateStreams_[device] = std::move(deviceStreams);
330
+ alternateStreams_[device] = std::move(deviceStreams);
325
331
 
326
- // Create cuBLAS handle
327
- cublasHandle_t blasHandle = 0;
328
- auto blasStatus = cublasCreate(&blasHandle);
329
- FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
330
- blasHandles_[device] = blasHandle;
332
+ // Create cuBLAS handle
333
+ cublasHandle_t blasHandle = 0;
334
+ auto blasStatus = cublasCreate(&blasHandle);
335
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
336
+ blasHandles_[device] = blasHandle;
331
337
 
332
- // For CUDA 10 on V100, enabling tensor core usage would enable automatic
333
- // rounding down of inputs to f16 (though accumulate in f32) which results in
334
- // unacceptable loss of precision in general.
335
- // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
336
- // a loss of precision.
338
+ // For CUDA 10 on V100, enabling tensor core usage would enable automatic
339
+ // rounding down of inputs to f16 (though accumulate in f32) which results
340
+ // in unacceptable loss of precision in general. For CUDA 11 / A100, only
341
+ // enable tensor core support if it doesn't result in a loss of precision.
337
342
  #if CUDA_VERSION >= 11000
338
- cublasSetMathMode(blasHandle,
339
- CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
343
+ cublasSetMathMode(
344
+ blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
340
345
  #endif
341
346
 
342
- FAISS_ASSERT(allocs_.count(device) == 0);
343
- allocs_[device] = std::unordered_map<void*, AllocRequest>();
347
+ FAISS_ASSERT(allocs_.count(device) == 0);
348
+ allocs_[device] = std::unordered_map<void*, AllocRequest>();
344
349
 
345
- FAISS_ASSERT(tempMemory_.count(device) == 0);
346
- auto mem = std::unique_ptr<StackDeviceMemory>(
347
- new StackDeviceMemory(this,
348
- device,
349
- // adjust for this specific device
350
- getDefaultTempMemForGPU(device, tempMemSize_)));
350
+ FAISS_ASSERT(tempMemory_.count(device) == 0);
351
+ auto mem = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
352
+ this,
353
+ device,
354
+ // adjust for this specific device
355
+ getDefaultTempMemForGPU(device, tempMemSize_)));
351
356
 
352
- tempMemory_.emplace(device, std::move(mem));
357
+ tempMemory_.emplace(device, std::move(mem));
353
358
  }
354
359
 
355
- cublasHandle_t
356
- StandardGpuResourcesImpl::getBlasHandle(int device) {
357
- initializeForDevice(device);
358
- return blasHandles_[device];
360
+ cublasHandle_t StandardGpuResourcesImpl::getBlasHandle(int device) {
361
+ initializeForDevice(device);
362
+ return blasHandles_[device];
359
363
  }
360
364
 
361
- cudaStream_t
362
- StandardGpuResourcesImpl::getDefaultStream(int device) {
363
- initializeForDevice(device);
365
+ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
366
+ initializeForDevice(device);
364
367
 
365
- auto it = userDefaultStreams_.find(device);
366
- if (it != userDefaultStreams_.end()) {
367
- // There is a user override stream set
368
- return it->second;
369
- }
368
+ auto it = userDefaultStreams_.find(device);
369
+ if (it != userDefaultStreams_.end()) {
370
+ // There is a user override stream set
371
+ return it->second;
372
+ }
370
373
 
371
- // Otherwise, our base default stream
372
- return defaultStreams_[device];
374
+ // Otherwise, our base default stream
375
+ return defaultStreams_[device];
373
376
  }
374
377
 
375
- std::vector<cudaStream_t>
376
- StandardGpuResourcesImpl::getAlternateStreams(int device) {
377
- initializeForDevice(device);
378
- return alternateStreams_[device];
378
+ std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
379
+ int device) {
380
+ initializeForDevice(device);
381
+ return alternateStreams_[device];
379
382
  }
380
383
 
381
- std::pair<void*, size_t>
382
- StandardGpuResourcesImpl::getPinnedMemory() {
383
- return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
384
+ std::pair<void*, size_t> StandardGpuResourcesImpl::getPinnedMemory() {
385
+ return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
384
386
  }
385
387
 
386
- cudaStream_t
387
- StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
388
- initializeForDevice(device);
389
- return asyncCopyStreams_[device];
388
+ cudaStream_t StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
389
+ initializeForDevice(device);
390
+ return asyncCopyStreams_[device];
390
391
  }
391
392
 
392
- void*
393
- StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
394
- initializeForDevice(req.device);
395
-
396
- // We don't allocate a placeholder for zero-sized allocations
397
- if (req.size == 0) {
398
- return nullptr;
399
- }
400
-
401
- // Make sure that the allocation is a multiple of 16 bytes for alignment
402
- // purposes
403
- auto adjReq = req;
404
- adjReq.size = utils::roundUp(adjReq.size, (size_t) 16);
405
-
406
- void* p = nullptr;
407
-
408
- if (allocLogging_) {
409
- std::cout << "StandardGpuResources: alloc " << adjReq.toString() << "\n";
410
- }
411
-
412
- if (adjReq.space == MemorySpace::Temporary) {
413
- // If we don't have enough space in our temporary memory manager, we need
414
- // to allocate this request separately
415
- auto& tempMem = tempMemory_[adjReq.device];
416
-
417
- if (adjReq.size > tempMem->getSizeAvailable()) {
418
- // We need to allocate this ourselves
419
- AllocRequest newReq = adjReq;
420
- newReq.space = MemorySpace::Device;
421
- newReq.type = AllocType::TemporaryMemoryOverflow;
393
+ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
394
+ initializeForDevice(req.device);
422
395
 
423
- return allocMemory(newReq);
396
+ // We don't allocate a placeholder for zero-sized allocations
397
+ if (req.size == 0) {
398
+ return nullptr;
424
399
  }
425
400
 
426
- // Otherwise, we can handle this locally
427
- p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
428
-
429
- } else if (adjReq.space == MemorySpace::Device) {
430
- auto err = cudaMalloc(&p, adjReq.size);
431
-
432
- // Throw if we fail to allocate
433
- if (err != cudaSuccess) {
434
- auto& map = allocs_[req.device];
435
-
436
- std::stringstream ss;
437
- ss << "Failed to cudaMalloc " << adjReq.size << " bytes "
438
- << "on device " << adjReq.device << " (error "
439
- << (int) err << " " << cudaGetErrorString(err)
440
- << "\nOutstanding allocations:\n" << allocsToString(map);
441
- auto str = ss.str();
442
-
443
- FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
401
+ // cudaMalloc guarantees allocation alignment to 256 bytes; do the same here
402
+ // for alignment purposes (to reduce memory transaction overhead etc)
403
+ auto adjReq = req;
404
+ adjReq.size = utils::roundUp(adjReq.size, (size_t)256);
405
+
406
+ void* p = nullptr;
407
+
408
+ if (adjReq.space == MemorySpace::Temporary) {
409
+ // If we don't have enough space in our temporary memory manager, we
410
+ // need to allocate this request separately
411
+ auto& tempMem = tempMemory_[adjReq.device];
412
+
413
+ if (adjReq.size > tempMem->getSizeAvailable()) {
414
+ // We need to allocate this ourselves
415
+ AllocRequest newReq = adjReq;
416
+ newReq.space = MemorySpace::Device;
417
+ newReq.type = AllocType::TemporaryMemoryOverflow;
418
+
419
+ if (allocLogging_) {
420
+ std::cout
421
+ << "StandardGpuResources: alloc fail "
422
+ << adjReq.toString()
423
+ << " (no temp space); retrying as MemorySpace::Device\n";
424
+ }
425
+
426
+ return allocMemory(newReq);
427
+ }
428
+
429
+ // Otherwise, we can handle this locally
430
+ p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
431
+
432
+ } else if (adjReq.space == MemorySpace::Device) {
433
+ auto err = cudaMalloc(&p, adjReq.size);
434
+
435
+ // Throw if we fail to allocate
436
+ if (err != cudaSuccess) {
437
+ // FIXME: as of CUDA 11, a memory allocation error appears to be
438
+ // presented via cudaGetLastError as well, and needs to be cleared.
439
+ // Just call the function to clear it
440
+ cudaGetLastError();
441
+
442
+ std::stringstream ss;
443
+ ss << "StandardGpuResources: alloc fail " << adjReq.toString()
444
+ << " (cudaMalloc error " << cudaGetErrorString(err) << " ["
445
+ << (int)err << "])\n";
446
+ auto str = ss.str();
447
+
448
+ if (allocLogging_) {
449
+ std::cout << str;
450
+ }
451
+
452
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
453
+ }
454
+ } else if (adjReq.space == MemorySpace::Unified) {
455
+ auto err = cudaMallocManaged(&p, adjReq.size);
456
+
457
+ if (err != cudaSuccess) {
458
+ // FIXME: as of CUDA 11, a memory allocation error appears to be
459
+ // presented via cudaGetLastError as well, and needs to be cleared.
460
+ // Just call the function to clear it
461
+ cudaGetLastError();
462
+
463
+ std::stringstream ss;
464
+ ss << "StandardGpuResources: alloc fail " << adjReq.toString()
465
+ << " failed (cudaMallocManaged error " << cudaGetErrorString(err)
466
+ << " [" << (int)err << "])\n";
467
+ auto str = ss.str();
468
+
469
+ if (allocLogging_) {
470
+ std::cout << str;
471
+ }
472
+
473
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
474
+ }
475
+ } else {
476
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
444
477
  }
445
- } else if (adjReq.space == MemorySpace::Unified) {
446
- auto err = cudaMallocManaged(&p, adjReq.size);
447
-
448
- if (err != cudaSuccess) {
449
- auto& map = allocs_[req.device];
450
478
 
451
- std::stringstream ss;
452
- ss << "Failed to cudaMallocManaged " << adjReq.size << " bytes "
453
- << "(error " << (int) err << " " << cudaGetErrorString(err)
454
- << "\nOutstanding allocations:\n" << allocsToString(map);
455
- auto str = ss.str();
456
-
457
- FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
479
+ if (allocLogging_) {
480
+ std::cout << "StandardGpuResources: alloc ok " << adjReq.toString()
481
+ << " ptr 0x" << p << "\n";
458
482
  }
459
- } else {
460
- FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) adjReq.space);
461
- }
462
483
 
463
- allocs_[adjReq.device][p] = adjReq;
484
+ allocs_[adjReq.device][p] = adjReq;
464
485
 
465
- return p;
486
+ return p;
466
487
  }
467
488
 
468
- void
469
- StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
470
- FAISS_ASSERT(isInitialized(device));
489
+ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
490
+ FAISS_ASSERT(isInitialized(device));
471
491
 
472
- if (!p) {
473
- return;
474
- }
492
+ if (!p) {
493
+ return;
494
+ }
475
495
 
476
- auto& a = allocs_[device];
477
- auto it = a.find(p);
478
- FAISS_ASSERT(it != a.end());
496
+ auto& a = allocs_[device];
497
+ auto it = a.find(p);
498
+ FAISS_ASSERT(it != a.end());
479
499
 
480
- auto& req = it->second;
500
+ auto& req = it->second;
481
501
 
482
- if (allocLogging_) {
483
- std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
484
- }
502
+ if (allocLogging_) {
503
+ std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
504
+ }
485
505
 
486
- if (req.space == MemorySpace::Temporary) {
487
- tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
506
+ if (req.space == MemorySpace::Temporary) {
507
+ tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
488
508
 
489
- } else if (req.space == MemorySpace::Device ||
490
- req.space == MemorySpace::Unified) {
491
- auto err = cudaFree(p);
492
- FAISS_ASSERT_FMT(err == cudaSuccess,
493
- "Failed to cudaFree pointer %p (error %d %s)",
494
- p, (int) err, cudaGetErrorString(err));
509
+ } else if (
510
+ req.space == MemorySpace::Device ||
511
+ req.space == MemorySpace::Unified) {
512
+ auto err = cudaFree(p);
513
+ FAISS_ASSERT_FMT(
514
+ err == cudaSuccess,
515
+ "Failed to cudaFree pointer %p (error %d %s)",
516
+ p,
517
+ (int)err,
518
+ cudaGetErrorString(err));
495
519
 
496
- } else {
497
- FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) req.space);
498
- }
520
+ } else {
521
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
522
+ }
499
523
 
500
- a.erase(it);
524
+ a.erase(it);
501
525
  }
502
526
 
503
- size_t
504
- StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
505
- FAISS_ASSERT(isInitialized(device));
527
+ size_t StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
528
+ FAISS_ASSERT(isInitialized(device));
506
529
 
507
- auto it = tempMemory_.find(device);
508
- FAISS_ASSERT(it != tempMemory_.end());
530
+ auto it = tempMemory_.find(device);
531
+ FAISS_ASSERT(it != tempMemory_.end());
509
532
 
510
- return it->second->getSizeAvailable();
533
+ return it->second->getSizeAvailable();
511
534
  }
512
535
 
513
536
  std::map<int, std::map<std::string, std::pair<int, size_t>>>
514
537
  StandardGpuResourcesImpl::getMemoryInfo() const {
515
- using AT = std::map<std::string, std::pair<int, size_t>>;
538
+ using AT = std::map<std::string, std::pair<int, size_t>>;
516
539
 
517
- std::map<int, AT> out;
540
+ std::map<int, AT> out;
518
541
 
519
- for (auto& entry : allocs_) {
520
- AT outDevice;
542
+ for (auto& entry : allocs_) {
543
+ AT outDevice;
521
544
 
522
- for (auto& a : entry.second) {
523
- auto& v = outDevice[allocTypeToString(a.second.type)];
524
- v.first++;
525
- v.second += a.second.size;
526
- }
545
+ for (auto& a : entry.second) {
546
+ auto& v = outDevice[allocTypeToString(a.second.type)];
547
+ v.first++;
548
+ v.second += a.second.size;
549
+ }
527
550
 
528
- out[entry.first] = std::move(outDevice);
529
- }
551
+ out[entry.first] = std::move(outDevice);
552
+ }
530
553
 
531
- return out;
554
+ return out;
532
555
  }
533
556
 
534
557
  //
@@ -536,70 +559,58 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
536
559
  //
537
560
 
538
561
  StandardGpuResources::StandardGpuResources()
539
- : res_(new StandardGpuResourcesImpl) {
540
- }
562
+ : res_(new StandardGpuResourcesImpl) {}
541
563
 
542
- StandardGpuResources::~StandardGpuResources() {
543
- }
564
+ StandardGpuResources::~StandardGpuResources() {}
544
565
 
545
- std::shared_ptr<GpuResources>
546
- StandardGpuResources::getResources() {
547
- return res_;
566
+ std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
567
+ return res_;
548
568
  }
549
569
 
550
- void
551
- StandardGpuResources::noTempMemory() {
552
- res_->noTempMemory();
570
+ void StandardGpuResources::noTempMemory() {
571
+ res_->noTempMemory();
553
572
  }
554
573
 
555
- void
556
- StandardGpuResources::setTempMemory(size_t size) {
557
- res_->setTempMemory(size);
574
+ void StandardGpuResources::setTempMemory(size_t size) {
575
+ res_->setTempMemory(size);
558
576
  }
559
577
 
560
- void
561
- StandardGpuResources::setPinnedMemory(size_t size) {
562
- res_->setPinnedMemory(size);
578
+ void StandardGpuResources::setPinnedMemory(size_t size) {
579
+ res_->setPinnedMemory(size);
563
580
  }
564
581
 
565
- void
566
- StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
567
- res_->setDefaultStream(device, stream);
582
+ void StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
583
+ res_->setDefaultStream(device, stream);
568
584
  }
569
585
 
570
- void
571
- StandardGpuResources::revertDefaultStream(int device) {
572
- res_->revertDefaultStream(device);
586
+ void StandardGpuResources::revertDefaultStream(int device) {
587
+ res_->revertDefaultStream(device);
573
588
  }
574
589
 
575
- void
576
- StandardGpuResources::setDefaultNullStreamAllDevices() {
577
- res_->setDefaultNullStreamAllDevices();
590
+ void StandardGpuResources::setDefaultNullStreamAllDevices() {
591
+ res_->setDefaultNullStreamAllDevices();
578
592
  }
579
593
 
580
594
  std::map<int, std::map<std::string, std::pair<int, size_t>>>
581
595
  StandardGpuResources::getMemoryInfo() const {
582
- return res_->getMemoryInfo();
596
+ return res_->getMemoryInfo();
583
597
  }
584
598
 
585
- cudaStream_t
586
- StandardGpuResources::getDefaultStream(int device) {
587
- return res_->getDefaultStream(device);
599
+ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
600
+ return res_->getDefaultStream(device);
588
601
  }
589
602
 
590
- size_t
591
- StandardGpuResources::getTempMemoryAvailable(int device) const {
592
- return res_->getTempMemoryAvailable(device);
603
+ size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
604
+ return res_->getTempMemoryAvailable(device);
593
605
  }
594
606
 
595
- void
596
- StandardGpuResources::syncDefaultStreamCurrentDevice() {
597
- res_->syncDefaultStreamCurrentDevice();
607
+ void StandardGpuResources::syncDefaultStreamCurrentDevice() {
608
+ res_->syncDefaultStreamCurrentDevice();
598
609
  }
599
610
 
600
- void
601
- StandardGpuResources::setLogMemoryAllocations(bool enable) {
602
- res_->setLogMemoryAllocations(enable);
611
+ void StandardGpuResources::setLogMemoryAllocations(bool enable) {
612
+ res_->setLogMemoryAllocations(enable);
603
613
  }
604
614
 
605
- } } // namespace
615
+ } // namespace gpu
616
+ } // namespace faiss