faiss 0.1.5 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +24 -0
  3. data/README.md +12 -0
  4. data/ext/faiss/ext.cpp +1 -1
  5. data/ext/faiss/extconf.rb +6 -2
  6. data/ext/faiss/index.cpp +114 -43
  7. data/ext/faiss/index_binary.cpp +24 -30
  8. data/ext/faiss/kmeans.cpp +20 -16
  9. data/ext/faiss/numo.hpp +867 -0
  10. data/ext/faiss/pca_matrix.cpp +13 -14
  11. data/ext/faiss/product_quantizer.cpp +23 -24
  12. data/ext/faiss/utils.cpp +10 -37
  13. data/ext/faiss/utils.h +2 -13
  14. data/lib/faiss.rb +0 -5
  15. data/lib/faiss/version.rb +1 -1
  16. data/vendor/faiss/faiss/AutoTune.cpp +292 -291
  17. data/vendor/faiss/faiss/AutoTune.h +55 -56
  18. data/vendor/faiss/faiss/Clustering.cpp +334 -195
  19. data/vendor/faiss/faiss/Clustering.h +88 -35
  20. data/vendor/faiss/faiss/IVFlib.cpp +171 -195
  21. data/vendor/faiss/faiss/IVFlib.h +48 -51
  22. data/vendor/faiss/faiss/Index.cpp +85 -103
  23. data/vendor/faiss/faiss/Index.h +54 -48
  24. data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
  25. data/vendor/faiss/faiss/Index2Layer.h +22 -22
  26. data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
  27. data/vendor/faiss/faiss/IndexBinary.h +140 -132
  28. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
  29. data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
  30. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
  31. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
  32. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
  33. data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
  34. data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
  35. data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
  36. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
  37. data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
  38. data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
  39. data/vendor/faiss/faiss/IndexFlat.h +35 -46
  40. data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
  41. data/vendor/faiss/faiss/IndexHNSW.h +57 -41
  42. data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
  43. data/vendor/faiss/faiss/IndexIVF.h +146 -113
  44. data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
  45. data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
  46. data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
  47. data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
  48. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
  49. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
  50. data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
  51. data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
  52. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
  53. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
  54. data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
  55. data/vendor/faiss/faiss/IndexLSH.h +21 -26
  56. data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
  57. data/vendor/faiss/faiss/IndexLattice.h +11 -16
  58. data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
  59. data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
  60. data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
  61. data/vendor/faiss/faiss/IndexNSG.h +85 -0
  62. data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
  63. data/vendor/faiss/faiss/IndexPQ.h +64 -67
  64. data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
  65. data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
  66. data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
  67. data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
  68. data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
  69. data/vendor/faiss/faiss/IndexRefine.h +22 -23
  70. data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
  71. data/vendor/faiss/faiss/IndexReplicas.h +62 -56
  72. data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
  73. data/vendor/faiss/faiss/IndexResidual.h +152 -0
  74. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
  75. data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
  76. data/vendor/faiss/faiss/IndexShards.cpp +256 -240
  77. data/vendor/faiss/faiss/IndexShards.h +85 -73
  78. data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
  79. data/vendor/faiss/faiss/MatrixStats.h +7 -10
  80. data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
  81. data/vendor/faiss/faiss/MetaIndexes.h +40 -34
  82. data/vendor/faiss/faiss/MetricType.h +7 -7
  83. data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
  84. data/vendor/faiss/faiss/VectorTransform.h +61 -89
  85. data/vendor/faiss/faiss/clone_index.cpp +77 -73
  86. data/vendor/faiss/faiss/clone_index.h +4 -9
  87. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
  88. data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
  89. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
  90. data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
  91. data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
  92. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
  93. data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
  94. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
  95. data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
  96. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
  97. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
  98. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
  99. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
  100. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
  101. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
  102. data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
  103. data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
  104. data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
  106. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
  107. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
  108. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
  109. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
  110. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
  111. data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
  112. data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
  113. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
  114. data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
  115. data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
  116. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
  117. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
  118. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
  119. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
  120. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
  121. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
  122. data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
  123. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
  124. data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
  125. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
  126. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
  127. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
  128. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
  129. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
  130. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
  131. data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
  132. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
  133. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
  134. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
  135. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
  136. data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
  137. data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
  138. data/vendor/faiss/faiss/impl/FaissException.h +41 -29
  139. data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
  140. data/vendor/faiss/faiss/impl/HNSW.h +179 -200
  141. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
  142. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
  143. data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
  144. data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
  145. data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
  146. data/vendor/faiss/faiss/impl/NSG.h +199 -0
  147. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
  148. data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
  149. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
  150. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
  151. data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
  152. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
  153. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
  154. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
  155. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
  156. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
  157. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
  158. data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
  159. data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
  160. data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
  161. data/vendor/faiss/faiss/impl/io.cpp +75 -94
  162. data/vendor/faiss/faiss/impl/io.h +31 -41
  163. data/vendor/faiss/faiss/impl/io_macros.h +40 -29
  164. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
  165. data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
  166. data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
  167. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
  168. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
  169. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
  170. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
  171. data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
  172. data/vendor/faiss/faiss/index_factory.cpp +269 -218
  173. data/vendor/faiss/faiss/index_factory.h +6 -7
  174. data/vendor/faiss/faiss/index_io.h +23 -26
  175. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
  176. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
  177. data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
  178. data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
  179. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
  180. data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
  181. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
  182. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
  183. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
  184. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
  185. data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
  186. data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
  187. data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
  188. data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
  189. data/vendor/faiss/faiss/utils/Heap.h +186 -209
  190. data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
  191. data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
  192. data/vendor/faiss/faiss/utils/distances.cpp +301 -310
  193. data/vendor/faiss/faiss/utils/distances.h +133 -118
  194. data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
  195. data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
  196. data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
  197. data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
  198. data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
  199. data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
  200. data/vendor/faiss/faiss/utils/hamming.h +62 -85
  201. data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
  202. data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
  203. data/vendor/faiss/faiss/utils/partitioning.h +26 -21
  204. data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
  205. data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
  206. data/vendor/faiss/faiss/utils/random.cpp +39 -63
  207. data/vendor/faiss/faiss/utils/random.h +13 -16
  208. data/vendor/faiss/faiss/utils/simdlib.h +4 -2
  209. data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
  210. data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
  211. data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
  212. data/vendor/faiss/faiss/utils/utils.cpp +304 -287
  213. data/vendor/faiss/faiss/utils/utils.h +53 -48
  214. metadata +24 -10
  215. data/lib/faiss/index.rb +0 -20
  216. data/lib/faiss/index_binary.rb +0 -20
  217. data/lib/faiss/kmeans.rb +0 -15
  218. data/lib/faiss/pca_matrix.rb +0 -15
  219. data/lib/faiss/product_quantizer.rb +0 -22
@@ -5,55 +5,59 @@
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
7
 
8
-
9
8
  #pragma once
10
9
 
11
- #include <faiss/impl/FaissAssert.h>
12
- #include <cuda_runtime.h>
13
10
  #include <cublas_v2.h>
11
+ #include <cuda_runtime.h>
12
+ #include <faiss/impl/FaissAssert.h>
14
13
  #include <memory>
15
14
  #include <utility>
16
15
  #include <vector>
17
16
 
18
- namespace faiss { namespace gpu {
17
+ namespace faiss {
18
+ namespace gpu {
19
19
 
20
20
  class GpuResources;
21
21
 
22
22
  enum AllocType {
23
- /// Unknown allocation type or miscellaneous (not currently categorized)
24
- Other = 0,
25
-
26
- /// Primary data storage for GpuIndexFlat (the raw matrix of vectors and
27
- /// vector norms if needed)
28
- FlatData = 1,
29
-
30
- /// Primary data storage for GpuIndexIVF* (the storage for each individual IVF
31
- /// list)
32
- IVFLists = 2,
33
-
34
- /// Quantizer (PQ, SQ) dictionary information
35
- Quantizer = 3,
36
-
37
- /// For GpuIndexIVFPQ, "precomputed codes" for more efficient PQ lookup
38
- /// require the use of possibly large tables. These are marked separately from
39
- /// Quantizer as these can frequently be 100s - 1000s of MiB in size
40
- QuantizerPrecomputedCodes = 4,
41
-
42
- ///
43
- /// StandardGpuResources implementation specific types
44
- ///
45
-
46
- /// When using StandardGpuResources, temporary memory allocations
47
- /// (MemorySpace::Temporary) come out of a stack region of memory that is
48
- /// allocated up front for each gpu (e.g., 1.5 GiB upon initialization). This
49
- /// allocation by StandardGpuResources is marked with this AllocType.
50
- TemporaryMemoryBuffer = 10,
51
-
52
- /// When using StandardGpuResources, any MemorySpace::Temporary allocations
53
- /// that cannot be satisfied within the TemporaryMemoryBuffer region fall back
54
- /// to calling cudaMalloc which are sized to just the request at hand. These
55
- /// "overflow" temporary allocations are marked with this AllocType.
56
- TemporaryMemoryOverflow = 11,
23
+ /// Unknown allocation type or miscellaneous (not currently categorized)
24
+ Other = 0,
25
+
26
+ /// Primary data storage for GpuIndexFlat (the raw matrix of vectors and
27
+ /// vector norms if needed)
28
+ FlatData = 1,
29
+
30
+ /// Primary data storage for GpuIndexIVF* (the storage for each individual
31
+ /// IVF
32
+ /// list)
33
+ IVFLists = 2,
34
+
35
+ /// Quantizer (PQ, SQ) dictionary information
36
+ Quantizer = 3,
37
+
38
+ /// For GpuIndexIVFPQ, "precomputed codes" for more efficient PQ lookup
39
+ /// require the use of possibly large tables. These are marked separately
40
+ /// from
41
+ /// Quantizer as these can frequently be 100s - 1000s of MiB in size
42
+ QuantizerPrecomputedCodes = 4,
43
+
44
+ ///
45
+ /// StandardGpuResources implementation specific types
46
+ ///
47
+
48
+ /// When using StandardGpuResources, temporary memory allocations
49
+ /// (MemorySpace::Temporary) come out of a stack region of memory that is
50
+ /// allocated up front for each gpu (e.g., 1.5 GiB upon initialization).
51
+ /// This
52
+ /// allocation by StandardGpuResources is marked with this AllocType.
53
+ TemporaryMemoryBuffer = 10,
54
+
55
+ /// When using StandardGpuResources, any MemorySpace::Temporary allocations
56
+ /// that cannot be satisfied within the TemporaryMemoryBuffer region fall
57
+ /// back
58
+ /// to calling cudaMalloc which are sized to just the request at hand. These
59
+ /// "overflow" temporary allocations are marked with this AllocType.
60
+ TemporaryMemoryOverflow = 11,
57
61
  };
58
62
 
59
63
  /// Convert an AllocType to string
@@ -61,16 +65,17 @@ std::string allocTypeToString(AllocType t);
61
65
 
62
66
  /// Memory regions accessible to the GPU
63
67
  enum MemorySpace {
64
- /// Temporary device memory (guaranteed to no longer be used upon exit of a
65
- /// top-level index call, and where the streams using it have completed GPU
66
- /// work). Typically backed by Device memory (cudaMalloc/cudaFree).
67
- Temporary = 0,
68
+ /// Temporary device memory (guaranteed to no longer be used upon exit of a
69
+ /// top-level index call, and where the streams using it have completed GPU
70
+ /// work). Typically backed by Device memory (cudaMalloc/cudaFree).
71
+ Temporary = 0,
68
72
 
69
- /// Managed using cudaMalloc/cudaFree (typical GPU device memory)
70
- Device = 1,
73
+ /// Managed using cudaMalloc/cudaFree (typical GPU device memory)
74
+ Device = 1,
71
75
 
72
- /// Managed using cudaMallocManaged/cudaFree (typical Unified CPU/GPU memory)
73
- Unified = 2,
76
+ /// Managed using cudaMallocManaged/cudaFree (typical Unified CPU/GPU
77
+ /// memory)
78
+ Unified = 2,
74
79
  };
75
80
 
76
81
  /// Convert a MemorySpace to string
@@ -78,44 +83,36 @@ std::string memorySpaceToString(MemorySpace s);
78
83
 
79
84
  /// Information on what/where an allocation is
80
85
  struct AllocInfo {
81
- inline AllocInfo()
82
- : type(AllocType::Other),
83
- device(0),
84
- space(MemorySpace::Device),
85
- stream(nullptr) {
86
- }
87
-
88
- inline AllocInfo(AllocType at,
89
- int dev,
90
- MemorySpace sp,
91
- cudaStream_t st)
92
- : type(at),
93
- device(dev),
94
- space(sp),
95
- stream(st) {
96
- }
97
-
98
- /// Returns a string representation of this info
99
- std::string toString() const;
100
-
101
- /// The internal category of the allocation
102
- AllocType type;
103
-
104
- /// The device on which the allocation is happening
105
- int device;
106
-
107
- /// The memory space of the allocation
108
- MemorySpace space;
109
-
110
- /// The stream on which new work on the memory will be ordered (e.g., if a
111
- /// piece of memory cached and to be returned for this call was last used on
112
- /// stream 3 and a new memory request is for stream 4, the memory manager will
113
- /// synchronize stream 4 to wait for the completion of stream 3 via events or
114
- /// other stream synchronization.
115
- ///
116
- /// The memory manager guarantees that the returned memory is free to use
117
- /// without data races on this stream specified.
118
- cudaStream_t stream;
86
+ inline AllocInfo()
87
+ : type(AllocType::Other),
88
+ device(0),
89
+ space(MemorySpace::Device),
90
+ stream(nullptr) {}
91
+
92
+ inline AllocInfo(AllocType at, int dev, MemorySpace sp, cudaStream_t st)
93
+ : type(at), device(dev), space(sp), stream(st) {}
94
+
95
+ /// Returns a string representation of this info
96
+ std::string toString() const;
97
+
98
+ /// The internal category of the allocation
99
+ AllocType type;
100
+
101
+ /// The device on which the allocation is happening
102
+ int device;
103
+
104
+ /// The memory space of the allocation
105
+ MemorySpace space;
106
+
107
+ /// The stream on which new work on the memory will be ordered (e.g., if a
108
+ /// piece of memory cached and to be returned for this call was last used on
109
+ /// stream 3 and a new memory request is for stream 4, the memory manager
110
+ /// will synchronize stream 4 to wait for the completion of stream 3 via
111
+ /// events or other stream synchronization.
112
+ ///
113
+ /// The memory manager guarantees that the returned memory is free to use
114
+ /// without data races on this stream specified.
115
+ cudaStream_t stream;
119
116
  };
120
117
 
121
118
  /// Create an AllocInfo for the current device with MemorySpace::Device
@@ -129,140 +126,139 @@ AllocInfo makeSpaceAlloc(AllocType at, MemorySpace sp, cudaStream_t st);
129
126
 
130
127
  /// Information on what/where an allocation is, along with how big it should be
131
128
  struct AllocRequest : public AllocInfo {
132
- inline AllocRequest()
133
- : AllocInfo(),
134
- size(0) {
135
- }
136
-
137
- inline AllocRequest(const AllocInfo& info,
138
- size_t sz)
139
- : AllocInfo(info),
140
- size(sz) {
141
- }
142
-
143
- inline AllocRequest(AllocType at,
144
- int dev,
145
- MemorySpace sp,
146
- cudaStream_t st,
147
- size_t sz)
148
- : AllocInfo(at, dev, sp, st),
149
- size(sz) {
150
- }
151
-
152
- /// Returns a string representation of this request
153
- std::string toString() const;
154
-
155
- /// The size in bytes of the allocation
156
- size_t size;
129
+ inline AllocRequest() : AllocInfo(), size(0) {}
130
+
131
+ inline AllocRequest(const AllocInfo& info, size_t sz)
132
+ : AllocInfo(info), size(sz) {}
133
+
134
+ inline AllocRequest(
135
+ AllocType at,
136
+ int dev,
137
+ MemorySpace sp,
138
+ cudaStream_t st,
139
+ size_t sz)
140
+ : AllocInfo(at, dev, sp, st), size(sz) {}
141
+
142
+ /// Returns a string representation of this request
143
+ std::string toString() const;
144
+
145
+ /// The size in bytes of the allocation
146
+ size_t size;
157
147
  };
158
148
 
159
149
  /// A RAII object that manages a temporary memory request
160
150
  struct GpuMemoryReservation {
161
- GpuMemoryReservation();
162
- GpuMemoryReservation(GpuResources* r,
163
- int dev,
164
- cudaStream_t str,
165
- void* p,
166
- size_t sz);
167
- GpuMemoryReservation(GpuMemoryReservation&& m) noexcept;
168
- ~GpuMemoryReservation();
169
-
170
- GpuMemoryReservation& operator=(GpuMemoryReservation&& m);
171
-
172
- inline void* get() { return data; }
173
-
174
- void release();
175
-
176
- GpuResources* res;
177
- int device;
178
- cudaStream_t stream;
179
- void* data;
180
- size_t size;
151
+ GpuMemoryReservation();
152
+ GpuMemoryReservation(
153
+ GpuResources* r,
154
+ int dev,
155
+ cudaStream_t str,
156
+ void* p,
157
+ size_t sz);
158
+ GpuMemoryReservation(GpuMemoryReservation&& m) noexcept;
159
+ ~GpuMemoryReservation();
160
+
161
+ GpuMemoryReservation& operator=(GpuMemoryReservation&& m);
162
+
163
+ inline void* get() {
164
+ return data;
165
+ }
166
+
167
+ void release();
168
+
169
+ GpuResources* res;
170
+ int device;
171
+ cudaStream_t stream;
172
+ void* data;
173
+ size_t size;
181
174
  };
182
175
 
183
176
  /// Base class of GPU-side resource provider; hides provision of
184
177
  /// cuBLAS handles, CUDA streams and all device memory allocation performed
185
178
  class GpuResources {
186
- public:
187
- virtual ~GpuResources();
179
+ public:
180
+ virtual ~GpuResources();
188
181
 
189
- /// Call to pre-allocate resources for a particular device. If this is
190
- /// not called, then resources will be allocated at the first time
191
- /// of demand
192
- virtual void initializeForDevice(int device) = 0;
182
+ /// Call to pre-allocate resources for a particular device. If this is
183
+ /// not called, then resources will be allocated at the first time
184
+ /// of demand
185
+ virtual void initializeForDevice(int device) = 0;
193
186
 
194
- /// Returns the cuBLAS handle that we use for the given device
195
- virtual cublasHandle_t getBlasHandle(int device) = 0;
187
+ /// Returns the cuBLAS handle that we use for the given device
188
+ virtual cublasHandle_t getBlasHandle(int device) = 0;
196
189
 
197
- /// Returns the stream that we order all computation on for the
198
- /// given device
199
- virtual cudaStream_t getDefaultStream(int device) = 0;
190
+ /// Returns the stream that we order all computation on for the
191
+ /// given device
192
+ virtual cudaStream_t getDefaultStream(int device) = 0;
200
193
 
201
- /// Overrides the default stream for a device to the user-supplied stream. The
202
- /// resources object does not own this stream (i.e., it will not destroy it).
203
- virtual void setDefaultStream(int device, cudaStream_t stream) = 0;
194
+ /// Overrides the default stream for a device to the user-supplied stream.
195
+ /// The resources object does not own this stream (i.e., it will not destroy
196
+ /// it).
197
+ virtual void setDefaultStream(int device, cudaStream_t stream) = 0;
204
198
 
205
- /// Returns the set of alternative streams that we use for the given device
206
- virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
199
+ /// Returns the set of alternative streams that we use for the given device
200
+ virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
207
201
 
208
- /// Memory management
209
- /// Returns an allocation from the given memory space, ordered with respect to
210
- /// the given stream (i.e., the first user will be a kernel in this stream).
211
- /// All allocations are sized internally to be the next highest multiple of 16
212
- /// bytes, and all allocations returned are guaranteed to be 16 byte aligned.
213
- virtual void* allocMemory(const AllocRequest& req) = 0;
202
+ /// Memory management
203
+ /// Returns an allocation from the given memory space, ordered with respect
204
+ /// to the given stream (i.e., the first user will be a kernel in this
205
+ /// stream). All allocations are sized internally to be the next highest
206
+ /// multiple of 16 bytes, and all allocations returned are guaranteed to be
207
+ /// 16 byte aligned.
208
+ virtual void* allocMemory(const AllocRequest& req) = 0;
214
209
 
215
- /// Returns a previous allocation
216
- virtual void deallocMemory(int device, void* in) = 0;
210
+ /// Returns a previous allocation
211
+ virtual void deallocMemory(int device, void* in) = 0;
217
212
 
218
- /// For MemorySpace::Temporary, how much space is immediately available
219
- /// without cudaMalloc allocation?
220
- virtual size_t getTempMemoryAvailable(int device) const = 0;
213
+ /// For MemorySpace::Temporary, how much space is immediately available
214
+ /// without cudaMalloc allocation?
215
+ virtual size_t getTempMemoryAvailable(int device) const = 0;
221
216
 
222
- /// Returns the available CPU pinned memory buffer
223
- virtual std::pair<void*, size_t> getPinnedMemory() = 0;
217
+ /// Returns the available CPU pinned memory buffer
218
+ virtual std::pair<void*, size_t> getPinnedMemory() = 0;
224
219
 
225
- /// Returns the stream on which we perform async CPU <-> GPU copies
226
- virtual cudaStream_t getAsyncCopyStream(int device) = 0;
220
+ /// Returns the stream on which we perform async CPU <-> GPU copies
221
+ virtual cudaStream_t getAsyncCopyStream(int device) = 0;
227
222
 
228
- ///
229
- /// Functions provided by default
230
- ///
223
+ ///
224
+ /// Functions provided by default
225
+ ///
231
226
 
232
- /// Calls getBlasHandle with the current device
233
- cublasHandle_t getBlasHandleCurrentDevice();
227
+ /// Calls getBlasHandle with the current device
228
+ cublasHandle_t getBlasHandleCurrentDevice();
234
229
 
235
- /// Calls getDefaultStream with the current device
236
- cudaStream_t getDefaultStreamCurrentDevice();
230
+ /// Calls getDefaultStream with the current device
231
+ cudaStream_t getDefaultStreamCurrentDevice();
237
232
 
238
- /// Calls getTempMemoryAvailable with the current device
239
- size_t getTempMemoryAvailableCurrentDevice() const;
233
+ /// Calls getTempMemoryAvailable with the current device
234
+ size_t getTempMemoryAvailableCurrentDevice() const;
240
235
 
241
- /// Returns a temporary memory allocation via a RAII object
242
- GpuMemoryReservation allocMemoryHandle(const AllocRequest& req);
236
+ /// Returns a temporary memory allocation via a RAII object
237
+ GpuMemoryReservation allocMemoryHandle(const AllocRequest& req);
243
238
 
244
- /// Synchronizes the CPU with respect to the default stream for the
245
- /// given device
246
- // equivalent to cudaDeviceSynchronize(getDefaultStream(device))
247
- void syncDefaultStream(int device);
239
+ /// Synchronizes the CPU with respect to the default stream for the
240
+ /// given device
241
+ // equivalent to cudaDeviceSynchronize(getDefaultStream(device))
242
+ void syncDefaultStream(int device);
248
243
 
249
- /// Calls syncDefaultStream for the current device
250
- void syncDefaultStreamCurrentDevice();
244
+ /// Calls syncDefaultStream for the current device
245
+ void syncDefaultStreamCurrentDevice();
251
246
 
252
- /// Calls getAlternateStreams for the current device
253
- std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
247
+ /// Calls getAlternateStreams for the current device
248
+ std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
254
249
 
255
- /// Calls getAsyncCopyStream for the current device
256
- cudaStream_t getAsyncCopyStreamCurrentDevice();
250
+ /// Calls getAsyncCopyStream for the current device
251
+ cudaStream_t getAsyncCopyStreamCurrentDevice();
257
252
  };
258
253
 
259
254
  /// Interface for a provider of a shared resources object
260
255
  class GpuResourcesProvider {
261
- public:
262
- virtual ~GpuResourcesProvider();
256
+ public:
257
+ virtual ~GpuResourcesProvider();
263
258
 
264
- /// Returns the shared resources object
265
- virtual std::shared_ptr<GpuResources> getResources() = 0;
259
+ /// Returns the shared resources object
260
+ virtual std::shared_ptr<GpuResources> getResources() = 0;
266
261
  };
267
262
 
268
- } } // namespace
263
+ } // namespace gpu
264
+ } // namespace faiss
@@ -5,16 +5,16 @@
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
7
 
8
-
9
8
  #include <faiss/gpu/StandardGpuResources.h>
10
9
  #include <faiss/gpu/utils/DeviceUtils.h>
11
10
  #include <faiss/gpu/utils/StaticUtils.h>
12
11
  #include <faiss/impl/FaissAssert.h>
13
- #include <limits>
14
12
  #include <iostream>
13
+ #include <limits>
15
14
  #include <sstream>
16
15
 
17
- namespace faiss { namespace gpu {
16
+ namespace faiss {
17
+ namespace gpu {
18
18
 
19
19
  namespace {
20
20
 
@@ -22,513 +22,536 @@ namespace {
22
22
  constexpr int kNumStreams = 2;
23
23
 
24
24
  // Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
25
- constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
25
+ constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
26
26
 
27
27
  // Default temporary memory allocation for <= 4 GiB memory GPUs
28
- constexpr size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
28
+ constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
29
29
 
30
30
  // Default temporary memory allocation for <= 8 GiB memory GPUs
31
- constexpr size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
31
+ constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
32
32
 
33
33
  // Maximum temporary memory allocation for all GPUs
34
- constexpr size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
34
+ constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
35
35
 
36
36
  std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
37
- // Produce a sorted list of all outstanding allocations by type
38
- std::unordered_map<AllocType, std::pair<int, size_t>> stats;
39
-
40
- for (auto& entry : map) {
41
- auto& a = entry.second;
42
-
43
- auto it = stats.find(a.type);
44
- if (it != stats.end()) {
45
- stats[a.type].first++;
46
- stats[a.type].second += a.size;
47
- } else {
48
- stats[a.type] = std::make_pair(1, a.size);
37
+ // Produce a sorted list of all outstanding allocations by type
38
+ std::unordered_map<AllocType, std::pair<int, size_t>> stats;
39
+
40
+ for (auto& entry : map) {
41
+ auto& a = entry.second;
42
+
43
+ auto it = stats.find(a.type);
44
+ if (it != stats.end()) {
45
+ stats[a.type].first++;
46
+ stats[a.type].second += a.size;
47
+ } else {
48
+ stats[a.type] = std::make_pair(1, a.size);
49
+ }
49
50
  }
50
- }
51
51
 
52
- std::stringstream ss;
53
- for (auto& entry : stats) {
54
- ss << "Alloc type " << allocTypeToString(entry.first) << ": "
55
- << entry.second.first << " allocations, "
56
- << entry.second.second << " bytes\n";
57
- }
52
+ std::stringstream ss;
53
+ for (auto& entry : stats) {
54
+ ss << "Alloc type " << allocTypeToString(entry.first) << ": "
55
+ << entry.second.first << " allocations, " << entry.second.second
56
+ << " bytes\n";
57
+ }
58
58
 
59
- return ss.str();
59
+ return ss.str();
60
60
  }
61
61
 
62
- }
62
+ } // namespace
63
63
 
64
64
  //
65
65
  // StandardGpuResourcesImpl
66
66
  //
67
67
 
68
- StandardGpuResourcesImpl::StandardGpuResourcesImpl() :
69
- pinnedMemAlloc_(nullptr),
70
- pinnedMemAllocSize_(0),
71
- // let the adjustment function determine the memory size for us by passing
72
- // in a huge value that will then be adjusted
73
- tempMemSize_(getDefaultTempMemForGPU(-1,
74
- std::numeric_limits<size_t>::max())),
75
- pinnedMemSize_(kDefaultPinnedMemoryAllocation),
76
- allocLogging_(false) {
77
- }
68
+ StandardGpuResourcesImpl::StandardGpuResourcesImpl()
69
+ : pinnedMemAlloc_(nullptr),
70
+ pinnedMemAllocSize_(0),
71
+ // let the adjustment function determine the memory size for us by
72
+ // passing in a huge value that will then be adjusted
73
+ tempMemSize_(getDefaultTempMemForGPU(
74
+ -1,
75
+ std::numeric_limits<size_t>::max())),
76
+ pinnedMemSize_(kDefaultPinnedMemoryAllocation),
77
+ allocLogging_(false) {}
78
78
 
79
79
  StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
80
- // The temporary memory allocator has allocated memory through us, so clean
81
- // that up before we finish fully de-initializing ourselves
82
- tempMemory_.clear();
83
-
84
- // Make sure all allocations have been freed
85
- bool allocError = false;
86
-
87
- for (auto& entry : allocs_) {
88
- auto& map = entry.second;
89
-
90
- if (!map.empty()) {
91
- std::cerr
92
- << "StandardGpuResources destroyed with allocations outstanding:\n"
93
- << "Device " << entry.first << " outstanding allocations:\n";
94
- std::cerr << allocsToString(map);
95
- allocError = true;
80
+ // The temporary memory allocator has allocated memory through us, so clean
81
+ // that up before we finish fully de-initializing ourselves
82
+ tempMemory_.clear();
83
+
84
+ // Make sure all allocations have been freed
85
+ bool allocError = false;
86
+
87
+ for (auto& entry : allocs_) {
88
+ auto& map = entry.second;
89
+
90
+ if (!map.empty()) {
91
+ std::cerr
92
+ << "StandardGpuResources destroyed with allocations outstanding:\n"
93
+ << "Device " << entry.first
94
+ << " outstanding allocations:\n";
95
+ std::cerr << allocsToString(map);
96
+ allocError = true;
97
+ }
96
98
  }
97
- }
98
99
 
99
- FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
100
+ FAISS_ASSERT_MSG(
101
+ !allocError, "GPU memory allocations not properly cleaned up");
100
102
 
101
- for (auto& entry : defaultStreams_) {
102
- DeviceScope scope(entry.first);
103
+ for (auto& entry : defaultStreams_) {
104
+ DeviceScope scope(entry.first);
103
105
 
104
- // We created these streams, so are responsible for destroying them
105
- CUDA_VERIFY(cudaStreamDestroy(entry.second));
106
- }
106
+ // We created these streams, so are responsible for destroying them
107
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
108
+ }
107
109
 
108
- for (auto& entry : alternateStreams_) {
109
- DeviceScope scope(entry.first);
110
+ for (auto& entry : alternateStreams_) {
111
+ DeviceScope scope(entry.first);
110
112
 
111
- for (auto stream : entry.second) {
112
- CUDA_VERIFY(cudaStreamDestroy(stream));
113
+ for (auto stream : entry.second) {
114
+ CUDA_VERIFY(cudaStreamDestroy(stream));
115
+ }
113
116
  }
114
- }
115
117
 
116
- for (auto& entry : asyncCopyStreams_) {
117
- DeviceScope scope(entry.first);
118
+ for (auto& entry : asyncCopyStreams_) {
119
+ DeviceScope scope(entry.first);
118
120
 
119
- CUDA_VERIFY(cudaStreamDestroy(entry.second));
120
- }
121
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
122
+ }
121
123
 
122
- for (auto& entry : blasHandles_) {
123
- DeviceScope scope(entry.first);
124
+ for (auto& entry : blasHandles_) {
125
+ DeviceScope scope(entry.first);
124
126
 
125
- auto blasStatus = cublasDestroy(entry.second);
126
- FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
127
- }
127
+ auto blasStatus = cublasDestroy(entry.second);
128
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
129
+ }
128
130
 
129
- if (pinnedMemAlloc_) {
130
- auto err = cudaFreeHost(pinnedMemAlloc_);
131
- FAISS_ASSERT_FMT(err == cudaSuccess,
132
- "Failed to cudaFreeHost pointer %p (error %d %s)",
133
- pinnedMemAlloc_, (int) err, cudaGetErrorString(err));
134
- }
131
+ if (pinnedMemAlloc_) {
132
+ auto err = cudaFreeHost(pinnedMemAlloc_);
133
+ FAISS_ASSERT_FMT(
134
+ err == cudaSuccess,
135
+ "Failed to cudaFreeHost pointer %p (error %d %s)",
136
+ pinnedMemAlloc_,
137
+ (int)err,
138
+ cudaGetErrorString(err));
139
+ }
135
140
  }
136
141
 
137
- size_t
138
- StandardGpuResourcesImpl::getDefaultTempMemForGPU(int device,
139
- size_t requested) {
140
- auto totalMem = device != -1 ?
141
- getDeviceProperties(device).totalGlobalMem :
142
- std::numeric_limits<size_t>::max();
142
+ size_t StandardGpuResourcesImpl::getDefaultTempMemForGPU(
143
+ int device,
144
+ size_t requested) {
145
+ auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
146
+ : std::numeric_limits<size_t>::max();
143
147
 
144
- if (totalMem <= (size_t) 4 * 1024 * 1024 * 1024) {
145
- // If the GPU has <= 4 GiB of memory, reserve 512 MiB
148
+ if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
149
+ // If the GPU has <= 4 GiB of memory, reserve 512 MiB
146
150
 
147
- if (requested > k4GiBTempMem) {
148
- return k4GiBTempMem;
149
- }
150
- } else if (totalMem <= (size_t) 8 * 1024 * 1024 * 1024) {
151
- // If the GPU has <= 8 GiB of memory, reserve 1 GiB
151
+ if (requested > k4GiBTempMem) {
152
+ return k4GiBTempMem;
153
+ }
154
+ } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
155
+ // If the GPU has <= 8 GiB of memory, reserve 1 GiB
152
156
 
153
- if (requested > k8GiBTempMem) {
154
- return k8GiBTempMem;
155
- }
156
- } else {
157
- // Never use more than 1.5 GiB
158
- if (requested > kMaxTempMem) {
159
- return kMaxTempMem;
157
+ if (requested > k8GiBTempMem) {
158
+ return k8GiBTempMem;
159
+ }
160
+ } else {
161
+ // Never use more than 1.5 GiB
162
+ if (requested > kMaxTempMem) {
163
+ return kMaxTempMem;
164
+ }
160
165
  }
161
- }
162
-
163
- // use whatever lower limit the user requested
164
- return requested;
165
- }
166
-
167
- void
168
- StandardGpuResourcesImpl::noTempMemory() {
169
- setTempMemory(0);
170
- }
171
-
172
- void
173
- StandardGpuResourcesImpl::setTempMemory(size_t size) {
174
- if (tempMemSize_ != size) {
175
- // adjust based on general limits
176
- tempMemSize_ = getDefaultTempMemForGPU(-1, size);
177
-
178
- // We need to re-initialize memory resources for all current devices that
179
- // have been initialized.
180
- // This should be safe to do, even if we are currently running work, because
181
- // the cudaFree call that this implies will force-synchronize all GPUs with
182
- // the CPU
183
- for (auto& p : tempMemory_) {
184
- int device = p.first;
185
- // Free the existing memory first
186
- p.second.reset();
187
-
188
- // Allocate new
189
- p.second = std::unique_ptr<StackDeviceMemory>(
190
- new StackDeviceMemory(this,
191
- p.first,
192
- // adjust for this specific device
193
- getDefaultTempMemForGPU(device, tempMemSize_)));
166
+
167
+ // use whatever lower limit the user requested
168
+ return requested;
169
+ }
170
+
171
+ void StandardGpuResourcesImpl::noTempMemory() {
172
+ setTempMemory(0);
173
+ }
174
+
175
+ void StandardGpuResourcesImpl::setTempMemory(size_t size) {
176
+ if (tempMemSize_ != size) {
177
+ // adjust based on general limits
178
+ tempMemSize_ = getDefaultTempMemForGPU(-1, size);
179
+
180
+ // We need to re-initialize memory resources for all current devices
181
+ // that have been initialized. This should be safe to do, even if we are
182
+ // currently running work, because the cudaFree call that this implies
183
+ // will force-synchronize all GPUs with the CPU
184
+ for (auto& p : tempMemory_) {
185
+ int device = p.first;
186
+ // Free the existing memory first
187
+ p.second.reset();
188
+
189
+ // Allocate new
190
+ p.second = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
191
+ this,
192
+ p.first,
193
+ // adjust for this specific device
194
+ getDefaultTempMemForGPU(device, tempMemSize_)));
195
+ }
194
196
  }
195
- }
196
197
  }
197
198
 
198
- void
199
- StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
200
- // Should not call this after devices have been initialized
201
- FAISS_ASSERT(defaultStreams_.size() == 0);
202
- FAISS_ASSERT(!pinnedMemAlloc_);
199
+ void StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
200
+ // Should not call this after devices have been initialized
201
+ FAISS_ASSERT(defaultStreams_.size() == 0);
202
+ FAISS_ASSERT(!pinnedMemAlloc_);
203
203
 
204
- pinnedMemSize_ = size;
204
+ pinnedMemSize_ = size;
205
205
  }
206
206
 
207
- void
208
- StandardGpuResourcesImpl::setDefaultStream(int device, cudaStream_t stream) {
209
- if (isInitialized(device)) {
210
- // A new series of calls may not be ordered with what was the previous
211
- // stream, so if the stream being specified is different, then we need to
212
- // ensure ordering between the two (new stream waits on old).
213
- auto it = userDefaultStreams_.find(device);
214
- cudaStream_t prevStream = nullptr;
207
+ void StandardGpuResourcesImpl::setDefaultStream(
208
+ int device,
209
+ cudaStream_t stream) {
210
+ if (isInitialized(device)) {
211
+ // A new series of calls may not be ordered with what was the previous
212
+ // stream, so if the stream being specified is different, then we need
213
+ // to ensure ordering between the two (new stream waits on old).
214
+ auto it = userDefaultStreams_.find(device);
215
+ cudaStream_t prevStream = nullptr;
215
216
 
216
- if (it != userDefaultStreams_.end()) {
217
- prevStream = it->second;
218
- } else {
219
- FAISS_ASSERT(defaultStreams_.count(device));
220
- prevStream = defaultStreams_[device];
221
- }
217
+ if (it != userDefaultStreams_.end()) {
218
+ prevStream = it->second;
219
+ } else {
220
+ FAISS_ASSERT(defaultStreams_.count(device));
221
+ prevStream = defaultStreams_[device];
222
+ }
222
223
 
223
- if (prevStream != stream) {
224
- streamWait({stream}, {prevStream});
224
+ if (prevStream != stream) {
225
+ streamWait({stream}, {prevStream});
226
+ }
225
227
  }
226
- }
227
228
 
228
- userDefaultStreams_[device] = stream;
229
+ userDefaultStreams_[device] = stream;
229
230
  }
230
231
 
231
- void
232
- StandardGpuResourcesImpl::revertDefaultStream(int device) {
233
- if (isInitialized(device)) {
234
- auto it = userDefaultStreams_.find(device);
232
+ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
233
+ if (isInitialized(device)) {
234
+ auto it = userDefaultStreams_.find(device);
235
235
 
236
- if (it != userDefaultStreams_.end()) {
237
- // There was a user stream set that we need to synchronize against
238
- cudaStream_t prevStream = userDefaultStreams_[device];
236
+ if (it != userDefaultStreams_.end()) {
237
+ // There was a user stream set that we need to synchronize against
238
+ cudaStream_t prevStream = userDefaultStreams_[device];
239
239
 
240
- FAISS_ASSERT(defaultStreams_.count(device));
241
- cudaStream_t newStream = defaultStreams_[device];
240
+ FAISS_ASSERT(defaultStreams_.count(device));
241
+ cudaStream_t newStream = defaultStreams_[device];
242
242
 
243
- streamWait({newStream}, {prevStream});
243
+ streamWait({newStream}, {prevStream});
244
+ }
244
245
  }
245
- }
246
246
 
247
- userDefaultStreams_.erase(device);
247
+ userDefaultStreams_.erase(device);
248
248
  }
249
249
 
250
- void
251
- StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
252
- for (int dev = 0; dev < getNumDevices(); ++dev) {
253
- setDefaultStream(dev, nullptr);
254
- }
250
+ void StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
251
+ for (int dev = 0; dev < getNumDevices(); ++dev) {
252
+ setDefaultStream(dev, nullptr);
253
+ }
255
254
  }
256
255
 
257
- void
258
- StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
259
- allocLogging_ = enable;
256
+ void StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
257
+ allocLogging_ = enable;
260
258
  }
261
259
 
262
- bool
263
- StandardGpuResourcesImpl::isInitialized(int device) const {
264
- // Use default streams as a marker for whether or not a certain
265
- // device has been initialized
266
- return defaultStreams_.count(device) != 0;
260
+ bool StandardGpuResourcesImpl::isInitialized(int device) const {
261
+ // Use default streams as a marker for whether or not a certain
262
+ // device has been initialized
263
+ return defaultStreams_.count(device) != 0;
267
264
  }
268
265
 
269
- void
270
- StandardGpuResourcesImpl::initializeForDevice(int device) {
271
- if (isInitialized(device)) {
272
- return;
273
- }
274
-
275
- // If this is the first device that we're initializing, create our
276
- // pinned memory allocation
277
- if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
278
- auto err =
279
- cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
266
+ void StandardGpuResourcesImpl::initializeForDevice(int device) {
267
+ if (isInitialized(device)) {
268
+ return;
269
+ }
280
270
 
281
- FAISS_THROW_IF_NOT_FMT(
282
- err == cudaSuccess,
283
- "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
284
- "async copy buffer (error %d %s)",
285
- pinnedMemSize_, (int) err, cudaGetErrorString(err));
271
+ // If this is the first device that we're initializing, create our
272
+ // pinned memory allocation
273
+ if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
274
+ auto err = cudaHostAlloc(
275
+ &pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
276
+
277
+ FAISS_THROW_IF_NOT_FMT(
278
+ err == cudaSuccess,
279
+ "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
280
+ "async copy buffer (error %d %s)",
281
+ pinnedMemSize_,
282
+ (int)err,
283
+ cudaGetErrorString(err));
284
+
285
+ pinnedMemAllocSize_ = pinnedMemSize_;
286
+ }
286
287
 
287
- pinnedMemAllocSize_ = pinnedMemSize_;
288
- }
288
+ FAISS_ASSERT(device < getNumDevices());
289
+ DeviceScope scope(device);
289
290
 
290
- FAISS_ASSERT(device < getNumDevices());
291
- DeviceScope scope(device);
291
+ // Make sure that device properties for all devices are cached
292
+ auto& prop = getDeviceProperties(device);
292
293
 
293
- // Make sure that device properties for all devices are cached
294
- auto& prop = getDeviceProperties(device);
294
+ // Also check to make sure we meet our minimum compute capability (3.0)
295
+ FAISS_ASSERT_FMT(
296
+ prop.major >= 3,
297
+ "Device id %d with CC %d.%d not supported, "
298
+ "need 3.0+ compute capability",
299
+ device,
300
+ prop.major,
301
+ prop.minor);
295
302
 
296
- // Also check to make sure we meet our minimum compute capability (3.0)
297
- FAISS_ASSERT_FMT(prop.major >= 3,
298
- "Device id %d with CC %d.%d not supported, "
299
- "need 3.0+ compute capability",
300
- device, prop.major, prop.minor);
303
+ // Our code is pre-built with and expects warpSize == 32, validate that
304
+ FAISS_ASSERT_FMT(
305
+ prop.warpSize == 32,
306
+ "Device id %d does not have expected warpSize of 32",
307
+ device);
301
308
 
302
- // Create streams
303
- cudaStream_t defaultStream = 0;
304
- CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
305
- cudaStreamNonBlocking));
309
+ // Create streams
310
+ cudaStream_t defaultStream = 0;
311
+ CUDA_VERIFY(
312
+ cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
306
313
 
307
- defaultStreams_[device] = defaultStream;
314
+ defaultStreams_[device] = defaultStream;
308
315
 
309
- cudaStream_t asyncCopyStream = 0;
310
- CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
311
- cudaStreamNonBlocking));
316
+ cudaStream_t asyncCopyStream = 0;
317
+ CUDA_VERIFY(
318
+ cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
312
319
 
313
- asyncCopyStreams_[device] = asyncCopyStream;
320
+ asyncCopyStreams_[device] = asyncCopyStream;
314
321
 
315
- std::vector<cudaStream_t> deviceStreams;
316
- for (int j = 0; j < kNumStreams; ++j) {
317
- cudaStream_t stream = 0;
318
- CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
319
- cudaStreamNonBlocking));
322
+ std::vector<cudaStream_t> deviceStreams;
323
+ for (int j = 0; j < kNumStreams; ++j) {
324
+ cudaStream_t stream = 0;
325
+ CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
320
326
 
321
- deviceStreams.push_back(stream);
322
- }
327
+ deviceStreams.push_back(stream);
328
+ }
323
329
 
324
- alternateStreams_[device] = std::move(deviceStreams);
330
+ alternateStreams_[device] = std::move(deviceStreams);
325
331
 
326
- // Create cuBLAS handle
327
- cublasHandle_t blasHandle = 0;
328
- auto blasStatus = cublasCreate(&blasHandle);
329
- FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
330
- blasHandles_[device] = blasHandle;
332
+ // Create cuBLAS handle
333
+ cublasHandle_t blasHandle = 0;
334
+ auto blasStatus = cublasCreate(&blasHandle);
335
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
336
+ blasHandles_[device] = blasHandle;
331
337
 
332
- // For CUDA 10 on V100, enabling tensor core usage would enable automatic
333
- // rounding down of inputs to f16 (though accumulate in f32) which results in
334
- // unacceptable loss of precision in general.
335
- // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
336
- // a loss of precision.
338
+ // For CUDA 10 on V100, enabling tensor core usage would enable automatic
339
+ // rounding down of inputs to f16 (though accumulate in f32) which results
340
+ // in unacceptable loss of precision in general. For CUDA 11 / A100, only
341
+ // enable tensor core support if it doesn't result in a loss of precision.
337
342
  #if CUDA_VERSION >= 11000
338
- cublasSetMathMode(blasHandle,
339
- CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
343
+ cublasSetMathMode(
344
+ blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
340
345
  #endif
341
346
 
342
- FAISS_ASSERT(allocs_.count(device) == 0);
343
- allocs_[device] = std::unordered_map<void*, AllocRequest>();
347
+ FAISS_ASSERT(allocs_.count(device) == 0);
348
+ allocs_[device] = std::unordered_map<void*, AllocRequest>();
344
349
 
345
- FAISS_ASSERT(tempMemory_.count(device) == 0);
346
- auto mem = std::unique_ptr<StackDeviceMemory>(
347
- new StackDeviceMemory(this,
348
- device,
349
- // adjust for this specific device
350
- getDefaultTempMemForGPU(device, tempMemSize_)));
350
+ FAISS_ASSERT(tempMemory_.count(device) == 0);
351
+ auto mem = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
352
+ this,
353
+ device,
354
+ // adjust for this specific device
355
+ getDefaultTempMemForGPU(device, tempMemSize_)));
351
356
 
352
- tempMemory_.emplace(device, std::move(mem));
357
+ tempMemory_.emplace(device, std::move(mem));
353
358
  }
354
359
 
355
- cublasHandle_t
356
- StandardGpuResourcesImpl::getBlasHandle(int device) {
357
- initializeForDevice(device);
358
- return blasHandles_[device];
360
+ cublasHandle_t StandardGpuResourcesImpl::getBlasHandle(int device) {
361
+ initializeForDevice(device);
362
+ return blasHandles_[device];
359
363
  }
360
364
 
361
- cudaStream_t
362
- StandardGpuResourcesImpl::getDefaultStream(int device) {
363
- initializeForDevice(device);
365
+ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
366
+ initializeForDevice(device);
364
367
 
365
- auto it = userDefaultStreams_.find(device);
366
- if (it != userDefaultStreams_.end()) {
367
- // There is a user override stream set
368
- return it->second;
369
- }
368
+ auto it = userDefaultStreams_.find(device);
369
+ if (it != userDefaultStreams_.end()) {
370
+ // There is a user override stream set
371
+ return it->second;
372
+ }
370
373
 
371
- // Otherwise, our base default stream
372
- return defaultStreams_[device];
374
+ // Otherwise, our base default stream
375
+ return defaultStreams_[device];
373
376
  }
374
377
 
375
- std::vector<cudaStream_t>
376
- StandardGpuResourcesImpl::getAlternateStreams(int device) {
377
- initializeForDevice(device);
378
- return alternateStreams_[device];
378
+ std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
379
+ int device) {
380
+ initializeForDevice(device);
381
+ return alternateStreams_[device];
379
382
  }
380
383
 
381
- std::pair<void*, size_t>
382
- StandardGpuResourcesImpl::getPinnedMemory() {
383
- return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
384
+ std::pair<void*, size_t> StandardGpuResourcesImpl::getPinnedMemory() {
385
+ return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
384
386
  }
385
387
 
386
- cudaStream_t
387
- StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
388
- initializeForDevice(device);
389
- return asyncCopyStreams_[device];
388
+ cudaStream_t StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
389
+ initializeForDevice(device);
390
+ return asyncCopyStreams_[device];
390
391
  }
391
392
 
392
- void*
393
- StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
394
- initializeForDevice(req.device);
395
-
396
- // We don't allocate a placeholder for zero-sized allocations
397
- if (req.size == 0) {
398
- return nullptr;
399
- }
400
-
401
- // Make sure that the allocation is a multiple of 16 bytes for alignment
402
- // purposes
403
- auto adjReq = req;
404
- adjReq.size = utils::roundUp(adjReq.size, (size_t) 16);
405
-
406
- void* p = nullptr;
407
-
408
- if (allocLogging_) {
409
- std::cout << "StandardGpuResources: alloc " << adjReq.toString() << "\n";
410
- }
411
-
412
- if (adjReq.space == MemorySpace::Temporary) {
413
- // If we don't have enough space in our temporary memory manager, we need
414
- // to allocate this request separately
415
- auto& tempMem = tempMemory_[adjReq.device];
416
-
417
- if (adjReq.size > tempMem->getSizeAvailable()) {
418
- // We need to allocate this ourselves
419
- AllocRequest newReq = adjReq;
420
- newReq.space = MemorySpace::Device;
421
- newReq.type = AllocType::TemporaryMemoryOverflow;
393
+ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
394
+ initializeForDevice(req.device);
422
395
 
423
- return allocMemory(newReq);
396
+ // We don't allocate a placeholder for zero-sized allocations
397
+ if (req.size == 0) {
398
+ return nullptr;
424
399
  }
425
400
 
426
- // Otherwise, we can handle this locally
427
- p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
428
-
429
- } else if (adjReq.space == MemorySpace::Device) {
430
- auto err = cudaMalloc(&p, adjReq.size);
431
-
432
- // Throw if we fail to allocate
433
- if (err != cudaSuccess) {
434
- auto& map = allocs_[req.device];
435
-
436
- std::stringstream ss;
437
- ss << "Failed to cudaMalloc " << adjReq.size << " bytes "
438
- << "on device " << adjReq.device << " (error "
439
- << (int) err << " " << cudaGetErrorString(err)
440
- << "\nOutstanding allocations:\n" << allocsToString(map);
441
- auto str = ss.str();
442
-
443
- FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
401
+ // cudaMalloc guarantees allocation alignment to 256 bytes; do the same here
402
+ // for alignment purposes (to reduce memory transaction overhead etc)
403
+ auto adjReq = req;
404
+ adjReq.size = utils::roundUp(adjReq.size, (size_t)256);
405
+
406
+ void* p = nullptr;
407
+
408
+ if (adjReq.space == MemorySpace::Temporary) {
409
+ // If we don't have enough space in our temporary memory manager, we
410
+ // need to allocate this request separately
411
+ auto& tempMem = tempMemory_[adjReq.device];
412
+
413
+ if (adjReq.size > tempMem->getSizeAvailable()) {
414
+ // We need to allocate this ourselves
415
+ AllocRequest newReq = adjReq;
416
+ newReq.space = MemorySpace::Device;
417
+ newReq.type = AllocType::TemporaryMemoryOverflow;
418
+
419
+ if (allocLogging_) {
420
+ std::cout
421
+ << "StandardGpuResources: alloc fail "
422
+ << adjReq.toString()
423
+ << " (no temp space); retrying as MemorySpace::Device\n";
424
+ }
425
+
426
+ return allocMemory(newReq);
427
+ }
428
+
429
+ // Otherwise, we can handle this locally
430
+ p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
431
+
432
+ } else if (adjReq.space == MemorySpace::Device) {
433
+ auto err = cudaMalloc(&p, adjReq.size);
434
+
435
+ // Throw if we fail to allocate
436
+ if (err != cudaSuccess) {
437
+ // FIXME: as of CUDA 11, a memory allocation error appears to be
438
+ // presented via cudaGetLastError as well, and needs to be cleared.
439
+ // Just call the function to clear it
440
+ cudaGetLastError();
441
+
442
+ std::stringstream ss;
443
+ ss << "StandardGpuResources: alloc fail " << adjReq.toString()
444
+ << " (cudaMalloc error " << cudaGetErrorString(err) << " ["
445
+ << (int)err << "])\n";
446
+ auto str = ss.str();
447
+
448
+ if (allocLogging_) {
449
+ std::cout << str;
450
+ }
451
+
452
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
453
+ }
454
+ } else if (adjReq.space == MemorySpace::Unified) {
455
+ auto err = cudaMallocManaged(&p, adjReq.size);
456
+
457
+ if (err != cudaSuccess) {
458
+ // FIXME: as of CUDA 11, a memory allocation error appears to be
459
+ // presented via cudaGetLastError as well, and needs to be cleared.
460
+ // Just call the function to clear it
461
+ cudaGetLastError();
462
+
463
+ std::stringstream ss;
464
+ ss << "StandardGpuResources: alloc fail " << adjReq.toString()
465
+ << " failed (cudaMallocManaged error " << cudaGetErrorString(err)
466
+ << " [" << (int)err << "])\n";
467
+ auto str = ss.str();
468
+
469
+ if (allocLogging_) {
470
+ std::cout << str;
471
+ }
472
+
473
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
474
+ }
475
+ } else {
476
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
444
477
  }
445
- } else if (adjReq.space == MemorySpace::Unified) {
446
- auto err = cudaMallocManaged(&p, adjReq.size);
447
-
448
- if (err != cudaSuccess) {
449
- auto& map = allocs_[req.device];
450
478
 
451
- std::stringstream ss;
452
- ss << "Failed to cudaMallocManaged " << adjReq.size << " bytes "
453
- << "(error " << (int) err << " " << cudaGetErrorString(err)
454
- << "\nOutstanding allocations:\n" << allocsToString(map);
455
- auto str = ss.str();
456
-
457
- FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
479
+ if (allocLogging_) {
480
+ std::cout << "StandardGpuResources: alloc ok " << adjReq.toString()
481
+ << " ptr 0x" << p << "\n";
458
482
  }
459
- } else {
460
- FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) adjReq.space);
461
- }
462
483
 
463
- allocs_[adjReq.device][p] = adjReq;
484
+ allocs_[adjReq.device][p] = adjReq;
464
485
 
465
- return p;
486
+ return p;
466
487
  }
467
488
 
468
- void
469
- StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
470
- FAISS_ASSERT(isInitialized(device));
489
+ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
490
+ FAISS_ASSERT(isInitialized(device));
471
491
 
472
- if (!p) {
473
- return;
474
- }
492
+ if (!p) {
493
+ return;
494
+ }
475
495
 
476
- auto& a = allocs_[device];
477
- auto it = a.find(p);
478
- FAISS_ASSERT(it != a.end());
496
+ auto& a = allocs_[device];
497
+ auto it = a.find(p);
498
+ FAISS_ASSERT(it != a.end());
479
499
 
480
- auto& req = it->second;
500
+ auto& req = it->second;
481
501
 
482
- if (allocLogging_) {
483
- std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
484
- }
502
+ if (allocLogging_) {
503
+ std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
504
+ }
485
505
 
486
- if (req.space == MemorySpace::Temporary) {
487
- tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
506
+ if (req.space == MemorySpace::Temporary) {
507
+ tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
488
508
 
489
- } else if (req.space == MemorySpace::Device ||
490
- req.space == MemorySpace::Unified) {
491
- auto err = cudaFree(p);
492
- FAISS_ASSERT_FMT(err == cudaSuccess,
493
- "Failed to cudaFree pointer %p (error %d %s)",
494
- p, (int) err, cudaGetErrorString(err));
509
+ } else if (
510
+ req.space == MemorySpace::Device ||
511
+ req.space == MemorySpace::Unified) {
512
+ auto err = cudaFree(p);
513
+ FAISS_ASSERT_FMT(
514
+ err == cudaSuccess,
515
+ "Failed to cudaFree pointer %p (error %d %s)",
516
+ p,
517
+ (int)err,
518
+ cudaGetErrorString(err));
495
519
 
496
- } else {
497
- FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) req.space);
498
- }
520
+ } else {
521
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
522
+ }
499
523
 
500
- a.erase(it);
524
+ a.erase(it);
501
525
  }
502
526
 
503
- size_t
504
- StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
505
- FAISS_ASSERT(isInitialized(device));
527
+ size_t StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
528
+ FAISS_ASSERT(isInitialized(device));
506
529
 
507
- auto it = tempMemory_.find(device);
508
- FAISS_ASSERT(it != tempMemory_.end());
530
+ auto it = tempMemory_.find(device);
531
+ FAISS_ASSERT(it != tempMemory_.end());
509
532
 
510
- return it->second->getSizeAvailable();
533
+ return it->second->getSizeAvailable();
511
534
  }
512
535
 
513
536
  std::map<int, std::map<std::string, std::pair<int, size_t>>>
514
537
  StandardGpuResourcesImpl::getMemoryInfo() const {
515
- using AT = std::map<std::string, std::pair<int, size_t>>;
538
+ using AT = std::map<std::string, std::pair<int, size_t>>;
516
539
 
517
- std::map<int, AT> out;
540
+ std::map<int, AT> out;
518
541
 
519
- for (auto& entry : allocs_) {
520
- AT outDevice;
542
+ for (auto& entry : allocs_) {
543
+ AT outDevice;
521
544
 
522
- for (auto& a : entry.second) {
523
- auto& v = outDevice[allocTypeToString(a.second.type)];
524
- v.first++;
525
- v.second += a.second.size;
526
- }
545
+ for (auto& a : entry.second) {
546
+ auto& v = outDevice[allocTypeToString(a.second.type)];
547
+ v.first++;
548
+ v.second += a.second.size;
549
+ }
527
550
 
528
- out[entry.first] = std::move(outDevice);
529
- }
551
+ out[entry.first] = std::move(outDevice);
552
+ }
530
553
 
531
- return out;
554
+ return out;
532
555
  }
533
556
 
534
557
  //
@@ -536,70 +559,58 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
536
559
  //
537
560
 
538
561
  StandardGpuResources::StandardGpuResources()
539
- : res_(new StandardGpuResourcesImpl) {
540
- }
562
+ : res_(new StandardGpuResourcesImpl) {}
541
563
 
542
- StandardGpuResources::~StandardGpuResources() {
543
- }
564
+ StandardGpuResources::~StandardGpuResources() {}
544
565
 
545
- std::shared_ptr<GpuResources>
546
- StandardGpuResources::getResources() {
547
- return res_;
566
+ std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
567
+ return res_;
548
568
  }
549
569
 
550
- void
551
- StandardGpuResources::noTempMemory() {
552
- res_->noTempMemory();
570
+ void StandardGpuResources::noTempMemory() {
571
+ res_->noTempMemory();
553
572
  }
554
573
 
555
- void
556
- StandardGpuResources::setTempMemory(size_t size) {
557
- res_->setTempMemory(size);
574
+ void StandardGpuResources::setTempMemory(size_t size) {
575
+ res_->setTempMemory(size);
558
576
  }
559
577
 
560
- void
561
- StandardGpuResources::setPinnedMemory(size_t size) {
562
- res_->setPinnedMemory(size);
578
+ void StandardGpuResources::setPinnedMemory(size_t size) {
579
+ res_->setPinnedMemory(size);
563
580
  }
564
581
 
565
- void
566
- StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
567
- res_->setDefaultStream(device, stream);
582
+ void StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
583
+ res_->setDefaultStream(device, stream);
568
584
  }
569
585
 
570
- void
571
- StandardGpuResources::revertDefaultStream(int device) {
572
- res_->revertDefaultStream(device);
586
+ void StandardGpuResources::revertDefaultStream(int device) {
587
+ res_->revertDefaultStream(device);
573
588
  }
574
589
 
575
- void
576
- StandardGpuResources::setDefaultNullStreamAllDevices() {
577
- res_->setDefaultNullStreamAllDevices();
590
+ void StandardGpuResources::setDefaultNullStreamAllDevices() {
591
+ res_->setDefaultNullStreamAllDevices();
578
592
  }
579
593
 
580
594
  std::map<int, std::map<std::string, std::pair<int, size_t>>>
581
595
  StandardGpuResources::getMemoryInfo() const {
582
- return res_->getMemoryInfo();
596
+ return res_->getMemoryInfo();
583
597
  }
584
598
 
585
- cudaStream_t
586
- StandardGpuResources::getDefaultStream(int device) {
587
- return res_->getDefaultStream(device);
599
+ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
600
+ return res_->getDefaultStream(device);
588
601
  }
589
602
 
590
- size_t
591
- StandardGpuResources::getTempMemoryAvailable(int device) const {
592
- return res_->getTempMemoryAvailable(device);
603
+ size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
604
+ return res_->getTempMemoryAvailable(device);
593
605
  }
594
606
 
595
- void
596
- StandardGpuResources::syncDefaultStreamCurrentDevice() {
597
- res_->syncDefaultStreamCurrentDevice();
607
+ void StandardGpuResources::syncDefaultStreamCurrentDevice() {
608
+ res_->syncDefaultStreamCurrentDevice();
598
609
  }
599
610
 
600
- void
601
- StandardGpuResources::setLogMemoryAllocations(bool enable) {
602
- res_->setLogMemoryAllocations(enable);
611
+ void StandardGpuResources::setLogMemoryAllocations(bool enable) {
612
+ res_->setLogMemoryAllocations(enable);
603
613
  }
604
614
 
605
- } } // namespace
615
+ } // namespace gpu
616
+ } // namespace faiss