faiss 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/faiss/version.rb +1 -1
  4. data/vendor/faiss/faiss/AutoTune.cpp +292 -291
  5. data/vendor/faiss/faiss/AutoTune.h +55 -56
  6. data/vendor/faiss/faiss/Clustering.cpp +334 -195
  7. data/vendor/faiss/faiss/Clustering.h +88 -35
  8. data/vendor/faiss/faiss/IVFlib.cpp +171 -195
  9. data/vendor/faiss/faiss/IVFlib.h +48 -51
  10. data/vendor/faiss/faiss/Index.cpp +85 -103
  11. data/vendor/faiss/faiss/Index.h +54 -48
  12. data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
  13. data/vendor/faiss/faiss/Index2Layer.h +22 -22
  14. data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
  15. data/vendor/faiss/faiss/IndexBinary.h +140 -132
  16. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
  17. data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
  18. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
  19. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
  20. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
  21. data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
  22. data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
  23. data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
  24. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
  25. data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
  26. data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
  27. data/vendor/faiss/faiss/IndexFlat.h +35 -46
  28. data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
  29. data/vendor/faiss/faiss/IndexHNSW.h +57 -41
  30. data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
  31. data/vendor/faiss/faiss/IndexIVF.h +146 -113
  32. data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
  33. data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
  34. data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
  35. data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
  36. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
  37. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
  38. data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
  39. data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
  40. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
  41. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
  42. data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
  43. data/vendor/faiss/faiss/IndexLSH.h +21 -26
  44. data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
  45. data/vendor/faiss/faiss/IndexLattice.h +11 -16
  46. data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
  47. data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
  48. data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
  49. data/vendor/faiss/faiss/IndexNSG.h +85 -0
  50. data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
  51. data/vendor/faiss/faiss/IndexPQ.h +64 -67
  52. data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
  53. data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
  54. data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
  55. data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
  56. data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
  57. data/vendor/faiss/faiss/IndexRefine.h +22 -23
  58. data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
  59. data/vendor/faiss/faiss/IndexReplicas.h +62 -56
  60. data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
  61. data/vendor/faiss/faiss/IndexResidual.h +152 -0
  62. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
  63. data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
  64. data/vendor/faiss/faiss/IndexShards.cpp +256 -240
  65. data/vendor/faiss/faiss/IndexShards.h +85 -73
  66. data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
  67. data/vendor/faiss/faiss/MatrixStats.h +7 -10
  68. data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
  69. data/vendor/faiss/faiss/MetaIndexes.h +40 -34
  70. data/vendor/faiss/faiss/MetricType.h +7 -7
  71. data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
  72. data/vendor/faiss/faiss/VectorTransform.h +61 -89
  73. data/vendor/faiss/faiss/clone_index.cpp +77 -73
  74. data/vendor/faiss/faiss/clone_index.h +4 -9
  75. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
  76. data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
  77. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
  78. data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
  79. data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
  80. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
  81. data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
  82. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
  83. data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
  84. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
  85. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
  86. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
  87. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
  88. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
  89. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
  90. data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
  91. data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
  92. data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
  93. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
  94. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
  95. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
  96. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
  97. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
  98. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
  99. data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
  100. data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
  101. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
  102. data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
  103. data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
  104. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
  105. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
  106. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
  107. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
  108. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
  109. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
  110. data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
  111. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
  112. data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
  113. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
  114. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
  115. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
  116. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
  117. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
  118. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
  119. data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
  120. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
  121. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
  122. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
  123. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
  124. data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
  125. data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
  126. data/vendor/faiss/faiss/impl/FaissException.h +41 -29
  127. data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
  128. data/vendor/faiss/faiss/impl/HNSW.h +179 -200
  129. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
  130. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
  131. data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
  132. data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
  133. data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
  134. data/vendor/faiss/faiss/impl/NSG.h +199 -0
  135. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
  136. data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
  137. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
  138. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
  139. data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
  140. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
  141. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
  142. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
  143. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
  144. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
  145. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
  146. data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
  147. data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
  148. data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
  149. data/vendor/faiss/faiss/impl/io.cpp +75 -94
  150. data/vendor/faiss/faiss/impl/io.h +31 -41
  151. data/vendor/faiss/faiss/impl/io_macros.h +40 -29
  152. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
  153. data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
  154. data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
  155. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
  156. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
  157. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
  158. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
  159. data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
  160. data/vendor/faiss/faiss/index_factory.cpp +269 -218
  161. data/vendor/faiss/faiss/index_factory.h +6 -7
  162. data/vendor/faiss/faiss/index_io.h +23 -26
  163. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
  164. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
  165. data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
  166. data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
  167. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
  168. data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
  169. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
  170. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
  171. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
  172. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
  173. data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
  174. data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
  175. data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
  176. data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
  177. data/vendor/faiss/faiss/utils/Heap.h +186 -209
  178. data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
  179. data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
  180. data/vendor/faiss/faiss/utils/distances.cpp +301 -310
  181. data/vendor/faiss/faiss/utils/distances.h +133 -118
  182. data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
  183. data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
  184. data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
  185. data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
  186. data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
  187. data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
  188. data/vendor/faiss/faiss/utils/hamming.h +62 -85
  189. data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
  190. data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
  191. data/vendor/faiss/faiss/utils/partitioning.h +26 -21
  192. data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
  193. data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
  194. data/vendor/faiss/faiss/utils/random.cpp +39 -63
  195. data/vendor/faiss/faiss/utils/random.h +13 -16
  196. data/vendor/faiss/faiss/utils/simdlib.h +4 -2
  197. data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
  198. data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
  199. data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
  200. data/vendor/faiss/faiss/utils/utils.cpp +304 -287
  201. data/vendor/faiss/faiss/utils/utils.h +53 -48
  202. metadata +20 -2
@@ -5,55 +5,59 @@
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
7
 
8
-
9
8
  #pragma once
10
9
 
11
- #include <faiss/impl/FaissAssert.h>
12
- #include <cuda_runtime.h>
13
10
  #include <cublas_v2.h>
11
+ #include <cuda_runtime.h>
12
+ #include <faiss/impl/FaissAssert.h>
14
13
  #include <memory>
15
14
  #include <utility>
16
15
  #include <vector>
17
16
 
18
- namespace faiss { namespace gpu {
17
+ namespace faiss {
18
+ namespace gpu {
19
19
 
20
20
  class GpuResources;
21
21
 
22
22
  enum AllocType {
23
- /// Unknown allocation type or miscellaneous (not currently categorized)
24
- Other = 0,
25
-
26
- /// Primary data storage for GpuIndexFlat (the raw matrix of vectors and
27
- /// vector norms if needed)
28
- FlatData = 1,
29
-
30
- /// Primary data storage for GpuIndexIVF* (the storage for each individual IVF
31
- /// list)
32
- IVFLists = 2,
33
-
34
- /// Quantizer (PQ, SQ) dictionary information
35
- Quantizer = 3,
36
-
37
- /// For GpuIndexIVFPQ, "precomputed codes" for more efficient PQ lookup
38
- /// require the use of possibly large tables. These are marked separately from
39
- /// Quantizer as these can frequently be 100s - 1000s of MiB in size
40
- QuantizerPrecomputedCodes = 4,
41
-
42
- ///
43
- /// StandardGpuResources implementation specific types
44
- ///
45
-
46
- /// When using StandardGpuResources, temporary memory allocations
47
- /// (MemorySpace::Temporary) come out of a stack region of memory that is
48
- /// allocated up front for each gpu (e.g., 1.5 GiB upon initialization). This
49
- /// allocation by StandardGpuResources is marked with this AllocType.
50
- TemporaryMemoryBuffer = 10,
51
-
52
- /// When using StandardGpuResources, any MemorySpace::Temporary allocations
53
- /// that cannot be satisfied within the TemporaryMemoryBuffer region fall back
54
- /// to calling cudaMalloc which are sized to just the request at hand. These
55
- /// "overflow" temporary allocations are marked with this AllocType.
56
- TemporaryMemoryOverflow = 11,
23
+ /// Unknown allocation type or miscellaneous (not currently categorized)
24
+ Other = 0,
25
+
26
+ /// Primary data storage for GpuIndexFlat (the raw matrix of vectors and
27
+ /// vector norms if needed)
28
+ FlatData = 1,
29
+
30
+ /// Primary data storage for GpuIndexIVF* (the storage for each individual
31
+ /// IVF
32
+ /// list)
33
+ IVFLists = 2,
34
+
35
+ /// Quantizer (PQ, SQ) dictionary information
36
+ Quantizer = 3,
37
+
38
+ /// For GpuIndexIVFPQ, "precomputed codes" for more efficient PQ lookup
39
+ /// require the use of possibly large tables. These are marked separately
40
+ /// from
41
+ /// Quantizer as these can frequently be 100s - 1000s of MiB in size
42
+ QuantizerPrecomputedCodes = 4,
43
+
44
+ ///
45
+ /// StandardGpuResources implementation specific types
46
+ ///
47
+
48
+ /// When using StandardGpuResources, temporary memory allocations
49
+ /// (MemorySpace::Temporary) come out of a stack region of memory that is
50
+ /// allocated up front for each gpu (e.g., 1.5 GiB upon initialization).
51
+ /// This
52
+ /// allocation by StandardGpuResources is marked with this AllocType.
53
+ TemporaryMemoryBuffer = 10,
54
+
55
+ /// When using StandardGpuResources, any MemorySpace::Temporary allocations
56
+ /// that cannot be satisfied within the TemporaryMemoryBuffer region fall
57
+ /// back
58
+ /// to calling cudaMalloc which are sized to just the request at hand. These
59
+ /// "overflow" temporary allocations are marked with this AllocType.
60
+ TemporaryMemoryOverflow = 11,
57
61
  };
58
62
 
59
63
  /// Convert an AllocType to string
@@ -61,16 +65,17 @@ std::string allocTypeToString(AllocType t);
61
65
 
62
66
  /// Memory regions accessible to the GPU
63
67
  enum MemorySpace {
64
- /// Temporary device memory (guaranteed to no longer be used upon exit of a
65
- /// top-level index call, and where the streams using it have completed GPU
66
- /// work). Typically backed by Device memory (cudaMalloc/cudaFree).
67
- Temporary = 0,
68
+ /// Temporary device memory (guaranteed to no longer be used upon exit of a
69
+ /// top-level index call, and where the streams using it have completed GPU
70
+ /// work). Typically backed by Device memory (cudaMalloc/cudaFree).
71
+ Temporary = 0,
68
72
 
69
- /// Managed using cudaMalloc/cudaFree (typical GPU device memory)
70
- Device = 1,
73
+ /// Managed using cudaMalloc/cudaFree (typical GPU device memory)
74
+ Device = 1,
71
75
 
72
- /// Managed using cudaMallocManaged/cudaFree (typical Unified CPU/GPU memory)
73
- Unified = 2,
76
+ /// Managed using cudaMallocManaged/cudaFree (typical Unified CPU/GPU
77
+ /// memory)
78
+ Unified = 2,
74
79
  };
75
80
 
76
81
  /// Convert a MemorySpace to string
@@ -78,44 +83,36 @@ std::string memorySpaceToString(MemorySpace s);
78
83
 
79
84
  /// Information on what/where an allocation is
80
85
  struct AllocInfo {
81
- inline AllocInfo()
82
- : type(AllocType::Other),
83
- device(0),
84
- space(MemorySpace::Device),
85
- stream(nullptr) {
86
- }
87
-
88
- inline AllocInfo(AllocType at,
89
- int dev,
90
- MemorySpace sp,
91
- cudaStream_t st)
92
- : type(at),
93
- device(dev),
94
- space(sp),
95
- stream(st) {
96
- }
97
-
98
- /// Returns a string representation of this info
99
- std::string toString() const;
100
-
101
- /// The internal category of the allocation
102
- AllocType type;
103
-
104
- /// The device on which the allocation is happening
105
- int device;
106
-
107
- /// The memory space of the allocation
108
- MemorySpace space;
109
-
110
- /// The stream on which new work on the memory will be ordered (e.g., if a
111
- /// piece of memory cached and to be returned for this call was last used on
112
- /// stream 3 and a new memory request is for stream 4, the memory manager will
113
- /// synchronize stream 4 to wait for the completion of stream 3 via events or
114
- /// other stream synchronization.
115
- ///
116
- /// The memory manager guarantees that the returned memory is free to use
117
- /// without data races on this stream specified.
118
- cudaStream_t stream;
86
+ inline AllocInfo()
87
+ : type(AllocType::Other),
88
+ device(0),
89
+ space(MemorySpace::Device),
90
+ stream(nullptr) {}
91
+
92
+ inline AllocInfo(AllocType at, int dev, MemorySpace sp, cudaStream_t st)
93
+ : type(at), device(dev), space(sp), stream(st) {}
94
+
95
+ /// Returns a string representation of this info
96
+ std::string toString() const;
97
+
98
+ /// The internal category of the allocation
99
+ AllocType type;
100
+
101
+ /// The device on which the allocation is happening
102
+ int device;
103
+
104
+ /// The memory space of the allocation
105
+ MemorySpace space;
106
+
107
+ /// The stream on which new work on the memory will be ordered (e.g., if a
108
+ /// piece of memory cached and to be returned for this call was last used on
109
+ /// stream 3 and a new memory request is for stream 4, the memory manager
110
+ /// will synchronize stream 4 to wait for the completion of stream 3 via
111
+ /// events or other stream synchronization.
112
+ ///
113
+ /// The memory manager guarantees that the returned memory is free to use
114
+ /// without data races on this stream specified.
115
+ cudaStream_t stream;
119
116
  };
120
117
 
121
118
  /// Create an AllocInfo for the current device with MemorySpace::Device
@@ -129,140 +126,139 @@ AllocInfo makeSpaceAlloc(AllocType at, MemorySpace sp, cudaStream_t st);
129
126
 
130
127
  /// Information on what/where an allocation is, along with how big it should be
131
128
  struct AllocRequest : public AllocInfo {
132
- inline AllocRequest()
133
- : AllocInfo(),
134
- size(0) {
135
- }
136
-
137
- inline AllocRequest(const AllocInfo& info,
138
- size_t sz)
139
- : AllocInfo(info),
140
- size(sz) {
141
- }
142
-
143
- inline AllocRequest(AllocType at,
144
- int dev,
145
- MemorySpace sp,
146
- cudaStream_t st,
147
- size_t sz)
148
- : AllocInfo(at, dev, sp, st),
149
- size(sz) {
150
- }
151
-
152
- /// Returns a string representation of this request
153
- std::string toString() const;
154
-
155
- /// The size in bytes of the allocation
156
- size_t size;
129
+ inline AllocRequest() : AllocInfo(), size(0) {}
130
+
131
+ inline AllocRequest(const AllocInfo& info, size_t sz)
132
+ : AllocInfo(info), size(sz) {}
133
+
134
+ inline AllocRequest(
135
+ AllocType at,
136
+ int dev,
137
+ MemorySpace sp,
138
+ cudaStream_t st,
139
+ size_t sz)
140
+ : AllocInfo(at, dev, sp, st), size(sz) {}
141
+
142
+ /// Returns a string representation of this request
143
+ std::string toString() const;
144
+
145
+ /// The size in bytes of the allocation
146
+ size_t size;
157
147
  };
158
148
 
159
149
  /// A RAII object that manages a temporary memory request
160
150
  struct GpuMemoryReservation {
161
- GpuMemoryReservation();
162
- GpuMemoryReservation(GpuResources* r,
163
- int dev,
164
- cudaStream_t str,
165
- void* p,
166
- size_t sz);
167
- GpuMemoryReservation(GpuMemoryReservation&& m) noexcept;
168
- ~GpuMemoryReservation();
169
-
170
- GpuMemoryReservation& operator=(GpuMemoryReservation&& m);
171
-
172
- inline void* get() { return data; }
173
-
174
- void release();
175
-
176
- GpuResources* res;
177
- int device;
178
- cudaStream_t stream;
179
- void* data;
180
- size_t size;
151
+ GpuMemoryReservation();
152
+ GpuMemoryReservation(
153
+ GpuResources* r,
154
+ int dev,
155
+ cudaStream_t str,
156
+ void* p,
157
+ size_t sz);
158
+ GpuMemoryReservation(GpuMemoryReservation&& m) noexcept;
159
+ ~GpuMemoryReservation();
160
+
161
+ GpuMemoryReservation& operator=(GpuMemoryReservation&& m);
162
+
163
+ inline void* get() {
164
+ return data;
165
+ }
166
+
167
+ void release();
168
+
169
+ GpuResources* res;
170
+ int device;
171
+ cudaStream_t stream;
172
+ void* data;
173
+ size_t size;
181
174
  };
182
175
 
183
176
  /// Base class of GPU-side resource provider; hides provision of
184
177
  /// cuBLAS handles, CUDA streams and all device memory allocation performed
185
178
  class GpuResources {
186
- public:
187
- virtual ~GpuResources();
179
+ public:
180
+ virtual ~GpuResources();
188
181
 
189
- /// Call to pre-allocate resources for a particular device. If this is
190
- /// not called, then resources will be allocated at the first time
191
- /// of demand
192
- virtual void initializeForDevice(int device) = 0;
182
+ /// Call to pre-allocate resources for a particular device. If this is
183
+ /// not called, then resources will be allocated at the first time
184
+ /// of demand
185
+ virtual void initializeForDevice(int device) = 0;
193
186
 
194
- /// Returns the cuBLAS handle that we use for the given device
195
- virtual cublasHandle_t getBlasHandle(int device) = 0;
187
+ /// Returns the cuBLAS handle that we use for the given device
188
+ virtual cublasHandle_t getBlasHandle(int device) = 0;
196
189
 
197
- /// Returns the stream that we order all computation on for the
198
- /// given device
199
- virtual cudaStream_t getDefaultStream(int device) = 0;
190
+ /// Returns the stream that we order all computation on for the
191
+ /// given device
192
+ virtual cudaStream_t getDefaultStream(int device) = 0;
200
193
 
201
- /// Overrides the default stream for a device to the user-supplied stream. The
202
- /// resources object does not own this stream (i.e., it will not destroy it).
203
- virtual void setDefaultStream(int device, cudaStream_t stream) = 0;
194
+ /// Overrides the default stream for a device to the user-supplied stream.
195
+ /// The resources object does not own this stream (i.e., it will not destroy
196
+ /// it).
197
+ virtual void setDefaultStream(int device, cudaStream_t stream) = 0;
204
198
 
205
- /// Returns the set of alternative streams that we use for the given device
206
- virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
199
+ /// Returns the set of alternative streams that we use for the given device
200
+ virtual std::vector<cudaStream_t> getAlternateStreams(int device) = 0;
207
201
 
208
- /// Memory management
209
- /// Returns an allocation from the given memory space, ordered with respect to
210
- /// the given stream (i.e., the first user will be a kernel in this stream).
211
- /// All allocations are sized internally to be the next highest multiple of 16
212
- /// bytes, and all allocations returned are guaranteed to be 16 byte aligned.
213
- virtual void* allocMemory(const AllocRequest& req) = 0;
202
+ /// Memory management
203
+ /// Returns an allocation from the given memory space, ordered with respect
204
+ /// to the given stream (i.e., the first user will be a kernel in this
205
+ /// stream). All allocations are sized internally to be the next highest
206
+ /// multiple of 16 bytes, and all allocations returned are guaranteed to be
207
+ /// 16 byte aligned.
208
+ virtual void* allocMemory(const AllocRequest& req) = 0;
214
209
 
215
- /// Returns a previous allocation
216
- virtual void deallocMemory(int device, void* in) = 0;
210
+ /// Returns a previous allocation
211
+ virtual void deallocMemory(int device, void* in) = 0;
217
212
 
218
- /// For MemorySpace::Temporary, how much space is immediately available
219
- /// without cudaMalloc allocation?
220
- virtual size_t getTempMemoryAvailable(int device) const = 0;
213
+ /// For MemorySpace::Temporary, how much space is immediately available
214
+ /// without cudaMalloc allocation?
215
+ virtual size_t getTempMemoryAvailable(int device) const = 0;
221
216
 
222
- /// Returns the available CPU pinned memory buffer
223
- virtual std::pair<void*, size_t> getPinnedMemory() = 0;
217
+ /// Returns the available CPU pinned memory buffer
218
+ virtual std::pair<void*, size_t> getPinnedMemory() = 0;
224
219
 
225
- /// Returns the stream on which we perform async CPU <-> GPU copies
226
- virtual cudaStream_t getAsyncCopyStream(int device) = 0;
220
+ /// Returns the stream on which we perform async CPU <-> GPU copies
221
+ virtual cudaStream_t getAsyncCopyStream(int device) = 0;
227
222
 
228
- ///
229
- /// Functions provided by default
230
- ///
223
+ ///
224
+ /// Functions provided by default
225
+ ///
231
226
 
232
- /// Calls getBlasHandle with the current device
233
- cublasHandle_t getBlasHandleCurrentDevice();
227
+ /// Calls getBlasHandle with the current device
228
+ cublasHandle_t getBlasHandleCurrentDevice();
234
229
 
235
- /// Calls getDefaultStream with the current device
236
- cudaStream_t getDefaultStreamCurrentDevice();
230
+ /// Calls getDefaultStream with the current device
231
+ cudaStream_t getDefaultStreamCurrentDevice();
237
232
 
238
- /// Calls getTempMemoryAvailable with the current device
239
- size_t getTempMemoryAvailableCurrentDevice() const;
233
+ /// Calls getTempMemoryAvailable with the current device
234
+ size_t getTempMemoryAvailableCurrentDevice() const;
240
235
 
241
- /// Returns a temporary memory allocation via a RAII object
242
- GpuMemoryReservation allocMemoryHandle(const AllocRequest& req);
236
+ /// Returns a temporary memory allocation via a RAII object
237
+ GpuMemoryReservation allocMemoryHandle(const AllocRequest& req);
243
238
 
244
- /// Synchronizes the CPU with respect to the default stream for the
245
- /// given device
246
- // equivalent to cudaDeviceSynchronize(getDefaultStream(device))
247
- void syncDefaultStream(int device);
239
+ /// Synchronizes the CPU with respect to the default stream for the
240
+ /// given device
241
+ // equivalent to cudaDeviceSynchronize(getDefaultStream(device))
242
+ void syncDefaultStream(int device);
248
243
 
249
- /// Calls syncDefaultStream for the current device
250
- void syncDefaultStreamCurrentDevice();
244
+ /// Calls syncDefaultStream for the current device
245
+ void syncDefaultStreamCurrentDevice();
251
246
 
252
- /// Calls getAlternateStreams for the current device
253
- std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
247
+ /// Calls getAlternateStreams for the current device
248
+ std::vector<cudaStream_t> getAlternateStreamsCurrentDevice();
254
249
 
255
- /// Calls getAsyncCopyStream for the current device
256
- cudaStream_t getAsyncCopyStreamCurrentDevice();
250
+ /// Calls getAsyncCopyStream for the current device
251
+ cudaStream_t getAsyncCopyStreamCurrentDevice();
257
252
  };
258
253
 
259
254
  /// Interface for a provider of a shared resources object
260
255
  class GpuResourcesProvider {
261
- public:
262
- virtual ~GpuResourcesProvider();
256
+ public:
257
+ virtual ~GpuResourcesProvider();
263
258
 
264
- /// Returns the shared resources object
265
- virtual std::shared_ptr<GpuResources> getResources() = 0;
259
+ /// Returns the shared resources object
260
+ virtual std::shared_ptr<GpuResources> getResources() = 0;
266
261
  };
267
262
 
268
- } } // namespace
263
+ } // namespace gpu
264
+ } // namespace faiss
@@ -5,16 +5,16 @@
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
7
 
8
-
9
8
  #include <faiss/gpu/StandardGpuResources.h>
10
9
  #include <faiss/gpu/utils/DeviceUtils.h>
11
10
  #include <faiss/gpu/utils/StaticUtils.h>
12
11
  #include <faiss/impl/FaissAssert.h>
13
- #include <limits>
14
12
  #include <iostream>
13
+ #include <limits>
15
14
  #include <sstream>
16
15
 
17
- namespace faiss { namespace gpu {
16
+ namespace faiss {
17
+ namespace gpu {
18
18
 
19
19
  namespace {
20
20
 
@@ -22,513 +22,536 @@ namespace {
22
22
  constexpr int kNumStreams = 2;
23
23
 
24
24
  // Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
25
- constexpr size_t kDefaultPinnedMemoryAllocation = (size_t) 256 * 1024 * 1024;
25
+ constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
26
26
 
27
27
  // Default temporary memory allocation for <= 4 GiB memory GPUs
28
- constexpr size_t k4GiBTempMem = (size_t) 512 * 1024 * 1024;
28
+ constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
29
29
 
30
30
  // Default temporary memory allocation for <= 8 GiB memory GPUs
31
- constexpr size_t k8GiBTempMem = (size_t) 1024 * 1024 * 1024;
31
+ constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
32
32
 
33
33
  // Maximum temporary memory allocation for all GPUs
34
- constexpr size_t kMaxTempMem = (size_t) 1536 * 1024 * 1024;
34
+ constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
35
35
 
36
36
  std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
37
- // Produce a sorted list of all outstanding allocations by type
38
- std::unordered_map<AllocType, std::pair<int, size_t>> stats;
39
-
40
- for (auto& entry : map) {
41
- auto& a = entry.second;
42
-
43
- auto it = stats.find(a.type);
44
- if (it != stats.end()) {
45
- stats[a.type].first++;
46
- stats[a.type].second += a.size;
47
- } else {
48
- stats[a.type] = std::make_pair(1, a.size);
37
+ // Produce a sorted list of all outstanding allocations by type
38
+ std::unordered_map<AllocType, std::pair<int, size_t>> stats;
39
+
40
+ for (auto& entry : map) {
41
+ auto& a = entry.second;
42
+
43
+ auto it = stats.find(a.type);
44
+ if (it != stats.end()) {
45
+ stats[a.type].first++;
46
+ stats[a.type].second += a.size;
47
+ } else {
48
+ stats[a.type] = std::make_pair(1, a.size);
49
+ }
49
50
  }
50
- }
51
51
 
52
- std::stringstream ss;
53
- for (auto& entry : stats) {
54
- ss << "Alloc type " << allocTypeToString(entry.first) << ": "
55
- << entry.second.first << " allocations, "
56
- << entry.second.second << " bytes\n";
57
- }
52
+ std::stringstream ss;
53
+ for (auto& entry : stats) {
54
+ ss << "Alloc type " << allocTypeToString(entry.first) << ": "
55
+ << entry.second.first << " allocations, " << entry.second.second
56
+ << " bytes\n";
57
+ }
58
58
 
59
- return ss.str();
59
+ return ss.str();
60
60
  }
61
61
 
62
- }
62
+ } // namespace
63
63
 
64
64
  //
65
65
  // StandardGpuResourcesImpl
66
66
  //
67
67
 
68
- StandardGpuResourcesImpl::StandardGpuResourcesImpl() :
69
- pinnedMemAlloc_(nullptr),
70
- pinnedMemAllocSize_(0),
71
- // let the adjustment function determine the memory size for us by passing
72
- // in a huge value that will then be adjusted
73
- tempMemSize_(getDefaultTempMemForGPU(-1,
74
- std::numeric_limits<size_t>::max())),
75
- pinnedMemSize_(kDefaultPinnedMemoryAllocation),
76
- allocLogging_(false) {
77
- }
68
+ StandardGpuResourcesImpl::StandardGpuResourcesImpl()
69
+ : pinnedMemAlloc_(nullptr),
70
+ pinnedMemAllocSize_(0),
71
+ // let the adjustment function determine the memory size for us by
72
+ // passing in a huge value that will then be adjusted
73
+ tempMemSize_(getDefaultTempMemForGPU(
74
+ -1,
75
+ std::numeric_limits<size_t>::max())),
76
+ pinnedMemSize_(kDefaultPinnedMemoryAllocation),
77
+ allocLogging_(false) {}
78
78
 
79
79
  StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
80
- // The temporary memory allocator has allocated memory through us, so clean
81
- // that up before we finish fully de-initializing ourselves
82
- tempMemory_.clear();
83
-
84
- // Make sure all allocations have been freed
85
- bool allocError = false;
86
-
87
- for (auto& entry : allocs_) {
88
- auto& map = entry.second;
89
-
90
- if (!map.empty()) {
91
- std::cerr
92
- << "StandardGpuResources destroyed with allocations outstanding:\n"
93
- << "Device " << entry.first << " outstanding allocations:\n";
94
- std::cerr << allocsToString(map);
95
- allocError = true;
80
+ // The temporary memory allocator has allocated memory through us, so clean
81
+ // that up before we finish fully de-initializing ourselves
82
+ tempMemory_.clear();
83
+
84
+ // Make sure all allocations have been freed
85
+ bool allocError = false;
86
+
87
+ for (auto& entry : allocs_) {
88
+ auto& map = entry.second;
89
+
90
+ if (!map.empty()) {
91
+ std::cerr
92
+ << "StandardGpuResources destroyed with allocations outstanding:\n"
93
+ << "Device " << entry.first
94
+ << " outstanding allocations:\n";
95
+ std::cerr << allocsToString(map);
96
+ allocError = true;
97
+ }
96
98
  }
97
- }
98
99
 
99
- FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
100
+ FAISS_ASSERT_MSG(
101
+ !allocError, "GPU memory allocations not properly cleaned up");
100
102
 
101
- for (auto& entry : defaultStreams_) {
102
- DeviceScope scope(entry.first);
103
+ for (auto& entry : defaultStreams_) {
104
+ DeviceScope scope(entry.first);
103
105
 
104
- // We created these streams, so are responsible for destroying them
105
- CUDA_VERIFY(cudaStreamDestroy(entry.second));
106
- }
106
+ // We created these streams, so are responsible for destroying them
107
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
108
+ }
107
109
 
108
- for (auto& entry : alternateStreams_) {
109
- DeviceScope scope(entry.first);
110
+ for (auto& entry : alternateStreams_) {
111
+ DeviceScope scope(entry.first);
110
112
 
111
- for (auto stream : entry.second) {
112
- CUDA_VERIFY(cudaStreamDestroy(stream));
113
+ for (auto stream : entry.second) {
114
+ CUDA_VERIFY(cudaStreamDestroy(stream));
115
+ }
113
116
  }
114
- }
115
117
 
116
- for (auto& entry : asyncCopyStreams_) {
117
- DeviceScope scope(entry.first);
118
+ for (auto& entry : asyncCopyStreams_) {
119
+ DeviceScope scope(entry.first);
118
120
 
119
- CUDA_VERIFY(cudaStreamDestroy(entry.second));
120
- }
121
+ CUDA_VERIFY(cudaStreamDestroy(entry.second));
122
+ }
121
123
 
122
- for (auto& entry : blasHandles_) {
123
- DeviceScope scope(entry.first);
124
+ for (auto& entry : blasHandles_) {
125
+ DeviceScope scope(entry.first);
124
126
 
125
- auto blasStatus = cublasDestroy(entry.second);
126
- FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
127
- }
127
+ auto blasStatus = cublasDestroy(entry.second);
128
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
129
+ }
128
130
 
129
- if (pinnedMemAlloc_) {
130
- auto err = cudaFreeHost(pinnedMemAlloc_);
131
- FAISS_ASSERT_FMT(err == cudaSuccess,
132
- "Failed to cudaFreeHost pointer %p (error %d %s)",
133
- pinnedMemAlloc_, (int) err, cudaGetErrorString(err));
134
- }
131
+ if (pinnedMemAlloc_) {
132
+ auto err = cudaFreeHost(pinnedMemAlloc_);
133
+ FAISS_ASSERT_FMT(
134
+ err == cudaSuccess,
135
+ "Failed to cudaFreeHost pointer %p (error %d %s)",
136
+ pinnedMemAlloc_,
137
+ (int)err,
138
+ cudaGetErrorString(err));
139
+ }
135
140
  }
136
141
 
137
- size_t
138
- StandardGpuResourcesImpl::getDefaultTempMemForGPU(int device,
139
- size_t requested) {
140
- auto totalMem = device != -1 ?
141
- getDeviceProperties(device).totalGlobalMem :
142
- std::numeric_limits<size_t>::max();
142
+ size_t StandardGpuResourcesImpl::getDefaultTempMemForGPU(
143
+ int device,
144
+ size_t requested) {
145
+ auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
146
+ : std::numeric_limits<size_t>::max();
143
147
 
144
- if (totalMem <= (size_t) 4 * 1024 * 1024 * 1024) {
145
- // If the GPU has <= 4 GiB of memory, reserve 512 MiB
148
+ if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
149
+ // If the GPU has <= 4 GiB of memory, reserve 512 MiB
146
150
 
147
- if (requested > k4GiBTempMem) {
148
- return k4GiBTempMem;
149
- }
150
- } else if (totalMem <= (size_t) 8 * 1024 * 1024 * 1024) {
151
- // If the GPU has <= 8 GiB of memory, reserve 1 GiB
151
+ if (requested > k4GiBTempMem) {
152
+ return k4GiBTempMem;
153
+ }
154
+ } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
155
+ // If the GPU has <= 8 GiB of memory, reserve 1 GiB
152
156
 
153
- if (requested > k8GiBTempMem) {
154
- return k8GiBTempMem;
155
- }
156
- } else {
157
- // Never use more than 1.5 GiB
158
- if (requested > kMaxTempMem) {
159
- return kMaxTempMem;
157
+ if (requested > k8GiBTempMem) {
158
+ return k8GiBTempMem;
159
+ }
160
+ } else {
161
+ // Never use more than 1.5 GiB
162
+ if (requested > kMaxTempMem) {
163
+ return kMaxTempMem;
164
+ }
160
165
  }
161
- }
162
-
163
- // use whatever lower limit the user requested
164
- return requested;
165
- }
166
-
167
- void
168
- StandardGpuResourcesImpl::noTempMemory() {
169
- setTempMemory(0);
170
- }
171
-
172
- void
173
- StandardGpuResourcesImpl::setTempMemory(size_t size) {
174
- if (tempMemSize_ != size) {
175
- // adjust based on general limits
176
- tempMemSize_ = getDefaultTempMemForGPU(-1, size);
177
-
178
- // We need to re-initialize memory resources for all current devices that
179
- // have been initialized.
180
- // This should be safe to do, even if we are currently running work, because
181
- // the cudaFree call that this implies will force-synchronize all GPUs with
182
- // the CPU
183
- for (auto& p : tempMemory_) {
184
- int device = p.first;
185
- // Free the existing memory first
186
- p.second.reset();
187
-
188
- // Allocate new
189
- p.second = std::unique_ptr<StackDeviceMemory>(
190
- new StackDeviceMemory(this,
191
- p.first,
192
- // adjust for this specific device
193
- getDefaultTempMemForGPU(device, tempMemSize_)));
166
+
167
+ // use whatever lower limit the user requested
168
+ return requested;
169
+ }
170
+
171
+ void StandardGpuResourcesImpl::noTempMemory() {
172
+ setTempMemory(0);
173
+ }
174
+
175
+ void StandardGpuResourcesImpl::setTempMemory(size_t size) {
176
+ if (tempMemSize_ != size) {
177
+ // adjust based on general limits
178
+ tempMemSize_ = getDefaultTempMemForGPU(-1, size);
179
+
180
+ // We need to re-initialize memory resources for all current devices
181
+ // that have been initialized. This should be safe to do, even if we are
182
+ // currently running work, because the cudaFree call that this implies
183
+ // will force-synchronize all GPUs with the CPU
184
+ for (auto& p : tempMemory_) {
185
+ int device = p.first;
186
+ // Free the existing memory first
187
+ p.second.reset();
188
+
189
+ // Allocate new
190
+ p.second = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
191
+ this,
192
+ p.first,
193
+ // adjust for this specific device
194
+ getDefaultTempMemForGPU(device, tempMemSize_)));
195
+ }
194
196
  }
195
- }
196
197
  }
197
198
 
198
- void
199
- StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
200
- // Should not call this after devices have been initialized
201
- FAISS_ASSERT(defaultStreams_.size() == 0);
202
- FAISS_ASSERT(!pinnedMemAlloc_);
199
+ void StandardGpuResourcesImpl::setPinnedMemory(size_t size) {
200
+ // Should not call this after devices have been initialized
201
+ FAISS_ASSERT(defaultStreams_.size() == 0);
202
+ FAISS_ASSERT(!pinnedMemAlloc_);
203
203
 
204
- pinnedMemSize_ = size;
204
+ pinnedMemSize_ = size;
205
205
  }
206
206
 
207
- void
208
- StandardGpuResourcesImpl::setDefaultStream(int device, cudaStream_t stream) {
209
- if (isInitialized(device)) {
210
- // A new series of calls may not be ordered with what was the previous
211
- // stream, so if the stream being specified is different, then we need to
212
- // ensure ordering between the two (new stream waits on old).
213
- auto it = userDefaultStreams_.find(device);
214
- cudaStream_t prevStream = nullptr;
207
+ void StandardGpuResourcesImpl::setDefaultStream(
208
+ int device,
209
+ cudaStream_t stream) {
210
+ if (isInitialized(device)) {
211
+ // A new series of calls may not be ordered with what was the previous
212
+ // stream, so if the stream being specified is different, then we need
213
+ // to ensure ordering between the two (new stream waits on old).
214
+ auto it = userDefaultStreams_.find(device);
215
+ cudaStream_t prevStream = nullptr;
215
216
 
216
- if (it != userDefaultStreams_.end()) {
217
- prevStream = it->second;
218
- } else {
219
- FAISS_ASSERT(defaultStreams_.count(device));
220
- prevStream = defaultStreams_[device];
221
- }
217
+ if (it != userDefaultStreams_.end()) {
218
+ prevStream = it->second;
219
+ } else {
220
+ FAISS_ASSERT(defaultStreams_.count(device));
221
+ prevStream = defaultStreams_[device];
222
+ }
222
223
 
223
- if (prevStream != stream) {
224
- streamWait({stream}, {prevStream});
224
+ if (prevStream != stream) {
225
+ streamWait({stream}, {prevStream});
226
+ }
225
227
  }
226
- }
227
228
 
228
- userDefaultStreams_[device] = stream;
229
+ userDefaultStreams_[device] = stream;
229
230
  }
230
231
 
231
- void
232
- StandardGpuResourcesImpl::revertDefaultStream(int device) {
233
- if (isInitialized(device)) {
234
- auto it = userDefaultStreams_.find(device);
232
+ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
233
+ if (isInitialized(device)) {
234
+ auto it = userDefaultStreams_.find(device);
235
235
 
236
- if (it != userDefaultStreams_.end()) {
237
- // There was a user stream set that we need to synchronize against
238
- cudaStream_t prevStream = userDefaultStreams_[device];
236
+ if (it != userDefaultStreams_.end()) {
237
+ // There was a user stream set that we need to synchronize against
238
+ cudaStream_t prevStream = userDefaultStreams_[device];
239
239
 
240
- FAISS_ASSERT(defaultStreams_.count(device));
241
- cudaStream_t newStream = defaultStreams_[device];
240
+ FAISS_ASSERT(defaultStreams_.count(device));
241
+ cudaStream_t newStream = defaultStreams_[device];
242
242
 
243
- streamWait({newStream}, {prevStream});
243
+ streamWait({newStream}, {prevStream});
244
+ }
244
245
  }
245
- }
246
246
 
247
- userDefaultStreams_.erase(device);
247
+ userDefaultStreams_.erase(device);
248
248
  }
249
249
 
250
- void
251
- StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
252
- for (int dev = 0; dev < getNumDevices(); ++dev) {
253
- setDefaultStream(dev, nullptr);
254
- }
250
+ void StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() {
251
+ for (int dev = 0; dev < getNumDevices(); ++dev) {
252
+ setDefaultStream(dev, nullptr);
253
+ }
255
254
  }
256
255
 
257
- void
258
- StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
259
- allocLogging_ = enable;
256
+ void StandardGpuResourcesImpl::setLogMemoryAllocations(bool enable) {
257
+ allocLogging_ = enable;
260
258
  }
261
259
 
262
- bool
263
- StandardGpuResourcesImpl::isInitialized(int device) const {
264
- // Use default streams as a marker for whether or not a certain
265
- // device has been initialized
266
- return defaultStreams_.count(device) != 0;
260
+ bool StandardGpuResourcesImpl::isInitialized(int device) const {
261
+ // Use default streams as a marker for whether or not a certain
262
+ // device has been initialized
263
+ return defaultStreams_.count(device) != 0;
267
264
  }
268
265
 
269
- void
270
- StandardGpuResourcesImpl::initializeForDevice(int device) {
271
- if (isInitialized(device)) {
272
- return;
273
- }
274
-
275
- // If this is the first device that we're initializing, create our
276
- // pinned memory allocation
277
- if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
278
- auto err =
279
- cudaHostAlloc(&pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
266
+ void StandardGpuResourcesImpl::initializeForDevice(int device) {
267
+ if (isInitialized(device)) {
268
+ return;
269
+ }
280
270
 
281
- FAISS_THROW_IF_NOT_FMT(
282
- err == cudaSuccess,
283
- "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
284
- "async copy buffer (error %d %s)",
285
- pinnedMemSize_, (int) err, cudaGetErrorString(err));
271
+ // If this is the first device that we're initializing, create our
272
+ // pinned memory allocation
273
+ if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
274
+ auto err = cudaHostAlloc(
275
+ &pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
276
+
277
+ FAISS_THROW_IF_NOT_FMT(
278
+ err == cudaSuccess,
279
+ "failed to cudaHostAlloc %zu bytes for CPU <-> GPU "
280
+ "async copy buffer (error %d %s)",
281
+ pinnedMemSize_,
282
+ (int)err,
283
+ cudaGetErrorString(err));
284
+
285
+ pinnedMemAllocSize_ = pinnedMemSize_;
286
+ }
286
287
 
287
- pinnedMemAllocSize_ = pinnedMemSize_;
288
- }
288
+ FAISS_ASSERT(device < getNumDevices());
289
+ DeviceScope scope(device);
289
290
 
290
- FAISS_ASSERT(device < getNumDevices());
291
- DeviceScope scope(device);
291
+ // Make sure that device properties for all devices are cached
292
+ auto& prop = getDeviceProperties(device);
292
293
 
293
- // Make sure that device properties for all devices are cached
294
- auto& prop = getDeviceProperties(device);
294
+ // Also check to make sure we meet our minimum compute capability (3.0)
295
+ FAISS_ASSERT_FMT(
296
+ prop.major >= 3,
297
+ "Device id %d with CC %d.%d not supported, "
298
+ "need 3.0+ compute capability",
299
+ device,
300
+ prop.major,
301
+ prop.minor);
295
302
 
296
- // Also check to make sure we meet our minimum compute capability (3.0)
297
- FAISS_ASSERT_FMT(prop.major >= 3,
298
- "Device id %d with CC %d.%d not supported, "
299
- "need 3.0+ compute capability",
300
- device, prop.major, prop.minor);
303
+ // Our code is pre-built with and expects warpSize == 32, validate that
304
+ FAISS_ASSERT_FMT(
305
+ prop.warpSize == 32,
306
+ "Device id %d does not have expected warpSize of 32",
307
+ device);
301
308
 
302
- // Create streams
303
- cudaStream_t defaultStream = 0;
304
- CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream,
305
- cudaStreamNonBlocking));
309
+ // Create streams
310
+ cudaStream_t defaultStream = 0;
311
+ CUDA_VERIFY(
312
+ cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
306
313
 
307
- defaultStreams_[device] = defaultStream;
314
+ defaultStreams_[device] = defaultStream;
308
315
 
309
- cudaStream_t asyncCopyStream = 0;
310
- CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream,
311
- cudaStreamNonBlocking));
316
+ cudaStream_t asyncCopyStream = 0;
317
+ CUDA_VERIFY(
318
+ cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
312
319
 
313
- asyncCopyStreams_[device] = asyncCopyStream;
320
+ asyncCopyStreams_[device] = asyncCopyStream;
314
321
 
315
- std::vector<cudaStream_t> deviceStreams;
316
- for (int j = 0; j < kNumStreams; ++j) {
317
- cudaStream_t stream = 0;
318
- CUDA_VERIFY(cudaStreamCreateWithFlags(&stream,
319
- cudaStreamNonBlocking));
322
+ std::vector<cudaStream_t> deviceStreams;
323
+ for (int j = 0; j < kNumStreams; ++j) {
324
+ cudaStream_t stream = 0;
325
+ CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
320
326
 
321
- deviceStreams.push_back(stream);
322
- }
327
+ deviceStreams.push_back(stream);
328
+ }
323
329
 
324
- alternateStreams_[device] = std::move(deviceStreams);
330
+ alternateStreams_[device] = std::move(deviceStreams);
325
331
 
326
- // Create cuBLAS handle
327
- cublasHandle_t blasHandle = 0;
328
- auto blasStatus = cublasCreate(&blasHandle);
329
- FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
330
- blasHandles_[device] = blasHandle;
332
+ // Create cuBLAS handle
333
+ cublasHandle_t blasHandle = 0;
334
+ auto blasStatus = cublasCreate(&blasHandle);
335
+ FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
336
+ blasHandles_[device] = blasHandle;
331
337
 
332
- // For CUDA 10 on V100, enabling tensor core usage would enable automatic
333
- // rounding down of inputs to f16 (though accumulate in f32) which results in
334
- // unacceptable loss of precision in general.
335
- // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
336
- // a loss of precision.
338
+ // For CUDA 10 on V100, enabling tensor core usage would enable automatic
339
+ // rounding down of inputs to f16 (though accumulate in f32) which results
340
+ // in unacceptable loss of precision in general. For CUDA 11 / A100, only
341
+ // enable tensor core support if it doesn't result in a loss of precision.
337
342
  #if CUDA_VERSION >= 11000
338
- cublasSetMathMode(blasHandle,
339
- CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
343
+ cublasSetMathMode(
344
+ blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
340
345
  #endif
341
346
 
342
- FAISS_ASSERT(allocs_.count(device) == 0);
343
- allocs_[device] = std::unordered_map<void*, AllocRequest>();
347
+ FAISS_ASSERT(allocs_.count(device) == 0);
348
+ allocs_[device] = std::unordered_map<void*, AllocRequest>();
344
349
 
345
- FAISS_ASSERT(tempMemory_.count(device) == 0);
346
- auto mem = std::unique_ptr<StackDeviceMemory>(
347
- new StackDeviceMemory(this,
348
- device,
349
- // adjust for this specific device
350
- getDefaultTempMemForGPU(device, tempMemSize_)));
350
+ FAISS_ASSERT(tempMemory_.count(device) == 0);
351
+ auto mem = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
352
+ this,
353
+ device,
354
+ // adjust for this specific device
355
+ getDefaultTempMemForGPU(device, tempMemSize_)));
351
356
 
352
- tempMemory_.emplace(device, std::move(mem));
357
+ tempMemory_.emplace(device, std::move(mem));
353
358
  }
354
359
 
355
- cublasHandle_t
356
- StandardGpuResourcesImpl::getBlasHandle(int device) {
357
- initializeForDevice(device);
358
- return blasHandles_[device];
360
+ cublasHandle_t StandardGpuResourcesImpl::getBlasHandle(int device) {
361
+ initializeForDevice(device);
362
+ return blasHandles_[device];
359
363
  }
360
364
 
361
- cudaStream_t
362
- StandardGpuResourcesImpl::getDefaultStream(int device) {
363
- initializeForDevice(device);
365
+ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
366
+ initializeForDevice(device);
364
367
 
365
- auto it = userDefaultStreams_.find(device);
366
- if (it != userDefaultStreams_.end()) {
367
- // There is a user override stream set
368
- return it->second;
369
- }
368
+ auto it = userDefaultStreams_.find(device);
369
+ if (it != userDefaultStreams_.end()) {
370
+ // There is a user override stream set
371
+ return it->second;
372
+ }
370
373
 
371
- // Otherwise, our base default stream
372
- return defaultStreams_[device];
374
+ // Otherwise, our base default stream
375
+ return defaultStreams_[device];
373
376
  }
374
377
 
375
- std::vector<cudaStream_t>
376
- StandardGpuResourcesImpl::getAlternateStreams(int device) {
377
- initializeForDevice(device);
378
- return alternateStreams_[device];
378
+ std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
379
+ int device) {
380
+ initializeForDevice(device);
381
+ return alternateStreams_[device];
379
382
  }
380
383
 
381
- std::pair<void*, size_t>
382
- StandardGpuResourcesImpl::getPinnedMemory() {
383
- return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
384
+ std::pair<void*, size_t> StandardGpuResourcesImpl::getPinnedMemory() {
385
+ return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
384
386
  }
385
387
 
386
- cudaStream_t
387
- StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
388
- initializeForDevice(device);
389
- return asyncCopyStreams_[device];
388
+ cudaStream_t StandardGpuResourcesImpl::getAsyncCopyStream(int device) {
389
+ initializeForDevice(device);
390
+ return asyncCopyStreams_[device];
390
391
  }
391
392
 
392
- void*
393
- StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
394
- initializeForDevice(req.device);
395
-
396
- // We don't allocate a placeholder for zero-sized allocations
397
- if (req.size == 0) {
398
- return nullptr;
399
- }
400
-
401
- // Make sure that the allocation is a multiple of 16 bytes for alignment
402
- // purposes
403
- auto adjReq = req;
404
- adjReq.size = utils::roundUp(adjReq.size, (size_t) 16);
405
-
406
- void* p = nullptr;
407
-
408
- if (allocLogging_) {
409
- std::cout << "StandardGpuResources: alloc " << adjReq.toString() << "\n";
410
- }
411
-
412
- if (adjReq.space == MemorySpace::Temporary) {
413
- // If we don't have enough space in our temporary memory manager, we need
414
- // to allocate this request separately
415
- auto& tempMem = tempMemory_[adjReq.device];
416
-
417
- if (adjReq.size > tempMem->getSizeAvailable()) {
418
- // We need to allocate this ourselves
419
- AllocRequest newReq = adjReq;
420
- newReq.space = MemorySpace::Device;
421
- newReq.type = AllocType::TemporaryMemoryOverflow;
393
+ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
394
+ initializeForDevice(req.device);
422
395
 
423
- return allocMemory(newReq);
396
+ // We don't allocate a placeholder for zero-sized allocations
397
+ if (req.size == 0) {
398
+ return nullptr;
424
399
  }
425
400
 
426
- // Otherwise, we can handle this locally
427
- p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
428
-
429
- } else if (adjReq.space == MemorySpace::Device) {
430
- auto err = cudaMalloc(&p, adjReq.size);
431
-
432
- // Throw if we fail to allocate
433
- if (err != cudaSuccess) {
434
- auto& map = allocs_[req.device];
435
-
436
- std::stringstream ss;
437
- ss << "Failed to cudaMalloc " << adjReq.size << " bytes "
438
- << "on device " << adjReq.device << " (error "
439
- << (int) err << " " << cudaGetErrorString(err)
440
- << "\nOutstanding allocations:\n" << allocsToString(map);
441
- auto str = ss.str();
442
-
443
- FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
401
+ // cudaMalloc guarantees allocation alignment to 256 bytes; do the same here
402
+ // for alignment purposes (to reduce memory transaction overhead etc)
403
+ auto adjReq = req;
404
+ adjReq.size = utils::roundUp(adjReq.size, (size_t)256);
405
+
406
+ void* p = nullptr;
407
+
408
+ if (adjReq.space == MemorySpace::Temporary) {
409
+ // If we don't have enough space in our temporary memory manager, we
410
+ // need to allocate this request separately
411
+ auto& tempMem = tempMemory_[adjReq.device];
412
+
413
+ if (adjReq.size > tempMem->getSizeAvailable()) {
414
+ // We need to allocate this ourselves
415
+ AllocRequest newReq = adjReq;
416
+ newReq.space = MemorySpace::Device;
417
+ newReq.type = AllocType::TemporaryMemoryOverflow;
418
+
419
+ if (allocLogging_) {
420
+ std::cout
421
+ << "StandardGpuResources: alloc fail "
422
+ << adjReq.toString()
423
+ << " (no temp space); retrying as MemorySpace::Device\n";
424
+ }
425
+
426
+ return allocMemory(newReq);
427
+ }
428
+
429
+ // Otherwise, we can handle this locally
430
+ p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
431
+
432
+ } else if (adjReq.space == MemorySpace::Device) {
433
+ auto err = cudaMalloc(&p, adjReq.size);
434
+
435
+ // Throw if we fail to allocate
436
+ if (err != cudaSuccess) {
437
+ // FIXME: as of CUDA 11, a memory allocation error appears to be
438
+ // presented via cudaGetLastError as well, and needs to be cleared.
439
+ // Just call the function to clear it
440
+ cudaGetLastError();
441
+
442
+ std::stringstream ss;
443
+ ss << "StandardGpuResources: alloc fail " << adjReq.toString()
444
+ << " (cudaMalloc error " << cudaGetErrorString(err) << " ["
445
+ << (int)err << "])\n";
446
+ auto str = ss.str();
447
+
448
+ if (allocLogging_) {
449
+ std::cout << str;
450
+ }
451
+
452
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
453
+ }
454
+ } else if (adjReq.space == MemorySpace::Unified) {
455
+ auto err = cudaMallocManaged(&p, adjReq.size);
456
+
457
+ if (err != cudaSuccess) {
458
+ // FIXME: as of CUDA 11, a memory allocation error appears to be
459
+ // presented via cudaGetLastError as well, and needs to be cleared.
460
+ // Just call the function to clear it
461
+ cudaGetLastError();
462
+
463
+ std::stringstream ss;
464
+ ss << "StandardGpuResources: alloc fail " << adjReq.toString()
465
+ << " failed (cudaMallocManaged error " << cudaGetErrorString(err)
466
+ << " [" << (int)err << "])\n";
467
+ auto str = ss.str();
468
+
469
+ if (allocLogging_) {
470
+ std::cout << str;
471
+ }
472
+
473
+ FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
474
+ }
475
+ } else {
476
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
444
477
  }
445
- } else if (adjReq.space == MemorySpace::Unified) {
446
- auto err = cudaMallocManaged(&p, adjReq.size);
447
-
448
- if (err != cudaSuccess) {
449
- auto& map = allocs_[req.device];
450
478
 
451
- std::stringstream ss;
452
- ss << "Failed to cudaMallocManaged " << adjReq.size << " bytes "
453
- << "(error " << (int) err << " " << cudaGetErrorString(err)
454
- << "\nOutstanding allocations:\n" << allocsToString(map);
455
- auto str = ss.str();
456
-
457
- FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
479
+ if (allocLogging_) {
480
+ std::cout << "StandardGpuResources: alloc ok " << adjReq.toString()
481
+ << " ptr 0x" << p << "\n";
458
482
  }
459
- } else {
460
- FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) adjReq.space);
461
- }
462
483
 
463
- allocs_[adjReq.device][p] = adjReq;
484
+ allocs_[adjReq.device][p] = adjReq;
464
485
 
465
- return p;
486
+ return p;
466
487
  }
467
488
 
468
- void
469
- StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
470
- FAISS_ASSERT(isInitialized(device));
489
+ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
490
+ FAISS_ASSERT(isInitialized(device));
471
491
 
472
- if (!p) {
473
- return;
474
- }
492
+ if (!p) {
493
+ return;
494
+ }
475
495
 
476
- auto& a = allocs_[device];
477
- auto it = a.find(p);
478
- FAISS_ASSERT(it != a.end());
496
+ auto& a = allocs_[device];
497
+ auto it = a.find(p);
498
+ FAISS_ASSERT(it != a.end());
479
499
 
480
- auto& req = it->second;
500
+ auto& req = it->second;
481
501
 
482
- if (allocLogging_) {
483
- std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
484
- }
502
+ if (allocLogging_) {
503
+ std::cout << "StandardGpuResources: dealloc " << req.toString() << "\n";
504
+ }
485
505
 
486
- if (req.space == MemorySpace::Temporary) {
487
- tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
506
+ if (req.space == MemorySpace::Temporary) {
507
+ tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
488
508
 
489
- } else if (req.space == MemorySpace::Device ||
490
- req.space == MemorySpace::Unified) {
491
- auto err = cudaFree(p);
492
- FAISS_ASSERT_FMT(err == cudaSuccess,
493
- "Failed to cudaFree pointer %p (error %d %s)",
494
- p, (int) err, cudaGetErrorString(err));
509
+ } else if (
510
+ req.space == MemorySpace::Device ||
511
+ req.space == MemorySpace::Unified) {
512
+ auto err = cudaFree(p);
513
+ FAISS_ASSERT_FMT(
514
+ err == cudaSuccess,
515
+ "Failed to cudaFree pointer %p (error %d %s)",
516
+ p,
517
+ (int)err,
518
+ cudaGetErrorString(err));
495
519
 
496
- } else {
497
- FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int) req.space);
498
- }
520
+ } else {
521
+ FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
522
+ }
499
523
 
500
- a.erase(it);
524
+ a.erase(it);
501
525
  }
502
526
 
503
- size_t
504
- StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
505
- FAISS_ASSERT(isInitialized(device));
527
+ size_t StandardGpuResourcesImpl::getTempMemoryAvailable(int device) const {
528
+ FAISS_ASSERT(isInitialized(device));
506
529
 
507
- auto it = tempMemory_.find(device);
508
- FAISS_ASSERT(it != tempMemory_.end());
530
+ auto it = tempMemory_.find(device);
531
+ FAISS_ASSERT(it != tempMemory_.end());
509
532
 
510
- return it->second->getSizeAvailable();
533
+ return it->second->getSizeAvailable();
511
534
  }
512
535
 
513
536
  std::map<int, std::map<std::string, std::pair<int, size_t>>>
514
537
  StandardGpuResourcesImpl::getMemoryInfo() const {
515
- using AT = std::map<std::string, std::pair<int, size_t>>;
538
+ using AT = std::map<std::string, std::pair<int, size_t>>;
516
539
 
517
- std::map<int, AT> out;
540
+ std::map<int, AT> out;
518
541
 
519
- for (auto& entry : allocs_) {
520
- AT outDevice;
542
+ for (auto& entry : allocs_) {
543
+ AT outDevice;
521
544
 
522
- for (auto& a : entry.second) {
523
- auto& v = outDevice[allocTypeToString(a.second.type)];
524
- v.first++;
525
- v.second += a.second.size;
526
- }
545
+ for (auto& a : entry.second) {
546
+ auto& v = outDevice[allocTypeToString(a.second.type)];
547
+ v.first++;
548
+ v.second += a.second.size;
549
+ }
527
550
 
528
- out[entry.first] = std::move(outDevice);
529
- }
551
+ out[entry.first] = std::move(outDevice);
552
+ }
530
553
 
531
- return out;
554
+ return out;
532
555
  }
533
556
 
534
557
  //
@@ -536,70 +559,58 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
536
559
  //
537
560
 
538
561
  StandardGpuResources::StandardGpuResources()
539
- : res_(new StandardGpuResourcesImpl) {
540
- }
562
+ : res_(new StandardGpuResourcesImpl) {}
541
563
 
542
- StandardGpuResources::~StandardGpuResources() {
543
- }
564
+ StandardGpuResources::~StandardGpuResources() {}
544
565
 
545
- std::shared_ptr<GpuResources>
546
- StandardGpuResources::getResources() {
547
- return res_;
566
+ std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
567
+ return res_;
548
568
  }
549
569
 
550
- void
551
- StandardGpuResources::noTempMemory() {
552
- res_->noTempMemory();
570
+ void StandardGpuResources::noTempMemory() {
571
+ res_->noTempMemory();
553
572
  }
554
573
 
555
- void
556
- StandardGpuResources::setTempMemory(size_t size) {
557
- res_->setTempMemory(size);
574
+ void StandardGpuResources::setTempMemory(size_t size) {
575
+ res_->setTempMemory(size);
558
576
  }
559
577
 
560
- void
561
- StandardGpuResources::setPinnedMemory(size_t size) {
562
- res_->setPinnedMemory(size);
578
+ void StandardGpuResources::setPinnedMemory(size_t size) {
579
+ res_->setPinnedMemory(size);
563
580
  }
564
581
 
565
- void
566
- StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
567
- res_->setDefaultStream(device, stream);
582
+ void StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) {
583
+ res_->setDefaultStream(device, stream);
568
584
  }
569
585
 
570
- void
571
- StandardGpuResources::revertDefaultStream(int device) {
572
- res_->revertDefaultStream(device);
586
+ void StandardGpuResources::revertDefaultStream(int device) {
587
+ res_->revertDefaultStream(device);
573
588
  }
574
589
 
575
- void
576
- StandardGpuResources::setDefaultNullStreamAllDevices() {
577
- res_->setDefaultNullStreamAllDevices();
590
+ void StandardGpuResources::setDefaultNullStreamAllDevices() {
591
+ res_->setDefaultNullStreamAllDevices();
578
592
  }
579
593
 
580
594
  std::map<int, std::map<std::string, std::pair<int, size_t>>>
581
595
  StandardGpuResources::getMemoryInfo() const {
582
- return res_->getMemoryInfo();
596
+ return res_->getMemoryInfo();
583
597
  }
584
598
 
585
- cudaStream_t
586
- StandardGpuResources::getDefaultStream(int device) {
587
- return res_->getDefaultStream(device);
599
+ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
600
+ return res_->getDefaultStream(device);
588
601
  }
589
602
 
590
- size_t
591
- StandardGpuResources::getTempMemoryAvailable(int device) const {
592
- return res_->getTempMemoryAvailable(device);
603
+ size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
604
+ return res_->getTempMemoryAvailable(device);
593
605
  }
594
606
 
595
- void
596
- StandardGpuResources::syncDefaultStreamCurrentDevice() {
597
- res_->syncDefaultStreamCurrentDevice();
607
+ void StandardGpuResources::syncDefaultStreamCurrentDevice() {
608
+ res_->syncDefaultStreamCurrentDevice();
598
609
  }
599
610
 
600
- void
601
- StandardGpuResources::setLogMemoryAllocations(bool enable) {
602
- res_->setLogMemoryAllocations(enable);
611
+ void StandardGpuResources::setLogMemoryAllocations(bool enable) {
612
+ res_->setLogMemoryAllocations(enable);
603
613
  }
604
614
 
605
- } } // namespace
615
+ } // namespace gpu
616
+ } // namespace faiss