faiss 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (215) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +7 -7
  5. data/ext/faiss/extconf.rb +6 -3
  6. data/ext/faiss/numo.hpp +4 -4
  7. data/ext/faiss/utils.cpp +1 -1
  8. data/ext/faiss/utils.h +1 -1
  9. data/lib/faiss/version.rb +1 -1
  10. data/vendor/faiss/faiss/AutoTune.cpp +292 -291
  11. data/vendor/faiss/faiss/AutoTune.h +55 -56
  12. data/vendor/faiss/faiss/Clustering.cpp +365 -194
  13. data/vendor/faiss/faiss/Clustering.h +102 -35
  14. data/vendor/faiss/faiss/IVFlib.cpp +171 -195
  15. data/vendor/faiss/faiss/IVFlib.h +48 -51
  16. data/vendor/faiss/faiss/Index.cpp +85 -103
  17. data/vendor/faiss/faiss/Index.h +54 -48
  18. data/vendor/faiss/faiss/Index2Layer.cpp +126 -224
  19. data/vendor/faiss/faiss/Index2Layer.h +22 -36
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
  21. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +195 -0
  22. data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
  23. data/vendor/faiss/faiss/IndexBinary.h +140 -132
  24. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
  25. data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
  26. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
  27. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
  28. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
  29. data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
  30. data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
  31. data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
  32. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
  33. data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
  34. data/vendor/faiss/faiss/IndexFlat.cpp +115 -176
  35. data/vendor/faiss/faiss/IndexFlat.h +42 -59
  36. data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
  37. data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
  38. data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
  39. data/vendor/faiss/faiss/IndexHNSW.h +57 -41
  40. data/vendor/faiss/faiss/IndexIVF.cpp +545 -453
  41. data/vendor/faiss/faiss/IndexIVF.h +169 -118
  42. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
  43. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
  44. data/vendor/faiss/faiss/IndexIVFFlat.cpp +247 -252
  45. data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
  46. data/vendor/faiss/faiss/IndexIVFPQ.cpp +459 -517
  47. data/vendor/faiss/faiss/IndexIVFPQ.h +75 -67
  48. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
  49. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
  50. data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
  51. data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
  52. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +163 -150
  53. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +38 -25
  54. data/vendor/faiss/faiss/IndexLSH.cpp +66 -113
  55. data/vendor/faiss/faiss/IndexLSH.h +20 -38
  56. data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
  57. data/vendor/faiss/faiss/IndexLattice.h +11 -16
  58. data/vendor/faiss/faiss/IndexNNDescent.cpp +229 -0
  59. data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
  60. data/vendor/faiss/faiss/IndexNSG.cpp +301 -0
  61. data/vendor/faiss/faiss/IndexNSG.h +85 -0
  62. data/vendor/faiss/faiss/IndexPQ.cpp +387 -495
  63. data/vendor/faiss/faiss/IndexPQ.h +64 -82
  64. data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
  65. data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
  66. data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
  67. data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
  68. data/vendor/faiss/faiss/IndexRefine.cpp +139 -127
  69. data/vendor/faiss/faiss/IndexRefine.h +32 -23
  70. data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
  71. data/vendor/faiss/faiss/IndexReplicas.h +62 -56
  72. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +111 -172
  73. data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -59
  74. data/vendor/faiss/faiss/IndexShards.cpp +256 -240
  75. data/vendor/faiss/faiss/IndexShards.h +85 -73
  76. data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
  77. data/vendor/faiss/faiss/MatrixStats.h +7 -10
  78. data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
  79. data/vendor/faiss/faiss/MetaIndexes.h +40 -34
  80. data/vendor/faiss/faiss/MetricType.h +7 -7
  81. data/vendor/faiss/faiss/VectorTransform.cpp +654 -475
  82. data/vendor/faiss/faiss/VectorTransform.h +64 -89
  83. data/vendor/faiss/faiss/clone_index.cpp +78 -73
  84. data/vendor/faiss/faiss/clone_index.h +4 -9
  85. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
  86. data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
  87. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +198 -171
  88. data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
  89. data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
  90. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
  91. data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
  92. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
  93. data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
  94. data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
  95. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
  96. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
  97. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
  98. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
  99. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
  100. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
  101. data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
  102. data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
  103. data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
  104. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
  106. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
  107. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
  108. data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
  109. data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
  110. data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
  111. data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
  112. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
  113. data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
  114. data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
  115. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
  116. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
  117. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
  118. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
  119. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
  120. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
  121. data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
  122. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
  123. data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
  124. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
  125. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
  126. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
  127. data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
  128. data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
  129. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
  130. data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
  131. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +503 -0
  132. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +175 -0
  133. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
  134. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
  135. data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
  136. data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
  137. data/vendor/faiss/faiss/impl/FaissException.h +41 -29
  138. data/vendor/faiss/faiss/impl/HNSW.cpp +606 -617
  139. data/vendor/faiss/faiss/impl/HNSW.h +179 -200
  140. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +855 -0
  141. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +244 -0
  142. data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
  143. data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
  144. data/vendor/faiss/faiss/impl/NSG.cpp +679 -0
  145. data/vendor/faiss/faiss/impl/NSG.h +199 -0
  146. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
  147. data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
  148. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
  149. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
  150. data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
  151. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +758 -0
  152. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +188 -0
  153. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
  154. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +647 -707
  155. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
  156. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
  157. data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
  158. data/vendor/faiss/faiss/impl/index_read.cpp +631 -480
  159. data/vendor/faiss/faiss/impl/index_write.cpp +547 -407
  160. data/vendor/faiss/faiss/impl/io.cpp +76 -95
  161. data/vendor/faiss/faiss/impl/io.h +31 -41
  162. data/vendor/faiss/faiss/impl/io_macros.h +60 -29
  163. data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
  164. data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
  165. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
  166. data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
  167. data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
  168. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
  169. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
  170. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
  171. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
  172. data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
  173. data/vendor/faiss/faiss/index_factory.cpp +619 -397
  174. data/vendor/faiss/faiss/index_factory.h +8 -6
  175. data/vendor/faiss/faiss/index_io.h +23 -26
  176. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
  177. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
  178. data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
  179. data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
  180. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
  181. data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
  182. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
  183. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
  184. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
  185. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
  186. data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
  187. data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
  188. data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
  189. data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
  190. data/vendor/faiss/faiss/utils/Heap.h +186 -209
  191. data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
  192. data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
  193. data/vendor/faiss/faiss/utils/distances.cpp +305 -312
  194. data/vendor/faiss/faiss/utils/distances.h +170 -122
  195. data/vendor/faiss/faiss/utils/distances_simd.cpp +498 -508
  196. data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
  197. data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
  198. data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
  199. data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
  200. data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
  201. data/vendor/faiss/faiss/utils/hamming.h +62 -85
  202. data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
  203. data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
  204. data/vendor/faiss/faiss/utils/partitioning.h +26 -21
  205. data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
  206. data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
  207. data/vendor/faiss/faiss/utils/random.cpp +39 -63
  208. data/vendor/faiss/faiss/utils/random.h +13 -16
  209. data/vendor/faiss/faiss/utils/simdlib.h +4 -2
  210. data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
  211. data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
  212. data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
  213. data/vendor/faiss/faiss/utils/utils.cpp +304 -287
  214. data/vendor/faiss/faiss/utils/utils.h +54 -49
  215. metadata +29 -4
@@ -5,210 +5,204 @@
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
7
 
8
-
9
- #include <faiss/gpu/utils/StackDeviceMemory.h>
10
8
  #include <faiss/gpu/utils/DeviceUtils.h>
9
+ #include <faiss/gpu/utils/StackDeviceMemory.h>
11
10
  #include <faiss/gpu/utils/StaticUtils.h>
12
11
  #include <faiss/impl/FaissAssert.h>
13
12
  #include <algorithm>
14
13
  #include <sstream>
15
14
 
16
- namespace faiss { namespace gpu {
15
+ namespace faiss {
16
+ namespace gpu {
17
17
 
18
18
  namespace {
19
19
 
20
20
  size_t adjustStackSize(size_t sz) {
21
- if (sz == 0) {
22
- return 0;
23
- } else {
24
- // ensure that we have at least 16 bytes, as all allocations are bumped up
25
- // to 16
26
- return utils::roundUp(sz, (size_t) 16);
27
- }
21
+ if (sz == 0) {
22
+ return 0;
23
+ } else {
24
+ // ensure that we have at least 16 bytes, as all allocations are bumped
25
+ // up to 16
26
+ return utils::roundUp(sz, (size_t)16);
27
+ }
28
28
  }
29
29
 
30
30
  } // namespace
31
31
 
32
32
  StackDeviceMemory::Stack::Stack(GpuResources* res, int d, size_t sz)
33
- : res_(res),
34
- device_(d),
35
- alloc_(nullptr),
36
- allocSize_(adjustStackSize(sz)),
37
- start_(nullptr),
38
- end_(nullptr),
39
- head_(nullptr),
40
- highWaterMemoryUsed_(0) {
41
- if (allocSize_ == 0) {
42
- return;
43
- }
44
-
45
- DeviceScope s(device_);
46
- auto req = AllocRequest(AllocType::TemporaryMemoryBuffer,
47
- device_,
48
- MemorySpace::Device,
49
- res_->getDefaultStream(device_),
50
- allocSize_);
51
-
52
- alloc_ = (char*) res_->allocMemory(req);
53
- FAISS_ASSERT_FMT(
54
- alloc_,
55
- "could not reserve temporary memory region of size %zu", allocSize_);
56
-
57
- // In order to disambiguate between our entire region of temporary memory
58
- // versus the first allocation in the temporary memory region, ensure that the
59
- // first address returned is +16 bytes from the beginning
60
- start_ = alloc_ + 16;
61
- head_ = start_;
62
- end_ = alloc_ + allocSize_;
33
+ : res_(res),
34
+ device_(d),
35
+ alloc_(nullptr),
36
+ allocSize_(adjustStackSize(sz)),
37
+ start_(nullptr),
38
+ end_(nullptr),
39
+ head_(nullptr),
40
+ highWaterMemoryUsed_(0) {
41
+ if (allocSize_ == 0) {
42
+ return;
43
+ }
44
+
45
+ DeviceScope s(device_);
46
+ auto req = AllocRequest(
47
+ AllocType::TemporaryMemoryBuffer,
48
+ device_,
49
+ MemorySpace::Device,
50
+ res_->getDefaultStream(device_),
51
+ allocSize_);
52
+
53
+ alloc_ = (char*)res_->allocMemory(req);
54
+ FAISS_ASSERT_FMT(
55
+ alloc_,
56
+ "could not reserve temporary memory region of size %zu",
57
+ allocSize_);
58
+
59
+ // In order to disambiguate between our entire region of temporary memory
60
+ // versus the first allocation in the temporary memory region, ensure that
61
+ // the first address returned is +16 bytes from the beginning
62
+ start_ = alloc_ + 16;
63
+ head_ = start_;
64
+ end_ = alloc_ + allocSize_;
63
65
  }
64
66
 
65
67
  StackDeviceMemory::Stack::~Stack() {
66
- DeviceScope s(device_);
68
+ DeviceScope s(device_);
67
69
 
68
- // FIXME: make sure there are no outstanding memory allocations?
69
- if (alloc_) {
70
- res_->deallocMemory(device_, alloc_);
71
- }
70
+ // FIXME: make sure there are no outstanding memory allocations?
71
+ if (alloc_) {
72
+ res_->deallocMemory(device_, alloc_);
73
+ }
72
74
  }
73
75
 
74
- size_t
75
- StackDeviceMemory::Stack::getSizeAvailable() const {
76
- return (end_ - head_);
76
+ size_t StackDeviceMemory::Stack::getSizeAvailable() const {
77
+ return (end_ - head_);
77
78
  }
78
79
 
79
- char*
80
- StackDeviceMemory::Stack::getAlloc(size_t size,
81
- cudaStream_t stream) {
82
- // The user must check to see that the allocation fit within us
83
- auto sizeRemaining = getSizeAvailable();
80
+ char* StackDeviceMemory::Stack::getAlloc(size_t size, cudaStream_t stream) {
81
+ // The user must check to see that the allocation fit within us
82
+ auto sizeRemaining = getSizeAvailable();
84
83
 
85
- FAISS_ASSERT(size <= sizeRemaining);
84
+ FAISS_ASSERT(size <= sizeRemaining);
86
85
 
87
- // We can make the allocation out of our stack
88
- // Find all the ranges that we overlap that may have been
89
- // previously allocated; our allocation will be [head, endAlloc)
90
- char* startAlloc = head_;
91
- char* endAlloc = head_ + size;
86
+ // We can make the allocation out of our stack
87
+ // Find all the ranges that we overlap that may have been
88
+ // previously allocated; our allocation will be [head, endAlloc)
89
+ char* startAlloc = head_;
90
+ char* endAlloc = head_ + size;
92
91
 
93
- while (lastUsers_.size() > 0) {
94
- auto& prevUser = lastUsers_.back();
92
+ while (lastUsers_.size() > 0) {
93
+ auto& prevUser = lastUsers_.back();
95
94
 
96
- // Because there is a previous user, we must overlap it
97
- FAISS_ASSERT(prevUser.start_ <= endAlloc && prevUser.end_ >= startAlloc);
95
+ // Because there is a previous user, we must overlap it
96
+ FAISS_ASSERT(
97
+ prevUser.start_ <= endAlloc && prevUser.end_ >= startAlloc);
98
98
 
99
- if (stream != prevUser.stream_) {
100
- // Synchronization required
101
- streamWait({stream}, {prevUser.stream_});
102
- }
99
+ if (stream != prevUser.stream_) {
100
+ // Synchronization required
101
+ streamWait({stream}, {prevUser.stream_});
102
+ }
103
103
 
104
- if (endAlloc < prevUser.end_) {
105
- // Update the previous user info
106
- prevUser.start_ = endAlloc;
104
+ if (endAlloc < prevUser.end_) {
105
+ // Update the previous user info
106
+ prevUser.start_ = endAlloc;
107
107
 
108
- break;
109
- }
108
+ break;
109
+ }
110
110
 
111
- // If we're the exact size of the previous request, then we
112
- // don't need to continue
113
- bool done = (prevUser.end_ == endAlloc);
111
+ // If we're the exact size of the previous request, then we
112
+ // don't need to continue
113
+ bool done = (prevUser.end_ == endAlloc);
114
114
 
115
- lastUsers_.pop_back();
115
+ lastUsers_.pop_back();
116
116
 
117
- if (done) {
118
- break;
117
+ if (done) {
118
+ break;
119
+ }
119
120
  }
120
- }
121
121
 
122
- head_ = endAlloc;
123
- FAISS_ASSERT(head_ <= end_);
122
+ head_ = endAlloc;
123
+ FAISS_ASSERT(head_ <= end_);
124
124
 
125
- highWaterMemoryUsed_ = std::max(highWaterMemoryUsed_,
126
- (size_t) (head_ - start_));
127
- FAISS_ASSERT(startAlloc);
128
- return startAlloc;
125
+ highWaterMemoryUsed_ =
126
+ std::max(highWaterMemoryUsed_, (size_t)(head_ - start_));
127
+ FAISS_ASSERT(startAlloc);
128
+ return startAlloc;
129
129
  }
130
130
 
131
- void
132
- StackDeviceMemory::Stack::returnAlloc(char* p,
133
- size_t size,
134
- cudaStream_t stream) {
135
- // This allocation should be within ourselves
136
- FAISS_ASSERT(p >= start_ && p < end_);
131
+ void StackDeviceMemory::Stack::returnAlloc(
132
+ char* p,
133
+ size_t size,
134
+ cudaStream_t stream) {
135
+ // This allocation should be within ourselves
136
+ FAISS_ASSERT(p >= start_ && p < end_);
137
137
 
138
- // All allocations should have been adjusted to a multiple of 16 bytes
139
- FAISS_ASSERT(size % 16 == 0);
138
+ // All allocations should have been adjusted to a multiple of 16 bytes
139
+ FAISS_ASSERT(size % 16 == 0);
140
140
 
141
- // This is on our stack
142
- // Allocations should be freed in the reverse order they are made
143
- if (p + size != head_) {
144
- FAISS_ASSERT(p + size == head_);
145
- }
141
+ // This is on our stack
142
+ // Allocations should be freed in the reverse order they are made
143
+ if (p + size != head_) {
144
+ FAISS_ASSERT(p + size == head_);
145
+ }
146
146
 
147
- head_ = p;
148
- lastUsers_.push_back(Range(p, p + size, stream));
147
+ head_ = p;
148
+ lastUsers_.push_back(Range(p, p + size, stream));
149
149
  }
150
150
 
151
- std::string
152
- StackDeviceMemory::Stack::toString() const {
153
- std::stringstream s;
151
+ std::string StackDeviceMemory::Stack::toString() const {
152
+ std::stringstream s;
154
153
 
155
- s << "SDM device " << device_ << ": Total memory " << allocSize_ << " ["
156
- << (void*) start_ << ", " << (void*) end_ << ")\n";
157
- s << " Available memory " << (size_t) (end_ - head_)
158
- << " [" << (void*) head_ << ", " << (void*) end_ << ")\n";
159
- s << " High water temp alloc " << highWaterMemoryUsed_ << "\n";
154
+ s << "SDM device " << device_ << ": Total memory " << allocSize_ << " ["
155
+ << (void*)start_ << ", " << (void*)end_ << ")\n";
156
+ s << " Available memory " << (size_t)(end_ - head_) << " ["
157
+ << (void*)head_ << ", " << (void*)end_ << ")\n";
158
+ s << " High water temp alloc " << highWaterMemoryUsed_ << "\n";
160
159
 
161
- int i = lastUsers_.size();
162
- for (auto it = lastUsers_.rbegin(); it != lastUsers_.rend(); ++it) {
163
- s << i-- << ": size " << (size_t) (it->end_ - it->start_)
164
- << " stream " << it->stream_
165
- << " [" << (void*) it->start_ << ", " << (void*) it->end_ << ")\n";
166
- }
160
+ int i = lastUsers_.size();
161
+ for (auto it = lastUsers_.rbegin(); it != lastUsers_.rend(); ++it) {
162
+ s << i-- << ": size " << (size_t)(it->end_ - it->start_) << " stream "
163
+ << it->stream_ << " [" << (void*)it->start_ << ", " << (void*)it->end_
164
+ << ")\n";
165
+ }
167
166
 
168
- return s.str();
167
+ return s.str();
169
168
  }
170
169
 
171
- StackDeviceMemory::StackDeviceMemory(GpuResources* res,
172
- int device,
173
- size_t allocPerDevice)
174
- : device_(device),
175
- stack_(res, device, allocPerDevice) {
176
- }
170
+ StackDeviceMemory::StackDeviceMemory(
171
+ GpuResources* res,
172
+ int device,
173
+ size_t allocPerDevice)
174
+ : device_(device), stack_(res, device, allocPerDevice) {}
177
175
 
178
- StackDeviceMemory::~StackDeviceMemory() {
179
- }
176
+ StackDeviceMemory::~StackDeviceMemory() {}
180
177
 
181
- int
182
- StackDeviceMemory::getDevice() const {
183
- return device_;
178
+ int StackDeviceMemory::getDevice() const {
179
+ return device_;
184
180
  }
185
181
 
186
- size_t
187
- StackDeviceMemory::getSizeAvailable() const {
188
- return stack_.getSizeAvailable();
182
+ size_t StackDeviceMemory::getSizeAvailable() const {
183
+ return stack_.getSizeAvailable();
189
184
  }
190
185
 
191
- std::string
192
- StackDeviceMemory::toString() const {
193
- return stack_.toString();
186
+ std::string StackDeviceMemory::toString() const {
187
+ return stack_.toString();
194
188
  }
195
189
 
196
- void*
197
- StackDeviceMemory::allocMemory(cudaStream_t stream, size_t size) {
198
- // All allocations should have been adjusted to a multiple of 16 bytes
199
- FAISS_ASSERT(size % 16 == 0);
200
- return stack_.getAlloc(size, stream);
190
+ void* StackDeviceMemory::allocMemory(cudaStream_t stream, size_t size) {
191
+ // All allocations should have been adjusted to a multiple of 16 bytes
192
+ FAISS_ASSERT(size % 16 == 0);
193
+ return stack_.getAlloc(size, stream);
201
194
  }
202
195
 
203
- void
204
- StackDeviceMemory::deallocMemory(int device,
205
- cudaStream_t stream,
206
- size_t size,
207
- void* p) {
208
- FAISS_ASSERT(p);
209
- FAISS_ASSERT(device == device_);
196
+ void StackDeviceMemory::deallocMemory(
197
+ int device,
198
+ cudaStream_t stream,
199
+ size_t size,
200
+ void* p) {
201
+ FAISS_ASSERT(p);
202
+ FAISS_ASSERT(device == device_);
210
203
 
211
- stack_.returnAlloc((char*) p, size, stream);
204
+ stack_.returnAlloc((char*)p, size, stream);
212
205
  }
213
206
 
214
- } } // namespace
207
+ } // namespace gpu
208
+ } // namespace faiss
@@ -5,110 +5,108 @@
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
7
 
8
-
9
8
  #pragma once
10
9
 
11
- #include <faiss/gpu/GpuResources.h>
12
10
  #include <cuda_runtime.h>
11
+ #include <faiss/gpu/GpuResources.h>
13
12
  #include <list>
14
13
  #include <memory>
15
- #include <unordered_map>
16
14
  #include <tuple>
15
+ #include <unordered_map>
17
16
 
18
- namespace faiss { namespace gpu {
17
+ namespace faiss {
18
+ namespace gpu {
19
19
 
20
20
  /// Device memory manager that provides temporary memory allocations
21
21
  /// out of a region of memory, for a single device
22
22
  class StackDeviceMemory {
23
- public:
24
- /// Allocate a new region of memory that we manage
25
- StackDeviceMemory(GpuResources* res,
26
- int device,
27
- size_t allocPerDevice);
23
+ public:
24
+ /// Allocate a new region of memory that we manage
25
+ StackDeviceMemory(GpuResources* res, int device, size_t allocPerDevice);
28
26
 
29
- /// Manage a region of memory for a particular device, with or
30
- /// without ownership
31
- StackDeviceMemory(int device, void* p, size_t size, bool isOwner);
27
+ /// Manage a region of memory for a particular device, with or
28
+ /// without ownership
29
+ StackDeviceMemory(int device, void* p, size_t size, bool isOwner);
32
30
 
33
- ~StackDeviceMemory();
31
+ ~StackDeviceMemory();
34
32
 
35
- int getDevice() const;
33
+ int getDevice() const;
36
34
 
37
- /// All allocations requested should be a multiple of 16 bytes
38
- void* allocMemory(cudaStream_t stream, size_t size);
39
- void deallocMemory(int device, cudaStream_t, size_t size, void* p);
35
+ /// All allocations requested should be a multiple of 16 bytes
36
+ void* allocMemory(cudaStream_t stream, size_t size);
37
+ void deallocMemory(int device, cudaStream_t, size_t size, void* p);
40
38
 
41
- size_t getSizeAvailable() const;
42
- std::string toString() const;
39
+ size_t getSizeAvailable() const;
40
+ std::string toString() const;
43
41
 
44
- protected:
45
- /// Previous allocation ranges and the streams for which
46
- /// synchronization is required
47
- struct Range {
48
- inline Range(char* s, char* e, cudaStream_t str) :
49
- start_(s), end_(e), stream_(str) {
50
- }
42
+ protected:
43
+ /// Previous allocation ranges and the streams for which
44
+ /// synchronization is required
45
+ struct Range {
46
+ inline Range(char* s, char* e, cudaStream_t str)
47
+ : start_(s), end_(e), stream_(str) {}
51
48
 
52
- // References a memory range [start, end)
53
- char* start_;
54
- char* end_;
55
- cudaStream_t stream_;
56
- };
49
+ // References a memory range [start, end)
50
+ char* start_;
51
+ char* end_;
52
+ cudaStream_t stream_;
53
+ };
57
54
 
58
- struct Stack {
59
- /// Constructor that allocates memory via cudaMalloc
60
- Stack(GpuResources* res, int device, size_t size);
55
+ struct Stack {
56
+ /// Constructor that allocates memory via cudaMalloc
57
+ Stack(GpuResources* res, int device, size_t size);
61
58
 
62
- ~Stack();
59
+ ~Stack();
63
60
 
64
- /// Returns how much size is available for an allocation without
65
- /// calling cudaMalloc
66
- size_t getSizeAvailable() const;
61
+ /// Returns how much size is available for an allocation without
62
+ /// calling cudaMalloc
63
+ size_t getSizeAvailable() const;
67
64
 
68
- /// Obtains an allocation; all allocations are guaranteed to be 16
69
- /// byte aligned
70
- char* getAlloc(size_t size, cudaStream_t stream);
65
+ /// Obtains an allocation; all allocations are guaranteed to be 16
66
+ /// byte aligned
67
+ char* getAlloc(size_t size, cudaStream_t stream);
71
68
 
72
- /// Returns an allocation
73
- void returnAlloc(char* p, size_t size, cudaStream_t stream);
69
+ /// Returns an allocation
70
+ void returnAlloc(char* p, size_t size, cudaStream_t stream);
74
71
 
75
- /// Returns the stack state
76
- std::string toString() const;
72
+ /// Returns the stack state
73
+ std::string toString() const;
77
74
 
78
- /// Our GpuResources object
79
- GpuResources* res_;
75
+ /// Our GpuResources object
76
+ GpuResources* res_;
80
77
 
81
- /// Device this allocation is on
82
- int device_;
78
+ /// Device this allocation is on
79
+ int device_;
83
80
 
84
- /// Where our temporary memory buffer is allocated; we allocate starting 16
85
- /// bytes into this
86
- char* alloc_;
81
+ /// Where our temporary memory buffer is allocated; we allocate starting
82
+ /// 16 bytes into this
83
+ char* alloc_;
87
84
 
88
- /// Total size of our allocation
89
- size_t allocSize_;
85
+ /// Total size of our allocation
86
+ size_t allocSize_;
90
87
 
91
- /// Our temporary memory region; [start_, end_) is valid
92
- char* start_;
93
- char* end_;
88
+ /// Our temporary memory region; [start_, end_) is valid
89
+ char* start_;
90
+ char* end_;
94
91
 
95
- /// Stack head within [start, end)
96
- char* head_;
92
+ /// Stack head within [start, end)
93
+ char* head_;
97
94
 
98
- /// List of previous last users of allocations on our stack, for
99
- /// possible synchronization purposes
100
- std::list<Range> lastUsers_;
95
+ /// List of previous last users of allocations on our stack, for
96
+ /// possible synchronization purposes
97
+ std::list<Range> lastUsers_;
101
98
 
102
- /// What's the high water mark in terms of memory used from the
103
- /// temporary buffer?
104
- size_t highWaterMemoryUsed_;
105
- };
99
+ /// What's the high water mark in terms of memory used from the
100
+ /// temporary buffer?
101
+ size_t highWaterMemoryUsed_;
102
+ };
106
103
 
107
- /// Our device
108
- int device_;
104
+ /// Our device
105
+ int device_;
109
106
 
110
- /// Memory stack
111
- Stack stack_;
107
+ /// Memory stack
108
+ Stack stack_;
112
109
  };
113
110
 
114
- } } // namespace
111
+ } // namespace gpu
112
+ } // namespace faiss
@@ -5,7 +5,6 @@
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
7
 
8
-
9
8
  #pragma once
10
9
 
11
10
  #include <cuda.h>
@@ -16,43 +15,45 @@
16
15
  #define __device__
17
16
  #endif
18
17
 
19
- namespace faiss { namespace gpu { namespace utils {
18
+ namespace faiss {
19
+ namespace gpu {
20
+ namespace utils {
20
21
 
21
22
  template <typename U, typename V>
22
23
  constexpr __host__ __device__ auto divDown(U a, V b) -> decltype(a + b) {
23
- return (a / b);
24
+ return (a / b);
24
25
  }
25
26
 
26
27
  template <typename U, typename V>
27
28
  constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
28
- return (a + b - 1) / b;
29
+ return (a + b - 1) / b;
29
30
  }
30
31
 
31
32
  template <typename U, typename V>
32
33
  constexpr __host__ __device__ auto roundDown(U a, V b) -> decltype(a + b) {
33
- return divDown(a, b) * b;
34
+ return divDown(a, b) * b;
34
35
  }
35
36
 
36
37
  template <typename U, typename V>
37
38
  constexpr __host__ __device__ auto roundUp(U a, V b) -> decltype(a + b) {
38
- return divUp(a, b) * b;
39
+ return divUp(a, b) * b;
39
40
  }
40
41
 
41
42
  template <class T>
42
43
  constexpr __host__ __device__ T pow(T n, T power) {
43
- return (power > 0 ? n * pow(n, power - 1) : 1);
44
+ return (power > 0 ? n * pow(n, power - 1) : 1);
44
45
  }
45
46
 
46
47
  template <class T>
47
48
  constexpr __host__ __device__ T pow2(T n) {
48
- return pow(2, (T) n);
49
+ return pow(2, (T)n);
49
50
  }
50
51
 
51
52
  static_assert(pow2(8) == 256, "pow2");
52
53
 
53
54
  template <typename T>
54
55
  constexpr __host__ __device__ int log2(T n, int p = 0) {
55
- return (n <= 1) ? p : log2(n / 2, p + 1);
56
+ return (n <= 1) ? p : log2(n / 2, p + 1);
56
57
  }
57
58
 
58
59
  static_assert(log2(2) == 1, "log2");
@@ -61,7 +62,7 @@ static_assert(log2(4) == 2, "log2");
61
62
 
62
63
  template <typename T>
63
64
  constexpr __host__ __device__ bool isPowerOf2(T v) {
64
- return (v && !(v & (v - 1)));
65
+ return (v && !(v & (v - 1)));
65
66
  }
66
67
 
67
68
  static_assert(isPowerOf2(2048), "isPowerOf2");
@@ -69,7 +70,7 @@ static_assert(!isPowerOf2(3333), "isPowerOf2");
69
70
 
70
71
  template <typename T>
71
72
  constexpr __host__ __device__ T nextHighestPowerOf2(T v) {
72
- return (isPowerOf2(v) ? (T) 2 * v : ((T) 1 << (log2(v) + 1)));
73
+ return (isPowerOf2(v) ? (T)2 * v : ((T)1 << (log2(v) + 1)));
73
74
  }
74
75
 
75
76
  static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2");
@@ -81,9 +82,13 @@ static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2");
81
82
  static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2");
82
83
  static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2");
83
84
 
84
- static_assert(nextHighestPowerOf2(1536000000u) == 2147483648u,
85
- "nextHighestPowerOf2");
86
- static_assert(nextHighestPowerOf2((size_t) 2147483648ULL) ==
87
- (size_t) 4294967296ULL, "nextHighestPowerOf2");
85
+ static_assert(
86
+ nextHighestPowerOf2(1536000000u) == 2147483648u,
87
+ "nextHighestPowerOf2");
88
+ static_assert(
89
+ nextHighestPowerOf2((size_t)2147483648ULL) == (size_t)4294967296ULL,
90
+ "nextHighestPowerOf2");
88
91
 
89
- } } } // namespace
92
+ } // namespace utils
93
+ } // namespace gpu
94
+ } // namespace faiss