faiss 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (216) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +9 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +1 -1
  5. data/ext/faiss/extconf.rb +9 -2
  6. data/ext/faiss/index.cpp +1 -1
  7. data/ext/faiss/index_binary.cpp +2 -2
  8. data/ext/faiss/product_quantizer.cpp +1 -1
  9. data/lib/faiss/version.rb +1 -1
  10. data/vendor/faiss/faiss/AutoTune.cpp +7 -7
  11. data/vendor/faiss/faiss/AutoTune.h +1 -2
  12. data/vendor/faiss/faiss/Clustering.cpp +39 -22
  13. data/vendor/faiss/faiss/Clustering.h +40 -21
  14. data/vendor/faiss/faiss/IVFlib.cpp +26 -12
  15. data/vendor/faiss/faiss/Index.cpp +1 -1
  16. data/vendor/faiss/faiss/Index.h +40 -10
  17. data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
  20. data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
  21. data/vendor/faiss/faiss/IndexBinary.h +8 -19
  22. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
  23. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
  24. data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
  25. data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
  26. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +107 -188
  27. data/vendor/faiss/faiss/IndexFastScan.cpp +95 -146
  28. data/vendor/faiss/faiss/IndexFastScan.h +9 -8
  29. data/vendor/faiss/faiss/IndexFlat.cpp +206 -10
  30. data/vendor/faiss/faiss/IndexFlat.h +20 -1
  31. data/vendor/faiss/faiss/IndexFlatCodes.cpp +170 -5
  32. data/vendor/faiss/faiss/IndexFlatCodes.h +23 -4
  33. data/vendor/faiss/faiss/IndexHNSW.cpp +231 -382
  34. data/vendor/faiss/faiss/IndexHNSW.h +62 -49
  35. data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
  36. data/vendor/faiss/faiss/IndexIDMap.h +24 -2
  37. data/vendor/faiss/faiss/IndexIVF.cpp +162 -56
  38. data/vendor/faiss/faiss/IndexIVF.h +46 -6
  39. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +33 -26
  40. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +6 -2
  41. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
  42. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
  43. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +502 -401
  44. data/vendor/faiss/faiss/IndexIVFFastScan.h +63 -26
  45. data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
  46. data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
  47. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
  48. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
  49. data/vendor/faiss/faiss/IndexIVFPQ.cpp +79 -125
  50. data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
  51. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +39 -52
  52. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
  53. data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
  54. data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
  55. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
  56. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
  57. data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
  58. data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
  59. data/vendor/faiss/faiss/IndexLattice.h +3 -22
  60. data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -33
  61. data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
  62. data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
  63. data/vendor/faiss/faiss/IndexNSG.h +11 -11
  64. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
  65. data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
  66. data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
  67. data/vendor/faiss/faiss/IndexPQ.h +1 -4
  68. data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
  69. data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
  70. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  71. data/vendor/faiss/faiss/IndexRefine.cpp +54 -24
  72. data/vendor/faiss/faiss/IndexRefine.h +7 -0
  73. data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
  74. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +25 -17
  75. data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
  76. data/vendor/faiss/faiss/IndexShards.cpp +21 -29
  77. data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
  78. data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
  79. data/vendor/faiss/faiss/MatrixStats.h +21 -9
  80. data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
  81. data/vendor/faiss/faiss/MetricType.h +7 -2
  82. data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
  83. data/vendor/faiss/faiss/VectorTransform.h +7 -7
  84. data/vendor/faiss/faiss/clone_index.cpp +15 -10
  85. data/vendor/faiss/faiss/clone_index.h +3 -0
  86. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
  87. data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
  88. data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
  89. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
  90. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +123 -8
  91. data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
  92. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +13 -0
  93. data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
  94. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
  95. data/vendor/faiss/faiss/gpu/GpuIndex.h +30 -12
  96. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
  97. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
  98. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +14 -9
  99. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +20 -3
  100. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
  101. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
  102. data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
  103. data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
  104. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +142 -17
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
  106. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
  107. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +7 -1
  108. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
  109. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
  110. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
  111. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +332 -40
  112. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
  113. data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
  114. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
  115. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
  116. data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
  117. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
  118. data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
  119. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
  120. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
  121. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +26 -1
  122. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +10 -3
  123. data/vendor/faiss/faiss/impl/DistanceComputer.h +70 -1
  124. data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
  125. data/vendor/faiss/faiss/impl/FaissException.h +13 -34
  126. data/vendor/faiss/faiss/impl/HNSW.cpp +605 -186
  127. data/vendor/faiss/faiss/impl/HNSW.h +52 -30
  128. data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
  129. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +11 -9
  130. data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
  131. data/vendor/faiss/faiss/impl/NNDescent.cpp +42 -27
  132. data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
  133. data/vendor/faiss/faiss/impl/NSG.h +1 -1
  134. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
  135. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
  136. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -22
  137. data/vendor/faiss/faiss/impl/ProductQuantizer.h +6 -2
  138. data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
  139. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
  140. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
  141. data/vendor/faiss/faiss/impl/ResultHandler.h +347 -172
  142. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +1104 -147
  143. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -8
  144. data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +285 -42
  145. data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
  146. data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
  147. data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
  148. data/vendor/faiss/faiss/impl/index_read.cpp +74 -34
  149. data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
  150. data/vendor/faiss/faiss/impl/index_write.cpp +88 -51
  151. data/vendor/faiss/faiss/impl/io.cpp +23 -15
  152. data/vendor/faiss/faiss/impl/io.h +4 -4
  153. data/vendor/faiss/faiss/impl/io_macros.h +6 -0
  154. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
  155. data/vendor/faiss/faiss/impl/platform_macros.h +40 -1
  156. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +14 -0
  157. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
  158. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
  159. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +487 -49
  160. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
  161. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
  162. data/vendor/faiss/faiss/impl/simd_result_handlers.h +481 -225
  163. data/vendor/faiss/faiss/index_factory.cpp +41 -20
  164. data/vendor/faiss/faiss/index_io.h +12 -5
  165. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
  166. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
  167. data/vendor/faiss/faiss/invlists/DirectMap.cpp +10 -2
  168. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +73 -17
  169. data/vendor/faiss/faiss/invlists/InvertedLists.h +26 -8
  170. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +24 -9
  171. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
  172. data/vendor/faiss/faiss/python/python_callbacks.cpp +4 -4
  173. data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
  174. data/vendor/faiss/faiss/utils/Heap.h +105 -0
  175. data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
  176. data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
  177. data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
  178. data/vendor/faiss/faiss/utils/bf16.h +36 -0
  179. data/vendor/faiss/faiss/utils/distances.cpp +147 -123
  180. data/vendor/faiss/faiss/utils/distances.h +86 -9
  181. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
  182. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
  183. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
  184. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
  185. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
  186. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
  187. data/vendor/faiss/faiss/utils/distances_simd.cpp +1589 -243
  188. data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
  189. data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
  190. data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
  191. data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
  192. data/vendor/faiss/faiss/utils/fp16.h +2 -0
  193. data/vendor/faiss/faiss/utils/hamming.cpp +163 -111
  194. data/vendor/faiss/faiss/utils/hamming.h +58 -0
  195. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
  196. data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
  197. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +19 -88
  198. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +58 -0
  199. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
  200. data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
  201. data/vendor/faiss/faiss/utils/prefetch.h +77 -0
  202. data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
  203. data/vendor/faiss/faiss/utils/random.cpp +43 -0
  204. data/vendor/faiss/faiss/utils/random.h +25 -0
  205. data/vendor/faiss/faiss/utils/simdlib.h +10 -1
  206. data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
  207. data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
  208. data/vendor/faiss/faiss/utils/simdlib_neon.h +77 -79
  209. data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
  210. data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
  211. data/vendor/faiss/faiss/utils/sorting.h +27 -0
  212. data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
  213. data/vendor/faiss/faiss/utils/utils.cpp +120 -7
  214. data/vendor/faiss/faiss/utils/utils.h +60 -20
  215. metadata +23 -4
  216. data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102
@@ -4,6 +4,29 @@
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
+ /*
8
+ * Copyright (c) 2023, NVIDIA CORPORATION.
9
+ *
10
+ * Licensed under the Apache License, Version 2.0 (the "License");
11
+ * you may not use this file except in compliance with the License.
12
+ * You may obtain a copy of the License at
13
+ *
14
+ * http://www.apache.org/licenses/LICENSE-2.0
15
+ *
16
+ * Unless required by applicable law or agreed to in writing, software
17
+ * distributed under the License is distributed on an "AS IS" BASIS,
18
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ * See the License for the specific language governing permissions and
20
+ * limitations under the License.
21
+ */
22
+
23
+ #if defined USE_NVIDIA_RAFT
24
+ #include <raft/core/device_resources.hpp>
25
+ #include <rmm/mr/device/managed_memory_resource.hpp>
26
+ #include <rmm/mr/device/per_device_resource.hpp>
27
+ #include <rmm/mr/host/pinned_memory_resource.hpp>
28
+ #include <memory>
29
+ #endif
7
30
 
8
31
  #include <faiss/gpu/StandardGpuResources.h>
9
32
  #include <faiss/gpu/utils/DeviceUtils.h>
@@ -66,7 +89,12 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
66
89
  //
67
90
 
68
91
  StandardGpuResourcesImpl::StandardGpuResourcesImpl()
69
- : pinnedMemAlloc_(nullptr),
92
+ :
93
+ #if defined USE_NVIDIA_RAFT
94
+ mmr_(new rmm::mr::managed_memory_resource),
95
+ pmr_(new rmm::mr::pinned_memory_resource),
96
+ #endif
97
+ pinnedMemAlloc_(nullptr),
70
98
  pinnedMemAllocSize_(0),
71
99
  // let the adjustment function determine the memory size for us by
72
100
  // passing in a huge value that will then be adjusted
@@ -74,7 +102,8 @@ StandardGpuResourcesImpl::StandardGpuResourcesImpl()
74
102
  -1,
75
103
  std::numeric_limits<size_t>::max())),
76
104
  pinnedMemSize_(kDefaultPinnedMemoryAllocation),
77
- allocLogging_(false) {}
105
+ allocLogging_(false) {
106
+ }
78
107
 
79
108
  StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
80
109
  // The temporary memory allocator has allocated memory through us, so clean
@@ -129,6 +158,9 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
129
158
  }
130
159
 
131
160
  if (pinnedMemAlloc_) {
161
+ #if defined USE_NVIDIA_RAFT
162
+ pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
163
+ #else
132
164
  auto err = cudaFreeHost(pinnedMemAlloc_);
133
165
  FAISS_ASSERT_FMT(
134
166
  err == cudaSuccess,
@@ -136,6 +168,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
136
168
  pinnedMemAlloc_,
137
169
  (int)err,
138
170
  cudaGetErrorString(err));
171
+ #endif
139
172
  }
140
173
  }
141
174
 
@@ -187,11 +220,11 @@ void StandardGpuResourcesImpl::setTempMemory(size_t size) {
187
220
  p.second.reset();
188
221
 
189
222
  // Allocate new
190
- p.second = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
223
+ p.second = std::make_unique<StackDeviceMemory>(
191
224
  this,
192
225
  p.first,
193
226
  // adjust for this specific device
194
- getDefaultTempMemForGPU(device, tempMemSize_)));
227
+ getDefaultTempMemForGPU(device, tempMemSize_));
195
228
  }
196
229
  }
197
230
  }
@@ -224,6 +257,14 @@ void StandardGpuResourcesImpl::setDefaultStream(
224
257
  if (prevStream != stream) {
225
258
  streamWait({stream}, {prevStream});
226
259
  }
260
+ #if defined USE_NVIDIA_RAFT
261
+ // delete the raft handle for this device, which will be initialized
262
+ // with the updated stream during any subsequent calls to getRaftHandle
263
+ auto it2 = raftHandles_.find(device);
264
+ if (it2 != raftHandles_.end()) {
265
+ raftHandles_.erase(it2);
266
+ }
267
+ #endif
227
268
  }
228
269
 
229
270
  userDefaultStreams_[device] = stream;
@@ -242,6 +283,14 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
242
283
 
243
284
  streamWait({newStream}, {prevStream});
244
285
  }
286
+ #if defined USE_NVIDIA_RAFT
287
+ // delete the raft handle for this device, which will be initialized
288
+ // with the updated stream during any subsequent calls to getRaftHandle
289
+ auto it2 = raftHandles_.find(device);
290
+ if (it2 != raftHandles_.end()) {
291
+ raftHandles_.erase(it2);
292
+ }
293
+ #endif
245
294
  }
246
295
 
247
296
  userDefaultStreams_.erase(device);
@@ -274,6 +323,19 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
274
323
  // If this is the first device that we're initializing, create our
275
324
  // pinned memory allocation
276
325
  if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
326
+ #if defined USE_NVIDIA_RAFT
327
+ // If this is the first device that we're initializing, create our
328
+ // pinned memory allocation
329
+ if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
330
+ try {
331
+ pinnedMemAlloc_ = pmr_->allocate(pinnedMemSize_);
332
+ } catch (const std::bad_alloc& rmm_ex) {
333
+ FAISS_THROW_MSG("CUDA memory allocation error");
334
+ }
335
+
336
+ pinnedMemAllocSize_ = pinnedMemSize_;
337
+ }
338
+ #else
277
339
  auto err = cudaHostAlloc(
278
340
  &pinnedMemAlloc_, pinnedMemSize_, cudaHostAllocDefault);
279
341
 
@@ -286,6 +348,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
286
348
  cudaGetErrorString(err));
287
349
 
288
350
  pinnedMemAllocSize_ = pinnedMemSize_;
351
+ #endif
289
352
  }
290
353
 
291
354
  // Make sure that device properties for all devices are cached
@@ -300,19 +363,32 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
300
363
  prop.major,
301
364
  prop.minor);
302
365
 
366
+ #if USE_AMD_ROCM
367
+ // Our code is pre-built with and expects warpSize == 32 or 64, validate
368
+ // that
369
+ FAISS_ASSERT_FMT(
370
+ prop.warpSize == 32 || prop.warpSize == 64,
371
+ "Device id %d does not have expected warpSize of 32 or 64",
372
+ device);
373
+ #else
303
374
  // Our code is pre-built with and expects warpSize == 32, validate that
304
375
  FAISS_ASSERT_FMT(
305
376
  prop.warpSize == 32,
306
377
  "Device id %d does not have expected warpSize of 32",
307
378
  device);
379
+ #endif
308
380
 
309
381
  // Create streams
310
- cudaStream_t defaultStream = 0;
382
+ cudaStream_t defaultStream = nullptr;
311
383
  CUDA_VERIFY(
312
384
  cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
313
385
 
314
386
  defaultStreams_[device] = defaultStream;
315
387
 
388
+ #if defined USE_NVIDIA_RAFT
389
+ raftHandles_.emplace(std::make_pair(device, defaultStream));
390
+ #endif
391
+
316
392
  cudaStream_t asyncCopyStream = 0;
317
393
  CUDA_VERIFY(
318
394
  cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
@@ -321,7 +397,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
321
397
 
322
398
  std::vector<cudaStream_t> deviceStreams;
323
399
  for (int j = 0; j < kNumStreams; ++j) {
324
- cudaStream_t stream = 0;
400
+ cudaStream_t stream = nullptr;
325
401
  CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
326
402
 
327
403
  deviceStreams.push_back(stream);
@@ -330,7 +406,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
330
406
  alternateStreams_[device] = std::move(deviceStreams);
331
407
 
332
408
  // Create cuBLAS handle
333
- cublasHandle_t blasHandle = 0;
409
+ cublasHandle_t blasHandle = nullptr;
334
410
  auto blasStatus = cublasCreate(&blasHandle);
335
411
  FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
336
412
  blasHandles_[device] = blasHandle;
@@ -348,11 +424,11 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
348
424
  allocs_[device] = std::unordered_map<void*, AllocRequest>();
349
425
 
350
426
  FAISS_ASSERT(tempMemory_.count(device) == 0);
351
- auto mem = std::unique_ptr<StackDeviceMemory>(new StackDeviceMemory(
427
+ auto mem = std::make_unique<StackDeviceMemory>(
352
428
  this,
353
429
  device,
354
430
  // adjust for this specific device
355
- getDefaultTempMemForGPU(device, tempMemSize_)));
431
+ getDefaultTempMemForGPU(device, tempMemSize_));
356
432
 
357
433
  tempMemory_.emplace(device, std::move(mem));
358
434
  }
@@ -375,6 +451,25 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
375
451
  return defaultStreams_[device];
376
452
  }
377
453
 
454
+ #if defined USE_NVIDIA_RAFT
455
+ raft::device_resources& StandardGpuResourcesImpl::getRaftHandle(int device) {
456
+ initializeForDevice(device);
457
+
458
+ auto it = raftHandles_.find(device);
459
+ if (it == raftHandles_.end()) {
460
+ // Make sure we are using the stream the user may have already assigned
461
+ // to the current GpuResources
462
+ raftHandles_.emplace(device, getDefaultStream(device));
463
+
464
+ // Initialize cublas handle
465
+ raftHandles_[device].get_cublas_handle();
466
+ }
467
+
468
+ // Otherwise, our base default handle
469
+ return raftHandles_[device];
470
+ }
471
+ #endif
472
+
378
473
  std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
379
474
  int device) {
380
475
  initializeForDevice(device);
@@ -406,8 +501,6 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
406
501
  void* p = nullptr;
407
502
 
408
503
  if (adjReq.space == MemorySpace::Temporary) {
409
- // If we don't have enough space in our temporary memory manager, we
410
- // need to allocate this request separately
411
504
  auto& tempMem = tempMemory_[adjReq.device];
412
505
 
413
506
  if (adjReq.size > tempMem->getSizeAvailable()) {
@@ -428,15 +521,25 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
428
521
 
429
522
  // Otherwise, we can handle this locally
430
523
  p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
431
-
432
524
  } else if (adjReq.space == MemorySpace::Device) {
525
+ #if defined USE_NVIDIA_RAFT
526
+ try {
527
+ rmm::mr::device_memory_resource* current_mr =
528
+ rmm::mr::get_per_device_resource(
529
+ rmm::cuda_device_id{adjReq.device});
530
+ p = current_mr->allocate_async(adjReq.size, adjReq.stream);
531
+ adjReq.mr = current_mr;
532
+ } catch (const std::bad_alloc& rmm_ex) {
533
+ FAISS_THROW_MSG("CUDA memory allocation error");
534
+ }
535
+ #else
433
536
  auto err = cudaMalloc(&p, adjReq.size);
434
537
 
435
538
  // Throw if we fail to allocate
436
539
  if (err != cudaSuccess) {
437
540
  // FIXME: as of CUDA 11, a memory allocation error appears to be
438
- // presented via cudaGetLastError as well, and needs to be cleared.
439
- // Just call the function to clear it
541
+ // presented via cudaGetLastError as well, and needs to be
542
+ // cleared. Just call the function to clear it
440
543
  cudaGetLastError();
441
544
 
442
545
  std::stringstream ss;
@@ -451,7 +554,20 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
451
554
 
452
555
  FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
453
556
  }
557
+ #endif
454
558
  } else if (adjReq.space == MemorySpace::Unified) {
559
+ #if defined USE_NVIDIA_RAFT
560
+ try {
561
+ // for now, use our own managed MR to do Unified Memory allocations.
562
+ // TODO: change this to use the current device resource once RMM has
563
+ // a way to retrieve a "guaranteed" managed memory resource for a
564
+ // device.
565
+ p = mmr_->allocate_async(adjReq.size, adjReq.stream);
566
+ adjReq.mr = mmr_.get();
567
+ } catch (const std::bad_alloc& rmm_ex) {
568
+ FAISS_THROW_MSG("CUDA memory allocation error");
569
+ }
570
+ #else
455
571
  auto err = cudaMallocManaged(&p, adjReq.size);
456
572
 
457
573
  if (err != cudaSuccess) {
@@ -472,6 +588,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
472
588
 
473
589
  FAISS_THROW_IF_NOT_FMT(err == cudaSuccess, "%s", str.c_str());
474
590
  }
591
+ #endif
475
592
  } else {
476
593
  FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
477
594
  }
@@ -505,10 +622,12 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
505
622
 
506
623
  if (req.space == MemorySpace::Temporary) {
507
624
  tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
508
-
509
625
  } else if (
510
626
  req.space == MemorySpace::Device ||
511
627
  req.space == MemorySpace::Unified) {
628
+ #if defined USE_NVIDIA_RAFT
629
+ req.mr->deallocate_async(p, req.size, req.stream);
630
+ #else
512
631
  auto err = cudaFree(p);
513
632
  FAISS_ASSERT_FMT(
514
633
  err == cudaSuccess,
@@ -516,7 +635,7 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
516
635
  p,
517
636
  (int)err,
518
637
  cudaGetErrorString(err));
519
-
638
+ #endif
520
639
  } else {
521
640
  FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
522
641
  }
@@ -561,7 +680,7 @@ StandardGpuResourcesImpl::getMemoryInfo() const {
561
680
  StandardGpuResources::StandardGpuResources()
562
681
  : res_(new StandardGpuResourcesImpl) {}
563
682
 
564
- StandardGpuResources::~StandardGpuResources() {}
683
+ StandardGpuResources::~StandardGpuResources() = default;
565
684
 
566
685
  std::shared_ptr<GpuResources> StandardGpuResources::getResources() {
567
686
  return res_;
@@ -600,6 +719,12 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
600
719
  return res_->getDefaultStream(device);
601
720
  }
602
721
 
722
+ #if defined USE_NVIDIA_RAFT
723
+ raft::device_resources& StandardGpuResources::getRaftHandle(int device) {
724
+ return res_->getRaftHandle(device);
725
+ }
726
+ #endif
727
+
603
728
  size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
604
729
  return res_->getTempMemoryAvailable(device);
605
730
  }
@@ -4,9 +4,29 @@
4
4
  * This source code is licensed under the MIT license found in the
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
+ /*
8
+ * Copyright (c) 2023, NVIDIA CORPORATION.
9
+ *
10
+ * Licensed under the Apache License, Version 2.0 (the "License");
11
+ * you may not use this file except in compliance with the License.
12
+ * You may obtain a copy of the License at
13
+ *
14
+ * http://www.apache.org/licenses/LICENSE-2.0
15
+ *
16
+ * Unless required by applicable law or agreed to in writing, software
17
+ * distributed under the License is distributed on an "AS IS" BASIS,
18
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ * See the License for the specific language governing permissions and
20
+ * limitations under the License.
21
+ */
7
22
 
8
23
  #pragma once
9
24
 
25
+ #if defined USE_NVIDIA_RAFT
26
+ #include <raft/core/device_resources.hpp>
27
+ #include <rmm/mr/host/pinned_memory_resource.hpp>
28
+ #endif
29
+
10
30
  #include <faiss/gpu/GpuResources.h>
11
31
  #include <faiss/gpu/utils/DeviceUtils.h>
12
32
  #include <faiss/gpu/utils/StackDeviceMemory.h>
@@ -15,6 +35,7 @@
15
35
  #include <unordered_map>
16
36
  #include <vector>
17
37
 
38
+ #pragma GCC visibility push(default)
18
39
  namespace faiss {
19
40
  namespace gpu {
20
41
 
@@ -58,6 +79,12 @@ class StandardGpuResourcesImpl : public GpuResources {
58
79
  /// this stream upon exit from an index or other Faiss GPU call.
59
80
  cudaStream_t getDefaultStream(int device) override;
60
81
 
82
+ #if defined USE_NVIDIA_RAFT
83
+ /// Returns the raft handle for the given device which can be used to
84
+ /// make calls to other raft primitives.
85
+ raft::device_resources& getRaftHandle(int device) override;
86
+ #endif
87
+
61
88
  /// Called to change the work ordering streams to the null stream
62
89
  /// for all devices
63
90
  void setDefaultNullStreamAllDevices();
@@ -92,7 +119,7 @@ class StandardGpuResourcesImpl : public GpuResources {
92
119
 
93
120
  cudaStream_t getAsyncCopyStream(int device) override;
94
121
 
95
- private:
122
+ protected:
96
123
  /// Have GPU resources been initialized for this device yet?
97
124
  bool isInitialized(int device) const;
98
125
 
@@ -100,7 +127,7 @@ class StandardGpuResourcesImpl : public GpuResources {
100
127
  /// memory size
101
128
  static size_t getDefaultTempMemForGPU(int device, size_t requested);
102
129
 
103
- private:
130
+ protected:
104
131
  /// Set of currently outstanding memory allocations per device
105
132
  /// device -> (alloc request, allocated ptr)
106
133
  std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
@@ -124,6 +151,27 @@ class StandardGpuResourcesImpl : public GpuResources {
124
151
  /// cuBLAS handle for each device
125
152
  std::unordered_map<int, cublasHandle_t> blasHandles_;
126
153
 
154
+ #if defined USE_NVIDIA_RAFT
155
+ /// raft handle for each device
156
+ std::unordered_map<int, raft::device_resources> raftHandles_;
157
+
158
+ /**
159
+ * FIXME: Integrating these in a separate code path for now. Ultimately,
160
+ * it would be nice if we use a simple memory resource abstraction
161
+ * in FAISS so we could plug in whether to use RMM's memory resources
162
+ * or the default.
163
+ *
164
+ * There's enough duplicated logic that it doesn't *seem* to make sense
165
+ * to create a subclass only for the RMM memory resources.
166
+ */
167
+
168
+ // managed_memory_resource
169
+ std::unique_ptr<rmm::mr::device_memory_resource> mmr_;
170
+
171
+ // pinned_memory_resource
172
+ std::unique_ptr<rmm::mr::host_memory_resource> pmr_;
173
+ #endif
174
+
127
175
  /// Pinned memory allocation for use with this GPU
128
176
  void* pinnedMemAlloc_;
129
177
  size_t pinnedMemAllocSize_;
@@ -183,10 +231,15 @@ class StandardGpuResources : public GpuResourcesProvider {
183
231
  /// Export a description of memory used for Python
184
232
  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
185
233
  const;
186
-
187
234
  /// Returns the current default stream
188
235
  cudaStream_t getDefaultStream(int device);
189
236
 
237
+ #if defined USE_NVIDIA_RAFT
238
+ /// Returns the raft handle for the given device which can be used to
239
+ /// make calls to other raft primitives.
240
+ raft::device_resources& getRaftHandle(int device);
241
+ #endif
242
+
190
243
  /// Returns the current amount of temp memory available
191
244
  size_t getTempMemoryAvailable(int device) const;
192
245
 
@@ -203,3 +256,4 @@ class StandardGpuResources : public GpuResourcesProvider {
203
256
 
204
257
  } // namespace gpu
205
258
  } // namespace faiss
259
+ #pragma GCC visibility pop
@@ -6,6 +6,7 @@
6
6
  */
7
7
 
8
8
  #include <faiss/gpu/impl/InterleavedCodes.h>
9
+ #include <faiss/gpu/utils/DeviceUtils.h>
9
10
  #include <faiss/gpu/utils/StaticUtils.h>
10
11
  #include <faiss/impl/FaissAssert.h>
11
12
 
@@ -166,15 +167,16 @@ void unpackInterleavedWord(
166
167
  int numVecs,
167
168
  int dims,
168
169
  int bitsPerCode) {
169
- int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
170
+ int warpSize = getWarpSizeCurrentDevice();
171
+ int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
170
172
  int wordsPerBlock = wordsPerDimBlock * dims;
171
- int numBlocks = utils::divUp(numVecs, 32);
173
+ int numBlocks = utils::divUp(numVecs, warpSize);
172
174
 
173
175
  #pragma omp parallel for
174
176
  for (int i = 0; i < numVecs; ++i) {
175
- int block = i / 32;
177
+ int block = i / warpSize;
176
178
  FAISS_ASSERT(block < numBlocks);
177
- int lane = i % 32;
179
+ int lane = i % warpSize;
178
180
 
179
181
  for (int j = 0; j < dims; ++j) {
180
182
  int srcOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
@@ -188,9 +190,10 @@ std::vector<uint8_t> unpackInterleaved(
188
190
  int numVecs,
189
191
  int dims,
190
192
  int bitsPerCode) {
191
- int bytesPerDimBlock = 32 * bitsPerCode / 8;
193
+ int warpSize = getWarpSizeCurrentDevice();
194
+ int bytesPerDimBlock = warpSize * bitsPerCode / 8;
192
195
  int bytesPerBlock = bytesPerDimBlock * dims;
193
- int numBlocks = utils::divUp(numVecs, 32);
196
+ int numBlocks = utils::divUp(numVecs, warpSize);
194
197
  size_t totalSize = (size_t)bytesPerBlock * numBlocks;
195
198
  FAISS_ASSERT(data.size() == totalSize);
196
199
 
@@ -217,8 +220,8 @@ std::vector<uint8_t> unpackInterleaved(
217
220
  } else if (bitsPerCode == 4) {
218
221
  #pragma omp parallel for
219
222
  for (int i = 0; i < numVecs; ++i) {
220
- int block = i / 32;
221
- int lane = i % 32;
223
+ int block = i / warpSize;
224
+ int lane = i % warpSize;
222
225
 
223
226
  int word = lane / 2;
224
227
  int subWord = lane % 2;
@@ -235,8 +238,8 @@ std::vector<uint8_t> unpackInterleaved(
235
238
  } else if (bitsPerCode == 5) {
236
239
  #pragma omp parallel for
237
240
  for (int i = 0; i < numVecs; ++i) {
238
- int block = i / 32;
239
- int blockVector = i % 32;
241
+ int block = i / warpSize;
242
+ int blockVector = i % warpSize;
240
243
 
241
244
  for (int j = 0; j < dims; ++j) {
242
245
  uint8_t* dimBlock =
@@ -257,8 +260,8 @@ std::vector<uint8_t> unpackInterleaved(
257
260
  } else if (bitsPerCode == 6) {
258
261
  #pragma omp parallel for
259
262
  for (int i = 0; i < numVecs; ++i) {
260
- int block = i / 32;
261
- int blockVector = i % 32;
263
+ int block = i / warpSize;
264
+ int blockVector = i % warpSize;
262
265
 
263
266
  for (int j = 0; j < dims; ++j) {
264
267
  uint8_t* dimBlock =
@@ -442,17 +445,18 @@ void packInterleavedWord(
442
445
  int numVecs,
443
446
  int dims,
444
447
  int bitsPerCode) {
445
- int wordsPerDimBlock = 32 * bitsPerCode / (8 * sizeof(T));
448
+ int warpSize = getWarpSizeCurrentDevice();
449
+ int wordsPerDimBlock = (size_t)warpSize * bitsPerCode / (8 * sizeof(T));
446
450
  int wordsPerBlock = wordsPerDimBlock * dims;
447
- int numBlocks = utils::divUp(numVecs, 32);
451
+ int numBlocks = utils::divUp(numVecs, warpSize);
448
452
 
449
453
  // We're guaranteed that all other slots not filled by the vectors present
450
454
  // are initialized to zero (from the vector constructor in packInterleaved)
451
455
  #pragma omp parallel for
452
456
  for (int i = 0; i < numVecs; ++i) {
453
- int block = i / 32;
457
+ int block = i / warpSize;
454
458
  FAISS_ASSERT(block < numBlocks);
455
- int lane = i % 32;
459
+ int lane = i % warpSize;
456
460
 
457
461
  for (int j = 0; j < dims; ++j) {
458
462
  int dstOffset = block * wordsPerBlock + j * wordsPerDimBlock + lane;
@@ -466,9 +470,10 @@ std::vector<uint8_t> packInterleaved(
466
470
  int numVecs,
467
471
  int dims,
468
472
  int bitsPerCode) {
469
- int bytesPerDimBlock = 32 * bitsPerCode / 8;
473
+ int warpSize = getWarpSizeCurrentDevice();
474
+ int bytesPerDimBlock = warpSize * bitsPerCode / 8;
470
475
  int bytesPerBlock = bytesPerDimBlock * dims;
471
- int numBlocks = utils::divUp(numVecs, 32);
476
+ int numBlocks = utils::divUp(numVecs, warpSize);
472
477
  size_t totalSize = (size_t)bytesPerBlock * numBlocks;
473
478
 
474
479
  // bit codes padded to whole bytes
@@ -499,7 +504,7 @@ std::vector<uint8_t> packInterleaved(
499
504
  for (int i = 0; i < numBlocks; ++i) {
500
505
  for (int j = 0; j < dims; ++j) {
501
506
  for (int k = 0; k < bytesPerDimBlock; ++k) {
502
- int loVec = i * 32 + k * 2;
507
+ int loVec = i * warpSize + k * 2;
503
508
  int hiVec = loVec + 1;
504
509
 
505
510
  uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
@@ -516,7 +521,7 @@ std::vector<uint8_t> packInterleaved(
516
521
  for (int j = 0; j < dims; ++j) {
517
522
  for (int k = 0; k < bytesPerDimBlock; ++k) {
518
523
  // What input vectors we are pulling from
519
- int loVec = i * 32 + (k * 8) / 5;
524
+ int loVec = i * warpSize + (k * 8) / 5;
520
525
  int hiVec = loVec + 1;
521
526
  int hiVec2 = hiVec + 1;
522
527
 
@@ -536,7 +541,7 @@ std::vector<uint8_t> packInterleaved(
536
541
  for (int j = 0; j < dims; ++j) {
537
542
  for (int k = 0; k < bytesPerDimBlock; ++k) {
538
543
  // What input vectors we are pulling from
539
- int loVec = i * 32 + (k * 8) / 6;
544
+ int loVec = i * warpSize + (k * 8) / 6;
540
545
  int hiVec = loVec + 1;
541
546
 
542
547
  uint8_t lo = loVec < numVecs ? data[loVec * dims + j] : 0;
@@ -17,6 +17,7 @@
17
17
  #include <vector>
18
18
 
19
19
  #include <cuda_profiler_api.h>
20
+ #include <faiss/impl/AuxIndexStructures.h>
20
21
 
21
22
  DEFINE_int32(num, 10000, "# of vecs");
22
23
  DEFINE_int32(k, 100, "# of clusters");
@@ -34,6 +35,7 @@ DEFINE_int64(
34
35
  "minimum size to use CPU -> GPU paged copies");
35
36
  DEFINE_int64(pinned_mem, -1, "pinned memory allocation to use");
36
37
  DEFINE_int32(max_points, -1, "max points per centroid");
38
+ DEFINE_double(timeout, 0, "timeout in seconds");
37
39
 
38
40
  using namespace faiss::gpu;
39
41
 
@@ -42,7 +44,7 @@ int main(int argc, char** argv) {
42
44
 
43
45
  cudaProfilerStop();
44
46
 
45
- auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
47
+ auto seed = FLAGS_seed != -1 ? FLAGS_seed : time(nullptr);
46
48
  printf("using seed %ld\n", seed);
47
49
 
48
50
  std::vector<float> vecs((size_t)FLAGS_num * FLAGS_dim);
@@ -99,10 +101,14 @@ int main(int argc, char** argv) {
99
101
  cp.max_points_per_centroid = FLAGS_max_points;
100
102
  }
101
103
 
104
+ auto tc = new faiss::TimeoutCallback();
105
+ faiss::InterruptCallback::instance.reset(tc);
106
+
102
107
  faiss::Clustering kmeans(FLAGS_dim, FLAGS_k, cp);
103
108
 
104
109
  // Time k-means
105
110
  {
111
+ tc->set_timeout(FLAGS_timeout);
106
112
  CpuTimer timer;
107
113
 
108
114
  kmeans.train(FLAGS_num, vecs.data(), *(gpuIndex.getIndex()));
@@ -7,6 +7,7 @@
7
7
 
8
8
  #include <faiss/gpu/impl/InterleavedCodes.h>
9
9
  #include <faiss/gpu/test/TestUtils.h>
10
+ #include <faiss/gpu/utils/DeviceUtils.h>
10
11
  #include <faiss/gpu/utils/StaticUtils.h>
11
12
  #include <gtest/gtest.h>
12
13
  #include <cmath>
@@ -119,8 +120,9 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
119
120
  std::cout << bitsPerCode << " " << dims << " " << numVecs
120
121
  << "\n";
121
122
 
122
- int blocks = utils::divUp(numVecs, 32);
123
- int bytesPerDimBlock = 32 * bitsPerCode / 8;
123
+ int warpSize = getWarpSizeCurrentDevice();
124
+ int blocks = utils::divUp(numVecs, warpSize);
125
+ int bytesPerDimBlock = warpSize * bitsPerCode / 8;
124
126
  int bytesPerBlock = bytesPerDimBlock * dims;
125
127
  int size = blocks * bytesPerBlock;
126
128
 
@@ -132,9 +134,9 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
132
134
 
133
135
  for (int i = 0; i < blocks; ++i) {
134
136
  for (int j = 0; j < dims; ++j) {
135
- for (int k = 0; k < 32; ++k) {
137
+ for (int k = 0; k < warpSize; ++k) {
136
138
  for (int l = 0; l < bytesPerCode; ++l) {
137
- int vec = i * 32 + k;
139
+ int vec = i * warpSize + k;
138
140
  if (vec < numVecs) {
139
141
  data[i * bytesPerBlock +
140
142
  j * bytesPerDimBlock +
@@ -148,7 +150,8 @@ TEST(TestCodePacking, InterleavedCodes_UnpackPack) {
148
150
  for (int i = 0; i < blocks; ++i) {
149
151
  for (int j = 0; j < dims; ++j) {
150
152
  for (int k = 0; k < bytesPerDimBlock; ++k) {
151
- int loVec = i * 32 + (k * 8) / bitsPerCode;
153
+ int loVec =
154
+ i * warpSize + (k * 8) / bitsPerCode;
152
155
  int hiVec = loVec + 1;
153
156
  int hiVec2 = hiVec + 1;
154
157