faiss 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/faiss/version.rb +1 -1
  4. data/vendor/faiss/faiss/AutoTune.h +1 -1
  5. data/vendor/faiss/faiss/Clustering.cpp +35 -4
  6. data/vendor/faiss/faiss/Clustering.h +10 -1
  7. data/vendor/faiss/faiss/IVFlib.cpp +4 -1
  8. data/vendor/faiss/faiss/Index.h +21 -6
  9. data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
  10. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -1
  11. data/vendor/faiss/faiss/IndexFastScan.cpp +22 -4
  12. data/vendor/faiss/faiss/IndexFlat.cpp +11 -7
  13. data/vendor/faiss/faiss/IndexFlatCodes.cpp +159 -5
  14. data/vendor/faiss/faiss/IndexFlatCodes.h +20 -3
  15. data/vendor/faiss/faiss/IndexHNSW.cpp +143 -90
  16. data/vendor/faiss/faiss/IndexHNSW.h +52 -3
  17. data/vendor/faiss/faiss/IndexIVF.cpp +3 -3
  18. data/vendor/faiss/faiss/IndexIVF.h +9 -1
  19. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +15 -0
  20. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -0
  21. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +130 -57
  22. data/vendor/faiss/faiss/IndexIVFFastScan.h +14 -7
  23. data/vendor/faiss/faiss/IndexIVFPQ.cpp +1 -3
  24. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +21 -2
  25. data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
  26. data/vendor/faiss/faiss/IndexLattice.h +3 -22
  27. data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -29
  28. data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
  29. data/vendor/faiss/faiss/IndexNSG.h +1 -1
  30. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
  31. data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
  32. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  33. data/vendor/faiss/faiss/IndexRefine.cpp +5 -5
  34. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +3 -1
  35. data/vendor/faiss/faiss/MetricType.h +7 -2
  36. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
  37. data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
  38. data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
  39. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
  40. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +36 -4
  41. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +6 -0
  42. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
  43. data/vendor/faiss/faiss/gpu/GpuIndex.h +2 -8
  44. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
  45. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +6 -0
  46. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +2 -0
  47. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +25 -0
  48. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
  49. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +6 -0
  50. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
  51. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +65 -0
  52. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
  53. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
  54. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
  55. data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
  56. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +25 -0
  57. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +9 -1
  58. data/vendor/faiss/faiss/impl/DistanceComputer.h +46 -0
  59. data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
  60. data/vendor/faiss/faiss/impl/HNSW.cpp +358 -190
  61. data/vendor/faiss/faiss/impl/HNSW.h +43 -22
  62. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +8 -8
  63. data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
  64. data/vendor/faiss/faiss/impl/NNDescent.cpp +13 -8
  65. data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
  66. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +1 -0
  67. data/vendor/faiss/faiss/impl/ProductQuantizer.h +5 -1
  68. data/vendor/faiss/faiss/impl/ResultHandler.h +151 -32
  69. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +719 -102
  70. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -0
  71. data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +5 -0
  72. data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
  73. data/vendor/faiss/faiss/impl/index_read.cpp +29 -15
  74. data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
  75. data/vendor/faiss/faiss/impl/index_write.cpp +28 -10
  76. data/vendor/faiss/faiss/impl/io.cpp +13 -5
  77. data/vendor/faiss/faiss/impl/io.h +4 -4
  78. data/vendor/faiss/faiss/impl/io_macros.h +6 -0
  79. data/vendor/faiss/faiss/impl/platform_macros.h +22 -0
  80. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +11 -0
  81. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +1 -1
  82. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +448 -1
  83. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +5 -5
  84. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  85. data/vendor/faiss/faiss/impl/simd_result_handlers.h +143 -59
  86. data/vendor/faiss/faiss/index_factory.cpp +31 -13
  87. data/vendor/faiss/faiss/index_io.h +12 -5
  88. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
  89. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
  90. data/vendor/faiss/faiss/invlists/DirectMap.cpp +9 -1
  91. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +55 -17
  92. data/vendor/faiss/faiss/invlists/InvertedLists.h +18 -9
  93. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +21 -6
  94. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
  95. data/vendor/faiss/faiss/python/python_callbacks.cpp +3 -3
  96. data/vendor/faiss/faiss/utils/Heap.h +105 -0
  97. data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
  98. data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
  99. data/vendor/faiss/faiss/utils/bf16.h +36 -0
  100. data/vendor/faiss/faiss/utils/distances.cpp +58 -88
  101. data/vendor/faiss/faiss/utils/distances.h +5 -5
  102. data/vendor/faiss/faiss/utils/distances_simd.cpp +997 -9
  103. data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
  104. data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
  105. data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
  106. data/vendor/faiss/faiss/utils/hamming.cpp +1 -1
  107. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +4 -1
  108. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +2 -1
  109. data/vendor/faiss/faiss/utils/random.cpp +43 -0
  110. data/vendor/faiss/faiss/utils/random.h +25 -0
  111. data/vendor/faiss/faiss/utils/simdlib.h +10 -1
  112. data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
  113. data/vendor/faiss/faiss/utils/simdlib_neon.h +5 -2
  114. data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
  115. data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
  116. data/vendor/faiss/faiss/utils/utils.cpp +10 -3
  117. data/vendor/faiss/faiss/utils/utils.h +3 -0
  118. metadata +16 -4
  119. data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102
@@ -12,10 +12,19 @@
12
12
  #include <cstdint>
13
13
 
14
14
  #include <faiss/cppcontrib/detail/CoarseBitType.h>
15
+ #include <faiss/impl/platform_macros.h>
15
16
 
16
17
  namespace faiss {
17
18
  namespace cppcontrib {
18
19
 
20
+ bool isBigEndian() {
21
+ #ifdef FAISS_BIG_ENDIAN
22
+ return true;
23
+ #else
24
+ return false;
25
+ #endif
26
+ }
27
+
19
28
  ////////////////////////////////////////////////////////////////////////////////////
20
29
  /// Index2LevelDecoder
21
30
  ////////////////////////////////////////////////////////////////////////////////////
@@ -72,9 +81,14 @@ struct Index2LevelDecoder {
72
81
  const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
73
82
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
74
83
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
75
-
76
- const intptr_t coarseCode = coarse[coarseCentroidIdx];
77
- const intptr_t fineCode = fine[fineCentroidIdx];
84
+ intptr_t coarseCode, fineCode;
85
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
86
+ coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
87
+ fineCode = Swap2Bytes(fine[fineCentroidIdx]);
88
+ } else {
89
+ coarseCode = coarse[coarseCentroidIdx];
90
+ fineCode = fine[fineCentroidIdx];
91
+ }
78
92
 
79
93
  const float* const __restrict coarsePtr = pqCoarseCentroids +
80
94
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
@@ -112,9 +126,14 @@ struct Index2LevelDecoder {
112
126
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
113
127
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
114
128
 
115
- const intptr_t coarseCode = coarse[coarseCentroidIdx];
116
- const intptr_t fineCode = fine[fineCentroidIdx];
117
-
129
+ intptr_t coarseCode, fineCode;
130
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
131
+ coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
132
+ fineCode = Swap2Bytes(fine[fineCentroidIdx]);
133
+ } else {
134
+ coarseCode = coarse[coarseCentroidIdx];
135
+ fineCode = fine[fineCentroidIdx];
136
+ }
118
137
  const float* const __restrict coarsePtr = pqCoarseCentroids +
119
138
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
120
139
  COARSE_SIZE +
@@ -162,11 +181,18 @@ struct Index2LevelDecoder {
162
181
  const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
163
182
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
164
183
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
165
-
166
- const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
167
- const intptr_t fineCode0 = fine0[fineCentroidIdx];
168
- const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
169
- const intptr_t fineCode1 = fine1[fineCentroidIdx];
184
+ intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
185
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
186
+ coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
187
+ fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
188
+ coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
189
+ fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
190
+ } else {
191
+ coarseCode0 = coarse0[coarseCentroidIdx];
192
+ fineCode0 = fine0[fineCentroidIdx];
193
+ coarseCode1 = coarse1[coarseCentroidIdx];
194
+ fineCode1 = fine1[fineCentroidIdx];
195
+ }
170
196
 
171
197
  const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
172
198
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -222,11 +248,18 @@ struct Index2LevelDecoder {
222
248
  const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
223
249
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
224
250
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
225
-
226
- const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
227
- const intptr_t fineCode0 = fine0[fineCentroidIdx];
228
- const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
229
- const intptr_t fineCode1 = fine1[fineCentroidIdx];
251
+ intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
252
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
253
+ coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
254
+ fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
255
+ coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
256
+ fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
257
+ } else {
258
+ coarseCode0 = coarse0[coarseCentroidIdx];
259
+ fineCode0 = fine0[fineCentroidIdx];
260
+ coarseCode1 = coarse1[coarseCentroidIdx];
261
+ fineCode1 = fine1[fineCentroidIdx];
262
+ }
230
263
 
231
264
  const float* const __restrict coarsePtr0 = pqCoarseCentroids +
232
265
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -292,13 +325,23 @@ struct Index2LevelDecoder {
292
325
  const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
293
326
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
294
327
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
295
-
296
- const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
297
- const intptr_t fineCode0 = fine0[fineCentroidIdx];
298
- const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
299
- const intptr_t fineCode1 = fine1[fineCentroidIdx];
300
- const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
301
- const intptr_t fineCode2 = fine2[fineCentroidIdx];
328
+ intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
329
+ intptr_t coarseCode2, fineCode2;
330
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
331
+ coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
332
+ fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
333
+ coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
334
+ fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
335
+ coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
336
+ fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
337
+ } else {
338
+ coarseCode0 = coarse0[coarseCentroidIdx];
339
+ fineCode0 = fine0[fineCentroidIdx];
340
+ coarseCode1 = coarse1[coarseCentroidIdx];
341
+ fineCode1 = fine1[fineCentroidIdx];
342
+ coarseCode2 = coarse2[coarseCentroidIdx];
343
+ fineCode2 = fine2[fineCentroidIdx];
344
+ }
302
345
 
303
346
  const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
304
347
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -369,13 +412,23 @@ struct Index2LevelDecoder {
369
412
  const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
370
413
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
371
414
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
372
-
373
- const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
374
- const intptr_t fineCode0 = fine0[fineCentroidIdx];
375
- const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
376
- const intptr_t fineCode1 = fine1[fineCentroidIdx];
377
- const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
378
- const intptr_t fineCode2 = fine2[fineCentroidIdx];
415
+ intptr_t coarseCode0, fineCode0, coarseCode1, fineCode1;
416
+ intptr_t coarseCode2, fineCode2;
417
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
418
+ coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
419
+ fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
420
+ coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
421
+ fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
422
+ coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
423
+ fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
424
+ } else {
425
+ coarseCode0 = coarse0[coarseCentroidIdx];
426
+ fineCode0 = fine0[fineCentroidIdx];
427
+ coarseCode1 = coarse1[coarseCentroidIdx];
428
+ fineCode1 = fine1[fineCentroidIdx];
429
+ coarseCode2 = coarse2[coarseCentroidIdx];
430
+ fineCode2 = fine2[fineCentroidIdx];
431
+ }
379
432
 
380
433
  const float* const __restrict coarsePtr0 = pqCoarseCentroids +
381
434
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -14,6 +14,9 @@
14
14
 
15
15
  #include <faiss/IndexBinaryFlat.h>
16
16
  #include <faiss/IndexFlat.h>
17
+ #if defined USE_NVIDIA_RAFT
18
+ #include <faiss/IndexHNSW.h>
19
+ #endif
17
20
  #include <faiss/IndexIVF.h>
18
21
  #include <faiss/IndexIVFFlat.h>
19
22
  #include <faiss/IndexIVFPQ.h>
@@ -24,6 +27,9 @@
24
27
  #include <faiss/MetaIndexes.h>
25
28
  #include <faiss/gpu/GpuIndex.h>
26
29
  #include <faiss/gpu/GpuIndexBinaryFlat.h>
30
+ #if defined USE_NVIDIA_RAFT
31
+ #include <faiss/gpu/GpuIndexCagra.h>
32
+ #endif
27
33
  #include <faiss/gpu/GpuIndexFlat.h>
28
34
  #include <faiss/gpu/GpuIndexIVFFlat.h>
29
35
  #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -85,7 +91,15 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
85
91
  // objective is to make a single component out of them
86
92
  // (inverse op of ToGpuClonerMultiple)
87
93
 
88
- } else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
94
+ }
95
+ #if defined USE_NVIDIA_RAFT
96
+ else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
97
+ IndexHNSWCagra* res = new IndexHNSWCagra();
98
+ icg->copyTo(res);
99
+ return res;
100
+ }
101
+ #endif
102
+ else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
89
103
  int nshard = ish->count();
90
104
  FAISS_ASSERT(nshard > 0);
91
105
  Index* res = clone_Index(ish->at(0));
@@ -153,6 +167,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
153
167
  config.indicesOptions = indicesOptions;
154
168
  config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
155
169
  config.use_raft = use_raft;
170
+ config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
156
171
 
157
172
  GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
158
173
  provider, ifl->d, ifl->nlist, ifl->metric_type, config);
@@ -205,6 +220,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
205
220
  config.usePrecomputedTables = usePrecomputed;
206
221
  config.use_raft = use_raft;
207
222
  config.interleavedLayout = use_raft;
223
+ config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
208
224
 
209
225
  GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
210
226
 
@@ -213,9 +229,25 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
213
229
  }
214
230
 
215
231
  return res;
216
- } else {
217
- // default: use CPU cloner
218
- return Cloner::clone_Index(index);
232
+ }
233
+ #if defined USE_NVIDIA_RAFT
234
+ else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
235
+ GpuIndexCagraConfig config;
236
+ config.device = device;
237
+ GpuIndexCagra* res =
238
+ new GpuIndexCagra(provider, icg->d, icg->metric_type, config);
239
+ res->copyFrom(icg);
240
+ return res;
241
+ }
242
+ #endif
243
+ else {
244
+ // use CPU cloner for IDMap and PreTransform
245
+ auto index_idmap = dynamic_cast<const IndexIDMap*>(index);
246
+ auto index_pt = dynamic_cast<const IndexPreTransform*>(index);
247
+ if (index_idmap || index_pt) {
248
+ return Cloner::clone_Index(index);
249
+ }
250
+ FAISS_THROW_MSG("This index type is not implemented on GPU.");
219
251
  }
220
252
  }
221
253
 
@@ -43,6 +43,12 @@ struct GpuClonerOptions {
43
43
  #else
44
44
  bool use_raft = false;
45
45
  #endif
46
+
47
+ /// This flag controls the CPU fallback logic for coarse quantizer
48
+ /// component of the index. When set to false (default), the cloner will
49
+ /// throw an exception for indices not implemented on GPU. When set to
50
+ /// true, it will fallback to a CPU implementation.
51
+ bool allowCpuCoarseQuantizer = false;
46
52
  };
47
53
 
48
54
  struct GpuMultipleClonerOptions : public GpuClonerOptions {
@@ -15,7 +15,7 @@
15
15
  /// Assertions
16
16
  ///
17
17
 
18
- #ifdef __CUDA_ARCH__
18
+ #if defined(__CUDA_ARCH__) || defined(USE_AMD_ROCM)
19
19
  #define GPU_FAISS_ASSERT(X) assert(X)
20
20
  #define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
21
21
  #define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)
@@ -84,19 +84,14 @@ class GpuIndex : public faiss::Index {
84
84
 
85
85
  /// `x` and `labels` can be resident on the CPU or any GPU; copies are
86
86
  /// performed as needed
87
- void assign(
88
- idx_t n,
89
- const float* x,
90
- idx_t* labels,
91
- // faiss::Index has idx_t for k
92
- idx_t k = 1) const override;
87
+ void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
88
+ const override;
93
89
 
94
90
  /// `x`, `distances` and `labels` can be resident on the CPU or any
95
91
  /// GPU; copies are performed as needed
96
92
  void search(
97
93
  idx_t n,
98
94
  const float* x,
99
- // faiss::Index has idx_t for k
100
95
  idx_t k,
101
96
  float* distances,
102
97
  idx_t* labels,
@@ -107,7 +102,6 @@ class GpuIndex : public faiss::Index {
107
102
  void search_and_reconstruct(
108
103
  idx_t n,
109
104
  const float* x,
110
- // faiss::Index has idx_t for k
111
105
  idx_t k,
112
106
  float* distances,
113
107
  idx_t* labels,
@@ -0,0 +1,282 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+ /*
8
+ * Copyright (c) 2024, NVIDIA CORPORATION.
9
+ *
10
+ * Licensed under the Apache License, Version 2.0 (the "License");
11
+ * you may not use this file except in compliance with the License.
12
+ * You may obtain a copy of the License at
13
+ *
14
+ * http://www.apache.org/licenses/LICENSE-2.0
15
+ *
16
+ * Unless required by applicable law or agreed to in writing, software
17
+ * distributed under the License is distributed on an "AS IS" BASIS,
18
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ * See the License for the specific language governing permissions and
20
+ * limitations under the License.
21
+ */
22
+
23
+ #pragma once
24
+
25
+ #include <faiss/IndexIVF.h>
26
+ #include <faiss/gpu/GpuIndex.h>
27
+ #include <faiss/gpu/GpuIndexIVFPQ.h>
28
+
29
+ namespace faiss {
30
+ struct IndexHNSWCagra;
31
+ }
32
+
33
+ namespace faiss {
34
+ namespace gpu {
35
+
36
+ class RaftCagra;
37
+
38
+ enum class graph_build_algo {
39
+ /// Use IVF-PQ to build all-neighbors knn graph
40
+ IVF_PQ,
41
+ /// Experimental, use NN-Descent to build all-neighbors knn graph
42
+ NN_DESCENT
43
+ };
44
+
45
+ /// A type for specifying how PQ codebooks are created.
46
+ enum class codebook_gen { // NOLINT
47
+ PER_SUBSPACE = 0, // NOLINT
48
+ PER_CLUSTER = 1, // NOLINT
49
+ };
50
+
51
+ struct IVFPQBuildCagraConfig {
52
+ ///
53
+ /// The number of inverted lists (clusters)
54
+ ///
55
+ /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be
56
+ /// approximately 1,000 to 10,000.
57
+
58
+ uint32_t n_lists = 1024;
59
+ /// The number of iterations searching for kmeans centers (index building).
60
+ uint32_t kmeans_n_iters = 20;
61
+ /// The fraction of data to use during iterative kmeans building.
62
+ double kmeans_trainset_fraction = 0.5;
63
+ ///
64
+ /// The bit length of the vector element after compression by PQ.
65
+ ///
66
+ /// Possible values: [4, 5, 6, 7, 8].
67
+ ///
68
+ /// Hint: the smaller the 'pq_bits', the smaller the index size and the
69
+ /// better the search performance, but the lower the recall.
70
+
71
+ uint32_t pq_bits = 8;
72
+ ///
73
+ /// The dimensionality of the vector after compression by PQ. When zero, an
74
+ /// optimal value is selected using a heuristic.
75
+ ///
76
+ /// NB: `pq_dim /// pq_bits` must be a multiple of 8.
77
+ ///
78
+ /// Hint: a smaller 'pq_dim' results in a smaller index size and better
79
+ /// search performance, but lower recall. If 'pq_bits' is 8, 'pq_dim' can be
80
+ /// set to any number, but multiple of 8 are desirable for good performance.
81
+ /// If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. For good
82
+ /// performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally,
83
+ /// 'pq_dim' should be also a divisor of the dataset dim.
84
+
85
+ uint32_t pq_dim = 0;
86
+ /// How PQ codebooks are created.
87
+ codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
88
+ ///
89
+ /// Apply a random rotation matrix on the input data and queries even if
90
+ /// `dim % pq_dim == 0`.
91
+ ///
92
+ /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always
93
+ /// applied to the input data and queries to transform the working space
94
+ /// from `dim` to `rot_dim`, which may be slightly larger than the original
95
+ /// space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
96
+ /// However, this transform is not necessary when `dim` is multiple of
97
+ /// `pq_dim`
98
+ /// (`dim == rot_dim`, hence no need in adding "extra" data columns /
99
+ /// features).
100
+ ///
101
+ /// By default, if `dim == rot_dim`, the rotation transform is initialized
102
+ /// with the identity matrix. When `force_random_rotation == true`, a random
103
+ /// orthogonal transform matrix is generated regardless of the values of
104
+ /// `dim` and `pq_dim`.
105
+
106
+ bool force_random_rotation = false;
107
+ ///
108
+ /// By default, the algorithm allocates more space than necessary for
109
+ /// individual clusters
110
+ /// (`list_data`). This allows to amortize the cost of memory allocation and
111
+ /// reduce the number of data copies during repeated calls to `extend`
112
+ /// (extending the database).
113
+ ///
114
+ /// The alternative is the conservative allocation behavior; when enabled,
115
+ /// the algorithm always allocates the minimum amount of memory required to
116
+ /// store the given number of records. Set this flag to `true` if you prefer
117
+ /// to use as little GPU memory for the database as possible.
118
+
119
+ bool conservative_memory_allocation = false;
120
+ };
121
+
122
+ struct IVFPQSearchCagraConfig {
123
+ /// The number of clusters to search.
124
+ uint32_t n_probes = 20;
125
+ ///
126
+ /// Data type of look up table to be created dynamically at search time.
127
+ ///
128
+ /// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
129
+ ///
130
+ /// The use of low-precision types reduces the amount of shared memory
131
+ /// required at search time, so fast shared memory kernels can be used even
132
+ /// for datasets with large dimansionality. Note that the recall is slightly
133
+ /// degraded when low-precision type is selected.
134
+
135
+ cudaDataType_t lut_dtype = CUDA_R_32F;
136
+ ///
137
+ /// Storage data type for distance/similarity computed at search time.
138
+ ///
139
+ /// Possible values: [CUDA_R_16F, CUDA_R_32F]
140
+ ///
141
+ /// If the performance limiter at search time is device memory access,
142
+ /// selecting FP16 will improve performance slightly.
143
+
144
+ cudaDataType_t internal_distance_dtype = CUDA_R_32F;
145
+ ///
146
+ /// Preferred fraction of SM's unified memory / L1 cache to be used as
147
+ /// shared memory.
148
+ ///
149
+ /// Possible values: [0.0 - 1.0] as a fraction of the
150
+ /// `sharedMemPerMultiprocessor`.
151
+ ///
152
+ /// One wants to increase the carveout to make sure a good GPU occupancy for
153
+ /// the main search kernel, but not to keep it too high to leave some memory
154
+ /// to be used as L1 cache. Note, this value is interpreted only as a hint.
155
+ /// Moreover, a GPU usually allows only a fixed set of cache configurations,
156
+ /// so the provided value is rounded up to the nearest configuration. Refer
157
+ /// to the NVIDIA tuning guide for the target GPU architecture.
158
+ ///
159
+ /// Note, this is a low-level tuning parameter that can have drastic
160
+ /// negative effects on the search performance if tweaked incorrectly.
161
+
162
+ double preferred_shmem_carveout = 1.0;
163
+ };
164
+
165
+ struct GpuIndexCagraConfig : public GpuIndexConfig {
166
+ /// Degree of input graph for pruning.
167
+ size_t intermediate_graph_degree = 128;
168
+ /// Degree of output graph.
169
+ size_t graph_degree = 64;
170
+ /// ANN algorithm to build knn graph.
171
+ graph_build_algo build_algo = graph_build_algo::IVF_PQ;
172
+ /// Number of Iterations to run if building with NN_DESCENT
173
+ size_t nn_descent_niter = 20;
174
+
175
+ IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
176
+ IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
177
+ };
178
+
179
+ enum class search_algo {
180
+ /// For large batch sizes.
181
+ SINGLE_CTA,
182
+ /// For small batch sizes.
183
+ MULTI_CTA,
184
+ MULTI_KERNEL,
185
+ AUTO
186
+ };
187
+
188
+ enum class hash_mode { HASH, SMALL, AUTO };
189
+
190
+ struct SearchParametersCagra : SearchParameters {
191
+ /// Maximum number of queries to search at the same time (batch size). Auto
192
+ /// select when 0.
193
+ size_t max_queries = 0;
194
+
195
+ /// Number of intermediate search results retained during the search.
196
+ ///
197
+ /// This is the main knob to adjust trade off between accuracy and search
198
+ /// speed. Higher values improve the search accuracy.
199
+
200
+ size_t itopk_size = 64;
201
+
202
+ /// Upper limit of search iterations. Auto select when 0.
203
+ size_t max_iterations = 0;
204
+
205
+ // In the following we list additional search parameters for fine tuning.
206
+ // Reasonable default values are automatically chosen.
207
+
208
+ /// Which search implementation to use.
209
+ search_algo algo = search_algo::AUTO;
210
+
211
+ /// Number of threads used to calculate a single distance. 4, 8, 16, or 32.
212
+
213
+ size_t team_size = 0;
214
+
215
+ /// Number of graph nodes to select as the starting point for the search in
216
+ /// each iteration. aka search width?
217
+ size_t search_width = 1;
218
+ /// Lower limit of search iterations.
219
+ size_t min_iterations = 0;
220
+
221
+ /// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0.
222
+ size_t thread_block_size = 0;
223
+ /// Hashmap type. Auto selection when AUTO.
224
+ hash_mode hashmap_mode = hash_mode::AUTO;
225
+ /// Lower limit of hashmap bit length. More than 8.
226
+ size_t hashmap_min_bitlen = 0;
227
+ /// Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
228
+ float hashmap_max_fill_rate = 0.5;
229
+
230
+ /// Number of iterations of initial random seed node selection. 1 or more.
231
+
232
+ uint32_t num_random_samplings = 1;
233
+ /// Bit mask used for initial random seed node selection.
234
+ uint64_t seed = 0x128394;
235
+ };
236
+
237
+ struct GpuIndexCagra : public GpuIndex {
238
+ public:
239
+ GpuIndexCagra(
240
+ GpuResourcesProvider* provider,
241
+ int dims,
242
+ faiss::MetricType metric = faiss::METRIC_L2,
243
+ GpuIndexCagraConfig config = GpuIndexCagraConfig());
244
+
245
+ /// Trains CAGRA based on the given vector data
246
+ void train(idx_t n, const float* x) override;
247
+
248
+ /// Initialize ourselves from the given CPU index; will overwrite
249
+ /// all data in ourselves
250
+ void copyFrom(const faiss::IndexHNSWCagra* index);
251
+
252
+ /// Copy ourselves to the given CPU index; will overwrite all data
253
+ /// in the index instance
254
+ void copyTo(faiss::IndexHNSWCagra* index) const;
255
+
256
+ void reset() override;
257
+
258
+ std::vector<idx_t> get_knngraph() const;
259
+
260
+ protected:
261
+ bool addImplRequiresIDs_() const override;
262
+
263
+ void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
264
+
265
+ /// Called from GpuIndex for search
266
+ void searchImpl_(
267
+ idx_t n,
268
+ const float* x,
269
+ int k,
270
+ float* distances,
271
+ idx_t* labels,
272
+ const SearchParameters* search_params) const override;
273
+
274
+ /// Our configuration options
275
+ const GpuIndexCagraConfig cagraConfig_;
276
+
277
+ /// Instance that we own; contains the inverted lists
278
+ std::shared_ptr<RaftCagra> index_;
279
+ };
280
+
281
+ } // namespace gpu
282
+ } // namespace faiss
@@ -26,6 +26,12 @@ struct GpuIndexIVFConfig : public GpuIndexConfig {
26
26
 
27
27
  /// Configuration for the coarse quantizer object
28
28
  GpuIndexFlatConfig flatConfig;
29
+
30
+ /// This flag controls the CPU fallback logic for coarse quantizer
31
+ /// component of the index. When set to false (default), the cloner will
32
+ /// throw an exception for indices not implemented on GPU. When set to
33
+ /// true, it will fallback to a CPU implementation.
34
+ bool allowCpuCoarseQuantizer = false;
29
35
  };
30
36
 
31
37
  /// Base class of all GPU IVF index types. This (for now) deliberately does not
@@ -87,6 +87,8 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
87
87
  /// Trains the coarse quantizer based on the given vector data
88
88
  void train(idx_t n, const float* x) override;
89
89
 
90
+ void reconstruct_n(idx_t i0, idx_t n, float* out) const override;
91
+
90
92
  protected:
91
93
  /// Initialize appropriate index
92
94
  void setIndex_(
@@ -257,6 +257,14 @@ void StandardGpuResourcesImpl::setDefaultStream(
257
257
  if (prevStream != stream) {
258
258
  streamWait({stream}, {prevStream});
259
259
  }
260
+ #if defined USE_NVIDIA_RAFT
261
+ // delete the raft handle for this device, which will be initialized
262
+ // with the updated stream during any subsequent calls to getRaftHandle
263
+ auto it2 = raftHandles_.find(device);
264
+ if (it2 != raftHandles_.end()) {
265
+ raftHandles_.erase(it2);
266
+ }
267
+ #endif
260
268
  }
261
269
 
262
270
  userDefaultStreams_[device] = stream;
@@ -275,6 +283,14 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
275
283
 
276
284
  streamWait({newStream}, {prevStream});
277
285
  }
286
+ #if defined USE_NVIDIA_RAFT
287
+ // delete the raft handle for this device, which will be initialized
288
+ // with the updated stream during any subsequent calls to getRaftHandle
289
+ auto it2 = raftHandles_.find(device);
290
+ if (it2 != raftHandles_.end()) {
291
+ raftHandles_.erase(it2);
292
+ }
293
+ #endif
278
294
  }
279
295
 
280
296
  userDefaultStreams_.erase(device);
@@ -347,11 +363,20 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
347
363
  prop.major,
348
364
  prop.minor);
349
365
 
366
+ #if USE_AMD_ROCM
367
+ // Our code is pre-built with and expects warpSize == 32 or 64, validate
368
+ // that
369
+ FAISS_ASSERT_FMT(
370
+ prop.warpSize == 32 || prop.warpSize == 64,
371
+ "Device id %d does not have expected warpSize of 32 or 64",
372
+ device);
373
+ #else
350
374
  // Our code is pre-built with and expects warpSize == 32, validate that
351
375
  FAISS_ASSERT_FMT(
352
376
  prop.warpSize == 32,
353
377
  "Device id %d does not have expected warpSize of 32",
354
378
  device);
379
+ #endif
355
380
 
356
381
  // Create streams
357
382
  cudaStream_t defaultStream = nullptr;