faiss 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (119) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/faiss/version.rb +1 -1
  4. data/vendor/faiss/faiss/AutoTune.h +1 -1
  5. data/vendor/faiss/faiss/Clustering.cpp +35 -4
  6. data/vendor/faiss/faiss/Clustering.h +10 -1
  7. data/vendor/faiss/faiss/IVFlib.cpp +4 -1
  8. data/vendor/faiss/faiss/Index.h +21 -6
  9. data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
  10. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -1
  11. data/vendor/faiss/faiss/IndexFastScan.cpp +22 -4
  12. data/vendor/faiss/faiss/IndexFlat.cpp +11 -7
  13. data/vendor/faiss/faiss/IndexFlatCodes.cpp +159 -5
  14. data/vendor/faiss/faiss/IndexFlatCodes.h +20 -3
  15. data/vendor/faiss/faiss/IndexHNSW.cpp +143 -90
  16. data/vendor/faiss/faiss/IndexHNSW.h +52 -3
  17. data/vendor/faiss/faiss/IndexIVF.cpp +3 -3
  18. data/vendor/faiss/faiss/IndexIVF.h +9 -1
  19. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +15 -0
  20. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -0
  21. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +130 -57
  22. data/vendor/faiss/faiss/IndexIVFFastScan.h +14 -7
  23. data/vendor/faiss/faiss/IndexIVFPQ.cpp +1 -3
  24. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +21 -2
  25. data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
  26. data/vendor/faiss/faiss/IndexLattice.h +3 -22
  27. data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -29
  28. data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
  29. data/vendor/faiss/faiss/IndexNSG.h +1 -1
  30. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
  31. data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
  32. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  33. data/vendor/faiss/faiss/IndexRefine.cpp +5 -5
  34. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +3 -1
  35. data/vendor/faiss/faiss/MetricType.h +7 -2
  36. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
  37. data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
  38. data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
  39. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
  40. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +36 -4
  41. data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +6 -0
  42. data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
  43. data/vendor/faiss/faiss/gpu/GpuIndex.h +2 -8
  44. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
  45. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +6 -0
  46. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +2 -0
  47. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +25 -0
  48. data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
  49. data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +6 -0
  50. data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
  51. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +65 -0
  52. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
  53. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
  54. data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
  55. data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
  56. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +25 -0
  57. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +9 -1
  58. data/vendor/faiss/faiss/impl/DistanceComputer.h +46 -0
  59. data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
  60. data/vendor/faiss/faiss/impl/HNSW.cpp +358 -190
  61. data/vendor/faiss/faiss/impl/HNSW.h +43 -22
  62. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +8 -8
  63. data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
  64. data/vendor/faiss/faiss/impl/NNDescent.cpp +13 -8
  65. data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
  66. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +1 -0
  67. data/vendor/faiss/faiss/impl/ProductQuantizer.h +5 -1
  68. data/vendor/faiss/faiss/impl/ResultHandler.h +151 -32
  69. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +719 -102
  70. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -0
  71. data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +5 -0
  72. data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
  73. data/vendor/faiss/faiss/impl/index_read.cpp +29 -15
  74. data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
  75. data/vendor/faiss/faiss/impl/index_write.cpp +28 -10
  76. data/vendor/faiss/faiss/impl/io.cpp +13 -5
  77. data/vendor/faiss/faiss/impl/io.h +4 -4
  78. data/vendor/faiss/faiss/impl/io_macros.h +6 -0
  79. data/vendor/faiss/faiss/impl/platform_macros.h +22 -0
  80. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +11 -0
  81. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +1 -1
  82. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +448 -1
  83. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +5 -5
  84. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  85. data/vendor/faiss/faiss/impl/simd_result_handlers.h +143 -59
  86. data/vendor/faiss/faiss/index_factory.cpp +31 -13
  87. data/vendor/faiss/faiss/index_io.h +12 -5
  88. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
  89. data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
  90. data/vendor/faiss/faiss/invlists/DirectMap.cpp +9 -1
  91. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +55 -17
  92. data/vendor/faiss/faiss/invlists/InvertedLists.h +18 -9
  93. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +21 -6
  94. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
  95. data/vendor/faiss/faiss/python/python_callbacks.cpp +3 -3
  96. data/vendor/faiss/faiss/utils/Heap.h +105 -0
  97. data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
  98. data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
  99. data/vendor/faiss/faiss/utils/bf16.h +36 -0
  100. data/vendor/faiss/faiss/utils/distances.cpp +58 -88
  101. data/vendor/faiss/faiss/utils/distances.h +5 -5
  102. data/vendor/faiss/faiss/utils/distances_simd.cpp +997 -9
  103. data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
  104. data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
  105. data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
  106. data/vendor/faiss/faiss/utils/hamming.cpp +1 -1
  107. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +4 -1
  108. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +2 -1
  109. data/vendor/faiss/faiss/utils/random.cpp +43 -0
  110. data/vendor/faiss/faiss/utils/random.h +25 -0
  111. data/vendor/faiss/faiss/utils/simdlib.h +10 -1
  112. data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
  113. data/vendor/faiss/faiss/utils/simdlib_neon.h +5 -2
  114. data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
  115. data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
  116. data/vendor/faiss/faiss/utils/utils.cpp +10 -3
  117. data/vendor/faiss/faiss/utils/utils.h +3 -0
  118. metadata +16 -4
  119. data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102
@@ -12,10 +12,19 @@
12
12
  #include <cstdint>
13
13
 
14
14
  #include <faiss/cppcontrib/detail/CoarseBitType.h>
15
+ #include <faiss/impl/platform_macros.h>
15
16
 
16
17
  namespace faiss {
17
18
  namespace cppcontrib {
18
19
 
20
+ bool isBigEndian() {
21
+ #ifdef FAISS_BIG_ENDIAN
22
+ return true;
23
+ #else
24
+ return false;
25
+ #endif
26
+ }
27
+
19
28
  ////////////////////////////////////////////////////////////////////////////////////
20
29
  /// Index2LevelDecoder
21
30
  ////////////////////////////////////////////////////////////////////////////////////
@@ -72,9 +81,14 @@ struct Index2LevelDecoder {
72
81
  const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
73
82
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
74
83
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
75
-
76
- const intptr_t coarseCode = coarse[coarseCentroidIdx];
77
- const intptr_t fineCode = fine[fineCentroidIdx];
84
+ intptr_t coarseCode, fineCode;
85
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
86
+ coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
87
+ fineCode = Swap2Bytes(fine[fineCentroidIdx]);
88
+ } else {
89
+ coarseCode = coarse[coarseCentroidIdx];
90
+ fineCode = fine[fineCentroidIdx];
91
+ }
78
92
 
79
93
  const float* const __restrict coarsePtr = pqCoarseCentroids +
80
94
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
@@ -112,9 +126,14 @@ struct Index2LevelDecoder {
112
126
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
113
127
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
114
128
 
115
- const intptr_t coarseCode = coarse[coarseCentroidIdx];
116
- const intptr_t fineCode = fine[fineCentroidIdx];
117
-
129
+ intptr_t coarseCode, fineCode;
130
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
131
+ coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
132
+ fineCode = Swap2Bytes(fine[fineCentroidIdx]);
133
+ } else {
134
+ coarseCode = coarse[coarseCentroidIdx];
135
+ fineCode = fine[fineCentroidIdx];
136
+ }
118
137
  const float* const __restrict coarsePtr = pqCoarseCentroids +
119
138
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
120
139
  COARSE_SIZE +
@@ -162,11 +181,18 @@ struct Index2LevelDecoder {
162
181
  const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
163
182
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
164
183
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
165
-
166
- const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
167
- const intptr_t fineCode0 = fine0[fineCentroidIdx];
168
- const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
169
- const intptr_t fineCode1 = fine1[fineCentroidIdx];
184
+ intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
185
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
186
+ coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
187
+ fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
188
+ coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
189
+ fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
190
+ } else {
191
+ coarseCode0 = coarse0[coarseCentroidIdx];
192
+ fineCode0 = fine0[fineCentroidIdx];
193
+ coarseCode1 = coarse1[coarseCentroidIdx];
194
+ fineCode1 = fine1[fineCentroidIdx];
195
+ }
170
196
 
171
197
  const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
172
198
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -222,11 +248,18 @@ struct Index2LevelDecoder {
222
248
  const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
223
249
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
224
250
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
225
-
226
- const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
227
- const intptr_t fineCode0 = fine0[fineCentroidIdx];
228
- const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
229
- const intptr_t fineCode1 = fine1[fineCentroidIdx];
251
+ intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
252
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
253
+ coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
254
+ fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
255
+ coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
256
+ fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
257
+ } else {
258
+ coarseCode0 = coarse0[coarseCentroidIdx];
259
+ fineCode0 = fine0[fineCentroidIdx];
260
+ coarseCode1 = coarse1[coarseCentroidIdx];
261
+ fineCode1 = fine1[fineCentroidIdx];
262
+ }
230
263
 
231
264
  const float* const __restrict coarsePtr0 = pqCoarseCentroids +
232
265
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -292,13 +325,23 @@ struct Index2LevelDecoder {
292
325
  const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
293
326
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
294
327
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
295
-
296
- const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
297
- const intptr_t fineCode0 = fine0[fineCentroidIdx];
298
- const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
299
- const intptr_t fineCode1 = fine1[fineCentroidIdx];
300
- const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
301
- const intptr_t fineCode2 = fine2[fineCentroidIdx];
328
+ intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
329
+ intptr_t coarseCode2, fineCode2;
330
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
331
+ coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
332
+ fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
333
+ coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
334
+ fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
335
+ coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
336
+ fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
337
+ } else {
338
+ coarseCode0 = coarse0[coarseCentroidIdx];
339
+ fineCode0 = fine0[fineCentroidIdx];
340
+ coarseCode1 = coarse1[coarseCentroidIdx];
341
+ fineCode1 = fine1[fineCentroidIdx];
342
+ coarseCode2 = coarse2[coarseCentroidIdx];
343
+ fineCode2 = fine2[fineCentroidIdx];
344
+ }
302
345
 
303
346
  const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
304
347
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -369,13 +412,23 @@ struct Index2LevelDecoder {
369
412
  const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
370
413
  const intptr_t fineCentroidIdx = i / FINE_SIZE;
371
414
  const intptr_t fineCentroidOffset = i % FINE_SIZE;
372
-
373
- const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
374
- const intptr_t fineCode0 = fine0[fineCentroidIdx];
375
- const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
376
- const intptr_t fineCode1 = fine1[fineCentroidIdx];
377
- const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
378
- const intptr_t fineCode2 = fine2[fineCentroidIdx];
415
+ intptr_t coarseCode0, fineCode0, coarseCode1, fineCode1;
416
+ intptr_t coarseCode2, fineCode2;
417
+ if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
418
+ coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
419
+ fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
420
+ coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
421
+ fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
422
+ coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
423
+ fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
424
+ } else {
425
+ coarseCode0 = coarse0[coarseCentroidIdx];
426
+ fineCode0 = fine0[fineCentroidIdx];
427
+ coarseCode1 = coarse1[coarseCentroidIdx];
428
+ fineCode1 = fine1[fineCentroidIdx];
429
+ coarseCode2 = coarse2[coarseCentroidIdx];
430
+ fineCode2 = fine2[fineCentroidIdx];
431
+ }
379
432
 
380
433
  const float* const __restrict coarsePtr0 = pqCoarseCentroids +
381
434
  (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -14,6 +14,9 @@
14
14
 
15
15
  #include <faiss/IndexBinaryFlat.h>
16
16
  #include <faiss/IndexFlat.h>
17
+ #if defined USE_NVIDIA_RAFT
18
+ #include <faiss/IndexHNSW.h>
19
+ #endif
17
20
  #include <faiss/IndexIVF.h>
18
21
  #include <faiss/IndexIVFFlat.h>
19
22
  #include <faiss/IndexIVFPQ.h>
@@ -24,6 +27,9 @@
24
27
  #include <faiss/MetaIndexes.h>
25
28
  #include <faiss/gpu/GpuIndex.h>
26
29
  #include <faiss/gpu/GpuIndexBinaryFlat.h>
30
+ #if defined USE_NVIDIA_RAFT
31
+ #include <faiss/gpu/GpuIndexCagra.h>
32
+ #endif
27
33
  #include <faiss/gpu/GpuIndexFlat.h>
28
34
  #include <faiss/gpu/GpuIndexIVFFlat.h>
29
35
  #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -85,7 +91,15 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
85
91
  // objective is to make a single component out of them
86
92
  // (inverse op of ToGpuClonerMultiple)
87
93
 
88
- } else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
94
+ }
95
+ #if defined USE_NVIDIA_RAFT
96
+ else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
97
+ IndexHNSWCagra* res = new IndexHNSWCagra();
98
+ icg->copyTo(res);
99
+ return res;
100
+ }
101
+ #endif
102
+ else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
89
103
  int nshard = ish->count();
90
104
  FAISS_ASSERT(nshard > 0);
91
105
  Index* res = clone_Index(ish->at(0));
@@ -153,6 +167,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
153
167
  config.indicesOptions = indicesOptions;
154
168
  config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
155
169
  config.use_raft = use_raft;
170
+ config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
156
171
 
157
172
  GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
158
173
  provider, ifl->d, ifl->nlist, ifl->metric_type, config);
@@ -205,6 +220,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
205
220
  config.usePrecomputedTables = usePrecomputed;
206
221
  config.use_raft = use_raft;
207
222
  config.interleavedLayout = use_raft;
223
+ config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
208
224
 
209
225
  GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
210
226
 
@@ -213,9 +229,25 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
213
229
  }
214
230
 
215
231
  return res;
216
- } else {
217
- // default: use CPU cloner
218
- return Cloner::clone_Index(index);
232
+ }
233
+ #if defined USE_NVIDIA_RAFT
234
+ else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
235
+ GpuIndexCagraConfig config;
236
+ config.device = device;
237
+ GpuIndexCagra* res =
238
+ new GpuIndexCagra(provider, icg->d, icg->metric_type, config);
239
+ res->copyFrom(icg);
240
+ return res;
241
+ }
242
+ #endif
243
+ else {
244
+ // use CPU cloner for IDMap and PreTransform
245
+ auto index_idmap = dynamic_cast<const IndexIDMap*>(index);
246
+ auto index_pt = dynamic_cast<const IndexPreTransform*>(index);
247
+ if (index_idmap || index_pt) {
248
+ return Cloner::clone_Index(index);
249
+ }
250
+ FAISS_THROW_MSG("This index type is not implemented on GPU.");
219
251
  }
220
252
  }
221
253
 
@@ -43,6 +43,12 @@ struct GpuClonerOptions {
43
43
  #else
44
44
  bool use_raft = false;
45
45
  #endif
46
+
47
+ /// This flag controls the CPU fallback logic for coarse quantizer
48
+ /// component of the index. When set to false (default), the cloner will
49
+ /// throw an exception for indices not implemented on GPU. When set to
50
+ /// true, it will fallback to a CPU implementation.
51
+ bool allowCpuCoarseQuantizer = false;
46
52
  };
47
53
 
48
54
  struct GpuMultipleClonerOptions : public GpuClonerOptions {
@@ -15,7 +15,7 @@
15
15
  /// Assertions
16
16
  ///
17
17
 
18
- #ifdef __CUDA_ARCH__
18
+ #if defined(__CUDA_ARCH__) || defined(USE_AMD_ROCM)
19
19
  #define GPU_FAISS_ASSERT(X) assert(X)
20
20
  #define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
21
21
  #define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)
@@ -84,19 +84,14 @@ class GpuIndex : public faiss::Index {
84
84
 
85
85
  /// `x` and `labels` can be resident on the CPU or any GPU; copies are
86
86
  /// performed as needed
87
- void assign(
88
- idx_t n,
89
- const float* x,
90
- idx_t* labels,
91
- // faiss::Index has idx_t for k
92
- idx_t k = 1) const override;
87
+ void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
88
+ const override;
93
89
 
94
90
  /// `x`, `distances` and `labels` can be resident on the CPU or any
95
91
  /// GPU; copies are performed as needed
96
92
  void search(
97
93
  idx_t n,
98
94
  const float* x,
99
- // faiss::Index has idx_t for k
100
95
  idx_t k,
101
96
  float* distances,
102
97
  idx_t* labels,
@@ -107,7 +102,6 @@ class GpuIndex : public faiss::Index {
107
102
  void search_and_reconstruct(
108
103
  idx_t n,
109
104
  const float* x,
110
- // faiss::Index has idx_t for k
111
105
  idx_t k,
112
106
  float* distances,
113
107
  idx_t* labels,
@@ -0,0 +1,282 @@
1
+ /**
2
+ * Copyright (c) Facebook, Inc. and its affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+ /*
8
+ * Copyright (c) 2024, NVIDIA CORPORATION.
9
+ *
10
+ * Licensed under the Apache License, Version 2.0 (the "License");
11
+ * you may not use this file except in compliance with the License.
12
+ * You may obtain a copy of the License at
13
+ *
14
+ * http://www.apache.org/licenses/LICENSE-2.0
15
+ *
16
+ * Unless required by applicable law or agreed to in writing, software
17
+ * distributed under the License is distributed on an "AS IS" BASIS,
18
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ * See the License for the specific language governing permissions and
20
+ * limitations under the License.
21
+ */
22
+
23
+ #pragma once
24
+
25
+ #include <faiss/IndexIVF.h>
26
+ #include <faiss/gpu/GpuIndex.h>
27
+ #include <faiss/gpu/GpuIndexIVFPQ.h>
28
+
29
+ namespace faiss {
30
+ struct IndexHNSWCagra;
31
+ }
32
+
33
+ namespace faiss {
34
+ namespace gpu {
35
+
36
+ class RaftCagra;
37
+
38
+ enum class graph_build_algo {
39
+ /// Use IVF-PQ to build all-neighbors knn graph
40
+ IVF_PQ,
41
+ /// Experimental, use NN-Descent to build all-neighbors knn graph
42
+ NN_DESCENT
43
+ };
44
+
45
+ /// A type for specifying how PQ codebooks are created.
46
+ enum class codebook_gen { // NOLINT
47
+ PER_SUBSPACE = 0, // NOLINT
48
+ PER_CLUSTER = 1, // NOLINT
49
+ };
50
+
51
+ struct IVFPQBuildCagraConfig {
52
+ ///
53
+ /// The number of inverted lists (clusters)
54
+ ///
55
+ /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be
56
+ /// approximately 1,000 to 10,000.
57
+
58
+ uint32_t n_lists = 1024;
59
+ /// The number of iterations searching for kmeans centers (index building).
60
+ uint32_t kmeans_n_iters = 20;
61
+ /// The fraction of data to use during iterative kmeans building.
62
+ double kmeans_trainset_fraction = 0.5;
63
+ ///
64
+ /// The bit length of the vector element after compression by PQ.
65
+ ///
66
+ /// Possible values: [4, 5, 6, 7, 8].
67
+ ///
68
+ /// Hint: the smaller the 'pq_bits', the smaller the index size and the
69
+ /// better the search performance, but the lower the recall.
70
+
71
+ uint32_t pq_bits = 8;
72
+ ///
73
+ /// The dimensionality of the vector after compression by PQ. When zero, an
74
+ /// optimal value is selected using a heuristic.
75
+ ///
76
+ /// NB: `pq_dim /// pq_bits` must be a multiple of 8.
77
+ ///
78
+ /// Hint: a smaller 'pq_dim' results in a smaller index size and better
79
+ /// search performance, but lower recall. If 'pq_bits' is 8, 'pq_dim' can be
80
+ /// set to any number, but multiple of 8 are desirable for good performance.
81
+ /// If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. For good
82
+ /// performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally,
83
+ /// 'pq_dim' should be also a divisor of the dataset dim.
84
+
85
+ uint32_t pq_dim = 0;
86
+ /// How PQ codebooks are created.
87
+ codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
88
+ ///
89
+ /// Apply a random rotation matrix on the input data and queries even if
90
+ /// `dim % pq_dim == 0`.
91
+ ///
92
+ /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always
93
+ /// applied to the input data and queries to transform the working space
94
+ /// from `dim` to `rot_dim`, which may be slightly larger than the original
95
+ /// space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
96
+ /// However, this transform is not necessary when `dim` is multiple of
97
+ /// `pq_dim`
98
+ /// (`dim == rot_dim`, hence no need in adding "extra" data columns /
99
+ /// features).
100
+ ///
101
+ /// By default, if `dim == rot_dim`, the rotation transform is initialized
102
+ /// with the identity matrix. When `force_random_rotation == true`, a random
103
+ /// orthogonal transform matrix is generated regardless of the values of
104
+ /// `dim` and `pq_dim`.
105
+
106
+ bool force_random_rotation = false;
107
+ ///
108
+ /// By default, the algorithm allocates more space than necessary for
109
+ /// individual clusters
110
+ /// (`list_data`). This allows to amortize the cost of memory allocation and
111
+ /// reduce the number of data copies during repeated calls to `extend`
112
+ /// (extending the database).
113
+ ///
114
+ /// The alternative is the conservative allocation behavior; when enabled,
115
+ /// the algorithm always allocates the minimum amount of memory required to
116
+ /// store the given number of records. Set this flag to `true` if you prefer
117
+ /// to use as little GPU memory for the database as possible.
118
+
119
+ bool conservative_memory_allocation = false;
120
+ };
121
+
122
+ struct IVFPQSearchCagraConfig {
123
+ /// The number of clusters to search.
124
+ uint32_t n_probes = 20;
125
+ ///
126
+ /// Data type of look up table to be created dynamically at search time.
127
+ ///
128
+ /// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
129
+ ///
130
+ /// The use of low-precision types reduces the amount of shared memory
131
+ /// required at search time, so fast shared memory kernels can be used even
132
+ /// for datasets with large dimansionality. Note that the recall is slightly
133
+ /// degraded when low-precision type is selected.
134
+
135
+ cudaDataType_t lut_dtype = CUDA_R_32F;
136
+ ///
137
+ /// Storage data type for distance/similarity computed at search time.
138
+ ///
139
+ /// Possible values: [CUDA_R_16F, CUDA_R_32F]
140
+ ///
141
+ /// If the performance limiter at search time is device memory access,
142
+ /// selecting FP16 will improve performance slightly.
143
+
144
+ cudaDataType_t internal_distance_dtype = CUDA_R_32F;
145
+ ///
146
+ /// Preferred fraction of SM's unified memory / L1 cache to be used as
147
+ /// shared memory.
148
+ ///
149
+ /// Possible values: [0.0 - 1.0] as a fraction of the
150
+ /// `sharedMemPerMultiprocessor`.
151
+ ///
152
+ /// One wants to increase the carveout to make sure a good GPU occupancy for
153
+ /// the main search kernel, but not to keep it too high to leave some memory
154
+ /// to be used as L1 cache. Note, this value is interpreted only as a hint.
155
+ /// Moreover, a GPU usually allows only a fixed set of cache configurations,
156
+ /// so the provided value is rounded up to the nearest configuration. Refer
157
+ /// to the NVIDIA tuning guide for the target GPU architecture.
158
+ ///
159
+ /// Note, this is a low-level tuning parameter that can have drastic
160
+ /// negative effects on the search performance if tweaked incorrectly.
161
+
162
+ double preferred_shmem_carveout = 1.0;
163
+ };
164
+
165
+ struct GpuIndexCagraConfig : public GpuIndexConfig {
166
+ /// Degree of input graph for pruning.
167
+ size_t intermediate_graph_degree = 128;
168
+ /// Degree of output graph.
169
+ size_t graph_degree = 64;
170
+ /// ANN algorithm to build knn graph.
171
+ graph_build_algo build_algo = graph_build_algo::IVF_PQ;
172
+ /// Number of Iterations to run if building with NN_DESCENT
173
+ size_t nn_descent_niter = 20;
174
+
175
+ IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
176
+ IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
177
+ };
178
+
179
+ enum class search_algo {
180
+ /// For large batch sizes.
181
+ SINGLE_CTA,
182
+ /// For small batch sizes.
183
+ MULTI_CTA,
184
+ MULTI_KERNEL,
185
+ AUTO
186
+ };
187
+
188
+ enum class hash_mode { HASH, SMALL, AUTO };
189
+
190
+ struct SearchParametersCagra : SearchParameters {
191
+ /// Maximum number of queries to search at the same time (batch size). Auto
192
+ /// select when 0.
193
+ size_t max_queries = 0;
194
+
195
+ /// Number of intermediate search results retained during the search.
196
+ ///
197
+ /// This is the main knob to adjust trade off between accuracy and search
198
+ /// speed. Higher values improve the search accuracy.
199
+
200
+ size_t itopk_size = 64;
201
+
202
+ /// Upper limit of search iterations. Auto select when 0.
203
+ size_t max_iterations = 0;
204
+
205
+ // In the following we list additional search parameters for fine tuning.
206
+ // Reasonable default values are automatically chosen.
207
+
208
+ /// Which search implementation to use.
209
+ search_algo algo = search_algo::AUTO;
210
+
211
+ /// Number of threads used to calculate a single distance. 4, 8, 16, or 32.
212
+
213
+ size_t team_size = 0;
214
+
215
+ /// Number of graph nodes to select as the starting point for the search in
216
+ /// each iteration. aka search width?
217
+ size_t search_width = 1;
218
+ /// Lower limit of search iterations.
219
+ size_t min_iterations = 0;
220
+
221
+ /// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0.
222
+ size_t thread_block_size = 0;
223
+ /// Hashmap type. Auto selection when AUTO.
224
+ hash_mode hashmap_mode = hash_mode::AUTO;
225
+ /// Lower limit of hashmap bit length. More than 8.
226
+ size_t hashmap_min_bitlen = 0;
227
+ /// Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
228
+ float hashmap_max_fill_rate = 0.5;
229
+
230
+ /// Number of iterations of initial random seed node selection. 1 or more.
231
+
232
+ uint32_t num_random_samplings = 1;
233
+ /// Bit mask used for initial random seed node selection.
234
+ uint64_t seed = 0x128394;
235
+ };
236
+
237
+ struct GpuIndexCagra : public GpuIndex {
238
+ public:
239
+ GpuIndexCagra(
240
+ GpuResourcesProvider* provider,
241
+ int dims,
242
+ faiss::MetricType metric = faiss::METRIC_L2,
243
+ GpuIndexCagraConfig config = GpuIndexCagraConfig());
244
+
245
+ /// Trains CAGRA based on the given vector data
246
+ void train(idx_t n, const float* x) override;
247
+
248
+ /// Initialize ourselves from the given CPU index; will overwrite
249
+ /// all data in ourselves
250
+ void copyFrom(const faiss::IndexHNSWCagra* index);
251
+
252
+ /// Copy ourselves to the given CPU index; will overwrite all data
253
+ /// in the index instance
254
+ void copyTo(faiss::IndexHNSWCagra* index) const;
255
+
256
+ void reset() override;
257
+
258
+ std::vector<idx_t> get_knngraph() const;
259
+
260
+ protected:
261
+ bool addImplRequiresIDs_() const override;
262
+
263
+ void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
264
+
265
+ /// Called from GpuIndex for search
266
+ void searchImpl_(
267
+ idx_t n,
268
+ const float* x,
269
+ int k,
270
+ float* distances,
271
+ idx_t* labels,
272
+ const SearchParameters* search_params) const override;
273
+
274
+ /// Our configuration options
275
+ const GpuIndexCagraConfig cagraConfig_;
276
+
277
+ /// Instance that we own; contains the inverted lists
278
+ std::shared_ptr<RaftCagra> index_;
279
+ };
280
+
281
+ } // namespace gpu
282
+ } // namespace faiss
@@ -26,6 +26,12 @@ struct GpuIndexIVFConfig : public GpuIndexConfig {
26
26
 
27
27
  /// Configuration for the coarse quantizer object
28
28
  GpuIndexFlatConfig flatConfig;
29
+
30
+ /// This flag controls the CPU fallback logic for coarse quantizer
31
+ /// component of the index. When set to false (default), the cloner will
32
+ /// throw an exception for indices not implemented on GPU. When set to
33
+ /// true, it will fallback to a CPU implementation.
34
+ bool allowCpuCoarseQuantizer = false;
29
35
  };
30
36
 
31
37
  /// Base class of all GPU IVF index types. This (for now) deliberately does not
@@ -87,6 +87,8 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
87
87
  /// Trains the coarse quantizer based on the given vector data
88
88
  void train(idx_t n, const float* x) override;
89
89
 
90
+ void reconstruct_n(idx_t i0, idx_t n, float* out) const override;
91
+
90
92
  protected:
91
93
  /// Initialize appropriate index
92
94
  void setIndex_(
@@ -257,6 +257,14 @@ void StandardGpuResourcesImpl::setDefaultStream(
257
257
  if (prevStream != stream) {
258
258
  streamWait({stream}, {prevStream});
259
259
  }
260
+ #if defined USE_NVIDIA_RAFT
261
+ // delete the raft handle for this device, which will be initialized
262
+ // with the updated stream during any subsequent calls to getRaftHandle
263
+ auto it2 = raftHandles_.find(device);
264
+ if (it2 != raftHandles_.end()) {
265
+ raftHandles_.erase(it2);
266
+ }
267
+ #endif
260
268
  }
261
269
 
262
270
  userDefaultStreams_[device] = stream;
@@ -275,6 +283,14 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
275
283
 
276
284
  streamWait({newStream}, {prevStream});
277
285
  }
286
+ #if defined USE_NVIDIA_RAFT
287
+ // delete the raft handle for this device, which will be initialized
288
+ // with the updated stream during any subsequent calls to getRaftHandle
289
+ auto it2 = raftHandles_.find(device);
290
+ if (it2 != raftHandles_.end()) {
291
+ raftHandles_.erase(it2);
292
+ }
293
+ #endif
278
294
  }
279
295
 
280
296
  userDefaultStreams_.erase(device);
@@ -347,11 +363,20 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
347
363
  prop.major,
348
364
  prop.minor);
349
365
 
366
+ #if USE_AMD_ROCM
367
+ // Our code is pre-built with and expects warpSize == 32 or 64, validate
368
+ // that
369
+ FAISS_ASSERT_FMT(
370
+ prop.warpSize == 32 || prop.warpSize == 64,
371
+ "Device id %d does not have expected warpSize of 32 or 64",
372
+ device);
373
+ #else
350
374
  // Our code is pre-built with and expects warpSize == 32, validate that
351
375
  FAISS_ASSERT_FMT(
352
376
  prop.warpSize == 32,
353
377
  "Device id %d does not have expected warpSize of 32",
354
378
  device);
379
+ #endif
355
380
 
356
381
  // Create streams
357
382
  cudaStream_t defaultStream = nullptr;