faiss 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +23 -21
  4. data/ext/faiss/extconf.rb +11 -0
  5. data/ext/faiss/index.cpp +4 -4
  6. data/ext/faiss/index_binary.cpp +6 -6
  7. data/ext/faiss/product_quantizer.cpp +4 -4
  8. data/lib/faiss/version.rb +1 -1
  9. data/vendor/faiss/faiss/AutoTune.cpp +13 -0
  10. data/vendor/faiss/faiss/IVFlib.cpp +101 -2
  11. data/vendor/faiss/faiss/IVFlib.h +26 -2
  12. data/vendor/faiss/faiss/Index.cpp +36 -3
  13. data/vendor/faiss/faiss/Index.h +43 -6
  14. data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
  15. data/vendor/faiss/faiss/Index2Layer.h +6 -1
  16. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
  17. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
  20. data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
  21. data/vendor/faiss/faiss/IndexBinary.h +18 -3
  22. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
  23. data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
  24. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
  31. data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
  32. data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
  33. data/vendor/faiss/faiss/IndexFastScan.h +145 -0
  34. data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
  35. data/vendor/faiss/faiss/IndexFlat.h +7 -4
  36. data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
  37. data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
  38. data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
  39. data/vendor/faiss/faiss/IndexHNSW.h +4 -2
  40. data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
  41. data/vendor/faiss/faiss/IndexIDMap.h +107 -0
  42. data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
  43. data/vendor/faiss/faiss/IndexIVF.h +35 -16
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
  45. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
  46. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
  48. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
  49. data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
  50. data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
  51. data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
  53. data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
  55. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
  56. data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
  57. data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
  58. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
  59. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
  60. data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
  61. data/vendor/faiss/faiss/IndexLSH.h +2 -1
  62. data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
  63. data/vendor/faiss/faiss/IndexLattice.h +3 -1
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
  66. data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
  67. data/vendor/faiss/faiss/IndexNSG.h +25 -1
  68. data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
  69. data/vendor/faiss/faiss/IndexPQ.h +19 -5
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
  73. data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
  74. data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
  75. data/vendor/faiss/faiss/IndexRefine.h +4 -2
  76. data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
  77. data/vendor/faiss/faiss/IndexReplicas.h +2 -1
  78. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
  79. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
  80. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
  81. data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
  82. data/vendor/faiss/faiss/IndexShards.cpp +4 -1
  83. data/vendor/faiss/faiss/IndexShards.h +2 -1
  84. data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
  85. data/vendor/faiss/faiss/MetaIndexes.h +3 -81
  86. data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
  87. data/vendor/faiss/faiss/VectorTransform.h +22 -4
  88. data/vendor/faiss/faiss/clone_index.cpp +23 -1
  89. data/vendor/faiss/faiss/clone_index.h +3 -0
  90. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
  91. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
  92. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
  93. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
  94. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
  95. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
  96. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
  101. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
  102. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
  103. data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
  104. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
  105. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
  106. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
  107. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
  108. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
  109. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
  110. data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
  111. data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
  112. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
  113. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
  114. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
  115. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
  116. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
  117. data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
  118. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
  119. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
  120. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
  121. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
  122. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
  123. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
  124. data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
  125. data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
  126. data/vendor/faiss/faiss/impl/HNSW.h +19 -16
  127. data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
  128. data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
  129. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
  130. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
  131. data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
  132. data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
  133. data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
  134. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
  135. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
  136. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
  137. data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
  138. data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
  139. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
  140. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
  141. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
  142. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
  143. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
  144. data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
  145. data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
  146. data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
  147. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
  148. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
  149. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
  150. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
  151. data/vendor/faiss/faiss/index_factory.cpp +196 -7
  152. data/vendor/faiss/faiss/index_io.h +5 -0
  153. data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
  154. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
  155. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
  156. data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
  157. data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
  158. data/vendor/faiss/faiss/utils/Heap.h +31 -15
  159. data/vendor/faiss/faiss/utils/distances.cpp +380 -56
  160. data/vendor/faiss/faiss/utils/distances.h +113 -15
  161. data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
  162. data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
  163. data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
  164. data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
  165. data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
  166. data/vendor/faiss/faiss/utils/fp16.h +11 -0
  167. data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
  168. data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
  169. data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
  170. data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
  171. data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
  172. data/vendor/faiss/faiss/utils/random.cpp +53 -0
  173. data/vendor/faiss/faiss/utils/random.h +5 -0
  174. data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
  175. data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
  176. data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
  177. metadata +37 -3
@@ -43,19 +43,27 @@ struct VectorTransform {
43
43
  */
44
44
  virtual void train(idx_t n, const float* x);
45
45
 
46
- /** apply the random rotation, return new allocated matrix
47
- * @param x size n * d_in
48
- * @return size n * d_out
46
+ /** apply the transformation and return the result in an allocated pointer
47
+ * @param n number of vectors to transform
48
+ * @param x input vectors, size n * d_in
49
+ * @return output vectors, size n * d_out
49
50
  */
50
51
  float* apply(idx_t n, const float* x) const;
51
52
 
52
- /// same as apply, but result is pre-allocated
53
+ /** apply the transformation and return the result in a provided matrix
54
+ * @param n number of vectors to transform
55
+ * @param x input vectors, size n * d_in
56
+ * @param xt output vectors, size n * d_out
57
+ */
53
58
  virtual void apply_noalloc(idx_t n, const float* x, float* xt) const = 0;
54
59
 
55
60
  /// reverse transformation. May not be implemented or may return
56
61
  /// approximate result
57
62
  virtual void reverse_transform(idx_t n, const float* xt, float* x) const;
58
63
 
64
+ // check that the two transforms are identical (to merge indexes)
65
+ virtual void check_identical(const VectorTransform& other) const = 0;
66
+
59
67
  virtual ~VectorTransform() {}
60
68
  };
61
69
 
@@ -100,6 +108,8 @@ struct LinearTransform : VectorTransform {
100
108
  int n,
101
109
  int d) const;
102
110
 
111
+ void check_identical(const VectorTransform& other) const override;
112
+
103
113
  ~LinearTransform() override {}
104
114
  };
105
115
 
@@ -207,6 +217,8 @@ struct ITQTransform : VectorTransform {
207
217
  void train(idx_t n, const float* x) override;
208
218
 
209
219
  void apply_noalloc(idx_t n, const float* x, float* xt) const override;
220
+
221
+ void check_identical(const VectorTransform& other) const override;
210
222
  };
211
223
 
212
224
  struct ProductQuantizer;
@@ -260,6 +272,8 @@ struct RemapDimensionsTransform : VectorTransform {
260
272
  void reverse_transform(idx_t n, const float* xt, float* x) const override;
261
273
 
262
274
  RemapDimensionsTransform() {}
275
+
276
+ void check_identical(const VectorTransform& other) const override;
263
277
  };
264
278
 
265
279
  /** per-vector normalization */
@@ -273,6 +287,8 @@ struct NormalizationTransform : VectorTransform {
273
287
 
274
288
  /// Identity transform since norm is not revertible
275
289
  void reverse_transform(idx_t n, const float* xt, float* x) const override;
290
+
291
+ void check_identical(const VectorTransform& other) const override;
276
292
  };
277
293
 
278
294
  /** Subtract the mean of each component from the vectors. */
@@ -290,6 +306,8 @@ struct CenteringTransform : VectorTransform {
290
306
 
291
307
  /// add the mean
292
308
  void reverse_transform(idx_t n, const float* xt, float* x) const override;
309
+
310
+ void check_identical(const VectorTransform& other) const override;
293
311
  };
294
312
 
295
313
  } // namespace faiss
@@ -32,6 +32,11 @@
32
32
  #include <faiss/MetaIndexes.h>
33
33
  #include <faiss/VectorTransform.h>
34
34
 
35
+ #include <faiss/impl/LocalSearchQuantizer.h>
36
+ #include <faiss/impl/ProductQuantizer.h>
37
+ #include <faiss/impl/ResidualQuantizer.h>
38
+ #include <faiss/impl/ScalarQuantizer.h>
39
+
35
40
  namespace faiss {
36
41
 
37
42
  /*************************************************************
@@ -117,7 +122,9 @@ Index* Cloner::clone_Index(const Index* index) {
117
122
  return res;
118
123
  } else if (
119
124
  const IndexIDMap* idmap = dynamic_cast<const IndexIDMap*>(index)) {
120
- IndexIDMap* res = new IndexIDMap(*idmap);
125
+ const IndexIDMap2* idmap2 = dynamic_cast<const IndexIDMap2*>(index);
126
+ IndexIDMap* res =
127
+ idmap2 ? new IndexIDMap2(*idmap2) : new IndexIDMap(*idmap);
121
128
  res->own_fields = true;
122
129
  res->index = clone_Index(idmap->index);
123
130
  return res;
@@ -137,6 +144,13 @@ Index* Cloner::clone_Index(const Index* index) {
137
144
  res->own_fields = true;
138
145
  res->storage = clone_Index(insg->storage);
139
146
  return res;
147
+ } else if (
148
+ const IndexNNDescent* innd =
149
+ dynamic_cast<const IndexNNDescent*>(index)) {
150
+ IndexNNDescent* res = new IndexNNDescent(*innd);
151
+ res->own_fields = true;
152
+ res->storage = clone_Index(innd->storage);
153
+ return res;
140
154
  } else if (
141
155
  const Index2Layer* i2l = dynamic_cast<const Index2Layer*>(index)) {
142
156
  Index2Layer* res = new Index2Layer(*i2l);
@@ -149,4 +163,12 @@ Index* Cloner::clone_Index(const Index* index) {
149
163
  return nullptr;
150
164
  }
151
165
 
166
+ Quantizer* clone_Quantizer(const Quantizer* quant) {
167
+ TRYCLONE(ResidualQuantizer, quant)
168
+ TRYCLONE(LocalSearchQuantizer, quant)
169
+ TRYCLONE(ProductQuantizer, quant)
170
+ TRYCLONE(ScalarQuantizer, quant)
171
+ FAISS_THROW_MSG("Did not recognize quantizer to clone");
172
+ }
173
+
152
174
  } // namespace faiss
@@ -16,6 +16,7 @@ namespace faiss {
16
16
  struct Index;
17
17
  struct IndexIVF;
18
18
  struct VectorTransform;
19
+ struct Quantizer;
19
20
 
20
21
  /* cloning functions */
21
22
  Index* clone_index(const Index*);
@@ -30,4 +31,6 @@ struct Cloner {
30
31
  virtual ~Cloner() {}
31
32
  };
32
33
 
34
+ Quantizer* clone_Quantizer(const Quantizer* quant);
35
+
33
36
  } // namespace faiss
@@ -0,0 +1,300 @@
1
+ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2
+
3
+ #pragma once
4
+
5
+ // This file contains a custom fast implementation of faiss::Index::sa_decode()
6
+ // function for the following index families:
7
+ // * IVF256,PQ[1]x8np
8
+ // * Residual[1]x8,PQ[2]x8
9
+ // * IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
10
+ // * Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
11
+ // * PQ[1]x8
12
+ // Additionally, AVX2 and ARM versions support
13
+ // * Residual[1]x8,PQ[2]x10
14
+ // * Residual[1]x8,PQ[2]x16
15
+ // * Residual[1]x10,PQ[2]x10
16
+ // * Residual[1]x10,PQ[2]x16
17
+ // * Residual[1]x16,PQ[2]x10
18
+ // * Residual[1]x16,PQ[2]x16
19
+ // * Residual1x[9-16 bit],PQ[1]x10 (such as Residual1x9,PQ16x10)
20
+ // * * (use with COARSE_BITS=16)
21
+ // * Residual1x[9-16 bit],PQ[1]x16 (such as Residual1x9,PQ16x16)
22
+ // * * (use with COARSE_BITS=16)
23
+ // * PQ[1]x10
24
+ // * PQ[1]x16
25
+ // Unfortunately, currently Faiss does not support something like
26
+ // IVF256,PQ16x10np
27
+ //
28
+ // The goal was to achieve the maximum performance, so the template version it
29
+ // is. The provided index families share the same code for sa_decode.
30
+ //
31
+ // The front-end code provides two high-level structures.
32
+ //
33
+ // First one:
34
+ // {
35
+ // template <
36
+ // intptr_t DIM,
37
+ // intptr_t COARSE_SIZE,
38
+ // intptr_t FINE_SIZE,
39
+ // intptr_t COARSE_BITS = 8
40
+ // intptr_t FINE_BITS = 8>
41
+ // struct Index2LevelDecoder { /*...*/ };
42
+ // }
43
+ // * DIM is the dimensionality of data
44
+ // * COARSE_SIZE is the dimensionality of the coarse quantizer (IVF, Residual)
45
+ // * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
46
+ // * COARSE_BITS is the number of bits that are needed to represent a coarse
47
+ // quantizer code.
48
+ // * FINE_BITS is the number of bits that are needed to represent a fine
49
+ // quantizer code.
50
+ // For example, "IVF256,PQ8np" for 160-dim data translates into
51
+ // Index2LevelDecoder<160,160,20,8>
52
+ // For example, "Residual4x8,PQ16" for 256-dim data translates into
53
+ // Index2LevelDecoder<256,64,1,8>
54
+ // For example, "IVF1024,PQ16np" for 256-dim data translates into
55
+ // Index2LevelDecoder<256,256,16,10>. But as there are only 1 coarse code
56
+ // element, Index2LevelDecoder<256,256,16,16> can be used as a faster
57
+ // decoder.
58
+ // For example, "Residual4x10,PQ16x10np" for 256-dim data translates into
59
+ // Index2LevelDecoder<256,64,16,10,10>
60
+ //
61
+ // Additional supported values for COARSE_BITS and FINE_BITS may be added later.
62
+ //
63
+ // Second one:
64
+ // {
65
+ // template <
66
+ // intptr_t DIM,
67
+ // intptr_t FINE_SIZE,
68
+ // intptr_t FINE_BITS = 8>
69
+ // struct IndexPQDecoder { /*...*/ };
70
+ // }
71
+ // * DIM is the dimensionality of data
72
+ // * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
73
+ // * FINE_BITS is the number of bits that are needed to represent a fine
74
+ // quantizer code.
75
+ // For example, "PQ8np" for 160-dim data translates into
76
+ // IndexPQDecoder<160,20>
77
+ //
78
+ // Unlike the general purpose version in faiss::Index::sa_decode(),
79
+ // this version provides the following functions (please note that
80
+ // pqCoarseCentroids params are not available for IndexPQDecoder,
81
+ // but the functionality is the same as for Index2LevelDecoder):
82
+ //
83
+ // * ::store(), which is similar to sa_decode(1, input, output),
84
+ // The method signature is the following:
85
+ // {
86
+ // void store(
87
+ // const float* const __restrict pqCoarseCentroids,
88
+ // const float* const __restrict pqFineCentroids,
89
+ // const uint8_t* const __restrict code,
90
+ // float* const __restrict outputStore);
91
+ // }
92
+ //
93
+ // * ::accum(), which is used to create a linear combination
94
+ // of decoded vectors:
95
+ // {
96
+ // const faiss::Index* const index;
97
+ // const uint8_t* const input;
98
+ // float weight;
99
+ //
100
+ // std::vector<float> buffer(d, 0);
101
+ //
102
+ // index->sa_decode(1, input, buffer.data());
103
+ // for (size_t iDim = 0; iDim < d; iDim++)
104
+ // output[iDim] += weight * buffer[iDim];
105
+ // }
106
+ // The method signature is the following:
107
+ // {
108
+ // static void accum(
109
+ // const float* const __restrict pqCoarseCentroids,
110
+ // const float* const __restrict pqFineCentroids,
111
+ // const uint8_t* const __restrict code,
112
+ // const float weight,
113
+ // float* const __restrict outputAccum);
114
+ // }
115
+ //
116
+ // * There is an additional overload for ::accum() that decodes two vectors
117
+ // per call. This provides an additional speedup because of a CPU
118
+ // superscalar architecture:
119
+ // {
120
+ // const faiss::Index* const index;
121
+ // const uint8_t* const input0;
122
+ // float weight0;
123
+ // const uint8_t* const input1;
124
+ // float weight1;
125
+ //
126
+ // std::vector<float> buffer(d, 0);
127
+ //
128
+ // index->sa_decode(1, input0, buffer.data());
129
+ // for (size_t iDim = 0; iDim < d; iDim++)
130
+ // output[iDim] += weight0 * buffer[iDim];
131
+ //
132
+ // index->sa_decode(1, input1, buffer.data());
133
+ // for (size_t iDim = 0; iDim < d; iDim++)
134
+ // output[iDim] += weight1 * buffer[iDim];
135
+ // }
136
+ // If each code uses its own coarse quantizer centroids table and its own fine
137
+ // quantizer centroids table, then the following overload can be used:
138
+ // {
139
+ // static void accum(
140
+ // const float* const __restrict pqCoarseCentroids0,
141
+ // const float* const __restrict pqFineCentroids0,
142
+ // const uint8_t* const __restrict code0,
143
+ // const float weight0,
144
+ // const float* const __restrict pqCoarseCentroids1,
145
+ // const float* const __restrict pqFineCentroids1,
146
+ // const uint8_t* const __restrict code1,
147
+ // const float weight1,
148
+ // float* const __restrict outputAccum);
149
+ // }
150
+ // If codes share the coarse quantizer centroids table and also share
151
+ // the fine quantizer centroids table, then the following overload can be
152
+ // used:
153
+ // {
154
+ // static void accum(
155
+ // const float* const __restrict pqCoarseCentroids,
156
+ // const float* const __restrict pqFineCentroids,
157
+ // const uint8_t* const __restrict code0,
158
+ // const float weight0,
159
+ // const uint8_t* const __restrict code1,
160
+ // const float weight1,
161
+ // float* const __restrict outputAccum);
162
+ // }
163
+ //
164
+ // * And one more overload for ::accum() that decodes and accumulates
165
+ // three vectors per call.
166
+ // {
167
+ // const faiss::Index* const index;
168
+ // const uint8_t* const input0;
169
+ // float weight0;
170
+ // const uint8_t* const input1;
171
+ // float weight1;
172
+ // const uint8_t* const input2;
173
+ // float weight2;
174
+ //
175
+ // std::vector<float> buffer(d, 0);
176
+ //
177
+ // index->sa_decode(1, input0, buffer.data());
178
+ // for (size_t iDim = 0; iDim < d; iDim++)
179
+ // output[iDim] += weight0 * buffer[iDim];
180
+ //
181
+ // index->sa_decode(1, input1, buffer.data());
182
+ // for (size_t iDim = 0; iDim < d; iDim++)
183
+ // output[iDim] += weight1 * buffer[iDim];
184
+ //
185
+ // index->sa_decode(1, input2, buffer.data());
186
+ // for (size_t iDim = 0; iDim < d; iDim++)
187
+ // output[iDim] += weight2 * buffer[iDim];
188
+ // }
189
+ //
190
+ // If each code uses its own coarse quantizer centroids table and its own fine
191
+ // quantizer centroids table, then the following overload can be used:
192
+ // {
193
+ // static void accum(
194
+ // const float* const __restrict pqCoarseCentroids0,
195
+ // const float* const __restrict pqFineCentroids0,
196
+ // const uint8_t* const __restrict code0,
197
+ // const float weight0,
198
+ // const float* const __restrict pqCoarseCentroids1,
199
+ // const float* const __restrict pqFineCentroids1,
200
+ // const uint8_t* const __restrict code1,
201
+ // const float weight1,
202
+ // const float* const __restrict pqCoarseCentroids2,
203
+ // const float* const __restrict pqFineCentroids2,
204
+ // const uint8_t* const __restrict code2,
205
+ // const float weight2,
206
+ // float* const __restrict outputAccum);
207
+ // }
208
+ // If codes share the coarse quantizer centroids table and also share
209
+ // the fine quantizer centroids table, then the following overload can be
210
+ // used:
211
+ // {
212
+ // static void accum(
213
+ // const float* const __restrict pqCoarseCentroids,
214
+ // const float* const __restrict pqFineCentroids,
215
+ // const uint8_t* const __restrict code0,
216
+ // const float weight0,
217
+ // const uint8_t* const __restrict code1,
218
+ // const float weight1,
219
+ // const uint8_t* const __restrict code2,
220
+ // const float weight2,
221
+ // float* const __restrict outputAccum);
222
+ // }
223
+ //
224
+ // The provided version is not multithreaded.
225
+ //
226
+ // Currently, an AVX2+FMA implementation is available. AVX512 version is also
227
+ // doable, but it was found to be slower than AVX2 for real world applications
228
+ // that I needed.
229
+ //
230
+ ////////////////////////////////////////////////////////////////////////////////////
231
+ //
232
+ // It is possible to use an additional index wrapper on top of IVFPQ /
233
+ // Residual+PQ, known as IndexRowwiseMinMax / IndexRowwiseMinMaxFP16. Index
234
+ // wrapper that performs rowwise normalization to [0,1], preserving the
235
+ // coefficients. This is a vector codec index only.
236
+ // For more details please refer to the description in
237
+ // faiss/IndexRowwiseMinMax.h file.
238
+ //
239
+ // If such a wrapper is used, then the quantizer will look like, say,
240
+ // MinMaxFP16,IVF256,PQ32np
241
+ // or
242
+ // MinMax,PQ16np
243
+ // In this case, please use the following contruction for the decoding,
244
+ // basically, wrapping a kernel in a kernel:
245
+ // {
246
+ // using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2>;
247
+ // using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
248
+ // // do T::store(...) or T::accum(...)
249
+ // }
250
+ //
251
+ // T::accum(...) contains an additional function variable which is
252
+ // used for accumulating scaling. Thus, the code pattern is the following:
253
+ // {
254
+ // const float* const __restrict pqCoarseCentroidsQ;
255
+ // const float* const __restrict pqFineCentroidsQ;
256
+ // const uint8_t* const __restrict input;
257
+ // const float* const __restrict weights;
258
+ // float* const __restrict output;
259
+ // float outputAccumMin = 0;
260
+ //
261
+ // for (size_t i = 0; i < n; i++) {
262
+ // T::accum(
263
+ // pqCoarseCentroidsQ,
264
+ // pqFineCentroidsQ,
265
+ // input + i * code_size,
266
+ // weights[i],
267
+ // output,
268
+ // outputAccumMin);
269
+ // }
270
+ // for (size_t j = 0; j < d; j++)
271
+ // output[j] += outputAccumMin;
272
+ // }
273
+ // This is similar to the following regular pseudo-code:
274
+ // {
275
+ // const faiss::Index* const index;
276
+ // const uint8_t* const __restrict input;
277
+ // const float* const __restrict weights;
278
+ // float* const __restrict output;
279
+ //
280
+ // for (size_t i = 0; i < n; i++) {
281
+ // std::vector<float> buffer(d, 0);
282
+ //
283
+ // index->sa_decode(1, input + i * code_size, buffer.data());
284
+ // for (size_t j = 0; j < d; j++)
285
+ // output[j] += weights[i] * buffer[j];
286
+ // }
287
+
288
+ #include <faiss/cppcontrib/sa_decode/MinMax-inl.h>
289
+ #include <faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h>
290
+
291
+ #ifdef __AVX2__
292
+ #include <faiss/cppcontrib/sa_decode/Level2-avx2-inl.h>
293
+ #include <faiss/cppcontrib/sa_decode/PQ-avx2-inl.h>
294
+ #elif defined(__ARM_NEON)
295
+ #include <faiss/cppcontrib/sa_decode/Level2-neon-inl.h>
296
+ #include <faiss/cppcontrib/sa_decode/PQ-neon-inl.h>
297
+ #else
298
+ #include <faiss/cppcontrib/sa_decode/Level2-inl.h>
299
+ #include <faiss/cppcontrib/sa_decode/PQ-inl.h>
300
+ #endif
@@ -0,0 +1,24 @@
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+
5
+ namespace faiss {
6
+ namespace cppcontrib {
7
+ namespace detail {
8
+
9
+ template <int COARSE_BITS>
10
+ struct CoarseBitType {};
11
+
12
+ template <>
13
+ struct CoarseBitType<8> {
14
+ using bit_type = uint8_t;
15
+ };
16
+
17
+ template <>
18
+ struct CoarseBitType<16> {
19
+ using bit_type = uint16_t;
20
+ };
21
+
22
+ } // namespace detail
23
+ } // namespace cppcontrib
24
+ } // namespace faiss
@@ -0,0 +1,195 @@
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+
5
+ namespace faiss {
6
+ namespace cppcontrib {
7
+ namespace detail {
8
+
9
+ namespace {
10
+
11
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
12
+ struct Uint8Reader {
13
+ static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
14
+
15
+ static intptr_t get(const uint8_t* const __restrict codes) {
16
+ // Read using 4-bytes, if possible.
17
+ // Reading using 8-byte takes too many registers somewhy.
18
+
19
+ constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
20
+ constexpr intptr_t SUB_ELEMENT = CPOS % 4;
21
+
22
+ switch (SUB_ELEMENT) {
23
+ case 0: {
24
+ if (N_ELEMENTS > CPOS + 3) {
25
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
26
+ codes + ELEMENT_TO_READ * 4);
27
+ return (code32 & 0x000000FF);
28
+ } else {
29
+ return codes[CPOS];
30
+ }
31
+ }
32
+ case 1: {
33
+ if (N_ELEMENTS > CPOS + 2) {
34
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
35
+ codes + ELEMENT_TO_READ * 4);
36
+ return (code32 & 0x0000FF00) >> 8;
37
+ } else {
38
+ return codes[CPOS];
39
+ }
40
+ }
41
+ case 2: {
42
+ if (N_ELEMENTS > CPOS + 1) {
43
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
44
+ codes + ELEMENT_TO_READ * 4);
45
+ return (code32 & 0x00FF0000) >> 16;
46
+ } else {
47
+ return codes[CPOS];
48
+ }
49
+ }
50
+ case 3: {
51
+ if (N_ELEMENTS > CPOS) {
52
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
53
+ codes + ELEMENT_TO_READ * 4);
54
+ return (code32) >> 24;
55
+ } else {
56
+ return codes[CPOS];
57
+ }
58
+ }
59
+ }
60
+ }
61
+ };
62
+
63
+ // reduces the number of read operations from RAM
64
+ ///////////////////////////////////////////////
65
+ // 76543210 76543210 76543210 76543210 76543210
66
+ // 00000000 00
67
+ // 111111 1111
68
+ // 2222 222222
69
+ // 33 33333333
70
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
71
+ struct Uint10Reader {
72
+ static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
73
+
74
+ static intptr_t get(const uint8_t* const __restrict codes) {
75
+ // Read using 4-bytes or 2-bytes.
76
+
77
+ constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
78
+ constexpr intptr_t SUB_ELEMENT = CPOS % 4;
79
+
80
+ switch (SUB_ELEMENT) {
81
+ case 0: {
82
+ if (N_ELEMENTS > CPOS + 2) {
83
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
84
+ codes + ELEMENT_TO_READ * 5);
85
+ return (code32 & 0b0000001111111111);
86
+ } else {
87
+ const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
88
+ codes + ELEMENT_TO_READ * 5 + 0);
89
+ return (code16 & 0b0000001111111111);
90
+ }
91
+ }
92
+ case 1: {
93
+ if (N_ELEMENTS > CPOS + 1) {
94
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
95
+ codes + ELEMENT_TO_READ * 5);
96
+ return (code32 & 0b000011111111110000000000) >> 10;
97
+ } else {
98
+ const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
99
+ codes + ELEMENT_TO_READ * 5 + 1);
100
+ return (code16 & 0b0000111111111100) >> 2;
101
+ }
102
+ }
103
+ case 2: {
104
+ if (N_ELEMENTS > CPOS) {
105
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
106
+ codes + ELEMENT_TO_READ * 5);
107
+ return (code32 & 0b00111111111100000000000000000000) >> 20;
108
+ } else {
109
+ const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
110
+ codes + ELEMENT_TO_READ * 5 + 2);
111
+ return (code16 & 0b0011111111110000) >> 4;
112
+ }
113
+ }
114
+ case 3: {
115
+ const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
116
+ codes + ELEMENT_TO_READ * 5 + 3);
117
+ return (code16 & 0b1111111111000000) >> 6;
118
+ }
119
+ }
120
+ }
121
+ };
122
+
123
+ // reduces the number of read operations from RAM
124
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
125
+ struct Uint16Reader {
126
+ static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
127
+
128
+ static intptr_t get(const uint8_t* const __restrict codes) {
129
+ // Read using 4-bytes or 2-bytes.
130
+ // Reading using 8-byte takes too many registers somewhy.
131
+
132
+ constexpr intptr_t ELEMENT_TO_READ = CPOS / 2;
133
+ constexpr intptr_t SUB_ELEMENT = CPOS % 2;
134
+
135
+ switch (SUB_ELEMENT) {
136
+ case 0: {
137
+ if (N_ELEMENTS > CPOS + 1) {
138
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
139
+ codes + ELEMENT_TO_READ * 4);
140
+ return (code32 & 0x0000FFFF);
141
+ } else {
142
+ const uint16_t* const __restrict codesFp16 =
143
+ reinterpret_cast<const uint16_t*>(codes);
144
+ return codesFp16[CPOS];
145
+ }
146
+ }
147
+ case 1: {
148
+ if (N_ELEMENTS > CPOS) {
149
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
150
+ codes + ELEMENT_TO_READ * 4);
151
+ return code32 >> 16;
152
+ } else {
153
+ const uint16_t* const __restrict codesFp16 =
154
+ reinterpret_cast<const uint16_t*>(codes);
155
+ return codesFp16[CPOS];
156
+ }
157
+ }
158
+ }
159
+ }
160
+ };
161
+
162
+ //
163
+ template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
164
+ struct UintReaderImplType {};
165
+
166
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
167
+ struct UintReaderImplType<N_ELEMENTS, 8, CPOS> {
168
+ using reader_type = Uint8Reader<N_ELEMENTS, CPOS>;
169
+ };
170
+
171
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
172
+ struct UintReaderImplType<N_ELEMENTS, 10, CPOS> {
173
+ using reader_type = Uint10Reader<N_ELEMENTS, CPOS>;
174
+ };
175
+
176
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
177
+ struct UintReaderImplType<N_ELEMENTS, 16, CPOS> {
178
+ using reader_type = Uint16Reader<N_ELEMENTS, CPOS>;
179
+ };
180
+
181
+ } // namespace
182
+
183
+ // reduces the number of read operations from RAM
184
+ template <intptr_t DIM, intptr_t CODE_SIZE, intptr_t CODE_BITS, intptr_t CPOS>
185
+ using UintReader =
186
+ typename UintReaderImplType<DIM / CODE_SIZE, CODE_BITS, CPOS>::
187
+ reader_type;
188
+
189
+ template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
190
+ using UintReaderRaw =
191
+ typename UintReaderImplType<N_ELEMENTS, CODE_BITS, CPOS>::reader_type;
192
+
193
+ } // namespace detail
194
+ } // namespace cppcontrib
195
+ } // namespace faiss