faiss 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +23 -21
  4. data/ext/faiss/extconf.rb +11 -0
  5. data/ext/faiss/index.cpp +4 -4
  6. data/ext/faiss/index_binary.cpp +6 -6
  7. data/ext/faiss/product_quantizer.cpp +4 -4
  8. data/lib/faiss/version.rb +1 -1
  9. data/vendor/faiss/faiss/AutoTune.cpp +13 -0
  10. data/vendor/faiss/faiss/IVFlib.cpp +101 -2
  11. data/vendor/faiss/faiss/IVFlib.h +26 -2
  12. data/vendor/faiss/faiss/Index.cpp +36 -3
  13. data/vendor/faiss/faiss/Index.h +43 -6
  14. data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
  15. data/vendor/faiss/faiss/Index2Layer.h +6 -1
  16. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
  17. data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
  20. data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
  21. data/vendor/faiss/faiss/IndexBinary.h +18 -3
  22. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
  23. data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
  24. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
  31. data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
  32. data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
  33. data/vendor/faiss/faiss/IndexFastScan.h +145 -0
  34. data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
  35. data/vendor/faiss/faiss/IndexFlat.h +7 -4
  36. data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
  37. data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
  38. data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
  39. data/vendor/faiss/faiss/IndexHNSW.h +4 -2
  40. data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
  41. data/vendor/faiss/faiss/IndexIDMap.h +107 -0
  42. data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
  43. data/vendor/faiss/faiss/IndexIVF.h +35 -16
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
  45. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
  46. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
  48. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
  49. data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
  50. data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
  51. data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
  53. data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
  55. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
  56. data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
  57. data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
  58. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
  59. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
  60. data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
  61. data/vendor/faiss/faiss/IndexLSH.h +2 -1
  62. data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
  63. data/vendor/faiss/faiss/IndexLattice.h +3 -1
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
  66. data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
  67. data/vendor/faiss/faiss/IndexNSG.h +25 -1
  68. data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
  69. data/vendor/faiss/faiss/IndexPQ.h +19 -5
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
  73. data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
  74. data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
  75. data/vendor/faiss/faiss/IndexRefine.h +4 -2
  76. data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
  77. data/vendor/faiss/faiss/IndexReplicas.h +2 -1
  78. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
  79. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
  80. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
  81. data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
  82. data/vendor/faiss/faiss/IndexShards.cpp +4 -1
  83. data/vendor/faiss/faiss/IndexShards.h +2 -1
  84. data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
  85. data/vendor/faiss/faiss/MetaIndexes.h +3 -81
  86. data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
  87. data/vendor/faiss/faiss/VectorTransform.h +22 -4
  88. data/vendor/faiss/faiss/clone_index.cpp +23 -1
  89. data/vendor/faiss/faiss/clone_index.h +3 -0
  90. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
  91. data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
  92. data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
  93. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
  94. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
  95. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
  96. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
  101. data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
  102. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
  103. data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
  104. data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
  105. data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
  106. data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
  107. data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
  108. data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
  109. data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
  110. data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
  111. data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
  112. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
  113. data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
  114. data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
  115. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
  116. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
  117. data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
  118. data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
  119. data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
  120. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
  121. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
  122. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
  123. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
  124. data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
  125. data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
  126. data/vendor/faiss/faiss/impl/HNSW.h +19 -16
  127. data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
  128. data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
  129. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
  130. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
  131. data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
  132. data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
  133. data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
  134. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
  135. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
  136. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
  137. data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
  138. data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
  139. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
  140. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
  141. data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
  142. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
  143. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
  144. data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
  145. data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
  146. data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
  147. data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
  148. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
  149. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
  150. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
  151. data/vendor/faiss/faiss/index_factory.cpp +196 -7
  152. data/vendor/faiss/faiss/index_io.h +5 -0
  153. data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
  154. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
  155. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
  156. data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
  157. data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
  158. data/vendor/faiss/faiss/utils/Heap.h +31 -15
  159. data/vendor/faiss/faiss/utils/distances.cpp +380 -56
  160. data/vendor/faiss/faiss/utils/distances.h +113 -15
  161. data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
  162. data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
  163. data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
  164. data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
  165. data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
  166. data/vendor/faiss/faiss/utils/fp16.h +11 -0
  167. data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
  168. data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
  169. data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
  170. data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
  171. data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
  172. data/vendor/faiss/faiss/utils/random.cpp +53 -0
  173. data/vendor/faiss/faiss/utils/random.h +5 -0
  174. data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
  175. data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
  176. data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
  177. metadata +37 -3
@@ -43,19 +43,27 @@ struct VectorTransform {
43
43
  */
44
44
  virtual void train(idx_t n, const float* x);
45
45
 
46
- /** apply the random rotation, return new allocated matrix
47
- * @param x size n * d_in
48
- * @return size n * d_out
46
+ /** apply the transformation and return the result in an allocated pointer
47
+ * @param n number of vectors to transform
48
+ * @param x input vectors, size n * d_in
49
+ * @return output vectors, size n * d_out
49
50
  */
50
51
  float* apply(idx_t n, const float* x) const;
51
52
 
52
- /// same as apply, but result is pre-allocated
53
+ /** apply the transformation and return the result in a provided matrix
54
+ * @param n number of vectors to transform
55
+ * @param x input vectors, size n * d_in
56
+ * @param xt output vectors, size n * d_out
57
+ */
53
58
  virtual void apply_noalloc(idx_t n, const float* x, float* xt) const = 0;
54
59
 
55
60
  /// reverse transformation. May not be implemented or may return
56
61
  /// approximate result
57
62
  virtual void reverse_transform(idx_t n, const float* xt, float* x) const;
58
63
 
64
+ // check that the two transforms are identical (to merge indexes)
65
+ virtual void check_identical(const VectorTransform& other) const = 0;
66
+
59
67
  virtual ~VectorTransform() {}
60
68
  };
61
69
 
@@ -100,6 +108,8 @@ struct LinearTransform : VectorTransform {
100
108
  int n,
101
109
  int d) const;
102
110
 
111
+ void check_identical(const VectorTransform& other) const override;
112
+
103
113
  ~LinearTransform() override {}
104
114
  };
105
115
 
@@ -207,6 +217,8 @@ struct ITQTransform : VectorTransform {
207
217
  void train(idx_t n, const float* x) override;
208
218
 
209
219
  void apply_noalloc(idx_t n, const float* x, float* xt) const override;
220
+
221
+ void check_identical(const VectorTransform& other) const override;
210
222
  };
211
223
 
212
224
  struct ProductQuantizer;
@@ -260,6 +272,8 @@ struct RemapDimensionsTransform : VectorTransform {
260
272
  void reverse_transform(idx_t n, const float* xt, float* x) const override;
261
273
 
262
274
  RemapDimensionsTransform() {}
275
+
276
+ void check_identical(const VectorTransform& other) const override;
263
277
  };
264
278
 
265
279
  /** per-vector normalization */
@@ -273,6 +287,8 @@ struct NormalizationTransform : VectorTransform {
273
287
 
274
288
  /// Identity transform since norm is not revertible
275
289
  void reverse_transform(idx_t n, const float* xt, float* x) const override;
290
+
291
+ void check_identical(const VectorTransform& other) const override;
276
292
  };
277
293
 
278
294
  /** Subtract the mean of each component from the vectors. */
@@ -290,6 +306,8 @@ struct CenteringTransform : VectorTransform {
290
306
 
291
307
  /// add the mean
292
308
  void reverse_transform(idx_t n, const float* xt, float* x) const override;
309
+
310
+ void check_identical(const VectorTransform& other) const override;
293
311
  };
294
312
 
295
313
  } // namespace faiss
@@ -32,6 +32,11 @@
32
32
  #include <faiss/MetaIndexes.h>
33
33
  #include <faiss/VectorTransform.h>
34
34
 
35
+ #include <faiss/impl/LocalSearchQuantizer.h>
36
+ #include <faiss/impl/ProductQuantizer.h>
37
+ #include <faiss/impl/ResidualQuantizer.h>
38
+ #include <faiss/impl/ScalarQuantizer.h>
39
+
35
40
  namespace faiss {
36
41
 
37
42
  /*************************************************************
@@ -117,7 +122,9 @@ Index* Cloner::clone_Index(const Index* index) {
117
122
  return res;
118
123
  } else if (
119
124
  const IndexIDMap* idmap = dynamic_cast<const IndexIDMap*>(index)) {
120
- IndexIDMap* res = new IndexIDMap(*idmap);
125
+ const IndexIDMap2* idmap2 = dynamic_cast<const IndexIDMap2*>(index);
126
+ IndexIDMap* res =
127
+ idmap2 ? new IndexIDMap2(*idmap2) : new IndexIDMap(*idmap);
121
128
  res->own_fields = true;
122
129
  res->index = clone_Index(idmap->index);
123
130
  return res;
@@ -137,6 +144,13 @@ Index* Cloner::clone_Index(const Index* index) {
137
144
  res->own_fields = true;
138
145
  res->storage = clone_Index(insg->storage);
139
146
  return res;
147
+ } else if (
148
+ const IndexNNDescent* innd =
149
+ dynamic_cast<const IndexNNDescent*>(index)) {
150
+ IndexNNDescent* res = new IndexNNDescent(*innd);
151
+ res->own_fields = true;
152
+ res->storage = clone_Index(innd->storage);
153
+ return res;
140
154
  } else if (
141
155
  const Index2Layer* i2l = dynamic_cast<const Index2Layer*>(index)) {
142
156
  Index2Layer* res = new Index2Layer(*i2l);
@@ -149,4 +163,12 @@ Index* Cloner::clone_Index(const Index* index) {
149
163
  return nullptr;
150
164
  }
151
165
 
166
+ Quantizer* clone_Quantizer(const Quantizer* quant) {
167
+ TRYCLONE(ResidualQuantizer, quant)
168
+ TRYCLONE(LocalSearchQuantizer, quant)
169
+ TRYCLONE(ProductQuantizer, quant)
170
+ TRYCLONE(ScalarQuantizer, quant)
171
+ FAISS_THROW_MSG("Did not recognize quantizer to clone");
172
+ }
173
+
152
174
  } // namespace faiss
@@ -16,6 +16,7 @@ namespace faiss {
16
16
  struct Index;
17
17
  struct IndexIVF;
18
18
  struct VectorTransform;
19
+ struct Quantizer;
19
20
 
20
21
  /* cloning functions */
21
22
  Index* clone_index(const Index*);
@@ -30,4 +31,6 @@ struct Cloner {
30
31
  virtual ~Cloner() {}
31
32
  };
32
33
 
34
+ Quantizer* clone_Quantizer(const Quantizer* quant);
35
+
33
36
  } // namespace faiss
@@ -0,0 +1,300 @@
1
+ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2
+
3
+ #pragma once
4
+
5
+ // This file contains a custom fast implementation of faiss::Index::sa_decode()
6
+ // function for the following index families:
7
+ // * IVF256,PQ[1]x8np
8
+ // * Residual[1]x8,PQ[2]x8
9
+ // * IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
10
+ // * Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
11
+ // * PQ[1]x8
12
+ // Additionally, AVX2 and ARM versions support
13
+ // * Residual[1]x8,PQ[2]x10
14
+ // * Residual[1]x8,PQ[2]x16
15
+ // * Residual[1]x10,PQ[2]x10
16
+ // * Residual[1]x10,PQ[2]x16
17
+ // * Residual[1]x16,PQ[2]x10
18
+ // * Residual[1]x16,PQ[2]x16
19
+ // * Residual1x[9-16 bit],PQ[1]x10 (such as Residual1x9,PQ16x10)
20
+ // * * (use with COARSE_BITS=16)
21
+ // * Residual1x[9-16 bit],PQ[1]x16 (such as Residual1x9,PQ16x16)
22
+ // * * (use with COARSE_BITS=16)
23
+ // * PQ[1]x10
24
+ // * PQ[1]x16
25
+ // Unfortunately, currently Faiss does not support something like
26
+ // IVF256,PQ16x10np
27
+ //
28
+ // The goal was to achieve the maximum performance, so the template version it
29
+ // is. The provided index families share the same code for sa_decode.
30
+ //
31
+ // The front-end code provides two high-level structures.
32
+ //
33
+ // First one:
34
+ // {
35
+ // template <
36
+ // intptr_t DIM,
37
+ // intptr_t COARSE_SIZE,
38
+ // intptr_t FINE_SIZE,
39
+ // intptr_t COARSE_BITS = 8
40
+ // intptr_t FINE_BITS = 8>
41
+ // struct Index2LevelDecoder { /*...*/ };
42
+ // }
43
+ // * DIM is the dimensionality of data
44
+ // * COARSE_SIZE is the dimensionality of the coarse quantizer (IVF, Residual)
45
+ // * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
46
+ // * COARSE_BITS is the number of bits that are needed to represent a coarse
47
+ // quantizer code.
48
+ // * FINE_BITS is the number of bits that are needed to represent a fine
49
+ // quantizer code.
50
+ // For example, "IVF256,PQ8np" for 160-dim data translates into
51
+ // Index2LevelDecoder<160,160,20,8>
52
+ // For example, "Residual4x8,PQ16" for 256-dim data translates into
53
+ // Index2LevelDecoder<256,64,1,8>
54
+ // For example, "IVF1024,PQ16np" for 256-dim data translates into
55
+ // Index2LevelDecoder<256,256,16,10>. But as there are only 1 coarse code
56
+ // element, Index2LevelDecoder<256,256,16,16> can be used as a faster
57
+ // decoder.
58
+ // For example, "Residual4x10,PQ16x10np" for 256-dim data translates into
59
+ // Index2LevelDecoder<256,64,16,10,10>
60
+ //
61
+ // Additional supported values for COARSE_BITS and FINE_BITS may be added later.
62
+ //
63
+ // Second one:
64
+ // {
65
+ // template <
66
+ // intptr_t DIM,
67
+ // intptr_t FINE_SIZE,
68
+ // intptr_t FINE_BITS = 8>
69
+ // struct IndexPQDecoder { /*...*/ };
70
+ // }
71
+ // * DIM is the dimensionality of data
72
+ // * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
73
+ // * FINE_BITS is the number of bits that are needed to represent a fine
74
+ // quantizer code.
75
+ // For example, "PQ8np" for 160-dim data translates into
76
+ // IndexPQDecoder<160,20>
77
+ //
78
+ // Unlike the general purpose version in faiss::Index::sa_decode(),
79
+ // this version provides the following functions (please note that
80
+ // pqCoarseCentroids params are not available for IndexPQDecoder,
81
+ // but the functionality is the same as for Index2LevelDecoder):
82
+ //
83
+ // * ::store(), which is similar to sa_decode(1, input, output),
84
+ // The method signature is the following:
85
+ // {
86
+ // void store(
87
+ // const float* const __restrict pqCoarseCentroids,
88
+ // const float* const __restrict pqFineCentroids,
89
+ // const uint8_t* const __restrict code,
90
+ // float* const __restrict outputStore);
91
+ // }
92
+ //
93
+ // * ::accum(), which is used to create a linear combination
94
+ // of decoded vectors:
95
+ // {
96
+ // const faiss::Index* const index;
97
+ // const uint8_t* const input;
98
+ // float weight;
99
+ //
100
+ // std::vector<float> buffer(d, 0);
101
+ //
102
+ // index->sa_decode(1, input, buffer.data());
103
+ // for (size_t iDim = 0; iDim < d; iDim++)
104
+ // output[iDim] += weight * buffer[iDim];
105
+ // }
106
+ // The method signature is the following:
107
+ // {
108
+ // static void accum(
109
+ // const float* const __restrict pqCoarseCentroids,
110
+ // const float* const __restrict pqFineCentroids,
111
+ // const uint8_t* const __restrict code,
112
+ // const float weight,
113
+ // float* const __restrict outputAccum);
114
+ // }
115
+ //
116
+ // * There is an additional overload for ::accum() that decodes two vectors
117
+ // per call. This provides an additional speedup because of a CPU
118
+ // superscalar architecture:
119
+ // {
120
+ // const faiss::Index* const index;
121
+ // const uint8_t* const input0;
122
+ // float weight0;
123
+ // const uint8_t* const input1;
124
+ // float weight1;
125
+ //
126
+ // std::vector<float> buffer(d, 0);
127
+ //
128
+ // index->sa_decode(1, input0, buffer.data());
129
+ // for (size_t iDim = 0; iDim < d; iDim++)
130
+ // output[iDim] += weight0 * buffer[iDim];
131
+ //
132
+ // index->sa_decode(1, input1, buffer.data());
133
+ // for (size_t iDim = 0; iDim < d; iDim++)
134
+ // output[iDim] += weight1 * buffer[iDim];
135
+ // }
136
+ // If each code uses its own coarse quantizer centroids table and its own fine
137
+ // quantizer centroids table, then the following overload can be used:
138
+ // {
139
+ // static void accum(
140
+ // const float* const __restrict pqCoarseCentroids0,
141
+ // const float* const __restrict pqFineCentroids0,
142
+ // const uint8_t* const __restrict code0,
143
+ // const float weight0,
144
+ // const float* const __restrict pqCoarseCentroids1,
145
+ // const float* const __restrict pqFineCentroids1,
146
+ // const uint8_t* const __restrict code1,
147
+ // const float weight1,
148
+ // float* const __restrict outputAccum);
149
+ // }
150
+ // If codes share the coarse quantizer centroids table and also share
151
+ // the fine quantizer centroids table, then the following overload can be
152
+ // used:
153
+ // {
154
+ // static void accum(
155
+ // const float* const __restrict pqCoarseCentroids,
156
+ // const float* const __restrict pqFineCentroids,
157
+ // const uint8_t* const __restrict code0,
158
+ // const float weight0,
159
+ // const uint8_t* const __restrict code1,
160
+ // const float weight1,
161
+ // float* const __restrict outputAccum);
162
+ // }
163
+ //
164
+ // * And one more overload for ::accum() that decodes and accumulates
165
+ // three vectors per call.
166
+ // {
167
+ // const faiss::Index* const index;
168
+ // const uint8_t* const input0;
169
+ // float weight0;
170
+ // const uint8_t* const input1;
171
+ // float weight1;
172
+ // const uint8_t* const input2;
173
+ // float weight2;
174
+ //
175
+ // std::vector<float> buffer(d, 0);
176
+ //
177
+ // index->sa_decode(1, input0, buffer.data());
178
+ // for (size_t iDim = 0; iDim < d; iDim++)
179
+ // output[iDim] += weight0 * buffer[iDim];
180
+ //
181
+ // index->sa_decode(1, input1, buffer.data());
182
+ // for (size_t iDim = 0; iDim < d; iDim++)
183
+ // output[iDim] += weight1 * buffer[iDim];
184
+ //
185
+ // index->sa_decode(1, input2, buffer.data());
186
+ // for (size_t iDim = 0; iDim < d; iDim++)
187
+ // output[iDim] += weight2 * buffer[iDim];
188
+ // }
189
+ //
190
+ // If each code uses its own coarse quantizer centroids table and its own fine
191
+ // quantizer centroids table, then the following overload can be used:
192
+ // {
193
+ // static void accum(
194
+ // const float* const __restrict pqCoarseCentroids0,
195
+ // const float* const __restrict pqFineCentroids0,
196
+ // const uint8_t* const __restrict code0,
197
+ // const float weight0,
198
+ // const float* const __restrict pqCoarseCentroids1,
199
+ // const float* const __restrict pqFineCentroids1,
200
+ // const uint8_t* const __restrict code1,
201
+ // const float weight1,
202
+ // const float* const __restrict pqCoarseCentroids2,
203
+ // const float* const __restrict pqFineCentroids2,
204
+ // const uint8_t* const __restrict code2,
205
+ // const float weight2,
206
+ // float* const __restrict outputAccum);
207
+ // }
208
+ // If codes share the coarse quantizer centroids table and also share
209
+ // the fine quantizer centroids table, then the following overload can be
210
+ // used:
211
+ // {
212
+ // static void accum(
213
+ // const float* const __restrict pqCoarseCentroids,
214
+ // const float* const __restrict pqFineCentroids,
215
+ // const uint8_t* const __restrict code0,
216
+ // const float weight0,
217
+ // const uint8_t* const __restrict code1,
218
+ // const float weight1,
219
+ // const uint8_t* const __restrict code2,
220
+ // const float weight2,
221
+ // float* const __restrict outputAccum);
222
+ // }
223
+ //
224
+ // The provided version is not multithreaded.
225
+ //
226
+ // Currently, an AVX2+FMA implementation is available. AVX512 version is also
227
+ // doable, but it was found to be slower than AVX2 for real world applications
228
+ // that I needed.
229
+ //
230
+ ////////////////////////////////////////////////////////////////////////////////////
231
+ //
232
+ // It is possible to use an additional index wrapper on top of IVFPQ /
233
+ // Residual+PQ, known as IndexRowwiseMinMax / IndexRowwiseMinMaxFP16. Index
234
+ // wrapper that performs rowwise normalization to [0,1], preserving the
235
+ // coefficients. This is a vector codec index only.
236
+ // For more details please refer to the description in
237
+ // faiss/IndexRowwiseMinMax.h file.
238
+ //
239
+ // If such a wrapper is used, then the quantizer will look like, say,
240
+ // MinMaxFP16,IVF256,PQ32np
241
+ // or
242
+ // MinMax,PQ16np
243
+ // In this case, please use the following contruction for the decoding,
244
+ // basically, wrapping a kernel in a kernel:
245
+ // {
246
+ // using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2>;
247
+ // using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
248
+ // // do T::store(...) or T::accum(...)
249
+ // }
250
+ //
251
+ // T::accum(...) contains an additional function variable which is
252
+ // used for accumulating scaling. Thus, the code pattern is the following:
253
+ // {
254
+ // const float* const __restrict pqCoarseCentroidsQ;
255
+ // const float* const __restrict pqFineCentroidsQ;
256
+ // const uint8_t* const __restrict input;
257
+ // const float* const __restrict weights;
258
+ // float* const __restrict output;
259
+ // float outputAccumMin = 0;
260
+ //
261
+ // for (size_t i = 0; i < n; i++) {
262
+ // T::accum(
263
+ // pqCoarseCentroidsQ,
264
+ // pqFineCentroidsQ,
265
+ // input + i * code_size,
266
+ // weights[i],
267
+ // output,
268
+ // outputAccumMin);
269
+ // }
270
+ // for (size_t j = 0; j < d; j++)
271
+ // output[j] += outputAccumMin;
272
+ // }
273
+ // This is similar to the following regular pseudo-code:
274
+ // {
275
+ // const faiss::Index* const index;
276
+ // const uint8_t* const __restrict input;
277
+ // const float* const __restrict weights;
278
+ // float* const __restrict output;
279
+ //
280
+ // for (size_t i = 0; i < n; i++) {
281
+ // std::vector<float> buffer(d, 0);
282
+ //
283
+ // index->sa_decode(1, input + i * code_size, buffer.data());
284
+ // for (size_t j = 0; j < d; j++)
285
+ // output[j] += weights[i] * buffer[j];
286
+ // }
287
+
288
+ #include <faiss/cppcontrib/sa_decode/MinMax-inl.h>
289
+ #include <faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h>
290
+
291
+ #ifdef __AVX2__
292
+ #include <faiss/cppcontrib/sa_decode/Level2-avx2-inl.h>
293
+ #include <faiss/cppcontrib/sa_decode/PQ-avx2-inl.h>
294
+ #elif defined(__ARM_NEON)
295
+ #include <faiss/cppcontrib/sa_decode/Level2-neon-inl.h>
296
+ #include <faiss/cppcontrib/sa_decode/PQ-neon-inl.h>
297
+ #else
298
+ #include <faiss/cppcontrib/sa_decode/Level2-inl.h>
299
+ #include <faiss/cppcontrib/sa_decode/PQ-inl.h>
300
+ #endif
@@ -0,0 +1,24 @@
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+
5
+ namespace faiss {
6
+ namespace cppcontrib {
7
+ namespace detail {
8
+
9
+ template <int COARSE_BITS>
10
+ struct CoarseBitType {};
11
+
12
+ template <>
13
+ struct CoarseBitType<8> {
14
+ using bit_type = uint8_t;
15
+ };
16
+
17
+ template <>
18
+ struct CoarseBitType<16> {
19
+ using bit_type = uint16_t;
20
+ };
21
+
22
+ } // namespace detail
23
+ } // namespace cppcontrib
24
+ } // namespace faiss
@@ -0,0 +1,195 @@
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+
5
+ namespace faiss {
6
+ namespace cppcontrib {
7
+ namespace detail {
8
+
9
+ namespace {
10
+
11
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
12
+ struct Uint8Reader {
13
+ static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
14
+
15
+ static intptr_t get(const uint8_t* const __restrict codes) {
16
+ // Read using 4-bytes, if possible.
17
+ // Reading using 8-byte takes too many registers somewhy.
18
+
19
+ constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
20
+ constexpr intptr_t SUB_ELEMENT = CPOS % 4;
21
+
22
+ switch (SUB_ELEMENT) {
23
+ case 0: {
24
+ if (N_ELEMENTS > CPOS + 3) {
25
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
26
+ codes + ELEMENT_TO_READ * 4);
27
+ return (code32 & 0x000000FF);
28
+ } else {
29
+ return codes[CPOS];
30
+ }
31
+ }
32
+ case 1: {
33
+ if (N_ELEMENTS > CPOS + 2) {
34
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
35
+ codes + ELEMENT_TO_READ * 4);
36
+ return (code32 & 0x0000FF00) >> 8;
37
+ } else {
38
+ return codes[CPOS];
39
+ }
40
+ }
41
+ case 2: {
42
+ if (N_ELEMENTS > CPOS + 1) {
43
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
44
+ codes + ELEMENT_TO_READ * 4);
45
+ return (code32 & 0x00FF0000) >> 16;
46
+ } else {
47
+ return codes[CPOS];
48
+ }
49
+ }
50
+ case 3: {
51
+ if (N_ELEMENTS > CPOS) {
52
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
53
+ codes + ELEMENT_TO_READ * 4);
54
+ return (code32) >> 24;
55
+ } else {
56
+ return codes[CPOS];
57
+ }
58
+ }
59
+ }
60
+ }
61
+ };
62
+
63
+ // reduces the number of read operations from RAM
64
+ ///////////////////////////////////////////////
65
+ // 76543210 76543210 76543210 76543210 76543210
66
+ // 00000000 00
67
+ // 111111 1111
68
+ // 2222 222222
69
+ // 33 33333333
70
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
71
+ struct Uint10Reader {
72
+ static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
73
+
74
+ static intptr_t get(const uint8_t* const __restrict codes) {
75
+ // Read using 4-bytes or 2-bytes.
76
+
77
+ constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
78
+ constexpr intptr_t SUB_ELEMENT = CPOS % 4;
79
+
80
+ switch (SUB_ELEMENT) {
81
+ case 0: {
82
+ if (N_ELEMENTS > CPOS + 2) {
83
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
84
+ codes + ELEMENT_TO_READ * 5);
85
+ return (code32 & 0b0000001111111111);
86
+ } else {
87
+ const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
88
+ codes + ELEMENT_TO_READ * 5 + 0);
89
+ return (code16 & 0b0000001111111111);
90
+ }
91
+ }
92
+ case 1: {
93
+ if (N_ELEMENTS > CPOS + 1) {
94
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
95
+ codes + ELEMENT_TO_READ * 5);
96
+ return (code32 & 0b000011111111110000000000) >> 10;
97
+ } else {
98
+ const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
99
+ codes + ELEMENT_TO_READ * 5 + 1);
100
+ return (code16 & 0b0000111111111100) >> 2;
101
+ }
102
+ }
103
+ case 2: {
104
+ if (N_ELEMENTS > CPOS) {
105
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
106
+ codes + ELEMENT_TO_READ * 5);
107
+ return (code32 & 0b00111111111100000000000000000000) >> 20;
108
+ } else {
109
+ const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
110
+ codes + ELEMENT_TO_READ * 5 + 2);
111
+ return (code16 & 0b0011111111110000) >> 4;
112
+ }
113
+ }
114
+ case 3: {
115
+ const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
116
+ codes + ELEMENT_TO_READ * 5 + 3);
117
+ return (code16 & 0b1111111111000000) >> 6;
118
+ }
119
+ }
120
+ }
121
+ };
122
+
123
+ // reduces the number of read operations from RAM
124
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
125
+ struct Uint16Reader {
126
+ static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
127
+
128
+ static intptr_t get(const uint8_t* const __restrict codes) {
129
+ // Read using 4-bytes or 2-bytes.
130
+ // Reading using 8-byte takes too many registers somewhy.
131
+
132
+ constexpr intptr_t ELEMENT_TO_READ = CPOS / 2;
133
+ constexpr intptr_t SUB_ELEMENT = CPOS % 2;
134
+
135
+ switch (SUB_ELEMENT) {
136
+ case 0: {
137
+ if (N_ELEMENTS > CPOS + 1) {
138
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
139
+ codes + ELEMENT_TO_READ * 4);
140
+ return (code32 & 0x0000FFFF);
141
+ } else {
142
+ const uint16_t* const __restrict codesFp16 =
143
+ reinterpret_cast<const uint16_t*>(codes);
144
+ return codesFp16[CPOS];
145
+ }
146
+ }
147
+ case 1: {
148
+ if (N_ELEMENTS > CPOS) {
149
+ const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
150
+ codes + ELEMENT_TO_READ * 4);
151
+ return code32 >> 16;
152
+ } else {
153
+ const uint16_t* const __restrict codesFp16 =
154
+ reinterpret_cast<const uint16_t*>(codes);
155
+ return codesFp16[CPOS];
156
+ }
157
+ }
158
+ }
159
+ }
160
+ };
161
+
162
+ //
163
+ template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
164
+ struct UintReaderImplType {};
165
+
166
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
167
+ struct UintReaderImplType<N_ELEMENTS, 8, CPOS> {
168
+ using reader_type = Uint8Reader<N_ELEMENTS, CPOS>;
169
+ };
170
+
171
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
172
+ struct UintReaderImplType<N_ELEMENTS, 10, CPOS> {
173
+ using reader_type = Uint10Reader<N_ELEMENTS, CPOS>;
174
+ };
175
+
176
+ template <intptr_t N_ELEMENTS, intptr_t CPOS>
177
+ struct UintReaderImplType<N_ELEMENTS, 16, CPOS> {
178
+ using reader_type = Uint16Reader<N_ELEMENTS, CPOS>;
179
+ };
180
+
181
+ } // namespace
182
+
183
+ // reduces the number of read operations from RAM
184
+ template <intptr_t DIM, intptr_t CODE_SIZE, intptr_t CODE_BITS, intptr_t CPOS>
185
+ using UintReader =
186
+ typename UintReaderImplType<DIM / CODE_SIZE, CODE_BITS, CPOS>::
187
+ reader_type;
188
+
189
+ template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
190
+ using UintReaderRaw =
191
+ typename UintReaderImplType<N_ELEMENTS, CODE_BITS, CPOS>::reader_type;
192
+
193
+ } // namespace detail
194
+ } // namespace cppcontrib
195
+ } // namespace faiss