RubyGems - faiss - Versions diffs - 0.2.4 → 0.2.5 - Mend

faiss 0.2.4 → 0.2.5

Files changed (177) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +23 -21
data/ext/faiss/extconf.rb +11 -0
data/ext/faiss/index.cpp +4 -4
data/ext/faiss/index_binary.cpp +6 -6
data/ext/faiss/product_quantizer.cpp +4 -4
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +13 -0
data/vendor/faiss/faiss/IVFlib.cpp +101 -2
data/vendor/faiss/faiss/IVFlib.h +26 -2
data/vendor/faiss/faiss/Index.cpp +36 -3
data/vendor/faiss/faiss/Index.h +43 -6
data/vendor/faiss/faiss/Index2Layer.cpp +6 -2
data/vendor/faiss/faiss/Index2Layer.h +6 -1
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +219 -16
data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +63 -5
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
data/vendor/faiss/faiss/IndexBinary.h +18 -3
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
data/vendor/faiss/faiss/IndexFastScan.h +145 -0
data/vendor/faiss/faiss/IndexFlat.cpp +34 -21
data/vendor/faiss/faiss/IndexFlat.h +7 -4
data/vendor/faiss/faiss/IndexFlatCodes.cpp +35 -1
data/vendor/faiss/faiss/IndexFlatCodes.h +12 -0
data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
data/vendor/faiss/faiss/IndexHNSW.h +4 -2
data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
data/vendor/faiss/faiss/IndexIDMap.h +107 -0
data/vendor/faiss/faiss/IndexIVF.cpp +121 -33
data/vendor/faiss/faiss/IndexIVF.h +35 -16
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +84 -7
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +63 -1
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
data/vendor/faiss/faiss/IndexIVFFlat.cpp +37 -17
data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
data/vendor/faiss/faiss/IndexIVFPQ.cpp +234 -50
data/vendor/faiss/faiss/IndexIVFPQ.h +5 -1
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +3 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -1
data/vendor/faiss/faiss/IndexLSH.cpp +4 -2
data/vendor/faiss/faiss/IndexLSH.h +2 -1
data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
data/vendor/faiss/faiss/IndexLattice.h +3 -1
data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -3
data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
data/vendor/faiss/faiss/IndexNSG.cpp +37 -3
data/vendor/faiss/faiss/IndexNSG.h +25 -1
data/vendor/faiss/faiss/IndexPQ.cpp +106 -69
data/vendor/faiss/faiss/IndexPQ.h +19 -5
data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
data/vendor/faiss/faiss/IndexRefine.cpp +8 -4
data/vendor/faiss/faiss/IndexRefine.h +4 -2
data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
data/vendor/faiss/faiss/IndexReplicas.h +2 -1
data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +26 -15
data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -7
data/vendor/faiss/faiss/IndexShards.cpp +4 -1
data/vendor/faiss/faiss/IndexShards.h +2 -1
data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
data/vendor/faiss/faiss/MetaIndexes.h +3 -81
data/vendor/faiss/faiss/VectorTransform.cpp +43 -0
data/vendor/faiss/faiss/VectorTransform.h +22 -4
data/vendor/faiss/faiss/clone_index.cpp +23 -1
data/vendor/faiss/faiss/clone_index.h +3 -0
data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +0 -4
data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +116 -47
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +44 -13
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
data/vendor/faiss/faiss/impl/HNSW.cpp +123 -27
data/vendor/faiss/faiss/impl/HNSW.h +19 -16
data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +6 -28
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +6 -1
data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
data/vendor/faiss/faiss/impl/NSG.cpp +1 -1
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +192 -36
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +40 -20
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +97 -173
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
data/vendor/faiss/faiss/impl/index_read.cpp +240 -9
data/vendor/faiss/faiss/impl/index_write.cpp +237 -5
data/vendor/faiss/faiss/impl/kmeans1d.cpp +6 -4
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
data/vendor/faiss/faiss/index_factory.cpp +196 -7
data/vendor/faiss/faiss/index_io.h +5 -0
data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
data/vendor/faiss/faiss/utils/Heap.h +31 -15
data/vendor/faiss/faiss/utils/distances.cpp +380 -56
data/vendor/faiss/faiss/utils/distances.h +113 -15
data/vendor/faiss/faiss/utils/distances_simd.cpp +726 -6
data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
data/vendor/faiss/faiss/utils/fp16.h +11 -0
data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
data/vendor/faiss/faiss/utils/random.cpp +53 -0
data/vendor/faiss/faiss/utils/random.h +5 -0
data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
metadata +37 -3

data/vendor/faiss/faiss/VectorTransform.h CHANGED Viewed

@@ -43,19 +43,27 @@ struct VectorTransform {
      */
     virtual void train(idx_t n, const float* x);
-    /** apply the random rotation, return new allocated matrix
-     * @param     x size n * d_in
-     * @return    size n * d_out
+    /** apply the transformation and return the result in an allocated pointer
+     * @param     n number of vectors to transform
+     * @param     x input vectors, size n * d_in
+     * @return    output vectors, size n * d_out
      */
     float* apply(idx_t n, const float* x) const;
-    /// same as apply, but result is pre-allocated
+    /** apply the transformation and return the result in a provided matrix
+     * @param     n number of vectors to transform
+     * @param     x input vectors, size n * d_in
+     * @param    xt output vectors, size n * d_out
+     */
     virtual void apply_noalloc(idx_t n, const float* x, float* xt) const = 0;
     /// reverse transformation. May not be implemented or may return
     /// approximate result
     virtual void reverse_transform(idx_t n, const float* xt, float* x) const;
+    // check that the two transforms are identical (to merge indexes)
+    virtual void check_identical(const VectorTransform& other) const = 0;
     virtual ~VectorTransform() {}
 };
@@ -100,6 +108,8 @@ struct LinearTransform : VectorTransform {
             int n,
             int d) const;
+    void check_identical(const VectorTransform& other) const override;
     ~LinearTransform() override {}
 };
@@ -207,6 +217,8 @@ struct ITQTransform : VectorTransform {
     void train(idx_t n, const float* x) override;
     void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+    void check_identical(const VectorTransform& other) const override;
 };
 struct ProductQuantizer;
@@ -260,6 +272,8 @@ struct RemapDimensionsTransform : VectorTransform {
     void reverse_transform(idx_t n, const float* xt, float* x) const override;
     RemapDimensionsTransform() {}
+    void check_identical(const VectorTransform& other) const override;
 };
 /** per-vector normalization */
@@ -273,6 +287,8 @@ struct NormalizationTransform : VectorTransform {
     /// Identity transform since norm is not revertible
     void reverse_transform(idx_t n, const float* xt, float* x) const override;
+    void check_identical(const VectorTransform& other) const override;
 };
 /** Subtract the mean of each component from the vectors. */
@@ -290,6 +306,8 @@ struct CenteringTransform : VectorTransform {
     /// add the mean
     void reverse_transform(idx_t n, const float* xt, float* x) const override;
+    void check_identical(const VectorTransform& other) const override;
 };
 } // namespace faiss

data/vendor/faiss/faiss/clone_index.cpp CHANGED Viewed

@@ -32,6 +32,11 @@
 #include <faiss/MetaIndexes.h>
 #include <faiss/VectorTransform.h>
+#include <faiss/impl/LocalSearchQuantizer.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/ResidualQuantizer.h>
+#include <faiss/impl/ScalarQuantizer.h>
 namespace faiss {
 /*************************************************************
@@ -117,7 +122,9 @@ Index* Cloner::clone_Index(const Index* index) {
         return res;
     } else if (
             const IndexIDMap* idmap = dynamic_cast<const IndexIDMap*>(index)) {
-        IndexIDMap* res = new IndexIDMap(*idmap);
+        const IndexIDMap2* idmap2 = dynamic_cast<const IndexIDMap2*>(index);
+        IndexIDMap* res =
+                idmap2 ? new IndexIDMap2(*idmap2) : new IndexIDMap(*idmap);
         res->own_fields = true;
         res->index = clone_Index(idmap->index);
         return res;
@@ -137,6 +144,13 @@ Index* Cloner::clone_Index(const Index* index) {
         res->own_fields = true;
         res->storage = clone_Index(insg->storage);
         return res;
+    } else if (
+            const IndexNNDescent* innd =
+                    dynamic_cast<const IndexNNDescent*>(index)) {
+        IndexNNDescent* res = new IndexNNDescent(*innd);
+        res->own_fields = true;
+        res->storage = clone_Index(innd->storage);
+        return res;
     } else if (
             const Index2Layer* i2l = dynamic_cast<const Index2Layer*>(index)) {
         Index2Layer* res = new Index2Layer(*i2l);
@@ -149,4 +163,12 @@ Index* Cloner::clone_Index(const Index* index) {
     return nullptr;
 }
+Quantizer* clone_Quantizer(const Quantizer* quant) {
+    TRYCLONE(ResidualQuantizer, quant)
+    TRYCLONE(LocalSearchQuantizer, quant)
+    TRYCLONE(ProductQuantizer, quant)
+    TRYCLONE(ScalarQuantizer, quant)
+    FAISS_THROW_MSG("Did not recognize quantizer to clone");
+}
 } // namespace faiss

data/vendor/faiss/faiss/clone_index.h CHANGED Viewed

@@ -16,6 +16,7 @@ namespace faiss {
 struct Index;
 struct IndexIVF;
 struct VectorTransform;
+struct Quantizer;
 /* cloning functions */
 Index* clone_index(const Index*);
@@ -30,4 +31,6 @@ struct Cloner {
     virtual ~Cloner() {}
 };
+Quantizer* clone_Quantizer(const Quantizer* quant);
 } // namespace faiss

data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h ADDED Viewed

@@ -0,0 +1,300 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+#pragma once
+// This file contains a custom fast implementation of faiss::Index::sa_decode()
+//   function for the following index families:
+//   * IVF256,PQ[1]x8np
+//   * Residual[1]x8,PQ[2]x8
+//   * IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
+//   * Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
+//   * PQ[1]x8
+// Additionally, AVX2 and ARM versions support
+//   * Residual[1]x8,PQ[2]x10
+//   * Residual[1]x8,PQ[2]x16
+//   * Residual[1]x10,PQ[2]x10
+//   * Residual[1]x10,PQ[2]x16
+//   * Residual[1]x16,PQ[2]x10
+//   * Residual[1]x16,PQ[2]x16
+//   * Residual1x[9-16 bit],PQ[1]x10 (such as Residual1x9,PQ16x10)
+//   * * (use with COARSE_BITS=16)
+//   * Residual1x[9-16 bit],PQ[1]x16 (such as Residual1x9,PQ16x16)
+//   * * (use with COARSE_BITS=16)
+//   * PQ[1]x10
+//   * PQ[1]x16
+// Unfortunately, currently Faiss does not support something like
+//   IVF256,PQ16x10np
+//
+// The goal was to achieve the maximum performance, so the template version it
+// is. The provided index families share the same code for sa_decode.
+//
+// The front-end code provides two high-level structures.
+//
+// First one:
+//   {
+//     template <
+//        intptr_t DIM,
+//        intptr_t COARSE_SIZE,
+//        intptr_t FINE_SIZE,
+//        intptr_t COARSE_BITS = 8
+//        intptr_t FINE_BITS = 8>
+//     struct Index2LevelDecoder { /*...*/ };
+//   }
+// * DIM is the dimensionality of data
+// * COARSE_SIZE is the dimensionality of the coarse quantizer (IVF, Residual)
+// * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
+// * COARSE_BITS is the number of bits that are needed to represent a coarse
+//   quantizer code.
+// * FINE_BITS is the number of bits that are needed to represent a fine
+//   quantizer code.
+// For example, "IVF256,PQ8np" for 160-dim data translates into
+//   Index2LevelDecoder<160,160,20,8>
+// For example, "Residual4x8,PQ16" for 256-dim data translates into
+//   Index2LevelDecoder<256,64,1,8>
+// For example, "IVF1024,PQ16np" for 256-dim data translates into
+//   Index2LevelDecoder<256,256,16,10>. But as there are only 1 coarse code
+//   element, Index2LevelDecoder<256,256,16,16> can be used as a faster
+//   decoder.
+// For example, "Residual4x10,PQ16x10np" for 256-dim data translates into
+//   Index2LevelDecoder<256,64,16,10,10>
+//
+// Additional supported values for COARSE_BITS and FINE_BITS may be added later.
+//
+// Second one:
+//   {
+//     template <
+//        intptr_t DIM,
+//        intptr_t FINE_SIZE,
+//        intptr_t FINE_BITS = 8>
+//     struct IndexPQDecoder { /*...*/ };
+//   }
+// * DIM is the dimensionality of data
+// * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
+// * FINE_BITS is the number of bits that are needed to represent a fine
+//   quantizer code.
+// For example, "PQ8np" for 160-dim data translates into
+//   IndexPQDecoder<160,20>
+//
+// Unlike the general purpose version in faiss::Index::sa_decode(),
+//   this version provides the following functions (please note that
+//   pqCoarseCentroids params are not available for IndexPQDecoder,
+//   but the functionality is the same as for Index2LevelDecoder):
+//
+// * ::store(), which is similar to sa_decode(1, input, output),
+//   The method signature is the following:
+//   {
+//     void store(
+//       const float* const __restrict pqCoarseCentroids,
+//       const float* const __restrict pqFineCentroids,
+//       const uint8_t* const __restrict code,
+//       float* const __restrict outputStore);
+//   }
+//
+// * ::accum(), which is used to create a linear combination
+//   of decoded vectors:
+//   {
+//     const faiss::Index* const index;
+//     const uint8_t* const input;
+//     float weight;
+//
+//     std::vector<float> buffer(d, 0);
+//
+//     index->sa_decode(1, input, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight * buffer[iDim];
+//   }
+//   The method signature is the following:
+//   {
+//    static void accum(
+//      const float* const __restrict pqCoarseCentroids,
+//      const float* const __restrict pqFineCentroids,
+//      const uint8_t* const __restrict code,
+//      const float weight,
+//      float* const __restrict outputAccum);
+//   }
+//
+// * There is an additional overload for ::accum() that decodes two vectors
+//   per call. This provides an additional speedup because of a CPU
+//   superscalar architecture:
+//   {
+//     const faiss::Index* const index;
+//     const uint8_t* const input0;
+//     float weight0;
+//     const uint8_t* const input1;
+//     float weight1;
+//
+//     std::vector<float> buffer(d, 0);
+//
+//     index->sa_decode(1, input0, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight0 * buffer[iDim];
+//
+//     index->sa_decode(1, input1, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight1 * buffer[iDim];
+//   }
+//   If each code uses its own coarse quantizer centroids table and its own fine
+//   quantizer centroids table, then the following overload can be used:
+//   {
+//    static void accum(
+//      const float* const __restrict pqCoarseCentroids0,
+//      const float* const __restrict pqFineCentroids0,
+//      const uint8_t* const __restrict code0,
+//      const float weight0,
+//      const float* const __restrict pqCoarseCentroids1,
+//      const float* const __restrict pqFineCentroids1,
+//      const uint8_t* const __restrict code1,
+//      const float weight1,
+//      float* const __restrict outputAccum);
+//   }
+//   If codes share the coarse quantizer centroids table and also share
+//   the fine quantizer centroids table, then the following overload can be
+//   used:
+//   {
+//    static void accum(
+//      const float* const __restrict pqCoarseCentroids,
+//      const float* const __restrict pqFineCentroids,
+//      const uint8_t* const __restrict code0,
+//      const float weight0,
+//      const uint8_t* const __restrict code1,
+//      const float weight1,
+//      float* const __restrict outputAccum);
+//   }
+//
+// * And one more overload for ::accum() that decodes and accumulates
+//   three vectors per call.
+//   {
+//     const faiss::Index* const index;
+//     const uint8_t* const input0;
+//     float weight0;
+//     const uint8_t* const input1;
+//     float weight1;
+//     const uint8_t* const input2;
+//     float weight2;
+//
+//     std::vector<float> buffer(d, 0);
+//
+//     index->sa_decode(1, input0, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight0 * buffer[iDim];
+//
+//     index->sa_decode(1, input1, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight1 * buffer[iDim];
+//
+//     index->sa_decode(1, input2, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight2 * buffer[iDim];
+//   }
+//
+//   If each code uses its own coarse quantizer centroids table and its own fine
+//   quantizer centroids table, then the following overload can be used:
+//   {
+//    static void accum(
+//      const float* const __restrict pqCoarseCentroids0,
+//      const float* const __restrict pqFineCentroids0,
+//      const uint8_t* const __restrict code0,
+//      const float weight0,
+//      const float* const __restrict pqCoarseCentroids1,
+//      const float* const __restrict pqFineCentroids1,
+//      const uint8_t* const __restrict code1,
+//      const float weight1,
+//      const float* const __restrict pqCoarseCentroids2,
+//      const float* const __restrict pqFineCentroids2,
+//      const uint8_t* const __restrict code2,
+//      const float weight2,
+//      float* const __restrict outputAccum);
+//   }
+//   If codes share the coarse quantizer centroids table and also share
+//   the fine quantizer centroids table, then the following overload can be
+//   used:
+//   {
+//    static void accum(
+//      const float* const __restrict pqCoarseCentroids,
+//      const float* const __restrict pqFineCentroids,
+//      const uint8_t* const __restrict code0,
+//      const float weight0,
+//      const uint8_t* const __restrict code1,
+//      const float weight1,
+//      const uint8_t* const __restrict code2,
+//      const float weight2,
+//      float* const __restrict outputAccum);
+//   }
+//
+// The provided version is not multithreaded.
+//
+// Currently, an AVX2+FMA implementation is available. AVX512 version is also
+//   doable, but it was found to be slower than AVX2 for real world applications
+//   that I needed.
+//
+////////////////////////////////////////////////////////////////////////////////////
+//
+// It is possible to use an additional index wrapper on top of IVFPQ /
+// Residual+PQ, known as IndexRowwiseMinMax / IndexRowwiseMinMaxFP16. Index
+// wrapper that performs rowwise normalization to [0,1], preserving the
+// coefficients. This is a vector codec index only.
+// For more details please refer to the description in
+// faiss/IndexRowwiseMinMax.h file.
+//
+// If such a wrapper is used, then the quantizer will look like, say,
+//    MinMaxFP16,IVF256,PQ32np
+//  or
+//    MinMax,PQ16np
+// In this case, please use the following contruction for the decoding,
+// basically, wrapping a kernel in a kernel:
+//   {
+//      using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2>;
+//      using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+//      // do T::store(...) or T::accum(...)
+//   }
+//
+// T::accum(...) contains an additional function variable which is
+// used for accumulating scaling. Thus, the code pattern is the following:
+//   {
+//     const float* const __restrict pqCoarseCentroidsQ;
+//     const float* const __restrict pqFineCentroidsQ;
+//     const uint8_t* const __restrict input;
+//     const float* const __restrict weights;
+//     float* const __restrict output;
+//     float outputAccumMin = 0;
+//
+//     for (size_t i = 0; i < n; i++) {
+//         T::accum(
+//                 pqCoarseCentroidsQ,
+//                 pqFineCentroidsQ,
+//                 input + i * code_size,
+//                 weights[i],
+//                 output,
+//                 outputAccumMin);
+//     }
+//     for (size_t j = 0; j < d; j++)
+//         output[j] += outputAccumMin;
+//   }
+// This is similar to the following regular pseudo-code:
+//   {
+//     const faiss::Index* const index;
+//     const uint8_t* const __restrict input;
+//     const float* const __restrict weights;
+//     float* const __restrict output;
+//
+//     for (size_t i = 0; i < n; i++) {
+//       std::vector<float> buffer(d, 0);
+//
+//       index->sa_decode(1, input + i * code_size, buffer.data());
+//       for (size_t j = 0; j < d; j++)
+//         output[j] += weights[i] * buffer[j];
+//     }
+#include <faiss/cppcontrib/sa_decode/MinMax-inl.h>
+#include <faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h>
+#ifdef __AVX2__
+#include <faiss/cppcontrib/sa_decode/Level2-avx2-inl.h>
+#include <faiss/cppcontrib/sa_decode/PQ-avx2-inl.h>
+#elif defined(__ARM_NEON)
+#include <faiss/cppcontrib/sa_decode/Level2-neon-inl.h>
+#include <faiss/cppcontrib/sa_decode/PQ-neon-inl.h>
+#else
+#include <faiss/cppcontrib/sa_decode/Level2-inl.h>
+#include <faiss/cppcontrib/sa_decode/PQ-inl.h>
+#endif

data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h ADDED Viewed

@@ -0,0 +1,24 @@
+#pragma once
+#include <cstdint>
+namespace faiss {
+namespace cppcontrib {
+namespace detail {
+template <int COARSE_BITS>
+struct CoarseBitType {};
+template <>
+struct CoarseBitType<8> {
+    using bit_type = uint8_t;
+};
+template <>
+struct CoarseBitType<16> {
+    using bit_type = uint16_t;
+};
+} // namespace detail
+} // namespace cppcontrib
+} // namespace faiss

data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h ADDED Viewed

@@ -0,0 +1,195 @@
+#pragma once
+#include <cstdint>
+namespace faiss {
+namespace cppcontrib {
+namespace detail {
+namespace {
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct Uint8Reader {
+    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
+    static intptr_t get(const uint8_t* const __restrict codes) {
+        // Read using 4-bytes, if possible.
+        // Reading using 8-byte takes too many registers somewhy.
+        constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
+        constexpr intptr_t SUB_ELEMENT = CPOS % 4;
+        switch (SUB_ELEMENT) {
+            case 0: {
+                if (N_ELEMENTS > CPOS + 3) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return (code32 & 0x000000FF);
+                } else {
+                    return codes[CPOS];
+                }
+            }
+            case 1: {
+                if (N_ELEMENTS > CPOS + 2) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return (code32 & 0x0000FF00) >> 8;
+                } else {
+                    return codes[CPOS];
+                }
+            }
+            case 2: {
+                if (N_ELEMENTS > CPOS + 1) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return (code32 & 0x00FF0000) >> 16;
+                } else {
+                    return codes[CPOS];
+                }
+            }
+            case 3: {
+                if (N_ELEMENTS > CPOS) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return (code32) >> 24;
+                } else {
+                    return codes[CPOS];
+                }
+            }
+        }
+    }
+};
+// reduces the number of read operations from RAM
+///////////////////////////////////////////////
+// 76543210 76543210 76543210 76543210 76543210
+// 00000000 00
+//            111111 1111
+//                       2222 222222
+//                                  33 33333333
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct Uint10Reader {
+    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
+    static intptr_t get(const uint8_t* const __restrict codes) {
+        // Read using 4-bytes or 2-bytes.
+        constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
+        constexpr intptr_t SUB_ELEMENT = CPOS % 4;
+        switch (SUB_ELEMENT) {
+            case 0: {
+                if (N_ELEMENTS > CPOS + 2) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 5);
+                    return (code32 & 0b0000001111111111);
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 5 + 0);
+                    return (code16 & 0b0000001111111111);
+                }
+            }
+            case 1: {
+                if (N_ELEMENTS > CPOS + 1) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 5);
+                    return (code32 & 0b000011111111110000000000) >> 10;
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 5 + 1);
+                    return (code16 & 0b0000111111111100) >> 2;
+                }
+            }
+            case 2: {
+                if (N_ELEMENTS > CPOS) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 5);
+                    return (code32 & 0b00111111111100000000000000000000) >> 20;
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 5 + 2);
+                    return (code16 & 0b0011111111110000) >> 4;
+                }
+            }
+            case 3: {
+                const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                        codes + ELEMENT_TO_READ * 5 + 3);
+                return (code16 & 0b1111111111000000) >> 6;
+            }
+        }
+    }
+};
+// reduces the number of read operations from RAM
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct Uint16Reader {
+    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
+    static intptr_t get(const uint8_t* const __restrict codes) {
+        // Read using 4-bytes or 2-bytes.
+        // Reading using 8-byte takes too many registers somewhy.
+        constexpr intptr_t ELEMENT_TO_READ = CPOS / 2;
+        constexpr intptr_t SUB_ELEMENT = CPOS % 2;
+        switch (SUB_ELEMENT) {
+            case 0: {
+                if (N_ELEMENTS > CPOS + 1) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return (code32 & 0x0000FFFF);
+                } else {
+                    const uint16_t* const __restrict codesFp16 =
+                            reinterpret_cast<const uint16_t*>(codes);
+                    return codesFp16[CPOS];
+                }
+            }
+            case 1: {
+                if (N_ELEMENTS > CPOS) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return code32 >> 16;
+                } else {
+                    const uint16_t* const __restrict codesFp16 =
+                            reinterpret_cast<const uint16_t*>(codes);
+                    return codesFp16[CPOS];
+                }
+            }
+        }
+    }
+};
+//
+template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
+struct UintReaderImplType {};
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct UintReaderImplType<N_ELEMENTS, 8, CPOS> {
+    using reader_type = Uint8Reader<N_ELEMENTS, CPOS>;
+};
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct UintReaderImplType<N_ELEMENTS, 10, CPOS> {
+    using reader_type = Uint10Reader<N_ELEMENTS, CPOS>;
+};
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct UintReaderImplType<N_ELEMENTS, 16, CPOS> {
+    using reader_type = Uint16Reader<N_ELEMENTS, CPOS>;
+};
+} // namespace
+// reduces the number of read operations from RAM
+template <intptr_t DIM, intptr_t CODE_SIZE, intptr_t CODE_BITS, intptr_t CPOS>
+using UintReader =
+        typename UintReaderImplType<DIM / CODE_SIZE, CODE_BITS, CPOS>::
+                reader_type;
+template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
+using UintReaderRaw =
+        typename UintReaderImplType<N_ELEMENTS, CODE_BITS, CPOS>::reader_type;
+} // namespace detail
+} // namespace cppcontrib
+} // namespace faiss