RubyGems - faiss - Versions diffs - 0.2.6 → 0.2.7 - Mend

faiss 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (189) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/ext/faiss/extconf.rb +1 -1
data/lib/faiss/version.rb +1 -1
data/lib/faiss.rb +2 -2
data/vendor/faiss/faiss/AutoTune.cpp +15 -4
data/vendor/faiss/faiss/AutoTune.h +0 -1
data/vendor/faiss/faiss/Clustering.cpp +1 -5
data/vendor/faiss/faiss/Clustering.h +0 -2
data/vendor/faiss/faiss/IVFlib.h +0 -2
data/vendor/faiss/faiss/Index.h +1 -2
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +17 -3
data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +10 -1
data/vendor/faiss/faiss/IndexBinary.h +0 -1
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +2 -1
data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -0
data/vendor/faiss/faiss/IndexBinaryHash.cpp +1 -3
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +273 -48
data/vendor/faiss/faiss/IndexBinaryIVF.h +18 -11
data/vendor/faiss/faiss/IndexFastScan.cpp +13 -10
data/vendor/faiss/faiss/IndexFastScan.h +5 -1
data/vendor/faiss/faiss/IndexFlat.cpp +16 -3
data/vendor/faiss/faiss/IndexFlat.h +1 -1
data/vendor/faiss/faiss/IndexFlatCodes.cpp +5 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +7 -2
data/vendor/faiss/faiss/IndexHNSW.cpp +3 -6
data/vendor/faiss/faiss/IndexHNSW.h +0 -1
data/vendor/faiss/faiss/IndexIDMap.cpp +4 -4
data/vendor/faiss/faiss/IndexIDMap.h +0 -2
data/vendor/faiss/faiss/IndexIVF.cpp +155 -129
data/vendor/faiss/faiss/IndexIVF.h +121 -61
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +12 -11
data/vendor/faiss/faiss/IndexIVFFastScan.h +6 -1
data/vendor/faiss/faiss/IndexIVFPQ.cpp +221 -165
data/vendor/faiss/faiss/IndexIVFPQ.h +1 -0
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +6 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +0 -2
data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -2
data/vendor/faiss/faiss/IndexNNDescent.h +0 -1
data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
data/vendor/faiss/faiss/IndexPQ.cpp +7 -9
data/vendor/faiss/faiss/IndexRefine.cpp +1 -1
data/vendor/faiss/faiss/IndexReplicas.cpp +3 -4
data/vendor/faiss/faiss/IndexReplicas.h +0 -1
data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +8 -1
data/vendor/faiss/faiss/IndexRowwiseMinMax.h +7 -0
data/vendor/faiss/faiss/IndexShards.cpp +26 -109
data/vendor/faiss/faiss/IndexShards.h +2 -3
data/vendor/faiss/faiss/IndexShardsIVF.cpp +246 -0
data/vendor/faiss/faiss/IndexShardsIVF.h +42 -0
data/vendor/faiss/faiss/MetaIndexes.cpp +86 -0
data/vendor/faiss/faiss/MetaIndexes.h +29 -0
data/vendor/faiss/faiss/MetricType.h +14 -0
data/vendor/faiss/faiss/VectorTransform.cpp +8 -10
data/vendor/faiss/faiss/VectorTransform.h +1 -3
data/vendor/faiss/faiss/clone_index.cpp +232 -18
data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +25 -3
data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +7 -0
data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +78 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +20 -6
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +7 -1
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +21 -7
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +7 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +7 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +10 -3
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +7 -1
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +11 -3
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +25 -2
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +76 -29
data/vendor/faiss/faiss/gpu/GpuCloner.h +2 -2
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +14 -13
data/vendor/faiss/faiss/gpu/GpuDistance.h +18 -6
data/vendor/faiss/faiss/gpu/GpuIndex.h +23 -21
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +10 -10
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -12
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +29 -50
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +3 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +8 -8
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +4 -4
data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +2 -5
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +9 -7
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +4 -4
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +2 -2
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +1 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +55 -6
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +20 -6
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +95 -25
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +67 -16
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +4 -4
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +7 -7
data/vendor/faiss/faiss/gpu/test/TestUtils.h +4 -4
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +0 -7
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +9 -9
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +2 -7
data/vendor/faiss/faiss/impl/CodePacker.cpp +67 -0
data/vendor/faiss/faiss/impl/CodePacker.h +71 -0
data/vendor/faiss/faiss/impl/DistanceComputer.h +0 -2
data/vendor/faiss/faiss/impl/HNSW.cpp +3 -7
data/vendor/faiss/faiss/impl/HNSW.h +6 -9
data/vendor/faiss/faiss/impl/IDSelector.cpp +1 -1
data/vendor/faiss/faiss/impl/IDSelector.h +39 -1
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +62 -51
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +11 -12
data/vendor/faiss/faiss/impl/NNDescent.cpp +3 -9
data/vendor/faiss/faiss/impl/NNDescent.h +10 -10
data/vendor/faiss/faiss/impl/NSG.cpp +1 -6
data/vendor/faiss/faiss/impl/NSG.h +4 -7
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +1 -15
data/vendor/faiss/faiss/impl/PolysemousTraining.h +11 -10
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +0 -7
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -12
data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -4
data/vendor/faiss/faiss/impl/Quantizer.h +6 -3
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +796 -174
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +16 -8
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +3 -5
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +4 -4
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +3 -3
data/vendor/faiss/faiss/impl/ThreadedIndex.h +4 -4
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +291 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +74 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +123 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +102 -0
data/vendor/faiss/faiss/impl/index_read.cpp +13 -10
data/vendor/faiss/faiss/impl/index_write.cpp +3 -4
data/vendor/faiss/faiss/impl/kmeans1d.cpp +0 -1
data/vendor/faiss/faiss/impl/kmeans1d.h +3 -3
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
data/vendor/faiss/faiss/impl/platform_macros.h +61 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +48 -4
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +18 -4
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +2 -2
data/vendor/faiss/faiss/index_factory.cpp +8 -10
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +29 -12
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +8 -2
data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
data/vendor/faiss/faiss/invlists/DirectMap.h +2 -4
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +118 -18
data/vendor/faiss/faiss/invlists/InvertedLists.h +44 -4
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
data/vendor/faiss/faiss/python/python_callbacks.h +1 -1
data/vendor/faiss/faiss/utils/AlignedTable.h +3 -1
data/vendor/faiss/faiss/utils/Heap.cpp +139 -3
data/vendor/faiss/faiss/utils/Heap.h +35 -1
data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +84 -0
data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +196 -0
data/vendor/faiss/faiss/utils/approx_topk/generic.h +138 -0
data/vendor/faiss/faiss/utils/approx_topk/mode.h +34 -0
data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +367 -0
data/vendor/faiss/faiss/utils/distances.cpp +61 -7
data/vendor/faiss/faiss/utils/distances.h +11 -0
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +346 -0
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +36 -0
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +42 -0
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +40 -0
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +352 -0
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +32 -0
data/vendor/faiss/faiss/utils/distances_simd.cpp +515 -327
data/vendor/faiss/faiss/utils/extra_distances-inl.h +17 -1
data/vendor/faiss/faiss/utils/extra_distances.cpp +37 -8
data/vendor/faiss/faiss/utils/extra_distances.h +2 -1
data/vendor/faiss/faiss/utils/fp16-fp16c.h +7 -0
data/vendor/faiss/faiss/utils/fp16-inl.h +7 -0
data/vendor/faiss/faiss/utils/fp16.h +7 -0
data/vendor/faiss/faiss/utils/hamming-inl.h +0 -456
data/vendor/faiss/faiss/utils/hamming.cpp +104 -120
data/vendor/faiss/faiss/utils/hamming.h +21 -10
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +535 -0
data/vendor/faiss/faiss/utils/hamming_distance/common.h +48 -0
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +519 -0
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +26 -0
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +614 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +21 -25
data/vendor/faiss/faiss/utils/simdlib_avx2.h +344 -3
data/vendor/faiss/faiss/utils/simdlib_emulated.h +390 -0
data/vendor/faiss/faiss/utils/simdlib_neon.h +655 -130
data/vendor/faiss/faiss/utils/sorting.cpp +692 -0
data/vendor/faiss/faiss/utils/sorting.h +71 -0
data/vendor/faiss/faiss/utils/transpose/transpose-avx2-inl.h +165 -0
data/vendor/faiss/faiss/utils/utils.cpp +4 -176
data/vendor/faiss/faiss/utils/utils.h +2 -9
metadata +29 -3
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +0 -26

data/vendor/faiss/faiss/impl/ResidualQuantizer.h CHANGED Viewed

@@ -13,6 +13,8 @@
 #include <faiss/Clustering.h>
 #include <faiss/impl/AdditiveQuantizer.h>
+#include <faiss/utils/approx_topk/mode.h>
 namespace faiss {
 /** Residual quantizer with variable number of bits per sub-quantizer
@@ -29,7 +31,7 @@ struct ResidualQuantizer : AdditiveQuantizer {
     using train_type_t = int;
     /// Binary or of the Train_* flags below
-    train_type_t train_type;
+    train_type_t train_type = Train_progressive_dim;
     /// regular k-means (minimal amount of computation)
     static const int Train_default = 0;
@@ -41,7 +43,7 @@ struct ResidualQuantizer : AdditiveQuantizer {
     static const int Train_refine_codebook = 2;
     /// number of iterations for codebook refinement.
-    int niter_codebook_refine;
+    int niter_codebook_refine = 5;
     /** set this bit on train_type if beam is to be trained only on the
      *  first element of the beam (faster but less accurate) */
@@ -52,16 +54,20 @@ struct ResidualQuantizer : AdditiveQuantizer {
     static const int Skip_codebook_tables = 2048;
     /// beam size used for training and for encoding
-    int max_beam_size;
+    int max_beam_size = 5;
     /// use LUT for beam search
-    int use_beam_LUT;
+    int use_beam_LUT = 0;
+    /// Currently used mode of approximate min-k computations.
+    /// Default value is EXACT_TOPK.
+    ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK;
     /// clustering parameters
     ProgressiveDimClusteringParameters cp;
     /// if non-NULL, use this index for assignment
-    ProgressiveDimIndexFactory* assign_index_factory;
+    ProgressiveDimIndexFactory* assign_index_factory = nullptr;
     ResidualQuantizer(
             size_t d,
@@ -183,7 +189,8 @@ void beam_search_encode_step(
         int32_t* new_codes,
         float* new_residuals,
         float* new_distances,
-        Index* assign_index = nullptr);
+        Index* assign_index = nullptr,
+        ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK);
 /** Encode a set of vectors using their dot products with the codebooks
  *
@@ -202,7 +209,8 @@ void beam_search_encode_step_tab(
         const int32_t* codes,   // n * beam_size * m
         const float* distances, // n * beam_size
         size_t new_beam_size,
-        int32_t* new_codes,    // n * new_beam_size * (m + 1)
-        float* new_distances); // n * new_beam_size
+        int32_t* new_codes,   // n * new_beam_size * (m + 1)
+        float* new_distances, // n * new_beam_size
+        ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK);
 }; // namespace faiss

data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp CHANGED Viewed

@@ -54,7 +54,6 @@ namespace faiss {
 namespace {
-typedef Index::idx_t idx_t;
 typedef ScalarQuantizer::QuantizerType QuantizerType;
 typedef ScalarQuantizer::RangeStat RangeStat;
 using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer;
@@ -1048,12 +1047,11 @@ SQDistanceComputer* select_distance_computer(
  ********************************************************************/
 ScalarQuantizer::ScalarQuantizer(size_t d, QuantizerType qtype)
-        : Quantizer(d), qtype(qtype), rangestat(RS_minmax), rangestat_arg(0) {
+        : Quantizer(d), qtype(qtype) {
     set_derived_sizes();
 }
-ScalarQuantizer::ScalarQuantizer()
-        : qtype(QT_8bit), rangestat(RS_minmax), rangestat_arg(0), bits(0) {}
+ScalarQuantizer::ScalarQuantizer() {}
 void ScalarQuantizer::set_derived_sizes() {
     switch (qtype) {
@@ -1131,7 +1129,7 @@ void ScalarQuantizer::train_residual(
     ScopeDeleter<float> del_x(x_in == x ? nullptr : x);
     if (by_residual) {
-        std::vector<Index::idx_t> idx(n);
+        std::vector<idx_t> idx(n);
         quantizer->assign(n, x, idx.data());
         std::vector<float> residuals(n * d);

data/vendor/faiss/faiss/impl/ScalarQuantizer.h CHANGED Viewed

@@ -34,7 +34,7 @@ struct ScalarQuantizer : Quantizer {
         QT_6bit,        ///< 6 bits per component
     };
-    QuantizerType qtype;
+    QuantizerType qtype = QT_8bit;
     /** The uniform encoder can estimate the range of representable
      * values of the unform encoder using different statistics. Here
@@ -48,11 +48,11 @@ struct ScalarQuantizer : Quantizer {
         RS_optim,     ///< alternate optimization of reconstruction error
     };
-    RangeStat rangestat;
-    float rangestat_arg;
+    RangeStat rangestat = RS_minmax;
+    float rangestat_arg = 0;
     /// bits per scalar code
-    size_t bits;
+    size_t bits = 0;
     /// trained values (including the range)
     std::vector<float> trained;

data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h CHANGED Viewed

@@ -18,7 +18,7 @@ ThreadedIndex<IndexT>::ThreadedIndex(bool threaded)
 template <typename IndexT>
 ThreadedIndex<IndexT>::ThreadedIndex(int d, bool threaded)
-        : IndexT(d), own_fields(false), isThreaded_(threaded) {}
+        : IndexT(d), isThreaded_(threaded) {}
 template <typename IndexT>
 ThreadedIndex<IndexT>::~ThreadedIndex() {
@@ -35,7 +35,7 @@ ThreadedIndex<IndexT>::~ThreadedIndex() {
             FAISS_ASSERT(!(bool)p.second);
         }
-        if (own_fields) {
+        if (own_indices) {
             delete p.first;
         }
     }
@@ -102,7 +102,7 @@ void ThreadedIndex<IndexT>::removeIndex(IndexT* index) {
             indices_.erase(it);
             onAfterRemoveIndex(index);
-            if (own_fields) {
+            if (own_indices) {
                 delete index;
             }

data/vendor/faiss/faiss/impl/ThreadedIndex.h CHANGED Viewed

@@ -29,7 +29,7 @@ class ThreadedIndex : public IndexT {
     /// WARNING: once an index is added, it becomes unsafe to touch it from any
     /// other thread than that on which is managing it, until we are shut
     /// down. Use runOnIndex to perform work on it instead.
-    void addIndex(IndexT* index);
+    virtual void addIndex(IndexT* index);
     /// Remove an index that is managed by ourselves.
     /// This will flush all pending work on that index, and then shut
@@ -52,17 +52,17 @@ class ThreadedIndex : public IndexT {
     }
     /// Returns the i-th sub-index
-    IndexT* at(int i) {
+    IndexT* at(size_t i) {
         return indices_[i].first;
     }
     /// Returns the i-th sub-index (const version)
-    const IndexT* at(int i) const {
+    const IndexT* at(size_t i) const {
         return indices_[i].first;
     }
     /// Whether or not we are responsible for deleting our contained indices
-    bool own_fields;
+    bool own_indices = false;
    protected:
     /// Called just after an index is added

data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h ADDED Viewed

@@ -0,0 +1,291 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#ifdef __AVX2__
+#include <immintrin.h>
+#include <type_traits>
+#include <faiss/impl/code_distance/code_distance-generic.h>
+namespace {
+// Computes a horizontal sum over an __m256 register
+inline float horizontal_sum(const __m256 reg) {
+    const __m256 h0 = _mm256_hadd_ps(reg, reg);
+    const __m256 h1 = _mm256_hadd_ps(h0, h0);
+    // extract high and low __m128 regs from __m256
+    const __m128 h2 = _mm256_extractf128_ps(h1, 1);
+    const __m128 h3 = _mm256_castps256_ps128(h1);
+    // get a final hsum into all 4 regs
+    const __m128 h4 = _mm_add_ss(h2, h3);
+    // extract f[0] from __m128
+    const float hsum = _mm_cvtss_f32(h4);
+    return hsum;
+}
+} // namespace
+namespace faiss {
+template <typename PQDecoderT>
+typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, float>::
+        type inline distance_single_code_avx2(
+                // the product quantizer
+                const ProductQuantizer& pq,
+                // precomputed distances, layout (M, ksub)
+                const float* sim_table,
+                const uint8_t* code) {
+    // default implementation
+    return distance_single_code_generic<PQDecoderT>(pq, sim_table, code);
+}
+template <typename PQDecoderT>
+typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, float>::
+        type inline distance_single_code_avx2(
+                // the product quantizer
+                const ProductQuantizer& pq,
+                // precomputed distances, layout (M, ksub)
+                const float* sim_table,
+                const uint8_t* code) {
+    float result = 0;
+    size_t m = 0;
+    const size_t pqM16 = pq.M / 16;
+    const float* tab = sim_table;
+    if (pqM16 > 0) {
+        // process 16 values per loop
+        const __m256i ksub = _mm256_set1_epi32(pq.ksub);
+        __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        offsets_0 = _mm256_mullo_epi32(offsets_0, ksub);
+        // accumulators of partial sums
+        __m256 partialSum = _mm256_setzero_ps();
+        // loop
+        for (m = 0; m < pqM16 * 16; m += 16) {
+            // load 16 uint8 values
+            const __m128i mm1 = _mm_loadu_si128((const __m128i_u*)(code + m));
+            {
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+                tab += pq.ksub * 8;
+                // collect partial sums
+                partialSum = _mm256_add_ps(partialSum, collected);
+            }
+            // move high 8 uint8 to low ones
+            const __m128i mm2 = _mm_unpackhi_epi64(mm1, _mm_setzero_si128());
+            {
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+                tab += pq.ksub * 8;
+                // collect partial sums
+                partialSum = _mm256_add_ps(partialSum, collected);
+            }
+        }
+        // horizontal sum for partialSum
+        result += horizontal_sum(partialSum);
+    }
+    //
+    if (m < pq.M) {
+        // process leftovers
+        PQDecoder8 decoder(code + m, pq.nbits);
+        for (; m < pq.M; m++) {
+            result += tab[decoder.decode()];
+            tab += pq.ksub;
+        }
+    }
+    return result;
+}
+template <typename PQDecoderT>
+typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, void>::
+        type
+        distance_four_codes_avx2(
+                // the product quantizer
+                const ProductQuantizer& pq,
+                // precomputed distances, layout (M, ksub)
+                const float* sim_table,
+                // codes
+                const uint8_t* __restrict code0,
+                const uint8_t* __restrict code1,
+                const uint8_t* __restrict code2,
+                const uint8_t* __restrict code3,
+                // computed distances
+                float& result0,
+                float& result1,
+                float& result2,
+                float& result3) {
+    distance_four_codes_generic<PQDecoderT>(
+            pq,
+            sim_table,
+            code0,
+            code1,
+            code2,
+            code3,
+            result0,
+            result1,
+            result2,
+            result3);
+}
+// Combines 4 operations of distance_single_code()
+template <typename PQDecoderT>
+typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, void>::type
+distance_four_codes_avx2(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    result0 = 0;
+    result1 = 0;
+    result2 = 0;
+    result3 = 0;
+    size_t m = 0;
+    const size_t pqM16 = pq.M / 16;
+    constexpr intptr_t N = 4;
+    const float* tab = sim_table;
+    if (pqM16 > 0) {
+        // process 16 values per loop
+        const __m256i ksub = _mm256_set1_epi32(pq.ksub);
+        __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        offsets_0 = _mm256_mullo_epi32(offsets_0, ksub);
+        // accumulators of partial sums
+        __m256 partialSums[N];
+        for (intptr_t j = 0; j < N; j++) {
+            partialSums[j] = _mm256_setzero_ps();
+        }
+        // loop
+        for (m = 0; m < pqM16 * 16; m += 16) {
+            // load 16 uint8 values
+            __m128i mm1[N];
+            mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
+            mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m));
+            mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m));
+            mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m));
+            // process first 8 codes
+            for (intptr_t j = 0; j < N; j++) {
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]);
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+                // collect partial sums
+                partialSums[j] = _mm256_add_ps(partialSums[j], collected);
+            }
+            tab += pq.ksub * 8;
+            // process next 8 codes
+            for (intptr_t j = 0; j < N; j++) {
+                // move high 8 uint8 to low ones
+                const __m128i mm2 =
+                        _mm_unpackhi_epi64(mm1[j], _mm_setzero_si128());
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+                // collect partial sums
+                partialSums[j] = _mm256_add_ps(partialSums[j], collected);
+            }
+            tab += pq.ksub * 8;
+        }
+        // horizontal sum for partialSum
+        result0 += horizontal_sum(partialSums[0]);
+        result1 += horizontal_sum(partialSums[1]);
+        result2 += horizontal_sum(partialSums[2]);
+        result3 += horizontal_sum(partialSums[3]);
+    }
+    //
+    if (m < pq.M) {
+        // process leftovers
+        PQDecoder8 decoder0(code0 + m, pq.nbits);
+        PQDecoder8 decoder1(code1 + m, pq.nbits);
+        PQDecoder8 decoder2(code2 + m, pq.nbits);
+        PQDecoder8 decoder3(code3 + m, pq.nbits);
+        for (; m < pq.M; m++) {
+            result0 += tab[decoder0.decode()];
+            result1 += tab[decoder1.decode()];
+            result2 += tab[decoder2.decode()];
+            result3 += tab[decoder3.decode()];
+            tab += pq.ksub;
+        }
+    }
+}
+} // namespace faiss
+#endif

data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h ADDED Viewed

@@ -0,0 +1,74 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <faiss/impl/ProductQuantizer.h>
+namespace faiss {
+/// Returns the distance to a single code.
+template <typename PQDecoderT>
+inline float distance_single_code_generic(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // the code
+        const uint8_t* code) {
+    PQDecoderT decoder(code, pq.nbits);
+    const float* tab = sim_table;
+    float result = 0;
+    for (size_t m = 0; m < pq.M; m++) {
+        result += tab[decoder.decode()];
+        tab += pq.ksub;
+    }
+    return result;
+}
+/// Combines 4 operations of distance_single_code()
+/// General-purpose version.
+template <typename PQDecoderT>
+inline void distance_four_codes_generic(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    PQDecoderT decoder0(code0, pq.nbits);
+    PQDecoderT decoder1(code1, pq.nbits);
+    PQDecoderT decoder2(code2, pq.nbits);
+    PQDecoderT decoder3(code3, pq.nbits);
+    const float* tab = sim_table;
+    result0 = 0;
+    result1 = 0;
+    result2 = 0;
+    result3 = 0;
+    for (size_t m = 0; m < pq.M; m++) {
+        result0 += tab[decoder0.decode()];
+        result1 += tab[decoder1.decode()];
+        result2 += tab[decoder2.decode()];
+        result3 += tab[decoder3.decode()];
+        tab += pq.ksub;
+    }
+}
+} // namespace faiss

data/vendor/faiss/faiss/impl/code_distance/code_distance.h ADDED Viewed

@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <faiss/impl/platform_macros.h>
+// This directory contains functions to compute a distance
+// from a given PQ code to a query vector, given that the
+// distances to a query vector for pq.M codebooks are precomputed.
+//
+// The code was originally the part of IndexIVFPQ.cpp.
+// The baseline implementation can be found in
+//   code_distance-generic.h, distance_single_code_generic().
+// The reason for this somewhat unusual structure is that
+// custom implementations may need to fall off to generic
+// implementation in certain cases. So, say, avx2 header file
+// needs to reference the generic header file. This is
+// why the names of the functions for custom implementations
+// have this _generic or _avx2 suffix.
+#ifdef __AVX2__
+#include <faiss/impl/code_distance/code_distance-avx2.h>
+namespace faiss {
+template <typename PQDecoderT>
+inline float distance_single_code(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // the code
+        const uint8_t* code) {
+    return distance_single_code_avx2<PQDecoderT>(pq, sim_table, code);
+}
+template <typename PQDecoderT>
+inline void distance_four_codes(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    distance_four_codes_avx2<PQDecoderT>(
+            pq,
+            sim_table,
+            code0,
+            code1,
+            code2,
+            code3,
+            result0,
+            result1,
+            result2,
+            result3);
+}
+} // namespace faiss
+#else
+#include <faiss/impl/code_distance/code_distance-generic.h>
+namespace faiss {
+template <typename PQDecoderT>
+inline float distance_single_code(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // the code
+        const uint8_t* code) {
+    return distance_single_code_generic<PQDecoderT>(pq, sim_table, code);
+}
+template <typename PQDecoderT>
+inline void distance_four_codes(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    distance_four_codes_generic<PQDecoderT>(
+            pq,
+            sim_table,
+            code0,
+            code1,
+            code2,
+            code3,
+            result0,
+            result1,
+            result2,
+            result3);
+}
+} // namespace faiss
+#endif