RubyGems - faiss - Versions diffs - 0.2.5 → 0.2.7 - Mend

faiss 0.2.5 → 0.2.7

Files changed (191) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/LICENSE.txt +1 -1
data/ext/faiss/extconf.rb +1 -1
data/ext/faiss/index.cpp +13 -0
data/lib/faiss/version.rb +1 -1
data/lib/faiss.rb +2 -2
data/vendor/faiss/faiss/AutoTune.cpp +15 -4
data/vendor/faiss/faiss/AutoTune.h +0 -1
data/vendor/faiss/faiss/Clustering.cpp +1 -5
data/vendor/faiss/faiss/Clustering.h +0 -2
data/vendor/faiss/faiss/IVFlib.h +0 -2
data/vendor/faiss/faiss/Index.h +1 -2
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +17 -3
data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +10 -1
data/vendor/faiss/faiss/IndexBinary.h +0 -1
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +2 -1
data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -0
data/vendor/faiss/faiss/IndexBinaryHash.cpp +1 -3
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +273 -48
data/vendor/faiss/faiss/IndexBinaryIVF.h +18 -11
data/vendor/faiss/faiss/IndexFastScan.cpp +13 -10
data/vendor/faiss/faiss/IndexFastScan.h +5 -1
data/vendor/faiss/faiss/IndexFlat.cpp +16 -3
data/vendor/faiss/faiss/IndexFlat.h +1 -1
data/vendor/faiss/faiss/IndexFlatCodes.cpp +5 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +7 -2
data/vendor/faiss/faiss/IndexHNSW.cpp +3 -6
data/vendor/faiss/faiss/IndexHNSW.h +0 -1
data/vendor/faiss/faiss/IndexIDMap.cpp +4 -4
data/vendor/faiss/faiss/IndexIDMap.h +0 -2
data/vendor/faiss/faiss/IndexIVF.cpp +155 -129
data/vendor/faiss/faiss/IndexIVF.h +121 -61
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +12 -11
data/vendor/faiss/faiss/IndexIVFFastScan.h +6 -1
data/vendor/faiss/faiss/IndexIVFPQ.cpp +221 -165
data/vendor/faiss/faiss/IndexIVFPQ.h +1 -0
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +6 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +0 -2
data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -2
data/vendor/faiss/faiss/IndexNNDescent.h +0 -1
data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
data/vendor/faiss/faiss/IndexPQ.cpp +7 -9
data/vendor/faiss/faiss/IndexRefine.cpp +1 -1
data/vendor/faiss/faiss/IndexReplicas.cpp +3 -4
data/vendor/faiss/faiss/IndexReplicas.h +0 -1
data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +8 -1
data/vendor/faiss/faiss/IndexRowwiseMinMax.h +7 -0
data/vendor/faiss/faiss/IndexShards.cpp +26 -109
data/vendor/faiss/faiss/IndexShards.h +2 -3
data/vendor/faiss/faiss/IndexShardsIVF.cpp +246 -0
data/vendor/faiss/faiss/IndexShardsIVF.h +42 -0
data/vendor/faiss/faiss/MetaIndexes.cpp +86 -0
data/vendor/faiss/faiss/MetaIndexes.h +29 -0
data/vendor/faiss/faiss/MetricType.h +14 -0
data/vendor/faiss/faiss/VectorTransform.cpp +8 -10
data/vendor/faiss/faiss/VectorTransform.h +1 -3
data/vendor/faiss/faiss/clone_index.cpp +232 -18
data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +25 -3
data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +7 -0
data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +78 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +20 -6
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +7 -1
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +21 -7
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +7 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +7 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +10 -3
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +7 -1
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +11 -3
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +25 -2
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +76 -29
data/vendor/faiss/faiss/gpu/GpuCloner.h +2 -2
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +14 -13
data/vendor/faiss/faiss/gpu/GpuDistance.h +18 -6
data/vendor/faiss/faiss/gpu/GpuIndex.h +23 -21
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +10 -10
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -12
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +29 -50
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +3 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +8 -8
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +4 -4
data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +2 -5
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +9 -7
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +4 -4
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +2 -2
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +1 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +55 -6
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +20 -6
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +95 -25
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +67 -16
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +4 -4
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +7 -7
data/vendor/faiss/faiss/gpu/test/TestUtils.h +4 -4
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +0 -7
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +9 -9
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +2 -7
data/vendor/faiss/faiss/impl/CodePacker.cpp +67 -0
data/vendor/faiss/faiss/impl/CodePacker.h +71 -0
data/vendor/faiss/faiss/impl/DistanceComputer.h +0 -2
data/vendor/faiss/faiss/impl/HNSW.cpp +3 -7
data/vendor/faiss/faiss/impl/HNSW.h +6 -9
data/vendor/faiss/faiss/impl/IDSelector.cpp +1 -1
data/vendor/faiss/faiss/impl/IDSelector.h +39 -1
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +62 -51
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +11 -12
data/vendor/faiss/faiss/impl/NNDescent.cpp +3 -9
data/vendor/faiss/faiss/impl/NNDescent.h +10 -10
data/vendor/faiss/faiss/impl/NSG.cpp +1 -6
data/vendor/faiss/faiss/impl/NSG.h +4 -7
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +1 -15
data/vendor/faiss/faiss/impl/PolysemousTraining.h +11 -10
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +0 -7
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -12
data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -4
data/vendor/faiss/faiss/impl/Quantizer.h +6 -3
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +796 -174
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +16 -8
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +3 -5
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +4 -4
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +3 -3
data/vendor/faiss/faiss/impl/ThreadedIndex.h +4 -4
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +291 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +74 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +123 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +102 -0
data/vendor/faiss/faiss/impl/index_read.cpp +13 -10
data/vendor/faiss/faiss/impl/index_write.cpp +3 -4
data/vendor/faiss/faiss/impl/kmeans1d.cpp +0 -1
data/vendor/faiss/faiss/impl/kmeans1d.h +3 -3
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
data/vendor/faiss/faiss/impl/platform_macros.h +61 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +48 -4
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +18 -4
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +2 -2
data/vendor/faiss/faiss/index_factory.cpp +8 -10
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +29 -12
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +8 -2
data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
data/vendor/faiss/faiss/invlists/DirectMap.h +2 -4
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +118 -18
data/vendor/faiss/faiss/invlists/InvertedLists.h +44 -4
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
data/vendor/faiss/faiss/python/python_callbacks.h +1 -1
data/vendor/faiss/faiss/utils/AlignedTable.h +3 -1
data/vendor/faiss/faiss/utils/Heap.cpp +139 -3
data/vendor/faiss/faiss/utils/Heap.h +35 -1
data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +84 -0
data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +196 -0
data/vendor/faiss/faiss/utils/approx_topk/generic.h +138 -0
data/vendor/faiss/faiss/utils/approx_topk/mode.h +34 -0
data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +367 -0
data/vendor/faiss/faiss/utils/distances.cpp +61 -7
data/vendor/faiss/faiss/utils/distances.h +11 -0
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +346 -0
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +36 -0
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +42 -0
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +40 -0
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +352 -0
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +32 -0
data/vendor/faiss/faiss/utils/distances_simd.cpp +515 -327
data/vendor/faiss/faiss/utils/extra_distances-inl.h +17 -1
data/vendor/faiss/faiss/utils/extra_distances.cpp +37 -8
data/vendor/faiss/faiss/utils/extra_distances.h +2 -1
data/vendor/faiss/faiss/utils/fp16-fp16c.h +7 -0
data/vendor/faiss/faiss/utils/fp16-inl.h +7 -0
data/vendor/faiss/faiss/utils/fp16.h +7 -0
data/vendor/faiss/faiss/utils/hamming-inl.h +0 -456
data/vendor/faiss/faiss/utils/hamming.cpp +104 -120
data/vendor/faiss/faiss/utils/hamming.h +21 -10
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +535 -0
data/vendor/faiss/faiss/utils/hamming_distance/common.h +48 -0
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +519 -0
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +26 -0
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +614 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +21 -25
data/vendor/faiss/faiss/utils/simdlib_avx2.h +344 -3
data/vendor/faiss/faiss/utils/simdlib_emulated.h +390 -0
data/vendor/faiss/faiss/utils/simdlib_neon.h +655 -130
data/vendor/faiss/faiss/utils/sorting.cpp +692 -0
data/vendor/faiss/faiss/utils/sorting.h +71 -0
data/vendor/faiss/faiss/utils/transpose/transpose-avx2-inl.h +165 -0
data/vendor/faiss/faiss/utils/utils.cpp +4 -176
data/vendor/faiss/faiss/utils/utils.h +2 -9
metadata +30 -4
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +0 -26

data/vendor/faiss/faiss/IndexIVFPQ.cpp CHANGED Viewed

@@ -33,9 +33,7 @@
 #include <faiss/impl/ProductQuantizer.h>
-#ifdef __AVX2__
-#include <immintrin.h>
-#endif
+#include <faiss/impl/code_distance/code_distance.h>
 namespace faiss {
@@ -51,7 +49,6 @@ IndexIVFPQ::IndexIVFPQ(
         size_t nbits_per_idx,
         MetricType metric)
         : IndexIVF(quantizer, d, nlist, 0, metric), pq(d, M, nbits_per_idx) {
-    FAISS_THROW_IF_NOT(nbits_per_idx <= 8);
     code_size = pq.code_size;
     invlists->code_size = code_size;
     is_trained = false;
@@ -198,9 +195,9 @@ void IndexIVFPQ::add_core(
 static float* compute_residuals(
         const Index* quantizer,
-        Index::idx_t n,
+        idx_t n,
         const float* x,
-        const Index::idx_t* list_nos) {
+        const idx_t* list_nos) {
     size_t d = quantizer->d;
     float* residuals = new float[n * d];
     // TODO: parallelize?
@@ -423,6 +420,7 @@ void initialize_IVFPQ_precomputed_table(
         const Index* quantizer,
         const ProductQuantizer& pq,
         AlignedTable<float>& precomputed_table,
+        bool by_residual,
         bool verbose) {
     size_t nlist = quantizer->ntotal;
     size_t d = quantizer->d;
@@ -434,10 +432,10 @@ void initialize_IVFPQ_precomputed_table(
     }
     if (use_precomputed_table == 0) { // then choose the type of table
-        if (quantizer->metric_type == METRIC_INNER_PRODUCT) {
+        if (!(quantizer->metric_type == METRIC_L2 && by_residual)) {
             if (verbose) {
                 printf("IndexIVFPQ::precompute_table: precomputed "
-                       "tables not needed for inner product quantizers\n");
+                       "tables needed only for L2 metric and by_residual is enabled\n");
             }
             precomputed_table.resize(0);
             return;
@@ -516,13 +514,16 @@ void initialize_IVFPQ_precomputed_table(
 void IndexIVFPQ::precompute_table() {
     initialize_IVFPQ_precomputed_table(
-            use_precomputed_table, quantizer, pq, precomputed_table, verbose);
+            use_precomputed_table,
+            quantizer,
+            pq,
+            precomputed_table,
+            by_residual,
+            verbose);
 }
 namespace {
-using idx_t = Index::idx_t;
 #define TIC t0 = get_cycles()
 #define TOC get_cycles() - t0
@@ -623,7 +624,7 @@ struct QueryTables {
      *****************************************************/
     // fields specific to list
-    Index::idx_t key;
+    idx_t key;
     float coarse_dis;
     std::vector<uint8_t> q_code;
@@ -886,140 +887,29 @@ struct IVFPQScannerT : QueryTables {
      * Scaning the codes: simple PQ scan.
      *****************************************************/
-#ifdef __AVX2__
-    /// Returns the distance to a single code.
-    /// General-purpose version.
-    template <class SearchResultType, typename T = PQDecoder>
-    typename std::enable_if<!(std::is_same<T, PQDecoder8>::value), float>::
-            type inline distance_single_code(const uint8_t* code) const {
-        PQDecoder decoder(code, pq.nbits);
-        const float* tab = sim_table;
-        float result = 0;
-        for (size_t m = 0; m < pq.M; m++) {
-            result += tab[decoder.decode()];
-            tab += pq.ksub;
-        }
-        return result;
-    }
-    /// Returns the distance to a single code.
-    /// Specialized AVX2 PQDecoder8 version.
-    template <class SearchResultType, typename T = PQDecoder>
-    typename std::enable_if<(std::is_same<T, PQDecoder8>::value), float>::
-            type inline distance_single_code(const uint8_t* code) const {
-        float result = 0;
-        size_t m = 0;
-        const size_t pqM16 = pq.M / 16;
-        const float* tab = sim_table;
-        if (pqM16 > 0) {
-            // process 16 values per loop
-            const __m256i ksub = _mm256_set1_epi32(pq.ksub);
-            __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-            offsets_0 = _mm256_mullo_epi32(offsets_0, ksub);
-            // accumulators of partial sums
-            __m256 partialSum = _mm256_setzero_ps();
-            // loop
-            for (m = 0; m < pqM16 * 16; m += 16) {
-                // load 16 uint8 values
-                const __m128i mm1 =
-                        _mm_loadu_si128((const __m128i_u*)(code + m));
-                {
-                    // convert uint8 values (low part of __m128i) to int32
-                    // values
-                    const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
-                    // add offsets
-                    const __m256i indices_to_read_from =
-                            _mm256_add_epi32(idx1, offsets_0);
-                    // gather 8 values, similar to 8 operations of tab[idx]
-                    __m256 collected = _mm256_i32gather_ps(
-                            tab, indices_to_read_from, sizeof(float));
-                    tab += pq.ksub * 8;
-                    // collect partial sums
-                    partialSum = _mm256_add_ps(partialSum, collected);
-                }
-                // move high 8 uint8 to low ones
-                const __m128i mm2 =
-                        _mm_unpackhi_epi64(mm1, _mm_setzero_si128());
-                {
-                    // convert uint8 values (low part of __m128i) to int32
-                    // values
-                    const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
-                    // add offsets
-                    const __m256i indices_to_read_from =
-                            _mm256_add_epi32(idx1, offsets_0);
-                    // gather 8 values, similar to 8 operations of tab[idx]
-                    __m256 collected = _mm256_i32gather_ps(
-                            tab, indices_to_read_from, sizeof(float));
-                    tab += pq.ksub * 8;
-                    // collect partial sums
-                    partialSum = _mm256_add_ps(partialSum, collected);
-                }
-            }
-            // horizontal sum for partialSum
-            const __m256 h0 = _mm256_hadd_ps(partialSum, partialSum);
-            const __m256 h1 = _mm256_hadd_ps(h0, h0);
-            // extract high and low __m128 regs from __m256
-            const __m128 h2 = _mm256_extractf128_ps(h1, 1);
-            const __m128 h3 = _mm256_castps256_ps128(h1);
-            // get a final hsum into all 4 regs
-            const __m128 h4 = _mm_add_ss(h2, h3);
-            // extract f[0] from __m128
-            const float hsum = _mm_cvtss_f32(h4);
-            result += hsum;
-        }
-        //
-        if (m < pq.M) {
-            // process leftovers
-            PQDecoder decoder(code + m, pq.nbits);
-            for (; m < pq.M; m++) {
-                result += tab[decoder.decode()];
-                tab += pq.ksub;
-            }
-        }
-        return result;
-    }
-#else
-    /// Returns the distance to a single code.
-    /// General-purpose version.
-    template <class SearchResultType>
-    inline float distance_single_code(const uint8_t* code) const {
-        PQDecoder decoder(code, pq.nbits);
-        const float* tab = sim_table;
-        float result = 0;
-        for (size_t m = 0; m < pq.M; m++) {
-            result += tab[decoder.decode()];
-            tab += pq.ksub;
-        }
-        return result;
-    }
-#endif
+    // This is the baseline version of scan_list_with_tables().
+    // It demonstrates what this function actually does.
+    //
+    // /// version of the scan where we use precomputed tables.
+    // template <class SearchResultType>
+    // void scan_list_with_table(
+    //         size_t ncode,
+    //         const uint8_t* codes,
+    //         SearchResultType& res) const {
+    //
+    //     for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
+    //         if (res.skip_entry(j)) {
+    //             continue;
+    //         }
+    //         float dis = dis0 + distance_single_code<PQDecoder>(
+    //             pq, sim_table, codes);
+    //         res.add(j, dis);
+    //     }
+    // }
+    // This is the modified version of scan_list_with_tables().
+    // It was observed that doing manual unrolling of the loop that
+    //    utilizes distance_single_code() speeds up the computations.
     /// version of the scan where we use precomputed tables.
     template <class SearchResultType>
@@ -1027,12 +917,65 @@ struct IVFPQScannerT : QueryTables {
             size_t ncode,
             const uint8_t* codes,
             SearchResultType& res) const {
-        for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
+        int counter = 0;
+        size_t saved_j[4] = {0, 0, 0, 0};
+        for (size_t j = 0; j < ncode; j++) {
             if (res.skip_entry(j)) {
                 continue;
             }
-            float dis = dis0 + distance_single_code<SearchResultType>(codes);
-            res.add(j, dis);
+            saved_j[0] = (counter == 0) ? j : saved_j[0];
+            saved_j[1] = (counter == 1) ? j : saved_j[1];
+            saved_j[2] = (counter == 2) ? j : saved_j[2];
+            saved_j[3] = (counter == 3) ? j : saved_j[3];
+            counter += 1;
+            if (counter == 4) {
+                float distance_0 = 0;
+                float distance_1 = 0;
+                float distance_2 = 0;
+                float distance_3 = 0;
+                distance_four_codes<PQDecoder>(
+                        pq,
+                        sim_table,
+                        codes + saved_j[0] * pq.code_size,
+                        codes + saved_j[1] * pq.code_size,
+                        codes + saved_j[2] * pq.code_size,
+                        codes + saved_j[3] * pq.code_size,
+                        distance_0,
+                        distance_1,
+                        distance_2,
+                        distance_3);
+                res.add(saved_j[0], dis0 + distance_0);
+                res.add(saved_j[1], dis0 + distance_1);
+                res.add(saved_j[2], dis0 + distance_2);
+                res.add(saved_j[3], dis0 + distance_3);
+                counter = 0;
+            }
+        }
+        if (counter >= 1) {
+            float dis =
+                    dis0 +
+                    distance_single_code<PQDecoder>(
+                            pq, sim_table, codes + saved_j[0] * pq.code_size);
+            res.add(saved_j[0], dis);
+        }
+        if (counter >= 2) {
+            float dis =
+                    dis0 +
+                    distance_single_code<PQDecoder>(
+                            pq, sim_table, codes + saved_j[1] * pq.code_size);
+            res.add(saved_j[1], dis);
+        }
+        if (counter >= 3) {
+            float dis =
+                    dis0 +
+                    distance_single_code<PQDecoder>(
+                            pq, sim_table, codes + saved_j[2] * pq.code_size);
+            res.add(saved_j[2], dis);
         }
     }
@@ -1101,6 +1044,46 @@ struct IVFPQScannerT : QueryTables {
      * Scanning codes with polysemous filtering
      *****************************************************/
+    // This is the baseline version of scan_list_polysemous_hc().
+    // It demonstrates what this function actually does.
+    //     template <class HammingComputer, class SearchResultType>
+    //     void scan_list_polysemous_hc(
+    //             size_t ncode,
+    //             const uint8_t* codes,
+    //             SearchResultType& res) const {
+    //         int ht = ivfpq.polysemous_ht;
+    //         size_t n_hamming_pass = 0, nup = 0;
+    //
+    //         int code_size = pq.code_size;
+    //
+    //         HammingComputer hc(q_code.data(), code_size);
+    //
+    //         for (size_t j = 0; j < ncode; j++, codes += code_size) {
+    //             if (res.skip_entry(j)) {
+    //                 continue;
+    //             }
+    //             const uint8_t* b_code = codes;
+    //             int hd = hc.hamming(b_code);
+    //             if (hd < ht) {
+    //                 n_hamming_pass++;
+    //
+    //                 float dis =
+    //                         dis0 +
+    //                         distance_single_code<PQDecoder>(
+    //                             pq, sim_table, codes);
+    //
+    //                 res.add(j, dis);
+    //             }
+    //         }
+    // #pragma omp critical
+    //         { indexIVFPQ_stats.n_hamming_pass += n_hamming_pass; }
+    //     }
+    // This is the modified version of scan_list_with_tables().
+    // It was observed that doing manual unrolling of the loop that
+    //    utilizes distance_single_code() speeds up the computations.
     template <class HammingComputer, class SearchResultType>
     void scan_list_polysemous_hc(
             size_t ncode,
@@ -1111,23 +1094,103 @@ struct IVFPQScannerT : QueryTables {
         int code_size = pq.code_size;
+        size_t saved_j[8];
+        int counter = 0;
         HammingComputer hc(q_code.data(), code_size);
-        for (size_t j = 0; j < ncode; j++, codes += code_size) {
+        for (size_t j = 0; j < (ncode / 4) * 4; j += 4) {
+            const uint8_t* b_code = codes + j * code_size;
+            // Unrolling is a key. Basically, doing multiple popcount
+            // operations one after another speeds things up.
+            // 9999999 is just an arbitrary large number
+            int hd0 = (res.skip_entry(j + 0))
+                    ? 99999999
+                    : hc.hamming(b_code + 0 * code_size);
+            int hd1 = (res.skip_entry(j + 1))
+                    ? 99999999
+                    : hc.hamming(b_code + 1 * code_size);
+            int hd2 = (res.skip_entry(j + 2))
+                    ? 99999999
+                    : hc.hamming(b_code + 2 * code_size);
+            int hd3 = (res.skip_entry(j + 3))
+                    ? 99999999
+                    : hc.hamming(b_code + 3 * code_size);
+            saved_j[counter] = j + 0;
+            counter = (hd0 < ht) ? (counter + 1) : counter;
+            saved_j[counter] = j + 1;
+            counter = (hd1 < ht) ? (counter + 1) : counter;
+            saved_j[counter] = j + 2;
+            counter = (hd2 < ht) ? (counter + 1) : counter;
+            saved_j[counter] = j + 3;
+            counter = (hd3 < ht) ? (counter + 1) : counter;
+            if (counter >= 4) {
+                // process four codes at the same time
+                n_hamming_pass += 4;
+                float distance_0 = dis0;
+                float distance_1 = dis0;
+                float distance_2 = dis0;
+                float distance_3 = dis0;
+                distance_four_codes<PQDecoder>(
+                        pq,
+                        sim_table,
+                        codes + saved_j[0] * pq.code_size,
+                        codes + saved_j[1] * pq.code_size,
+                        codes + saved_j[2] * pq.code_size,
+                        codes + saved_j[3] * pq.code_size,
+                        distance_0,
+                        distance_1,
+                        distance_2,
+                        distance_3);
+                res.add(saved_j[0], dis0 + distance_0);
+                res.add(saved_j[1], dis0 + distance_1);
+                res.add(saved_j[2], dis0 + distance_2);
+                res.add(saved_j[3], dis0 + distance_3);
+                //
+                counter -= 4;
+                saved_j[0] = saved_j[4];
+                saved_j[1] = saved_j[5];
+                saved_j[2] = saved_j[6];
+                saved_j[3] = saved_j[7];
+            }
+        }
+        for (size_t kk = 0; kk < counter; kk++) {
+            n_hamming_pass++;
+            float dis =
+                    dis0 +
+                    distance_single_code<PQDecoder>(
+                            pq, sim_table, codes + saved_j[kk] * pq.code_size);
+            res.add(saved_j[kk], dis);
+        }
+        // process leftovers
+        for (size_t j = (ncode / 4) * 4; j < ncode; j++) {
             if (res.skip_entry(j)) {
                 continue;
             }
-            const uint8_t* b_code = codes;
+            const uint8_t* b_code = codes + j * code_size;
             int hd = hc.hamming(b_code);
             if (hd < ht) {
                 n_hamming_pass++;
-                float dis =
-                        dis0 + distance_single_code<SearchResultType>(codes);
+                float dis = dis0 +
+                        distance_single_code<PQDecoder>(
+                                    pq, sim_table, codes + j * code_size);
                 res.add(j, dis);
             }
         }
 #pragma omp critical
         { indexIVFPQ_stats.n_hamming_pass += n_hamming_pass; }
     }
@@ -1171,7 +1234,7 @@ struct IVFPQScannerT : QueryTables {
  * use_sel: store or ignore the IDSelector
  */
 template <MetricType METRIC_TYPE, class C, class PQDecoder, bool use_sel>
-struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
+struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>,
                       InvertedListScanner {
     int precompute_mode;
     const IDSelector* sel;
@@ -1181,9 +1244,7 @@ struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
             bool store_pairs,
             int precompute_mode,
             const IDSelector* sel)
-            : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>(
-                      ivfpq,
-                      nullptr),
+            : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>(ivfpq, nullptr),
               precompute_mode(precompute_mode),
               sel(sel) {
         this->store_pairs = store_pairs;
@@ -1200,14 +1261,9 @@ struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
     float distance_to_code(const uint8_t* code) const override {
         assert(precompute_mode == 2);
-        float dis = this->dis0;
-        const float* tab = this->sim_table;
-        PQDecoder decoder(code, this->pq.nbits);
-        for (size_t m = 0; m < this->pq.M; m++) {
-            dis += tab[decoder.decode()];
-            tab += this->pq.ksub;
-        }
+        float dis = this->dis0 +
+                distance_single_code<PQDecoder>(
+                            this->pq, this->sim_table, code);
         return dis;
     }

data/vendor/faiss/faiss/IndexIVFPQ.h CHANGED Viewed

@@ -162,6 +162,7 @@ void initialize_IVFPQ_precomputed_table(
         const Index* quantizer,
         const ProductQuantizer& pq,
         AlignedTable<float>& precomputed_table,
+        bool by_residual,
         bool verbose);
 /// statistics are robust to internal threading, but not if

data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp CHANGED Viewed

@@ -156,7 +156,12 @@ void IndexIVFPQFastScan::train_residual(idx_t n, const float* x_in) {
 void IndexIVFPQFastScan::precompute_table() {
     initialize_IVFPQ_precomputed_table(
-            use_precomputed_table, quantizer, pq, precomputed_table, verbose);
+            use_precomputed_table,
+            quantizer,
+            pq,
+            precomputed_table,
+            by_residual,
+            verbose);
 }
 /*********************************************************

data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp CHANGED Viewed

@@ -213,8 +213,6 @@ struct IVFScanner : InvertedListScanner {
     std::vector<uint8_t> qcode;
     HammingComputer hc;
-    using idx_t = Index::idx_t;
     IVFScanner(const IndexIVFSpectralHash* index, bool store_pairs)
             : index(index),
               nbit(index->nbit),

data/vendor/faiss/faiss/IndexNNDescent.cpp CHANGED Viewed

@@ -50,7 +50,6 @@ int sgemm_(
 namespace faiss {
-using idx_t = Index::idx_t;
 using storage_idx_t = NNDescent::storage_idx_t;
 /**************************************************************
@@ -89,7 +88,7 @@ struct NegativeDistanceComputer : DistanceComputer {
 };
 DistanceComputer* storage_distance_computer(const Index* storage) {
-    if (storage->metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
     } else {
         return storage->get_distance_computer();

data/vendor/faiss/faiss/IndexNNDescent.h CHANGED Viewed

@@ -25,7 +25,6 @@ struct IndexNNDescent : Index {
     using storage_idx_t = NNDescent::storage_idx_t;
     /// Faiss results are 64-bit
-    using idx_t = Index::idx_t;
     // the link strcuture
     NNDescent nndescent;

data/vendor/faiss/faiss/IndexNSG.cpp CHANGED Viewed

@@ -23,7 +23,6 @@
 namespace faiss {
-using idx_t = Index::idx_t;
 using namespace nsg;
 /**************************************************************
@@ -113,7 +112,7 @@ void IndexNSG::search(
         InterruptCallback::check();
     }
-    if (metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(metric_type)) {
         // we need to revert the negated distances
         for (size_t i = 0; i < k * n; i++) {
             distances[i] = -distances[i];

data/vendor/faiss/faiss/IndexPQ.cpp CHANGED Viewed

@@ -19,6 +19,8 @@
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/hamming.h>
+#include <faiss/impl/code_distance/code_distance.h>
 namespace faiss {
 /*********************************************************
@@ -74,22 +76,18 @@ template <class PQDecoder>
 struct PQDistanceComputer : FlatCodesDistanceComputer {
     size_t d;
     MetricType metric;
-    Index::idx_t nb;
+    idx_t nb;
     const ProductQuantizer& pq;
     const float* sdc;
     std::vector<float> precomputed_table;
     size_t ndis;
     float distance_to_code(const uint8_t* code) final {
-        const float* dt = precomputed_table.data();
-        PQDecoder decoder(code, pq.nbits);
-        float accu = 0;
-        for (int j = 0; j < pq.M; j++) {
-            accu += dt[decoder.decode()];
-            dt += 1 << decoder.nbits;
-        }
         ndis++;
-        return accu;
+        float dis = distance_single_code<PQDecoder>(
+                pq, precomputed_table.data(), code);
+        return dis;
     }
     float symmetric_dis(idx_t i, idx_t j) override {

data/vendor/faiss/faiss/IndexRefine.cpp CHANGED Viewed

@@ -62,7 +62,7 @@ void IndexRefine::reset() {
 namespace {
-typedef faiss::Index::idx_t idx_t;
+typedef faiss::idx_t idx_t;
 template <class C>
 static void reorder_2_heaps(

data/vendor/faiss/faiss/IndexReplicas.cpp CHANGED Viewed

@@ -123,14 +123,13 @@ void IndexReplicasTemplate<IndexT>::search(
     size_t componentsPerVec = sizeof(component_t) == 1 ? (dim + 7) / 8 : dim;
     // Partition the query by the number of indices we have
-    faiss::Index::idx_t queriesPerIndex =
-            (faiss::Index::idx_t)(n + this->count() - 1) /
-            (faiss::Index::idx_t)this->count();
+    faiss::idx_t queriesPerIndex =
+            (faiss::idx_t)(n + this->count() - 1) / (faiss::idx_t)this->count();
     FAISS_ASSERT(n / queriesPerIndex <= this->count());
     auto fn = [queriesPerIndex, componentsPerVec, n, x, k, distances, labels](
                       int i, const IndexT* index) {
-        faiss::Index::idx_t base = (faiss::Index::idx_t)i * queriesPerIndex;
+        faiss::idx_t base = (faiss::idx_t)i * queriesPerIndex;
         if (base < n) {
             auto numForIndex = std::min(queriesPerIndex, n - base);

data/vendor/faiss/faiss/IndexReplicas.h CHANGED Viewed

@@ -20,7 +20,6 @@ namespace faiss {
 template <typename IndexT>
 class IndexReplicasTemplate : public ThreadedIndex<IndexT> {
    public:
-    using idx_t = typename IndexT::idx_t;
     using component_t = typename IndexT::component_t;
     using distance_t = typename IndexT::distance_t;

data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp CHANGED Viewed

@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 #include <faiss/IndexRowwiseMinMax.h>
 #include <cstdint>
@@ -11,7 +18,7 @@ namespace faiss {
 namespace {
-using idx_t = faiss::Index::idx_t;
+using idx_t = faiss::idx_t;
 struct StorageMinMaxFP16 {
     uint16_t scaler;