RubyGems - faiss - Versions diffs - 0.1.3 → 0.1.4 - Mend

faiss 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (184) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +36 -33
data/vendor/faiss/faiss/AutoTune.h +6 -3
data/vendor/faiss/faiss/Clustering.cpp +16 -12
data/vendor/faiss/faiss/Index.cpp +3 -4
data/vendor/faiss/faiss/Index.h +3 -3
data/vendor/faiss/faiss/IndexBinary.cpp +3 -4
data/vendor/faiss/faiss/IndexBinary.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -12
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +1 -2
data/vendor/faiss/faiss/IndexFlat.cpp +0 -148
data/vendor/faiss/faiss/IndexFlat.h +0 -51
data/vendor/faiss/faiss/IndexHNSW.cpp +4 -5
data/vendor/faiss/faiss/IndexIVF.cpp +118 -31
data/vendor/faiss/faiss/IndexIVF.h +22 -15
data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -3
data/vendor/faiss/faiss/IndexIVFFlat.h +2 -1
data/vendor/faiss/faiss/IndexIVFPQ.cpp +39 -15
data/vendor/faiss/faiss/IndexIVFPQ.h +25 -9
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +1116 -0
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +166 -0
data/vendor/faiss/faiss/IndexIVFPQR.cpp +8 -9
data/vendor/faiss/faiss/IndexIVFPQR.h +2 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +1 -2
data/vendor/faiss/faiss/IndexPQ.cpp +34 -18
data/vendor/faiss/faiss/IndexPQFastScan.cpp +536 -0
data/vendor/faiss/faiss/IndexPQFastScan.h +111 -0
data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -0
data/vendor/faiss/faiss/IndexPreTransform.h +2 -0
data/vendor/faiss/faiss/IndexRefine.cpp +256 -0
data/vendor/faiss/faiss/IndexRefine.h +73 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexScalarQuantizer.h +1 -1
data/vendor/faiss/faiss/gpu/GpuDistance.h +1 -1
data/vendor/faiss/faiss/gpu/GpuIndex.h +16 -9
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +8 -1
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +11 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +19 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +28 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +24 -14
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +29 -2
data/vendor/faiss/faiss/gpu/GpuResources.h +4 -0
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +60 -27
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +28 -6
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +547 -0
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +51 -0
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +3 -3
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +3 -2
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +274 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +7 -2
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +5 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +231 -0
data/vendor/faiss/faiss/gpu/test/TestUtils.h +33 -0
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +1 -0
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +6 -0
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +5 -6
data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +5 -4
data/vendor/faiss/faiss/impl/HNSW.cpp +2 -4
data/vendor/faiss/faiss/impl/PolysemousTraining.h +4 -4
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +22 -12
data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +452 -0
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +29 -19
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +6 -0
data/vendor/faiss/faiss/impl/index_read.cpp +64 -96
data/vendor/faiss/faiss/impl/index_write.cpp +34 -25
data/vendor/faiss/faiss/impl/io.cpp +33 -2
data/vendor/faiss/faiss/impl/io.h +7 -2
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -15
data/vendor/faiss/faiss/impl/platform_macros.h +44 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +272 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +169 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +180 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +354 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +559 -0
data/vendor/faiss/faiss/index_factory.cpp +112 -7
data/vendor/faiss/faiss/index_io.h +1 -48
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +151 -0
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +76 -0
data/vendor/faiss/faiss/{DirectMap.cpp → invlists/DirectMap.cpp} +1 -1
data/vendor/faiss/faiss/{DirectMap.h → invlists/DirectMap.h} +1 -1
data/vendor/faiss/faiss/{InvertedLists.cpp → invlists/InvertedLists.cpp} +72 -1
data/vendor/faiss/faiss/{InvertedLists.h → invlists/InvertedLists.h} +32 -1
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +107 -0
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +63 -0
data/vendor/faiss/faiss/{OnDiskInvertedLists.cpp → invlists/OnDiskInvertedLists.cpp} +21 -6
data/vendor/faiss/faiss/{OnDiskInvertedLists.h → invlists/OnDiskInvertedLists.h} +5 -2
data/vendor/faiss/faiss/python/python_callbacks.h +8 -1
data/vendor/faiss/faiss/utils/AlignedTable.h +141 -0
data/vendor/faiss/faiss/utils/Heap.cpp +2 -4
data/vendor/faiss/faiss/utils/Heap.h +61 -50
data/vendor/faiss/faiss/utils/distances.cpp +164 -319
data/vendor/faiss/faiss/utils/distances.h +28 -20
data/vendor/faiss/faiss/utils/distances_simd.cpp +277 -49
data/vendor/faiss/faiss/utils/extra_distances.cpp +1 -2
data/vendor/faiss/faiss/utils/hamming-inl.h +4 -4
data/vendor/faiss/faiss/utils/hamming.cpp +3 -6
data/vendor/faiss/faiss/utils/hamming.h +2 -7
data/vendor/faiss/faiss/utils/ordered_key_value.h +98 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +1256 -0
data/vendor/faiss/faiss/utils/partitioning.h +69 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +277 -0
data/vendor/faiss/faiss/utils/quantize_lut.h +80 -0
data/vendor/faiss/faiss/utils/simdlib.h +31 -0
data/vendor/faiss/faiss/utils/simdlib_avx2.h +461 -0
data/vendor/faiss/faiss/utils/simdlib_emulated.h +589 -0
metadata +43 -141
data/vendor/faiss/benchs/bench_6bit_codec.cpp +0 -80
data/vendor/faiss/c_api/AutoTune_c.cpp +0 -83
data/vendor/faiss/c_api/AutoTune_c.h +0 -66
data/vendor/faiss/c_api/Clustering_c.cpp +0 -145
data/vendor/faiss/c_api/Clustering_c.h +0 -123
data/vendor/faiss/c_api/IndexFlat_c.cpp +0 -140
data/vendor/faiss/c_api/IndexFlat_c.h +0 -115
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +0 -64
data/vendor/faiss/c_api/IndexIVFFlat_c.h +0 -58
data/vendor/faiss/c_api/IndexIVF_c.cpp +0 -99
data/vendor/faiss/c_api/IndexIVF_c.h +0 -142
data/vendor/faiss/c_api/IndexLSH_c.cpp +0 -37
data/vendor/faiss/c_api/IndexLSH_c.h +0 -40
data/vendor/faiss/c_api/IndexPreTransform_c.cpp +0 -21
data/vendor/faiss/c_api/IndexPreTransform_c.h +0 -32
data/vendor/faiss/c_api/IndexShards_c.cpp +0 -38
data/vendor/faiss/c_api/IndexShards_c.h +0 -39
data/vendor/faiss/c_api/Index_c.cpp +0 -105
data/vendor/faiss/c_api/Index_c.h +0 -183
data/vendor/faiss/c_api/MetaIndexes_c.cpp +0 -49
data/vendor/faiss/c_api/MetaIndexes_c.h +0 -49
data/vendor/faiss/c_api/clone_index_c.cpp +0 -23
data/vendor/faiss/c_api/clone_index_c.h +0 -32
data/vendor/faiss/c_api/error_c.h +0 -42
data/vendor/faiss/c_api/error_impl.cpp +0 -27
data/vendor/faiss/c_api/error_impl.h +0 -16
data/vendor/faiss/c_api/faiss_c.h +0 -58
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +0 -98
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +0 -56
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +0 -52
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +0 -68
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +0 -17
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +0 -30
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +0 -38
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +0 -86
data/vendor/faiss/c_api/gpu/GpuResources_c.h +0 -66
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +0 -54
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +0 -53
data/vendor/faiss/c_api/gpu/macros_impl.h +0 -42
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +0 -220
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +0 -149
data/vendor/faiss/c_api/index_factory_c.cpp +0 -26
data/vendor/faiss/c_api/index_factory_c.h +0 -30
data/vendor/faiss/c_api/index_io_c.cpp +0 -42
data/vendor/faiss/c_api/index_io_c.h +0 -50
data/vendor/faiss/c_api/macros_impl.h +0 -110
data/vendor/faiss/demos/demo_imi_flat.cpp +0 -154
data/vendor/faiss/demos/demo_imi_pq.cpp +0 -203
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +0 -151
data/vendor/faiss/demos/demo_sift1M.cpp +0 -252
data/vendor/faiss/demos/demo_weighted_kmeans.cpp +0 -185
data/vendor/faiss/misc/test_blas.cpp +0 -87
data/vendor/faiss/tests/test_binary_flat.cpp +0 -62
data/vendor/faiss/tests/test_dealloc_invlists.cpp +0 -188
data/vendor/faiss/tests/test_ivfpq_codec.cpp +0 -70
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +0 -100
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +0 -573
data/vendor/faiss/tests/test_merge.cpp +0 -260
data/vendor/faiss/tests/test_omp_threads.cpp +0 -14
data/vendor/faiss/tests/test_ondisk_ivf.cpp +0 -225
data/vendor/faiss/tests/test_pairs_decoding.cpp +0 -193
data/vendor/faiss/tests/test_params_override.cpp +0 -236
data/vendor/faiss/tests/test_pq_encoding.cpp +0 -98
data/vendor/faiss/tests/test_sliding_ivf.cpp +0 -246
data/vendor/faiss/tests/test_threaded_index.cpp +0 -253
data/vendor/faiss/tests/test_transfer_invlists.cpp +0 -159
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +0 -104
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +0 -85
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +0 -98
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +0 -122
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +0 -104

data/vendor/faiss/faiss/IndexIVFPQ.cpp CHANGED Viewed

@@ -371,7 +371,7 @@ void IndexIVFPQ::reconstruct_from_offset (int64_t list_no, int64_t offset,
 /// 2G by default, accommodates tables up to PQ32 w/ 65536 centroids
-size_t IndexIVFPQ::precomputed_table_max_bytes = ((size_t)1) << 31;
+size_t precomputed_table_max_bytes = ((size_t)1) << 31;
 /** Precomputed tables for residuals
  *
@@ -403,10 +403,22 @@ size_t IndexIVFPQ::precomputed_table_max_bytes = ((size_t)1) << 31;
  * is faster when the length of the lists is > ksub * M.
  */
-void IndexIVFPQ::precompute_table ()
+void initialize_IVFPQ_precomputed_table (
+    int &use_precomputed_table,
+    const Index *quantizer,
+    const ProductQuantizer &pq,
+    AlignedTable<float> & precomputed_table,
+    bool verbose
+)
 {
-    if (use_precomputed_table == -1)
+    size_t nlist = quantizer->ntotal;
+    size_t d = quantizer->d;
+    FAISS_THROW_IF_NOT(d == pq.d);
+    if (use_precomputed_table == -1) {
+        precomputed_table.resize (0);
         return;
+    }
     if (use_precomputed_table == 0) { // then choose the type of table
         if (quantizer->metric_type == METRIC_INNER_PRODUCT) {
@@ -414,6 +426,7 @@ void IndexIVFPQ::precompute_table ()
                 printf("IndexIVFPQ::precompute_table: precomputed "
                         "tables not needed for inner product quantizers\n");
             }
+            precomputed_table.resize (0);
             return;
         }
         const MultiIndexQuantizer *miq =
@@ -492,6 +505,16 @@ void IndexIVFPQ::precompute_table ()
 }
+void IndexIVFPQ::precompute_table ()
+{
+    initialize_IVFPQ_precomputed_table (
+        use_precomputed_table, quantizer, pq, precomputed_table,
+        verbose
+    );
+}
 namespace {
 using idx_t = Index::idx_t;
@@ -676,11 +699,12 @@ struct QueryTables {
         } else if (use_precomputed_table == 1) {
             dis0 = coarse_dis;
-            fvec_madd (pq.M * pq.ksub,
-                       &ivfpq.precomputed_table [key * pq.ksub * pq.M],
-                       -2.0, sim_table_2,
-                       sim_table);
+            fvec_madd (
+                    pq.M * pq.ksub,
+                    ivfpq.precomputed_table.data() + key * pq.ksub * pq.M,
+                    -2.0, sim_table_2,
+                    sim_table
+            );
             if (polysemous_ht != 0) {
                 ivfpq.quantizer->compute_residual (qi, residual_vec, key);
@@ -706,8 +730,8 @@ struct QueryTables {
                 k >>= cpq.nbits;
                 // get corresponding table
-                const float *pc = &ivfpq.precomputed_table
-                    [(ki * pq.M + cm * Mf) * pq.ksub];
+                const float *pc = ivfpq.precomputed_table.data() +
+                    (ki * pq.M + cm * Mf) * pq.ksub;
                 if (polysemous_ht == 0) {
@@ -741,7 +765,8 @@ struct QueryTables {
         if (use_precomputed_table == 1) {
             dis0 = coarse_dis;
-            const float * s = &ivfpq.precomputed_table [key * pq.ksub * pq.M];
+            const float * s = ivfpq.precomputed_table.data() +
+                    key * pq.ksub * pq.M;
             for (int m = 0; m < pq.M; m++) {
                 sim_table_ptrs [m] = s;
                 s += pq.ksub;
@@ -761,8 +786,8 @@ struct QueryTables {
                 int ki = k & ((uint64_t(1) << cpq.nbits) - 1);
                 k >>= cpq.nbits;
-                const float *pc = &ivfpq.precomputed_table
-                    [(ki * pq.M + cm * Mf) * pq.ksub];
+                const float *pc = ivfpq.precomputed_table.data() +
+                    (ki * pq.M + cm * Mf) * pq.ksub;
                 for (int m = m0; m < m0 + Mf; m++) {
                     sim_table_ptrs [m] = pc;
@@ -803,9 +828,8 @@ struct KnnSearchResults {
     inline void add (idx_t j, float dis) {
         if (C::cmp (heap_sim[0], dis)) {
-            heap_pop<C> (k, heap_sim, heap_ids);
             idx_t id = ids ? ids[j] : lo_build (key, j);
-            heap_push<C> (k, heap_sim, heap_ids, dis, id);
+            heap_replace_top<C> (k, heap_sim, heap_ids, dis, id);
             nup++;
         }
     }

data/vendor/faiss/faiss/IndexIVFPQ.h CHANGED Viewed

@@ -16,7 +16,7 @@
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexPQ.h>
 #include <faiss/impl/platform_macros.h>
+#include <faiss/utils/AlignedTable.h>
 namespace faiss {
@@ -28,10 +28,14 @@ struct IVFPQSearchParameters: IVFSearchParameters {
 };
+FAISS_API extern size_t precomputed_table_max_bytes;
 /** Inverted file with Product Quantizer encoding. Each residual
  * vector is encoded as a product quantizer code.
  */
-struct FAISS_API IndexIVFPQ: IndexIVF {
+struct IndexIVFPQ: IndexIVF {
     bool by_residual;              ///< Encode residual or plain vector?
     ProductQuantizer pq;           ///< produces the codes
@@ -45,18 +49,12 @@ struct FAISS_API IndexIVFPQ: IndexIVF {
     /** Precompute table that speed up query preprocessing at some
      * memory cost (used only for by_residual with L2 metric)
-     * =-1: force disable
-     * =0: decide heuristically (default: use tables only if they are
-     *     < precomputed_tables_max_bytes)
-     * =1: tables that work for all quantizers (size 256 * nlist * M)
-     * =2: specific version for MultiIndexQuantizer (much more compact)
      */
     int use_precomputed_table;
-    static size_t precomputed_table_max_bytes;
     /// if use_precompute_table
     /// size nlist * pq.M * pq.ksub
-    std::vector <float> precomputed_table;
+    AlignedTable<float> precomputed_table;
     IndexIVFPQ (
             Index * quantizer, size_t d, size_t nlist,
@@ -133,6 +131,24 @@ struct FAISS_API IndexIVFPQ: IndexIVF {
 };
+/** Pre-compute distance tables for IVFPQ with by-residual and METRIC_L2
+ *
+ * @param use_precomputed_table (I/O)
+ *        =-1: force disable
+ *        =0: decide heuristically (default: use tables only if they are
+ *            < precomputed_tables_max_bytes), set use_precomputed_table on output
+ *        =1: tables that work for all quantizers (size 256 * nlist * M)
+ *        =2: specific version for MultiIndexQuantizer (much more compact)
+ * @param precomputed_table precomputed table to intialize
+ */
+void initialize_IVFPQ_precomputed_table(
+    int &use_precomputed_table,
+    const Index *quantizer,
+    const ProductQuantizer &pq,
+    AlignedTable<float> & precomputed_table,
+    bool verbose
+);
 /// statistics are robust to internal threading, but not if
 /// IndexIVFPQ::search_preassigned is called by multiple threads

data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp ADDED Viewed

@@ -0,0 +1,1116 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <faiss/IndexIVFPQFastScan.h>
+#include <cassert>
+#include <cstdio>
+#include <inttypes.h>
+#include <omp.h>
+#include <memory>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/simdlib.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/invlists/BlockInvertedLists.h>
+#include <faiss/impl/simd_result_handlers.h>
+#include <faiss/utils/quantize_lut.h>
+#include <faiss/impl/pq4_fast_scan.h>
+namespace faiss {
+using namespace simd_result_handlers;
+inline size_t roundup(size_t a, size_t b) {
+    return (a + b - 1) / b * b;
+}
+IndexIVFPQFastScan::IndexIVFPQFastScan (
+            Index * quantizer, size_t d, size_t nlist,
+            size_t M, size_t nbits_per_idx,
+            MetricType metric, int bbs):
+    IndexIVF (quantizer, d, nlist, 0, metric),
+    pq (d, M, nbits_per_idx),
+    bbs (bbs)
+{
+    FAISS_THROW_IF_NOT(nbits_per_idx == 4);
+    M2 = roundup(pq.M, 2);
+    by_residual = false; // set to false by default because it's much faster
+    is_trained = false;
+    code_size = pq.code_size;
+    replace_invlists(
+        new BlockInvertedLists(nlist, bbs, bbs * M2 / 2),
+        true
+    );
+}
+IndexIVFPQFastScan::IndexIVFPQFastScan ()
+{
+    by_residual = false;
+    bbs = 0;
+    M2 = 0;
+}
+IndexIVFPQFastScan::IndexIVFPQFastScan(const IndexIVFPQ & orig, int bbs):
+    IndexIVF(
+        orig.quantizer, orig.d, orig.nlist,
+        orig.pq.code_size, orig.metric_type),
+    pq(orig.pq),
+    bbs(bbs)
+{
+    FAISS_THROW_IF_NOT(orig.pq.nbits == 4);
+    by_residual = orig.by_residual;
+    ntotal = orig.ntotal;
+    is_trained = orig.is_trained;
+    nprobe = orig.nprobe;
+    size_t M = pq.M;
+    M2 = roundup(M, 2);
+    replace_invlists(
+        new BlockInvertedLists(orig.nlist, bbs, bbs * M2 / 2),
+        true
+    );
+    precomputed_table.resize(orig.precomputed_table.size());
+    if (precomputed_table.nbytes() > 0) {
+        memcpy(precomputed_table.get(), orig.precomputed_table.data(),
+               precomputed_table.nbytes()
+        );
+    }
+    for(size_t i = 0; i < nlist; i++) {
+        size_t nb = orig.invlists->list_size(i);
+        size_t nb2 = roundup(nb, bbs);
+        AlignedTable<uint8_t> tmp(nb2 * M2 / 2);
+        pq4_pack_codes(
+            InvertedLists::ScopedCodes(orig.invlists, i).get(),
+            nb, M, nb2, bbs, M2,
+            tmp.get()
+        );
+        invlists->add_entries(
+            i, nb,
+            InvertedLists::ScopedIds(orig.invlists, i).get(),
+            tmp.get()
+        );
+    }
+    orig_invlists = orig.invlists;
+}
+/*********************************************************
+ * Training
+ *********************************************************/
+void IndexIVFPQFastScan::train_residual (idx_t n, const float *x_in)
+{
+    const float * x = fvecs_maybe_subsample (
+         d, (size_t*)&n, pq.cp.max_points_per_centroid * pq.ksub,
+         x_in, verbose, pq.cp.seed);
+    std::unique_ptr<float []> del_x;
+    if (x != x_in) {
+        del_x.reset((float*)x);
+    }
+    const float *trainset;
+    AlignedTable<float> residuals;
+    if (by_residual) {
+        if(verbose) printf("computing residuals\n");
+        std::vector<idx_t> assign(n);
+        quantizer->assign (n, x, assign.data());
+        residuals.resize(n * d);
+        for (idx_t i = 0; i < n; i++) {
+           quantizer->compute_residual (
+                x + i * d,
+                residuals.data() + i * d,
+                assign[i]
+            );
+        }
+        trainset = residuals.data();
+    } else {
+        trainset = x;
+    }
+    if (verbose) {
+        printf ("training %zdx%zd product quantizer on %zd vectors in %dD\n",
+                pq.M, pq.ksub, long(n), d);
+    }
+    pq.verbose = verbose;
+    pq.train (n, trainset);
+    if (by_residual && metric_type == METRIC_L2) {
+        precompute_table();
+    }
+}
+void IndexIVFPQFastScan::precompute_table ()
+{
+    initialize_IVFPQ_precomputed_table(
+        use_precomputed_table,
+        quantizer, pq, precomputed_table, verbose
+    );
+}
+/*********************************************************
+ * Code management functions
+ *********************************************************/
+void IndexIVFPQFastScan::encode_vectors(
+        idx_t n, const float* x, const idx_t *list_nos,
+        uint8_t * codes, bool include_listnos) const
+{
+    if (by_residual) {
+        AlignedTable<float> residuals (n * d);
+        for (size_t i = 0; i < n; i++) {
+            if (list_nos[i] < 0) {
+                memset (residuals.data() + i * d, 0, sizeof(residuals[0]) * d);
+            } else {
+                quantizer->compute_residual (
+                    x + i * d, residuals.data() + i * d, list_nos[i]);
+            }
+        }
+        pq.compute_codes (residuals.data(), codes, n);
+    } else {
+        pq.compute_codes (x, codes, n);
+    }
+    if (include_listnos) {
+        size_t coarse_size = coarse_code_size();
+        for (idx_t i = n - 1; i >= 0; i--) {
+            uint8_t * code = codes + i * (coarse_size + code_size);
+            memmove (code + coarse_size,
+                     codes + i * code_size, code_size);
+            encode_listno (list_nos[i], code);
+        }
+    }
+}
+void IndexIVFPQFastScan::add_with_ids (
+        idx_t n, const float * x, const idx_t *xids) {
+    // copied from IndexIVF::add_with_ids --->
+    // do some blocking to avoid excessive allocs
+    idx_t bs = 65536;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min (n, i0 + bs);
+            if (verbose) {
+                printf("   IndexIVFPQFastScan::add_with_ids %zd: %zd",
+                       size_t(i0), size_t(i1));
+            }
+            add_with_ids (i1 - i0, x + i0 * d,
+                          xids ? xids + i0 : nullptr);
+        }
+        return;
+    }
+    InterruptCallback::check();
+    AlignedTable<uint8_t> codes(n * code_size);
+    FAISS_THROW_IF_NOT (is_trained);
+    direct_map.check_can_add (xids);
+    std::unique_ptr<idx_t []> idx(new idx_t[n]);
+    quantizer->assign (n, x, idx.get());
+    size_t nadd = 0, nminus1 = 0;
+    for (size_t i = 0; i < n; i++) {
+        if (idx[i] < 0) nminus1++;
+    }
+    AlignedTable<uint8_t> flat_codes(n * code_size);
+    encode_vectors (n, x, idx.get(), flat_codes.get());
+    DirectMapAdd dm_adder(direct_map, n, xids);
+    // <---
+    BlockInvertedLists *bil = dynamic_cast<BlockInvertedLists*>(invlists);
+    FAISS_THROW_IF_NOT_MSG (bil, "only block inverted lists supported");
+    // prepare batches
+    std::vector<idx_t> order(n);
+    for(idx_t i = 0; i < n ; i++) { order[i] = i; }
+    // TODO should not need stable
+    std::stable_sort(order.begin(), order.end(),
+        [&idx](idx_t a, idx_t b) {
+            return idx[a] < idx[b];
+        }
+    );
+    // TODO parallelize
+    idx_t i0 = 0;
+    while (i0 < n) {
+        idx_t list_no = idx[order[i0]];
+        idx_t i1 = i0 + 1;
+        while (i1 < n && idx[order[i1]] == list_no) {
+            i1 ++;
+        }
+        if (list_no == -1) {
+            i0 = i1;
+            continue;
+        }
+        // make linear array
+        AlignedTable<uint8_t> list_codes((i1 - i0) * code_size);
+        size_t list_size = bil->list_size(list_no);
+        bil->resize(list_no, list_size + i1 - i0);
+        for(idx_t i = i0; i < i1; i++) {
+            size_t ofs = list_size + i - i0;
+            idx_t id = xids ? xids[order[i]] : ntotal + order[i];
+            dm_adder.add (order[i], list_no, ofs);
+            bil->ids[list_no][ofs] = id;
+            memcpy(
+                list_codes.data() + (i - i0) * code_size,
+                flat_codes.data() + order[i] * code_size,
+                code_size
+            );
+            nadd++;
+        }
+        pq4_pack_codes_range(
+            list_codes.data(), pq.M,
+            list_size, list_size + i1 - i0,
+            bbs, M2, bil->codes[list_no].data()
+        );
+        i0 = i1;
+    }
+    ntotal += n;
+}
+/*********************************************************
+ * search
+ *********************************************************/
+namespace {
+// from impl/ProductQuantizer.cpp
+template <class C, typename dis_t>
+void pq_estimators_from_tables_generic(
+        const ProductQuantizer& pq, size_t nbits,
+        const uint8_t *codes, size_t ncodes,
+        const dis_t *dis_table, const int64_t * ids,
+        float dis0,
+        size_t k, typename C::T *heap_dis, int64_t *heap_ids)
+{
+    using accu_t = typename C::T;
+    const size_t M = pq.M;
+    const size_t ksub = pq.ksub;
+    for (size_t j = 0; j < ncodes; ++j) {
+        PQDecoderGeneric decoder(
+                codes + j * pq.code_size, nbits
+        );
+        accu_t dis = dis0;
+        const dis_t * dt = dis_table;
+        for (size_t m = 0; m < M; m++) {
+            uint64_t c = decoder.decode();
+            dis += dt[c];
+            dt += ksub;
+        }
+        if (C::cmp(heap_dis[0], dis)) {
+            heap_pop<C>(k, heap_dis, heap_ids);
+            heap_push<C>(k, heap_dis, heap_ids, dis, ids[j]);
+        }
+    }
+}
+using idx_t = Index::idx_t;
+using namespace quantize_lut;
+void fvec_madd_avx (
+        size_t n, const float *a,
+        float bf, const float *b, float *c)
+{
+    assert(is_aligned_pointer(a));
+    assert(is_aligned_pointer(b));
+    assert(is_aligned_pointer(c));
+    assert(n % 8 == 0);
+    simd8float32 bf8(bf);
+    n /= 8;
+    for(size_t i = 0; i < n; i++) {
+        simd8float32 ai(a);
+        simd8float32 bi(b);
+        simd8float32 ci = fmadd(bf8, bi, ai);
+        ci.store(c);
+        c += 8;
+        a += 8;
+        b += 8;
+    }
+}
+} // anonymous namespace
+/*********************************************************
+ * Look-Up Table functions
+ *********************************************************/
+void IndexIVFPQFastScan::compute_LUT(
+    size_t n, const float *x,
+    const idx_t *coarse_ids, const float *coarse_dis,
+    AlignedTable<float> & dis_tables,
+    AlignedTable<float> & biases
+) const
+{
+    const IndexIVFPQFastScan & ivfpq = *this;
+    size_t dim12 = pq.ksub * pq.M;
+    size_t d = pq.d;
+    size_t nprobe = ivfpq.nprobe;
+    if (ivfpq.by_residual) {
+        if (ivfpq.metric_type == METRIC_L2) {
+            dis_tables.resize(n * nprobe * dim12);
+            if (ivfpq.use_precomputed_table == 1) {
+                biases.resize(n * nprobe);
+                memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
+                AlignedTable<float> ip_table(n * dim12);
+                pq.compute_inner_prod_tables (n, x, ip_table.get());
+#pragma omp parallel for if (n * nprobe > 8000)
+                for(idx_t ij = 0; ij < n * nprobe; ij++) {
+                    idx_t i = ij / nprobe;
+                    float *tab = dis_tables.get() + ij * dim12;
+                    idx_t cij = coarse_ids[ij];
+                    if (cij >= 0) {
+                        fvec_madd_avx (
+                            dim12,
+                            precomputed_table.get() + cij * dim12,
+                            -2, ip_table.get() + i * dim12,
+                            tab
+                        );
+                    } else {
+                        // fill with NaNs so that they are ignored during
+                        // LUT quantization
+                        memset (tab, -1, sizeof(float) * dim12);
+                    }
+                }
+            } else {
+                std::unique_ptr<float[]> xrel(new float[n * nprobe * d]);
+                biases.resize(n * nprobe);
+                memset(biases.get(), 0, sizeof(float) * n * nprobe);
+#pragma omp parallel for if (n * nprobe > 8000)
+                for(idx_t ij = 0; ij < n * nprobe; ij++) {
+                    idx_t i = ij / nprobe;
+                    float *xij = &xrel[ij * d];
+                    idx_t cij = coarse_ids[ij];
+                    if (cij >= 0) {
+                        ivfpq.quantizer->compute_residual(
+                            x + i * d, xij, cij);
+                    } else {
+                        // will fill with NaNs
+                        memset(xij, -1, sizeof(float) * d);
+                    }
+                }
+                pq.compute_distance_tables (
+                        n * nprobe, xrel.get(), dis_tables.get());
+            }
+        } else if (ivfpq.metric_type == METRIC_INNER_PRODUCT) {
+            dis_tables.resize(n * dim12);
+            pq.compute_inner_prod_tables (n, x, dis_tables.get());
+            // compute_inner_prod_tables(pq, n, x, dis_tables.get());
+            biases.resize(n * nprobe);
+            memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
+        } else {
+            FAISS_THROW_FMT("metric %d not supported", ivfpq.metric_type);
+        }
+    } else {
+        dis_tables.resize(n * dim12);
+        if (ivfpq.metric_type == METRIC_L2) {
+            pq.compute_distance_tables (n, x, dis_tables.get());
+        } else if (ivfpq.metric_type == METRIC_INNER_PRODUCT) {
+            pq.compute_inner_prod_tables (n, x, dis_tables.get());
+        } else {
+            FAISS_THROW_FMT("metric %d not supported", ivfpq.metric_type);
+        }
+    }
+}
+void IndexIVFPQFastScan::compute_LUT_uint8(
+    size_t n, const float *x,
+    const idx_t *coarse_ids, const float *coarse_dis,
+    AlignedTable<uint8_t> & dis_tables,
+    AlignedTable<uint16_t> & biases,
+    float * normalizers
+) const {
+    const IndexIVFPQFastScan & ivfpq = *this;
+    AlignedTable<float> dis_tables_float;
+    AlignedTable<float> biases_float;
+    uint64_t t0 = get_cy();
+    compute_LUT(
+            n, x,
+            coarse_ids, coarse_dis,
+            dis_tables_float, biases_float
+    );
+    IVFFastScan_stats.t_compute_distance_tables += get_cy() - t0;
+    bool lut_is_3d = ivfpq.by_residual && ivfpq.metric_type == METRIC_L2;
+    size_t dim123 = pq.ksub * pq.M;
+    size_t dim123_2 = pq.ksub * M2;
+    if (lut_is_3d) {
+        dim123 *= nprobe;
+        dim123_2 *= nprobe;
+    }
+    dis_tables.resize(n * dim123_2);
+    if (biases_float.get()) {
+        biases.resize(n * nprobe);
+    }
+    uint64_t t1 = get_cy();
+#pragma omp parallel for if (n > 100)
+    for(int64_t i = 0; i < n; i++) {
+        const float *t_in = dis_tables_float.get() + i * dim123;
+        const float *b_in = nullptr;
+        uint8_t *t_out = dis_tables.get() + i * dim123_2;
+        uint16_t *b_out = nullptr;
+        if (biases_float.get()) {
+            b_in = biases_float.get() + i * nprobe;
+            b_out = biases.get() + i * nprobe;
+        }
+        quantize_LUT_and_bias(
+            nprobe, pq.M, pq.ksub, lut_is_3d,
+            t_in, b_in,
+            t_out, M2, b_out,
+            normalizers + 2 * i, normalizers + 2 * i + 1
+        );
+    }
+    IVFFastScan_stats.t_round += get_cy() - t1;
+}
+/*********************************************************
+ * Search functions
+ *********************************************************/
+template<bool is_max>
+void IndexIVFPQFastScan::search_dispatch_implem(
+                idx_t n,
+                const float* x,
+                idx_t k,
+                float* distances,
+                idx_t* labels) const
+{
+    using Cfloat = typename std::conditional<is_max,
+        CMax<float, int64_t>, CMin<float, int64_t> >::type;
+    using C = typename std::conditional<is_max,
+        CMax<uint16_t, int64_t>, CMin<uint16_t, int64_t> >::type;
+    if (n == 0) {
+        return;
+    }
+    // actual implementation used
+    int impl = implem;
+    if (impl == 0) {
+        if (bbs == 32) {
+            impl = 12;
+        } else {
+            impl = 10;
+        }
+        if (k > 20) {
+            impl ++;
+        }
+    }
+    if (impl == 1) {
+        search_implem_1<Cfloat>(n, x, k, distances, labels);
+    } else if (impl == 2) {
+        search_implem_2<C>(n, x, k, distances, labels);
+    } else if (impl >= 10 && impl <= 13) {
+        size_t ndis = 0, nlist_visited = 0;
+        if (n < 2) {
+            if (impl == 12 || impl == 13) {
+                search_implem_12<C>
+                    (n, x, k, distances, labels, impl, &ndis, &nlist_visited);
+            } else {
+                search_implem_10<C>
+                    (n, x, k, distances, labels, impl, &ndis, &nlist_visited);
+            }
+        } else {
+            // explicitly slice over threads
+            int nslice;
+            if (n <= omp_get_max_threads()) {
+                nslice = n;
+            } else if (by_residual && metric_type == METRIC_L2) {
+                // make sure we don't make too big LUT tables
+                size_t lut_size_per_query =
+                    pq.M * pq.ksub * nprobe * (sizeof(float) + sizeof(uint8_t));
+                size_t max_lut_size = precomputed_table_max_bytes;
+                // how many queries we can handle within mem budget
+                size_t nq_ok = std::max(max_lut_size / lut_size_per_query, size_t(1));
+                nslice = roundup(std::max(size_t(n / nq_ok), size_t(1)), omp_get_max_threads());
+            } else {
+                // LUTs unlikely to be a limiting factor
+                nslice = omp_get_max_threads();
+            }
+#pragma omp parallel for reduction(+: ndis, nlist_visited)
+            for (int slice = 0; slice < nslice; slice++) {
+                idx_t i0 = n * slice / nslice;
+                idx_t i1 = n * (slice + 1) / nslice;
+                float *dis_i = distances + i0 * k;
+                idx_t *lab_i = labels + i0 * k;
+                if (impl == 12 || impl == 13) {
+                    search_implem_12<C>(
+                        i1 - i0, x + i0 * d, k, dis_i, lab_i,
+                        impl, &ndis, &nlist_visited
+                    );
+                } else {
+                    search_implem_10<C>(
+                        i1 - i0, x + i0 * d, k, dis_i, lab_i,
+                        impl, &ndis, &nlist_visited
+                    );
+                }
+            }
+        }
+        indexIVF_stats.nq += n;
+        indexIVF_stats.ndis += ndis;
+        indexIVF_stats.nlist += nlist_visited;
+    } else {
+        FAISS_THROW_FMT("implem %d does not exist", implem);
+    }
+}
+void IndexIVFPQFastScan::search(
+                idx_t n, const float* x, idx_t k,
+                float* distances, idx_t* labels) const
+{
+    if (metric_type == METRIC_L2) {
+        search_dispatch_implem<true>(n, x, k, distances, labels);
+    } else {
+        search_dispatch_implem<false>(n, x, k, distances, labels);
+    }
+}
+template<class C>
+void IndexIVFPQFastScan::search_implem_1(
+                idx_t n, const float* x, idx_t k,
+                float* distances, idx_t* labels) const
+{
+    FAISS_THROW_IF_NOT(orig_invlists);
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+    quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
+    size_t dim12 = pq.ksub * pq.M;
+    AlignedTable<float> dis_tables;
+    AlignedTable<float> biases;
+    compute_LUT (
+            n, x,
+            coarse_ids.get(), coarse_dis.get(),
+            dis_tables, biases
+    );
+    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
+    size_t ndis = 0, nlist_visited = 0;
+#pragma omp parallel for reduction(+: ndis, nlist_visited)
+    for(idx_t i = 0; i < n; i++) {
+        int64_t *heap_ids = labels + i * k;
+        float *heap_dis = distances + i * k;
+        heap_heapify<C> (k, heap_dis, heap_ids);
+        float *LUT = nullptr;
+        if (single_LUT) {
+            LUT = dis_tables.get() + i * dim12;
+        }
+        for(idx_t j = 0; j < nprobe; j++) {
+            if (!single_LUT) {
+                LUT = dis_tables.get() + (i * nprobe + j) * dim12;
+            }
+            idx_t list_no = coarse_ids[i * nprobe + j];
+            if (list_no < 0) continue;
+            size_t ls = orig_invlists->list_size(list_no);
+            if (ls == 0) continue;
+            InvertedLists::ScopedCodes codes(orig_invlists, list_no);
+            InvertedLists::ScopedIds ids(orig_invlists, list_no);
+            float bias = biases.get() ? biases[i * nprobe + j] : 0;
+            pq_estimators_from_tables_generic<C>(
+                pq, pq.nbits, codes.get(), ls,
+                LUT, ids.get(), bias,
+                k, heap_dis, heap_ids
+            );
+            nlist_visited ++;
+            ndis ++;
+        }
+        heap_reorder<C> (k, heap_dis, heap_ids);
+    }
+    indexIVF_stats.nq += n;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nlist += nlist_visited;
+}
+template<class C>
+void IndexIVFPQFastScan::search_implem_2(
+                idx_t n, const float* x, idx_t k,
+                float* distances, idx_t* labels) const
+{
+    FAISS_THROW_IF_NOT(orig_invlists);
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+    quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
+    size_t dim12 = pq.ksub * M2;
+    AlignedTable<uint8_t> dis_tables;
+    AlignedTable<uint16_t> biases;
+    std::unique_ptr<float[]> normalizers(new float[2 * n]);
+    compute_LUT_uint8 (
+        n, x,
+        coarse_ids.get(), coarse_dis.get(),
+        dis_tables, biases,
+        normalizers.get()
+    );
+    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
+    size_t ndis = 0, nlist_visited = 0;
+#pragma omp parallel for reduction(+: ndis, nlist_visited)
+    for(idx_t i = 0; i < n; i++) {
+        std::vector<uint16_t> tmp_dis(k);
+        int64_t *heap_ids = labels + i * k;
+        uint16_t *heap_dis = tmp_dis.data();
+        heap_heapify<C> (k, heap_dis, heap_ids);
+        const uint8_t *LUT = nullptr;
+        if (single_LUT) {
+            LUT = dis_tables.get() + i * dim12;
+        }
+        for(idx_t j = 0; j < nprobe; j++) {
+            if (!single_LUT) {
+                LUT = dis_tables.get() + (i * nprobe + j) * dim12;
+            }
+            idx_t list_no = coarse_ids[i * nprobe + j];
+            if (list_no < 0) continue;
+            size_t ls = orig_invlists->list_size(list_no);
+            if (ls == 0) continue;
+            InvertedLists::ScopedCodes codes(orig_invlists, list_no);
+            InvertedLists::ScopedIds ids(orig_invlists, list_no);
+            uint16_t bias = biases.get() ? biases[i * nprobe + j] : 0;
+            pq_estimators_from_tables_generic<C>(
+                pq, pq.nbits, codes.get(), ls,
+                LUT, ids.get(), bias,
+                k, heap_dis, heap_ids
+            );
+            nlist_visited++;
+            ndis += ls;
+        }
+        heap_reorder<C> (k, heap_dis, heap_ids);
+        // convert distances to float
+        {
+            float one_a = 1 / normalizers[2 * i], b = normalizers[2 * i + 1];
+            if (skip & 16) {
+                one_a = 1;
+                b = 0;
+            }
+            float *heap_dis_float = distances + i * k;
+            for (int j = 0; j < k; j++) {
+                heap_dis_float[j] = b + heap_dis[j] * one_a;
+            }
+        }
+    }
+    indexIVF_stats.nq += n;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nlist += nlist_visited;
+}
+template<class C>
+void IndexIVFPQFastScan::search_implem_10(
+                idx_t n, const float* x, idx_t k,
+                float* distances, idx_t* labels,
+                int impl, size_t *ndis_out, size_t *nlist_out) const
+{
+    memset(distances, -1, sizeof(float) * k * n);
+    memset(labels, -1, sizeof(idx_t) * k * n);
+    using HeapHC = HeapHandler<C, true>;
+    using ReservoirHC = ReservoirHandler<C, true>;
+    using SingleResultHC = SingleResultHandler<C, true>;
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+    uint64_t times[10];
+    memset(times, 0, sizeof(times));
+    int ti = 0;
+#define TIC times[ti++] = get_cy()
+    TIC;
+    quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
+    TIC;
+    size_t dim12 = pq.ksub * M2;
+    AlignedTable<uint8_t> dis_tables;
+    AlignedTable<uint16_t> biases;
+    std::unique_ptr<float[]> normalizers (new float[2 * n]);
+    compute_LUT_uint8 (
+            n, x,
+            coarse_ids.get(), coarse_dis.get(),
+            dis_tables, biases, normalizers.get()
+    );
+    TIC;
+    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
+    TIC;
+    size_t ndis = 0, nlist_visited = 0;
+    {
+        AlignedTable<uint16_t> tmp_distances(k);
+        for(idx_t i = 0; i < n; i++) {
+            const uint8_t *LUT = nullptr;
+            int qmap1[1] = {0};
+            std::unique_ptr<SIMDResultHandler<C, true> > handler;
+            if (k == 1) {
+                handler.reset(new SingleResultHC(1, 0));
+            } else if (impl == 10) {
+                handler.reset(new HeapHC(1, tmp_distances.get(), labels + i * k, k, 0));
+            } else if (impl == 11) {
+                handler.reset(new ReservoirHC(1, 0, k, 2 * k));
+            } else {
+                FAISS_THROW_MSG("invalid");
+            }
+            handler->q_map = qmap1;
+            if (single_LUT) {
+                LUT = dis_tables.get() + i * dim12;
+            }
+            for(idx_t j = 0; j < nprobe; j++) {
+                size_t ij = i * nprobe + j;
+                if (!single_LUT) {
+                    LUT = dis_tables.get() + ij * dim12;
+                }
+                if (biases.get()) {
+                    handler->dbias = biases.get() + ij;
+                }
+                idx_t list_no = coarse_ids[ij];
+                if (list_no < 0) continue;
+                size_t ls = invlists->list_size(list_no);
+                if (ls == 0) continue;
+                InvertedLists::ScopedCodes codes(invlists, list_no);
+                InvertedLists::ScopedIds ids(invlists, list_no);
+                handler->ntotal = ls;
+                handler->id_map = ids.get();
+#define DISPATCH(classHC) \
+                if(auto *res = dynamic_cast<classHC* > (handler.get())) { \
+                    pq4_accumulate_loop( \
+                            1, roundup(ls, bbs), bbs, M2, \
+                            codes.get(), LUT, \
+                            *res \
+                        ); \
+                }
+                DISPATCH(HeapHC)
+                else DISPATCH(ReservoirHC)
+                else DISPATCH(SingleResultHC)
+#undef DISPATCH
+                nlist_visited ++;
+                ndis ++;
+            }
+            handler->to_flat_arrays(
+                distances + i * k, labels + i * k,
+                skip & 16 ? nullptr : normalizers.get() + i * 2
+            );
+        }
+    }
+    *ndis_out = ndis;
+    *nlist_out = nlist;
+}
+template<class C>
+void IndexIVFPQFastScan::search_implem_12(
+                idx_t n, const float* x, idx_t k,
+                float* distances, idx_t* labels,
+                int impl, size_t *ndis_out, size_t *nlist_out) const
+{
+    if (n == 0) { // does not work well with reservoir
+        return;
+    }
+    FAISS_THROW_IF_NOT(bbs == 32);
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+    uint64_t times[10];
+    memset(times, 0, sizeof(times));
+    int ti = 0;
+#define TIC times[ti++] = get_cy()
+    TIC;
+    quantizer->search (n, x, nprobe, coarse_dis.get(), coarse_ids.get());
+    TIC;
+    size_t dim12 = pq.ksub * M2;
+    AlignedTable<uint8_t> dis_tables;
+    AlignedTable<uint16_t> biases;
+    std::unique_ptr<float[]> normalizers (new float[2 * n]);
+    compute_LUT_uint8 (
+            n, x,
+            coarse_ids.get(), coarse_dis.get(),
+            dis_tables, biases, normalizers.get()
+    );
+    TIC;
+    struct QC {
+        int qno;      // sequence number of the query
+        int list_no;  // list to visit
+        int rank;     // this is the rank'th result of the coarse quantizer
+    };
+    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
+    std::vector<QC> qcs;
+    {
+        int ij = 0;
+        for(int i = 0; i < n; i++) {
+            for(int j = 0; j < nprobe; j++) {
+                if (coarse_ids[ij] >= 0) {
+                    qcs.push_back(QC{i, int(coarse_ids[ij]), int(j)});
+                }
+                ij++;
+            }
+        }
+        std::sort(
+            qcs.begin(), qcs.end(),
+            [](const QC &a, const QC & b) {
+                return a.list_no < b.list_no;
+            }
+        );
+    }
+    TIC;
+    // prepare the result handlers
+    std::unique_ptr<SIMDResultHandler<C, true> > handler;
+    AlignedTable<uint16_t> tmp_distances;
+    using HeapHC = HeapHandler<C, true>;
+    using ReservoirHC = ReservoirHandler<C, true>;
+    using SingleResultHC = SingleResultHandler<C, true>;
+    if (k == 1) {
+        handler.reset(new SingleResultHC(n, 0));
+    } else if (impl == 12) {
+        tmp_distances.resize(n * k);
+        handler.reset(new HeapHC(n, tmp_distances.get(), labels, k, 0));
+    } else if (impl == 13) {
+        handler.reset(new ReservoirHC(n, 0, k, 2 * k));
+    }
+    int qbs2 = this->qbs2 ? this->qbs2 : 11;
+    std::vector<uint16_t> tmp_bias;
+    if (biases.get()) {
+        tmp_bias.resize(qbs2);
+        handler->dbias = tmp_bias.data();
+    }
+    TIC;
+    size_t ndis = 0;
+    size_t i0 = 0;
+    uint64_t t_copy_pack = 0, t_scan = 0;
+    while (i0 < qcs.size()) {
+        uint64_t tt0 = get_cy();
+        // find all queries that access this inverted list
+        int list_no = qcs[i0].list_no;
+        size_t i1 = i0 + 1;
+        while(i1 < qcs.size() && i1 < i0 + qbs2) {
+            if (qcs[i1].list_no != list_no) {
+                break;
+            }
+            i1++;
+        }
+        size_t list_size = invlists->list_size(list_no);
+        if (list_size == 0) {
+            i0 = i1;
+            continue;
+        }
+        // re-organize LUTs and biases into the right order
+        int nc = i1 - i0;
+        std::vector<int> q_map(nc), lut_entries(nc);
+        AlignedTable<uint8_t> LUT(nc * dim12);
+        memset(LUT.get(), -1, nc * dim12);
+        int qbs = pq4_preferred_qbs(nc);
+        for(size_t i = i0; i < i1; i++) {
+            const QC & qc = qcs[i];
+            q_map[i - i0] = qc.qno;
+            int ij = qc.qno * nprobe + qc.rank;
+            lut_entries[i - i0] = single_LUT ? qc.qno : ij;
+            if (biases.get()) {
+                tmp_bias[i - i0] = biases[ij];
+            }
+        }
+        pq4_pack_LUT_qbs_q_map(
+            qbs, M2, dis_tables.get(), lut_entries.data(),
+            LUT.get()
+        );
+        // access the inverted list
+        ndis += (i1 - i0) * list_size;
+        InvertedLists::ScopedCodes codes(invlists, list_no);
+        InvertedLists::ScopedIds ids(invlists, list_no);
+        // prepare the handler
+        handler->ntotal = list_size;
+        handler->q_map = q_map.data();
+        handler->id_map = ids.get();
+        uint64_t tt1 = get_cy();
+#define DISPATCH(classHC) \
+        if(auto *res = dynamic_cast<classHC* > (handler.get())) { \
+            pq4_accumulate_loop_qbs( \
+                    qbs, list_size, M2, \
+                    codes.get(), LUT.get(), \
+                    *res \
+                ); \
+        }
+        DISPATCH(HeapHC)
+        else DISPATCH(ReservoirHC)
+        else DISPATCH(SingleResultHC)
+        // prepare for next loop
+        i0 = i1;
+        uint64_t tt2 = get_cy();
+        t_copy_pack += tt1 - tt0;
+        t_scan += tt2 - tt1;
+    }
+    TIC;
+    // labels is in-place for HeapHC
+    handler->to_flat_arrays(
+            distances, labels,
+            skip & 16 ? nullptr : normalizers.get()
+    );
+    TIC;
+    // these stats are not thread-safe
+    for(int i = 1; i < ti; i++) {
+        IVFFastScan_stats.times[i] += times[i] - times[i-1];
+    }
+    IVFFastScan_stats.t_copy_pack += t_copy_pack;
+    IVFFastScan_stats.t_scan += t_scan;
+    if (auto *rh = dynamic_cast<ReservoirHC*> (handler.get())) {
+        for (int i = 0; i < 4; i++) {
+            IVFFastScan_stats.reservoir_times[i] += rh->times[i];
+        }
+    }
+    *ndis_out = ndis;
+    *nlist_out = nlist;
+}
+IVFFastScanStats IVFFastScan_stats;
+} // namespace faiss