RubyGems - faiss - Versions diffs - 0.2.7 → 0.3.1 - Mend

faiss 0.2.7 → 0.3.1

Files changed (172) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +9 -2
data/ext/faiss/index.cpp +1 -1
data/ext/faiss/index_binary.cpp +2 -2
data/ext/faiss/product_quantizer.cpp +1 -1
data/lib/faiss/version.rb +1 -1
data/lib/faiss.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +7 -7
data/vendor/faiss/faiss/AutoTune.h +0 -1
data/vendor/faiss/faiss/Clustering.cpp +4 -18
data/vendor/faiss/faiss/Clustering.h +31 -21
data/vendor/faiss/faiss/IVFlib.cpp +22 -11
data/vendor/faiss/faiss/Index.cpp +1 -1
data/vendor/faiss/faiss/Index.h +20 -5
data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
data/vendor/faiss/faiss/IndexBinary.h +8 -19
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +106 -187
data/vendor/faiss/faiss/IndexFastScan.cpp +90 -159
data/vendor/faiss/faiss/IndexFastScan.h +9 -8
data/vendor/faiss/faiss/IndexFlat.cpp +195 -3
data/vendor/faiss/faiss/IndexFlat.h +20 -1
data/vendor/faiss/faiss/IndexFlatCodes.cpp +11 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +3 -1
data/vendor/faiss/faiss/IndexHNSW.cpp +112 -316
data/vendor/faiss/faiss/IndexHNSW.h +12 -48
data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
data/vendor/faiss/faiss/IndexIDMap.h +24 -2
data/vendor/faiss/faiss/IndexIVF.cpp +159 -53
data/vendor/faiss/faiss/IndexIVF.h +37 -5
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +18 -26
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +433 -405
data/vendor/faiss/faiss/IndexIVFFastScan.h +56 -26
data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
data/vendor/faiss/faiss/IndexIVFPQ.cpp +78 -122
data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +18 -50
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -4
data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
data/vendor/faiss/faiss/IndexNSG.h +10 -10
data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
data/vendor/faiss/faiss/IndexPQ.h +1 -4
data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
data/vendor/faiss/faiss/IndexRefine.cpp +49 -19
data/vendor/faiss/faiss/IndexRefine.h +7 -0
data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +22 -16
data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
data/vendor/faiss/faiss/IndexShards.cpp +21 -29
data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
data/vendor/faiss/faiss/MatrixStats.h +21 -9
data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
data/vendor/faiss/faiss/VectorTransform.h +7 -7
data/vendor/faiss/faiss/clone_index.cpp +15 -10
data/vendor/faiss/faiss/clone_index.h +3 -0
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +87 -4
data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +7 -0
data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +8 -9
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +18 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +117 -17
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +1 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +267 -40
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +1 -2
data/vendor/faiss/faiss/impl/DistanceComputer.h +24 -1
data/vendor/faiss/faiss/impl/FaissException.h +13 -34
data/vendor/faiss/faiss/impl/HNSW.cpp +321 -70
data/vendor/faiss/faiss/impl/HNSW.h +9 -8
data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/NNDescent.cpp +29 -19
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +24 -22
data/vendor/faiss/faiss/impl/ProductQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
data/vendor/faiss/faiss/impl/ResultHandler.h +232 -176
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +444 -104
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +0 -8
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +280 -42
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
data/vendor/faiss/faiss/impl/index_read.cpp +45 -19
data/vendor/faiss/faiss/impl/index_write.cpp +60 -41
data/vendor/faiss/faiss/impl/io.cpp +10 -10
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
data/vendor/faiss/faiss/impl/platform_macros.h +18 -1
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +3 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +40 -49
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +374 -202
data/vendor/faiss/faiss/index_factory.cpp +10 -7
data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +27 -9
data/vendor/faiss/faiss/invlists/InvertedLists.h +12 -3
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
data/vendor/faiss/faiss/utils/distances.cpp +128 -74
data/vendor/faiss/faiss/utils/distances.h +81 -4
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
data/vendor/faiss/faiss/utils/distances_simd.cpp +428 -70
data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
data/vendor/faiss/faiss/utils/fp16.h +2 -0
data/vendor/faiss/faiss/utils/hamming.cpp +162 -110
data/vendor/faiss/faiss/utils/hamming.h +58 -0
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +15 -87
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +57 -0
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
data/vendor/faiss/faiss/utils/prefetch.h +77 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
data/vendor/faiss/faiss/utils/simdlib_neon.h +72 -77
data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
data/vendor/faiss/faiss/utils/sorting.h +27 -0
data/vendor/faiss/faiss/utils/utils.cpp +112 -6
data/vendor/faiss/faiss/utils/utils.h +57 -20
metadata +11 -4

data/vendor/faiss/faiss/impl/simd_result_handlers.h CHANGED Viewed

@@ -14,40 +14,86 @@
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/simdlib.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/ResultHandler.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/AlignedTable.h>
 #include <faiss/utils/partitioning.h>
 /** This file contains callbacks for kernels that compute distances.
- *
- * The SIMDResultHandler object is intended to be templated and inlined.
- * Methods:
- * - handle(): called when 32 distances are computed and provided in two
- *   simd16uint16. (q, b) indicate which entry it is in the block.
- * - set_block_origin(): set the sub-matrix that is being computed
  */
 namespace faiss {
+struct SIMDResultHandler {
+    // used to dispatch templates
+    bool is_CMax = false;
+    uint8_t sizeof_ids = 0;
+    bool with_fields = false;
+    /**  called when 32 distances are computed and provided in two
+     *   simd16uint16. (q, b) indicate which entry it is in the block. */
+    virtual void handle(
+            size_t q,
+            size_t b,
+            simd16uint16 d0,
+            simd16uint16 d1) = 0;
+    /// set the sub-matrix that is being computed
+    virtual void set_block_origin(size_t i0, size_t j0) = 0;
+    virtual ~SIMDResultHandler() {}
+};
+/* Result handler that will return float resutls eventually */
+struct SIMDResultHandlerToFloat : SIMDResultHandler {
+    size_t nq;     // number of queries
+    size_t ntotal; // ignore excess elements after ntotal
+    /// these fields are used mainly for the IVF variants (with_id_map=true)
+    const idx_t* id_map = nullptr; // map offset in invlist to vector id
+    const int* q_map = nullptr;    // map q to global query
+    const uint16_t* dbias =
+            nullptr; // table of biases to add to each query (for IVF L2 search)
+    const float* normalizers = nullptr; // size 2 * nq, to convert
+    SIMDResultHandlerToFloat(size_t nq, size_t ntotal)
+            : nq(nq), ntotal(ntotal) {}
+    virtual void begin(const float* norms) {
+        normalizers = norms;
+    }
+    // called at end of search to convert int16 distances to float, before
+    // normalizers are deallocated
+    virtual void end() {
+        normalizers = nullptr;
+    }
+};
+FAISS_API extern bool simd_result_handlers_accept_virtual;
 namespace simd_result_handlers {
-/** Dummy structure that just computes a checksum on results
+/** Dummy structure that just computes a chqecksum on results
  * (to avoid the computation to be optimized away) */
-struct DummyResultHandler {
+struct DummyResultHandler : SIMDResultHandler {
     size_t cs = 0;
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         cs += q * 123 + b * 789 + d0.get_scalar_0() + d1.get_scalar_0();
     }
-    void set_block_origin(size_t, size_t) {}
+    void set_block_origin(size_t, size_t) final {}
+    ~DummyResultHandler() {}
 };
 /** memorize results in a nq-by-nb matrix.
  *
  * j0 is the current upper-left block of the matrix
  */
-struct StoreResultHandler {
+struct StoreResultHandler : SIMDResultHandler {
     uint16_t* data;
     size_t ld; // total number of columns
     size_t i0 = 0;
@@ -55,32 +101,32 @@ struct StoreResultHandler {
     StoreResultHandler(uint16_t* data, size_t ld) : data(data), ld(ld) {}
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         size_t ofs = (q + i0) * ld + j0 + b * 32;
         d0.store(data + ofs);
         d1.store(data + ofs + 16);
     }
-    void set_block_origin(size_t i0, size_t j0) {
-        this->i0 = i0;
-        this->j0 = j0;
+    void set_block_origin(size_t i0_in, size_t j0_in) final {
+        this->i0 = i0_in;
+        this->j0 = j0_in;
     }
 };
 /** stores results in fixed-size matrix. */
 template <int NQ, int BB>
-struct FixedStorageHandler {
+struct FixedStorageHandler : SIMDResultHandler {
     simd16uint16 dis[NQ][BB];
     int i0 = 0;
-    void handle(int q, int b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         dis[q + i0][2 * b] = d0;
         dis[q + i0][2 * b + 1] = d1;
     }
-    void set_block_origin(size_t i0, size_t j0) {
-        this->i0 = i0;
-        assert(j0 == 0);
+    void set_block_origin(size_t i0_in, size_t j0_in) final {
+        this->i0 = i0_in;
+        assert(j0_in == 0);
     }
     template <class OtherResultHandler>
@@ -91,30 +137,29 @@ struct FixedStorageHandler {
             }
         }
     }
+    virtual ~FixedStorageHandler() {}
 };
-/** Record origin of current block  */
+/** Result handler that compares distances to check if they need to be kept */
 template <class C, bool with_id_map>
-struct SIMDResultHandler {
+struct ResultHandlerCompare : SIMDResultHandlerToFloat {
     using TI = typename C::TI;
     bool disable = false;
     int64_t i0 = 0; // query origin
     int64_t j0 = 0; // db origin
-    size_t ntotal;  // ignore excess elements after ntotal
-    /// these fields are used mainly for the IVF variants (with_id_map=true)
-    const TI* id_map;      // map offset in invlist to vector id
-    const int* q_map;      // map q to global query
-    const uint16_t* dbias; // table of biases to add to each query
-    explicit SIMDResultHandler(size_t ntotal)
-            : ntotal(ntotal), id_map(nullptr), q_map(nullptr), dbias(nullptr) {}
+    ResultHandlerCompare(size_t nq, size_t ntotal)
+            : SIMDResultHandlerToFloat(nq, ntotal) {
+        this->is_CMax = C::is_max;
+        this->sizeof_ids = sizeof(typename C::TI);
+        this->with_fields = with_id_map;
+    }
-    void set_block_origin(size_t i0, size_t j0) {
-        this->i0 = i0;
-        this->j0 = j0;
+    void set_block_origin(size_t i0_in, size_t j0_in) final {
+        this->i0 = i0_in;
+        this->j0 = j0_in;
     }
     // adjust handler data for IVF.
@@ -172,43 +217,37 @@ struct SIMDResultHandler {
         return lt_mask;
     }
-    virtual void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) = 0;
-    virtual ~SIMDResultHandler() {}
+    virtual ~ResultHandlerCompare() {}
 };
 /** Special version for k=1 */
 template <class C, bool with_id_map = false>
-struct SingleResultHandler : SIMDResultHandler<C, with_id_map> {
+struct SingleResultHandler : ResultHandlerCompare<C, with_id_map> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
-    struct Result {
-        T val;
-        TI id;
-    };
-    std::vector<Result> results;
+    std::vector<int16_t> idis;
+    float* dis;
+    int64_t* ids;
-    SingleResultHandler(size_t nq, size_t ntotal)
-            : SIMDResultHandler<C, with_id_map>(ntotal), results(nq) {
+    SingleResultHandler(size_t nq, size_t ntotal, float* dis, int64_t* ids)
+            : RHC(nq, ntotal), idis(nq), dis(dis), ids(ids) {
         for (int i = 0; i < nq; i++) {
-            Result res = {C::neutral(), -1};
-            results[i] = res;
+            ids[i] = -1;
+            idis[i] = C::neutral();
         }
     }
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         if (this->disable) {
             return;
         }
         this->adjust_with_origin(q, d0, d1);
-        Result& res = results[q];
-        uint32_t lt_mask = this->get_lt_mask(res.val, b, d0, d1);
+        uint32_t lt_mask = this->get_lt_mask(idis[q], b, d0, d1);
         if (!lt_mask) {
             return;
         }
@@ -221,70 +260,61 @@ struct SingleResultHandler : SIMDResultHandler<C, with_id_map> {
             // find first non-zero
             int j = __builtin_ctz(lt_mask);
             lt_mask -= 1 << j;
-            T dis = d32tab[j];
-            if (C::cmp(res.val, dis)) {
-                res.val = dis;
-                res.id = this->adjust_id(b, j);
+            T d = d32tab[j];
+            if (C::cmp(idis[q], d)) {
+                idis[q] = d;
+                ids[q] = this->adjust_id(b, j);
             }
         }
     }
-    void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) override {
-        for (int q = 0; q < results.size(); q++) {
+    void end() {
+        for (int q = 0; q < this->nq; q++) {
             if (!normalizers) {
-                distances[q] = results[q].val;
+                dis[q] = idis[q];
             } else {
                 float one_a = 1 / normalizers[2 * q];
                 float b = normalizers[2 * q + 1];
-                distances[q] = b + results[q].val * one_a;
+                dis[q] = b + idis[q] * one_a;
             }
-            labels[q] = results[q].id;
         }
     }
 };
 /** Structure that collects results in a min- or max-heap */
 template <class C, bool with_id_map = false>
-struct HeapHandler : SIMDResultHandler<C, with_id_map> {
+struct HeapHandler : ResultHandlerCompare<C, with_id_map> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
-    int nq;
-    T* heap_dis_tab;
-    TI* heap_ids_tab;
+    std::vector<uint16_t> idis;
+    std::vector<TI> iids;
+    float* dis;
+    int64_t* ids;
     int64_t k; // number of results to keep
-    HeapHandler(
-            int nq,
-            T* heap_dis_tab,
-            TI* heap_ids_tab,
-            size_t k,
-            size_t ntotal)
-            : SIMDResultHandler<C, with_id_map>(ntotal),
-              nq(nq),
-              heap_dis_tab(heap_dis_tab),
-              heap_ids_tab(heap_ids_tab),
+    HeapHandler(size_t nq, size_t ntotal, int64_t k, float* dis, int64_t* ids)
+            : RHC(nq, ntotal),
+              idis(nq * k),
+              iids(nq * k),
+              dis(dis),
+              ids(ids),
               k(k) {
-        for (int q = 0; q < nq; q++) {
-            T* heap_dis_in = heap_dis_tab + q * k;
-            TI* heap_ids_in = heap_ids_tab + q * k;
-            heap_heapify<C>(k, heap_dis_in, heap_ids_in);
-        }
+        heap_heapify<C>(k * nq, idis.data(), iids.data());
     }
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         if (this->disable) {
             return;
         }
         this->adjust_with_origin(q, d0, d1);
-        T* heap_dis = heap_dis_tab + q * k;
-        TI* heap_ids = heap_ids_tab + q * k;
+        T* heap_dis = idis.data() + q * k;
+        TI* heap_ids = iids.data() + q * k;
         uint16_t cur_thresh =
                 heap_dis[0] < 65536 ? (uint16_t)(heap_dis[0]) : 0xffff;
@@ -313,16 +343,13 @@ struct HeapHandler : SIMDResultHandler<C, with_id_map> {
         }
     }
-    void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) override {
-        for (int q = 0; q < nq; q++) {
-            T* heap_dis_in = heap_dis_tab + q * k;
-            TI* heap_ids_in = heap_ids_tab + q * k;
+    void end() override {
+        for (int q = 0; q < this->nq; q++) {
+            T* heap_dis_in = idis.data() + q * k;
+            TI* heap_ids_in = iids.data() + q * k;
             heap_reorder<C>(k, heap_dis_in, heap_ids_in);
-            int64_t* heap_ids = labels + q * k;
-            float* heap_dis = distances + q * k;
+            float* heap_dis = dis + q * k;
+            int64_t* heap_ids = ids + q * k;
             float one_a = 1.0, b = 0.0;
             if (normalizers) {
@@ -330,8 +357,8 @@ struct HeapHandler : SIMDResultHandler<C, with_id_map> {
                 b = normalizers[2 * q + 1];
             }
             for (int j = 0; j < k; j++) {
-                heap_ids[j] = heap_ids_in[j];
                 heap_dis[j] = heap_dis_in[j] * one_a + b;
+                heap_ids[j] = heap_ids_in[j];
             }
         }
     }
@@ -342,114 +369,45 @@ struct HeapHandler : SIMDResultHandler<C, with_id_map> {
  * Results are stored when they are below the threshold until the capacity is
  * reached. Then a partition sort is used to update the threshold. */
-namespace {
-uint64_t get_cy() {
-#ifdef MICRO_BENCHMARK
-    uint32_t high, low;
-    asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
-    return ((uint64_t)high << 32) | (low);
-#else
-    return 0;
-#endif
-}
-} // anonymous namespace
-template <class C>
-struct ReservoirTopN {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    T* vals;
-    TI* ids;
-    size_t i;        // number of stored elements
-    size_t n;        // number of requested elements
-    size_t capacity; // size of storage
-    size_t cycles = 0;
-    T threshold; // current threshold
-    ReservoirTopN(size_t n, size_t capacity, T* vals, TI* ids)
-            : vals(vals), ids(ids), i(0), n(n), capacity(capacity) {
-        assert(n < capacity);
-        threshold = C::neutral();
-    }
-    void add(T val, TI id) {
-        if (C::cmp(threshold, val)) {
-            if (i == capacity) {
-                shrink_fuzzy();
-            }
-            vals[i] = val;
-            ids[i] = id;
-            i++;
-        }
-    }
-    /// shrink number of stored elements to n
-    void shrink_xx() {
-        uint64_t t0 = get_cy();
-        qselect(vals, ids, i, n);
-        i = n; // forget all elements above i = n
-        threshold = C::Crev::neutral();
-        for (size_t j = 0; j < n; j++) {
-            if (C::cmp(vals[j], threshold)) {
-                threshold = vals[j];
-            }
-        }
-        cycles += get_cy() - t0;
-    }
-    void shrink() {
-        uint64_t t0 = get_cy();
-        threshold = partition<C>(vals, ids, i, n);
-        i = n;
-        cycles += get_cy() - t0;
-    }
-    void shrink_fuzzy() {
-        uint64_t t0 = get_cy();
-        assert(i == capacity);
-        threshold = partition_fuzzy<C>(
-                vals, ids, capacity, n, (capacity + n) / 2, &i);
-        cycles += get_cy() - t0;
-    }
-};
 /** Handler built from several ReservoirTopN (one per query) */
 template <class C, bool with_id_map = false>
-struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
+struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
     size_t capacity; // rounded up to multiple of 16
+    // where the final results will be written
+    float* dis;
+    int64_t* ids;
     std::vector<TI> all_ids;
     AlignedTable<T> all_vals;
     std::vector<ReservoirTopN<C>> reservoirs;
-    uint64_t times[4];
-    ReservoirHandler(size_t nq, size_t ntotal, size_t n, size_t capacity_in)
-            : SIMDResultHandler<C, with_id_map>(ntotal),
-              capacity((capacity_in + 15) & ~15),
-              all_ids(nq * capacity),
-              all_vals(nq * capacity) {
+    ReservoirHandler(
+            size_t nq,
+            size_t ntotal,
+            size_t k,
+            size_t cap,
+            float* dis,
+            int64_t* ids)
+            : RHC(nq, ntotal), capacity((cap + 15) & ~15), dis(dis), ids(ids) {
         assert(capacity % 16 == 0);
-        for (size_t i = 0; i < nq; i++) {
+        all_ids.resize(nq * capacity);
+        all_vals.resize(nq * capacity);
+        for (size_t q = 0; q < nq; q++) {
             reservoirs.emplace_back(
-                    n,
+                    k,
                     capacity,
-                    all_vals.get() + i * capacity,
-                    all_ids.data() + i * capacity);
+                    all_vals.get() + q * capacity,
+                    all_ids.data() + q * capacity);
         }
-        times[0] = times[1] = times[2] = times[3] = 0;
     }
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
-        uint64_t t0 = get_cy();
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         if (this->disable) {
             return;
         }
@@ -457,8 +415,6 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
         ReservoirTopN<C>& res = reservoirs[q];
         uint32_t lt_mask = this->get_lt_mask(res.threshold, b, d0, d1);
-        uint64_t t1 = get_cy();
-        times[0] += t1 - t0;
         if (!lt_mask) {
             return;
@@ -474,20 +430,14 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
             T dis = d32tab[j];
             res.add(dis, this->adjust_id(b, j));
         }
-        times[1] += get_cy() - t1;
     }
-    void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) override {
+    void end() override {
         using Cf = typename std::conditional<
                 C::is_max,
                 CMax<float, int64_t>,
                 CMin<float, int64_t>>::type;
-        uint64_t t0 = get_cy();
-        uint64_t t3 = 0;
         std::vector<int> perm(reservoirs[0].n);
         for (int q = 0; q < reservoirs.size(); q++) {
             ReservoirTopN<C>& res = reservoirs[q];
@@ -496,8 +446,8 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
             if (res.i > res.n) {
                 res.shrink();
             }
-            int64_t* heap_ids = labels + q * n;
-            float* heap_dis = distances + q * n;
+            int64_t* heap_ids = ids + q * n;
+            float* heap_dis = dis + q * n;
             float one_a = 1.0, b = 0.0;
             if (normalizers) {
@@ -518,14 +468,236 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
             // possibly add empty results
             heap_heapify<Cf>(n - res.i, heap_dis + res.i, heap_ids + res.i);
+        }
+    }
+};
+/** Result hanlder for range search. The difficulty is that the range distances
+ * have to be scaled using the scaler.
+ */
+template <class C, bool with_id_map = false>
+struct RangeHandler : ResultHandlerCompare<C, with_id_map> {
+    using T = typename C::T;
+    using TI = typename C::TI;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
+    using RHC::nq;
+    RangeSearchResult& rres;
+    float radius;
+    std::vector<uint16_t> thresholds;
+    std::vector<size_t> n_per_query;
+    size_t q0 = 0;
+    // we cannot use the RangeSearchPartialResult interface because queries can
+    // be performed by batches
+    struct Triplet {
+        idx_t q;
+        idx_t b;
+        uint16_t dis;
+    };
+    std::vector<Triplet> triplets;
+    RangeHandler(RangeSearchResult& rres, float radius, size_t ntotal)
+            : RHC(rres.nq, ntotal), rres(rres), radius(radius) {
+        thresholds.resize(nq);
+        n_per_query.resize(nq + 1);
+    }
+    virtual void begin(const float* norms) {
+        normalizers = norms;
+        for (int q = 0; q < nq; ++q) {
+            thresholds[q] =
+                    normalizers[2 * q] * (radius - normalizers[2 * q + 1]);
+        }
+    }
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
+        if (this->disable) {
+            return;
+        }
+        this->adjust_with_origin(q, d0, d1);
+        uint32_t lt_mask = this->get_lt_mask(thresholds[q], b, d0, d1);
+        if (!lt_mask) {
+            return;
+        }
+        ALIGNED(32) uint16_t d32tab[32];
+        d0.store(d32tab);
+        d1.store(d32tab + 16);
+        while (lt_mask) {
+            // find first non-zero
+            int j = __builtin_ctz(lt_mask);
+            lt_mask -= 1 << j;
+            T dis = d32tab[j];
+            n_per_query[q]++;
+            triplets.push_back({idx_t(q + q0), this->adjust_id(b, j), dis});
+        }
+    }
-            t3 += res.cycles;
+    void end() override {
+        memcpy(rres.lims, n_per_query.data(), sizeof(n_per_query[0]) * nq);
+        rres.do_allocation();
+        for (auto it = triplets.begin(); it != triplets.end(); ++it) {
+            size_t& l = rres.lims[it->q];
+            rres.distances[l] = it->dis;
+            rres.labels[l] = it->b;
+            l++;
+        }
+        memmove(rres.lims + 1, rres.lims, sizeof(*rres.lims) * rres.nq);
+        rres.lims[0] = 0;
+        for (int q = 0; q < nq; q++) {
+            float one_a = 1 / normalizers[2 * q];
+            float b = normalizers[2 * q + 1];
+            for (size_t i = rres.lims[q]; i < rres.lims[q + 1]; i++) {
+                rres.distances[i] = rres.distances[i] * one_a + b;
+            }
         }
-        times[2] += get_cy() - t0;
-        times[3] += t3;
     }
 };
+#ifndef SWIG
+// handler for a subset of queries
+template <class C, bool with_id_map = false>
+struct PartialRangeHandler : RangeHandler<C, with_id_map> {
+    using T = typename C::T;
+    using TI = typename C::TI;
+    using RHC = RangeHandler<C, with_id_map>;
+    using RHC::normalizers;
+    using RHC::nq, RHC::q0, RHC::triplets, RHC::n_per_query;
+    RangeSearchPartialResult& pres;
+    PartialRangeHandler(
+            RangeSearchPartialResult& pres,
+            float radius,
+            size_t ntotal,
+            size_t q0,
+            size_t q1)
+            : RangeHandler<C, with_id_map>(*pres.res, radius, ntotal),
+              pres(pres) {
+        nq = q1 - q0;
+        this->q0 = q0;
+    }
+    // shift left n_per_query
+    void shift_n_per_query() {
+        memmove(n_per_query.data() + 1,
+                n_per_query.data(),
+                nq * sizeof(n_per_query[0]));
+        n_per_query[0] = 0;
+    }
+    // commit to partial result instead of full RangeResult
+    void end() override {
+        std::vector<typename RHC::Triplet> sorted_triplets(triplets.size());
+        for (int q = 0; q < nq; q++) {
+            n_per_query[q + 1] += n_per_query[q];
+        }
+        shift_n_per_query();
+        for (size_t i = 0; i < triplets.size(); i++) {
+            sorted_triplets[n_per_query[triplets[i].q - q0]++] = triplets[i];
+        }
+        shift_n_per_query();
+        size_t* lims = n_per_query.data();
+        for (int q = 0; q < nq; q++) {
+            float one_a = 1 / normalizers[2 * q];
+            float b = normalizers[2 * q + 1];
+            RangeQueryResult& qres = pres.new_result(q + q0);
+            for (size_t i = lims[q]; i < lims[q + 1]; i++) {
+                qres.add(
+                        sorted_triplets[i].dis * one_a + b,
+                        sorted_triplets[i].b);
+            }
+        }
+    }
+};
+#endif
+/********************************************************************************
+ * Dynamic dispatching function. The consumer should have a templatized method f
+ * that will be replaced with the actual SIMDResultHandler that is determined
+ * dynamically.
+ */
+template <class C, bool W, class Consumer, class... Types>
+void dispatch_SIMDResultHanlder_fixedCW(
+        SIMDResultHandler& res,
+        Consumer& consumer,
+        Types... args) {
+    if (auto resh = dynamic_cast<SingleResultHandler<C, W>*>(&res)) {
+        consumer.template f<SingleResultHandler<C, W>>(*resh, args...);
+    } else if (auto resh = dynamic_cast<HeapHandler<C, W>*>(&res)) {
+        consumer.template f<HeapHandler<C, W>>(*resh, args...);
+    } else if (auto resh = dynamic_cast<ReservoirHandler<C, W>*>(&res)) {
+        consumer.template f<ReservoirHandler<C, W>>(*resh, args...);
+    } else { // generic handler -- will not be inlined
+        FAISS_THROW_IF_NOT_FMT(
+                simd_result_handlers_accept_virtual,
+                "Running vitrual handler for %s",
+                typeid(res).name());
+        consumer.template f<SIMDResultHandler>(res, args...);
+    }
+}
+template <class C, class Consumer, class... Types>
+void dispatch_SIMDResultHanlder_fixedC(
+        SIMDResultHandler& res,
+        Consumer& consumer,
+        Types... args) {
+    if (res.with_fields) {
+        dispatch_SIMDResultHanlder_fixedCW<C, true>(res, consumer, args...);
+    } else {
+        dispatch_SIMDResultHanlder_fixedCW<C, false>(res, consumer, args...);
+    }
+}
+template <class Consumer, class... Types>
+void dispatch_SIMDResultHanlder(
+        SIMDResultHandler& res,
+        Consumer& consumer,
+        Types... args) {
+    if (res.sizeof_ids == 0) {
+        if (auto resh = dynamic_cast<StoreResultHandler*>(&res)) {
+            consumer.template f<StoreResultHandler>(*resh, args...);
+        } else if (auto resh = dynamic_cast<DummyResultHandler*>(&res)) {
+            consumer.template f<DummyResultHandler>(*resh, args...);
+        } else { // generic path
+            FAISS_THROW_IF_NOT_FMT(
+                    simd_result_handlers_accept_virtual,
+                    "Running vitrual handler for %s",
+                    typeid(res).name());
+            consumer.template f<SIMDResultHandler>(res, args...);
+        }
+    } else if (res.sizeof_ids == sizeof(int)) {
+        if (res.is_CMax) {
+            dispatch_SIMDResultHanlder_fixedC<CMax<uint16_t, int>>(
+                    res, consumer, args...);
+        } else {
+            dispatch_SIMDResultHanlder_fixedC<CMin<uint16_t, int>>(
+                    res, consumer, args...);
+        }
+    } else if (res.sizeof_ids == sizeof(int64_t)) {
+        if (res.is_CMax) {
+            dispatch_SIMDResultHanlder_fixedC<CMax<uint16_t, int64_t>>(
+                    res, consumer, args...);
+        } else {
+            dispatch_SIMDResultHanlder_fixedC<CMin<uint16_t, int64_t>>(
+                    res, consumer, args...);
+        }
+    } else {
+        FAISS_THROW_FMT("Unknown id size %d", res.sizeof_ids);
+    }
+}
 } // namespace simd_result_handlers
 } // namespace faiss