RubyGems - faiss - Versions diffs - 0.3.0 → 0.3.1 - Mend

faiss 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +9 -2
data/ext/faiss/index.cpp +1 -1
data/ext/faiss/index_binary.cpp +2 -2
data/ext/faiss/product_quantizer.cpp +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +7 -7
data/vendor/faiss/faiss/AutoTune.h +0 -1
data/vendor/faiss/faiss/Clustering.cpp +4 -18
data/vendor/faiss/faiss/Clustering.h +31 -21
data/vendor/faiss/faiss/IVFlib.cpp +22 -11
data/vendor/faiss/faiss/Index.cpp +1 -1
data/vendor/faiss/faiss/Index.h +20 -5
data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
data/vendor/faiss/faiss/IndexBinary.h +8 -19
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +106 -187
data/vendor/faiss/faiss/IndexFastScan.cpp +90 -159
data/vendor/faiss/faiss/IndexFastScan.h +9 -8
data/vendor/faiss/faiss/IndexFlat.cpp +195 -3
data/vendor/faiss/faiss/IndexFlat.h +20 -1
data/vendor/faiss/faiss/IndexFlatCodes.cpp +11 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +3 -1
data/vendor/faiss/faiss/IndexHNSW.cpp +112 -316
data/vendor/faiss/faiss/IndexHNSW.h +12 -48
data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
data/vendor/faiss/faiss/IndexIDMap.h +24 -2
data/vendor/faiss/faiss/IndexIVF.cpp +159 -53
data/vendor/faiss/faiss/IndexIVF.h +37 -5
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +18 -26
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +3 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +433 -405
data/vendor/faiss/faiss/IndexIVFFastScan.h +56 -26
data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
data/vendor/faiss/faiss/IndexIVFPQ.cpp +78 -122
data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +18 -50
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -4
data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
data/vendor/faiss/faiss/IndexNSG.h +10 -10
data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
data/vendor/faiss/faiss/IndexPQ.h +1 -4
data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
data/vendor/faiss/faiss/IndexRefine.cpp +49 -19
data/vendor/faiss/faiss/IndexRefine.h +7 -0
data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +22 -16
data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
data/vendor/faiss/faiss/IndexShards.cpp +21 -29
data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
data/vendor/faiss/faiss/MatrixStats.h +21 -9
data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
data/vendor/faiss/faiss/VectorTransform.h +7 -7
data/vendor/faiss/faiss/clone_index.cpp +15 -10
data/vendor/faiss/faiss/clone_index.h +3 -0
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +87 -4
data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +7 -0
data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +8 -9
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +18 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +117 -17
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +1 -1
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +267 -40
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +1 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +1 -2
data/vendor/faiss/faiss/impl/DistanceComputer.h +24 -1
data/vendor/faiss/faiss/impl/FaissException.h +13 -34
data/vendor/faiss/faiss/impl/HNSW.cpp +321 -70
data/vendor/faiss/faiss/impl/HNSW.h +9 -8
data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/NNDescent.cpp +29 -19
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +24 -22
data/vendor/faiss/faiss/impl/ProductQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
data/vendor/faiss/faiss/impl/ResultHandler.h +232 -176
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +444 -104
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +0 -8
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +280 -42
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
data/vendor/faiss/faiss/impl/index_read.cpp +45 -19
data/vendor/faiss/faiss/impl/index_write.cpp +60 -41
data/vendor/faiss/faiss/impl/io.cpp +10 -10
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
data/vendor/faiss/faiss/impl/platform_macros.h +18 -1
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +3 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +40 -49
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +374 -202
data/vendor/faiss/faiss/index_factory.cpp +10 -7
data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -1
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +27 -9
data/vendor/faiss/faiss/invlists/InvertedLists.h +12 -3
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +3 -3
data/vendor/faiss/faiss/python/python_callbacks.cpp +1 -1
data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
data/vendor/faiss/faiss/utils/distances.cpp +128 -74
data/vendor/faiss/faiss/utils/distances.h +81 -4
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
data/vendor/faiss/faiss/utils/distances_simd.cpp +428 -70
data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
data/vendor/faiss/faiss/utils/fp16.h +2 -0
data/vendor/faiss/faiss/utils/hamming.cpp +162 -110
data/vendor/faiss/faiss/utils/hamming.h +58 -0
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +15 -87
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +57 -0
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
data/vendor/faiss/faiss/utils/prefetch.h +77 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
data/vendor/faiss/faiss/utils/simdlib_neon.h +72 -77
data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
data/vendor/faiss/faiss/utils/sorting.h +27 -0
data/vendor/faiss/faiss/utils/utils.cpp +112 -6
data/vendor/faiss/faiss/utils/utils.h +57 -20
metadata +10 -3

data/vendor/faiss/faiss/impl/simd_result_handlers.h CHANGED Viewed

@@ -14,40 +14,86 @@
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/simdlib.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/ResultHandler.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/AlignedTable.h>
 #include <faiss/utils/partitioning.h>
 /** This file contains callbacks for kernels that compute distances.
- *
- * The SIMDResultHandler object is intended to be templated and inlined.
- * Methods:
- * - handle(): called when 32 distances are computed and provided in two
- *   simd16uint16. (q, b) indicate which entry it is in the block.
- * - set_block_origin(): set the sub-matrix that is being computed
  */
 namespace faiss {
+struct SIMDResultHandler {
+    // used to dispatch templates
+    bool is_CMax = false;
+    uint8_t sizeof_ids = 0;
+    bool with_fields = false;
+    /**  called when 32 distances are computed and provided in two
+     *   simd16uint16. (q, b) indicate which entry it is in the block. */
+    virtual void handle(
+            size_t q,
+            size_t b,
+            simd16uint16 d0,
+            simd16uint16 d1) = 0;
+    /// set the sub-matrix that is being computed
+    virtual void set_block_origin(size_t i0, size_t j0) = 0;
+    virtual ~SIMDResultHandler() {}
+};
+/* Result handler that will return float resutls eventually */
+struct SIMDResultHandlerToFloat : SIMDResultHandler {
+    size_t nq;     // number of queries
+    size_t ntotal; // ignore excess elements after ntotal
+    /// these fields are used mainly for the IVF variants (with_id_map=true)
+    const idx_t* id_map = nullptr; // map offset in invlist to vector id
+    const int* q_map = nullptr;    // map q to global query
+    const uint16_t* dbias =
+            nullptr; // table of biases to add to each query (for IVF L2 search)
+    const float* normalizers = nullptr; // size 2 * nq, to convert
+    SIMDResultHandlerToFloat(size_t nq, size_t ntotal)
+            : nq(nq), ntotal(ntotal) {}
+    virtual void begin(const float* norms) {
+        normalizers = norms;
+    }
+    // called at end of search to convert int16 distances to float, before
+    // normalizers are deallocated
+    virtual void end() {
+        normalizers = nullptr;
+    }
+};
+FAISS_API extern bool simd_result_handlers_accept_virtual;
 namespace simd_result_handlers {
-/** Dummy structure that just computes a checksum on results
+/** Dummy structure that just computes a chqecksum on results
  * (to avoid the computation to be optimized away) */
-struct DummyResultHandler {
+struct DummyResultHandler : SIMDResultHandler {
     size_t cs = 0;
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         cs += q * 123 + b * 789 + d0.get_scalar_0() + d1.get_scalar_0();
     }
-    void set_block_origin(size_t, size_t) {}
+    void set_block_origin(size_t, size_t) final {}
+    ~DummyResultHandler() {}
 };
 /** memorize results in a nq-by-nb matrix.
  *
  * j0 is the current upper-left block of the matrix
  */
-struct StoreResultHandler {
+struct StoreResultHandler : SIMDResultHandler {
     uint16_t* data;
     size_t ld; // total number of columns
     size_t i0 = 0;
@@ -55,32 +101,32 @@ struct StoreResultHandler {
     StoreResultHandler(uint16_t* data, size_t ld) : data(data), ld(ld) {}
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         size_t ofs = (q + i0) * ld + j0 + b * 32;
         d0.store(data + ofs);
         d1.store(data + ofs + 16);
     }
-    void set_block_origin(size_t i0, size_t j0) {
-        this->i0 = i0;
-        this->j0 = j0;
+    void set_block_origin(size_t i0_in, size_t j0_in) final {
+        this->i0 = i0_in;
+        this->j0 = j0_in;
     }
 };
 /** stores results in fixed-size matrix. */
 template <int NQ, int BB>
-struct FixedStorageHandler {
+struct FixedStorageHandler : SIMDResultHandler {
     simd16uint16 dis[NQ][BB];
     int i0 = 0;
-    void handle(int q, int b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         dis[q + i0][2 * b] = d0;
         dis[q + i0][2 * b + 1] = d1;
     }
-    void set_block_origin(size_t i0, size_t j0) {
-        this->i0 = i0;
-        assert(j0 == 0);
+    void set_block_origin(size_t i0_in, size_t j0_in) final {
+        this->i0 = i0_in;
+        assert(j0_in == 0);
     }
     template <class OtherResultHandler>
@@ -91,30 +137,29 @@ struct FixedStorageHandler {
             }
         }
     }
+    virtual ~FixedStorageHandler() {}
 };
-/** Record origin of current block  */
+/** Result handler that compares distances to check if they need to be kept */
 template <class C, bool with_id_map>
-struct SIMDResultHandler {
+struct ResultHandlerCompare : SIMDResultHandlerToFloat {
     using TI = typename C::TI;
     bool disable = false;
     int64_t i0 = 0; // query origin
     int64_t j0 = 0; // db origin
-    size_t ntotal;  // ignore excess elements after ntotal
-    /// these fields are used mainly for the IVF variants (with_id_map=true)
-    const TI* id_map;      // map offset in invlist to vector id
-    const int* q_map;      // map q to global query
-    const uint16_t* dbias; // table of biases to add to each query
-    explicit SIMDResultHandler(size_t ntotal)
-            : ntotal(ntotal), id_map(nullptr), q_map(nullptr), dbias(nullptr) {}
+    ResultHandlerCompare(size_t nq, size_t ntotal)
+            : SIMDResultHandlerToFloat(nq, ntotal) {
+        this->is_CMax = C::is_max;
+        this->sizeof_ids = sizeof(typename C::TI);
+        this->with_fields = with_id_map;
+    }
-    void set_block_origin(size_t i0, size_t j0) {
-        this->i0 = i0;
-        this->j0 = j0;
+    void set_block_origin(size_t i0_in, size_t j0_in) final {
+        this->i0 = i0_in;
+        this->j0 = j0_in;
     }
     // adjust handler data for IVF.
@@ -172,43 +217,37 @@ struct SIMDResultHandler {
         return lt_mask;
     }
-    virtual void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) = 0;
-    virtual ~SIMDResultHandler() {}
+    virtual ~ResultHandlerCompare() {}
 };
 /** Special version for k=1 */
 template <class C, bool with_id_map = false>
-struct SingleResultHandler : SIMDResultHandler<C, with_id_map> {
+struct SingleResultHandler : ResultHandlerCompare<C, with_id_map> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
-    struct Result {
-        T val;
-        TI id;
-    };
-    std::vector<Result> results;
+    std::vector<int16_t> idis;
+    float* dis;
+    int64_t* ids;
-    SingleResultHandler(size_t nq, size_t ntotal)
-            : SIMDResultHandler<C, with_id_map>(ntotal), results(nq) {
+    SingleResultHandler(size_t nq, size_t ntotal, float* dis, int64_t* ids)
+            : RHC(nq, ntotal), idis(nq), dis(dis), ids(ids) {
         for (int i = 0; i < nq; i++) {
-            Result res = {C::neutral(), -1};
-            results[i] = res;
+            ids[i] = -1;
+            idis[i] = C::neutral();
         }
     }
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         if (this->disable) {
             return;
         }
         this->adjust_with_origin(q, d0, d1);
-        Result& res = results[q];
-        uint32_t lt_mask = this->get_lt_mask(res.val, b, d0, d1);
+        uint32_t lt_mask = this->get_lt_mask(idis[q], b, d0, d1);
         if (!lt_mask) {
             return;
         }
@@ -221,70 +260,61 @@ struct SingleResultHandler : SIMDResultHandler<C, with_id_map> {
             // find first non-zero
             int j = __builtin_ctz(lt_mask);
             lt_mask -= 1 << j;
-            T dis = d32tab[j];
-            if (C::cmp(res.val, dis)) {
-                res.val = dis;
-                res.id = this->adjust_id(b, j);
+            T d = d32tab[j];
+            if (C::cmp(idis[q], d)) {
+                idis[q] = d;
+                ids[q] = this->adjust_id(b, j);
             }
         }
     }
-    void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) override {
-        for (int q = 0; q < results.size(); q++) {
+    void end() {
+        for (int q = 0; q < this->nq; q++) {
             if (!normalizers) {
-                distances[q] = results[q].val;
+                dis[q] = idis[q];
             } else {
                 float one_a = 1 / normalizers[2 * q];
                 float b = normalizers[2 * q + 1];
-                distances[q] = b + results[q].val * one_a;
+                dis[q] = b + idis[q] * one_a;
             }
-            labels[q] = results[q].id;
         }
     }
 };
 /** Structure that collects results in a min- or max-heap */
 template <class C, bool with_id_map = false>
-struct HeapHandler : SIMDResultHandler<C, with_id_map> {
+struct HeapHandler : ResultHandlerCompare<C, with_id_map> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
-    int nq;
-    T* heap_dis_tab;
-    TI* heap_ids_tab;
+    std::vector<uint16_t> idis;
+    std::vector<TI> iids;
+    float* dis;
+    int64_t* ids;
     int64_t k; // number of results to keep
-    HeapHandler(
-            int nq,
-            T* heap_dis_tab,
-            TI* heap_ids_tab,
-            size_t k,
-            size_t ntotal)
-            : SIMDResultHandler<C, with_id_map>(ntotal),
-              nq(nq),
-              heap_dis_tab(heap_dis_tab),
-              heap_ids_tab(heap_ids_tab),
+    HeapHandler(size_t nq, size_t ntotal, int64_t k, float* dis, int64_t* ids)
+            : RHC(nq, ntotal),
+              idis(nq * k),
+              iids(nq * k),
+              dis(dis),
+              ids(ids),
               k(k) {
-        for (int q = 0; q < nq; q++) {
-            T* heap_dis_in = heap_dis_tab + q * k;
-            TI* heap_ids_in = heap_ids_tab + q * k;
-            heap_heapify<C>(k, heap_dis_in, heap_ids_in);
-        }
+        heap_heapify<C>(k * nq, idis.data(), iids.data());
     }
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         if (this->disable) {
             return;
         }
         this->adjust_with_origin(q, d0, d1);
-        T* heap_dis = heap_dis_tab + q * k;
-        TI* heap_ids = heap_ids_tab + q * k;
+        T* heap_dis = idis.data() + q * k;
+        TI* heap_ids = iids.data() + q * k;
         uint16_t cur_thresh =
                 heap_dis[0] < 65536 ? (uint16_t)(heap_dis[0]) : 0xffff;
@@ -313,16 +343,13 @@ struct HeapHandler : SIMDResultHandler<C, with_id_map> {
         }
     }
-    void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) override {
-        for (int q = 0; q < nq; q++) {
-            T* heap_dis_in = heap_dis_tab + q * k;
-            TI* heap_ids_in = heap_ids_tab + q * k;
+    void end() override {
+        for (int q = 0; q < this->nq; q++) {
+            T* heap_dis_in = idis.data() + q * k;
+            TI* heap_ids_in = iids.data() + q * k;
             heap_reorder<C>(k, heap_dis_in, heap_ids_in);
-            int64_t* heap_ids = labels + q * k;
-            float* heap_dis = distances + q * k;
+            float* heap_dis = dis + q * k;
+            int64_t* heap_ids = ids + q * k;
             float one_a = 1.0, b = 0.0;
             if (normalizers) {
@@ -330,8 +357,8 @@ struct HeapHandler : SIMDResultHandler<C, with_id_map> {
                 b = normalizers[2 * q + 1];
             }
             for (int j = 0; j < k; j++) {
-                heap_ids[j] = heap_ids_in[j];
                 heap_dis[j] = heap_dis_in[j] * one_a + b;
+                heap_ids[j] = heap_ids_in[j];
             }
         }
     }
@@ -342,114 +369,45 @@ struct HeapHandler : SIMDResultHandler<C, with_id_map> {
  * Results are stored when they are below the threshold until the capacity is
  * reached. Then a partition sort is used to update the threshold. */
-namespace {
-uint64_t get_cy() {
-#ifdef MICRO_BENCHMARK
-    uint32_t high, low;
-    asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
-    return ((uint64_t)high << 32) | (low);
-#else
-    return 0;
-#endif
-}
-} // anonymous namespace
-template <class C>
-struct ReservoirTopN {
-    using T = typename C::T;
-    using TI = typename C::TI;
-    T* vals;
-    TI* ids;
-    size_t i;        // number of stored elements
-    size_t n;        // number of requested elements
-    size_t capacity; // size of storage
-    size_t cycles = 0;
-    T threshold; // current threshold
-    ReservoirTopN(size_t n, size_t capacity, T* vals, TI* ids)
-            : vals(vals), ids(ids), i(0), n(n), capacity(capacity) {
-        assert(n < capacity);
-        threshold = C::neutral();
-    }
-    void add(T val, TI id) {
-        if (C::cmp(threshold, val)) {
-            if (i == capacity) {
-                shrink_fuzzy();
-            }
-            vals[i] = val;
-            ids[i] = id;
-            i++;
-        }
-    }
-    /// shrink number of stored elements to n
-    void shrink_xx() {
-        uint64_t t0 = get_cy();
-        qselect(vals, ids, i, n);
-        i = n; // forget all elements above i = n
-        threshold = C::Crev::neutral();
-        for (size_t j = 0; j < n; j++) {
-            if (C::cmp(vals[j], threshold)) {
-                threshold = vals[j];
-            }
-        }
-        cycles += get_cy() - t0;
-    }
-    void shrink() {
-        uint64_t t0 = get_cy();
-        threshold = partition<C>(vals, ids, i, n);
-        i = n;
-        cycles += get_cy() - t0;
-    }
-    void shrink_fuzzy() {
-        uint64_t t0 = get_cy();
-        assert(i == capacity);
-        threshold = partition_fuzzy<C>(
-                vals, ids, capacity, n, (capacity + n) / 2, &i);
-        cycles += get_cy() - t0;
-    }
-};
 /** Handler built from several ReservoirTopN (one per query) */
 template <class C, bool with_id_map = false>
-struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
+struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
     size_t capacity; // rounded up to multiple of 16
+    // where the final results will be written
+    float* dis;
+    int64_t* ids;
     std::vector<TI> all_ids;
     AlignedTable<T> all_vals;
     std::vector<ReservoirTopN<C>> reservoirs;
-    uint64_t times[4];
-    ReservoirHandler(size_t nq, size_t ntotal, size_t n, size_t capacity_in)
-            : SIMDResultHandler<C, with_id_map>(ntotal),
-              capacity((capacity_in + 15) & ~15),
-              all_ids(nq * capacity),
-              all_vals(nq * capacity) {
+    ReservoirHandler(
+            size_t nq,
+            size_t ntotal,
+            size_t k,
+            size_t cap,
+            float* dis,
+            int64_t* ids)
+            : RHC(nq, ntotal), capacity((cap + 15) & ~15), dis(dis), ids(ids) {
         assert(capacity % 16 == 0);
-        for (size_t i = 0; i < nq; i++) {
+        all_ids.resize(nq * capacity);
+        all_vals.resize(nq * capacity);
+        for (size_t q = 0; q < nq; q++) {
             reservoirs.emplace_back(
-                    n,
+                    k,
                     capacity,
-                    all_vals.get() + i * capacity,
-                    all_ids.data() + i * capacity);
+                    all_vals.get() + q * capacity,
+                    all_ids.data() + q * capacity);
         }
-        times[0] = times[1] = times[2] = times[3] = 0;
     }
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
-        uint64_t t0 = get_cy();
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         if (this->disable) {
             return;
         }
@@ -457,8 +415,6 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
         ReservoirTopN<C>& res = reservoirs[q];
         uint32_t lt_mask = this->get_lt_mask(res.threshold, b, d0, d1);
-        uint64_t t1 = get_cy();
-        times[0] += t1 - t0;
         if (!lt_mask) {
             return;
@@ -474,20 +430,14 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
             T dis = d32tab[j];
             res.add(dis, this->adjust_id(b, j));
         }
-        times[1] += get_cy() - t1;
     }
-    void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) override {
+    void end() override {
         using Cf = typename std::conditional<
                 C::is_max,
                 CMax<float, int64_t>,
                 CMin<float, int64_t>>::type;
-        uint64_t t0 = get_cy();
-        uint64_t t3 = 0;
         std::vector<int> perm(reservoirs[0].n);
         for (int q = 0; q < reservoirs.size(); q++) {
             ReservoirTopN<C>& res = reservoirs[q];
@@ -496,8 +446,8 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
             if (res.i > res.n) {
                 res.shrink();
             }
-            int64_t* heap_ids = labels + q * n;
-            float* heap_dis = distances + q * n;
+            int64_t* heap_ids = ids + q * n;
+            float* heap_dis = dis + q * n;
             float one_a = 1.0, b = 0.0;
             if (normalizers) {
@@ -518,14 +468,236 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
             // possibly add empty results
             heap_heapify<Cf>(n - res.i, heap_dis + res.i, heap_ids + res.i);
+        }
+    }
+};
+/** Result hanlder for range search. The difficulty is that the range distances
+ * have to be scaled using the scaler.
+ */
+template <class C, bool with_id_map = false>
+struct RangeHandler : ResultHandlerCompare<C, with_id_map> {
+    using T = typename C::T;
+    using TI = typename C::TI;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
+    using RHC::nq;
+    RangeSearchResult& rres;
+    float radius;
+    std::vector<uint16_t> thresholds;
+    std::vector<size_t> n_per_query;
+    size_t q0 = 0;
+    // we cannot use the RangeSearchPartialResult interface because queries can
+    // be performed by batches
+    struct Triplet {
+        idx_t q;
+        idx_t b;
+        uint16_t dis;
+    };
+    std::vector<Triplet> triplets;
+    RangeHandler(RangeSearchResult& rres, float radius, size_t ntotal)
+            : RHC(rres.nq, ntotal), rres(rres), radius(radius) {
+        thresholds.resize(nq);
+        n_per_query.resize(nq + 1);
+    }
+    virtual void begin(const float* norms) {
+        normalizers = norms;
+        for (int q = 0; q < nq; ++q) {
+            thresholds[q] =
+                    normalizers[2 * q] * (radius - normalizers[2 * q + 1]);
+        }
+    }
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
+        if (this->disable) {
+            return;
+        }
+        this->adjust_with_origin(q, d0, d1);
+        uint32_t lt_mask = this->get_lt_mask(thresholds[q], b, d0, d1);
+        if (!lt_mask) {
+            return;
+        }
+        ALIGNED(32) uint16_t d32tab[32];
+        d0.store(d32tab);
+        d1.store(d32tab + 16);
+        while (lt_mask) {
+            // find first non-zero
+            int j = __builtin_ctz(lt_mask);
+            lt_mask -= 1 << j;
+            T dis = d32tab[j];
+            n_per_query[q]++;
+            triplets.push_back({idx_t(q + q0), this->adjust_id(b, j), dis});
+        }
+    }
-            t3 += res.cycles;
+    void end() override {
+        memcpy(rres.lims, n_per_query.data(), sizeof(n_per_query[0]) * nq);
+        rres.do_allocation();
+        for (auto it = triplets.begin(); it != triplets.end(); ++it) {
+            size_t& l = rres.lims[it->q];
+            rres.distances[l] = it->dis;
+            rres.labels[l] = it->b;
+            l++;
+        }
+        memmove(rres.lims + 1, rres.lims, sizeof(*rres.lims) * rres.nq);
+        rres.lims[0] = 0;
+        for (int q = 0; q < nq; q++) {
+            float one_a = 1 / normalizers[2 * q];
+            float b = normalizers[2 * q + 1];
+            for (size_t i = rres.lims[q]; i < rres.lims[q + 1]; i++) {
+                rres.distances[i] = rres.distances[i] * one_a + b;
+            }
         }
-        times[2] += get_cy() - t0;
-        times[3] += t3;
     }
 };
+#ifndef SWIG
+// handler for a subset of queries
+template <class C, bool with_id_map = false>
+struct PartialRangeHandler : RangeHandler<C, with_id_map> {
+    using T = typename C::T;
+    using TI = typename C::TI;
+    using RHC = RangeHandler<C, with_id_map>;
+    using RHC::normalizers;
+    using RHC::nq, RHC::q0, RHC::triplets, RHC::n_per_query;
+    RangeSearchPartialResult& pres;
+    PartialRangeHandler(
+            RangeSearchPartialResult& pres,
+            float radius,
+            size_t ntotal,
+            size_t q0,
+            size_t q1)
+            : RangeHandler<C, with_id_map>(*pres.res, radius, ntotal),
+              pres(pres) {
+        nq = q1 - q0;
+        this->q0 = q0;
+    }
+    // shift left n_per_query
+    void shift_n_per_query() {
+        memmove(n_per_query.data() + 1,
+                n_per_query.data(),
+                nq * sizeof(n_per_query[0]));
+        n_per_query[0] = 0;
+    }
+    // commit to partial result instead of full RangeResult
+    void end() override {
+        std::vector<typename RHC::Triplet> sorted_triplets(triplets.size());
+        for (int q = 0; q < nq; q++) {
+            n_per_query[q + 1] += n_per_query[q];
+        }
+        shift_n_per_query();
+        for (size_t i = 0; i < triplets.size(); i++) {
+            sorted_triplets[n_per_query[triplets[i].q - q0]++] = triplets[i];
+        }
+        shift_n_per_query();
+        size_t* lims = n_per_query.data();
+        for (int q = 0; q < nq; q++) {
+            float one_a = 1 / normalizers[2 * q];
+            float b = normalizers[2 * q + 1];
+            RangeQueryResult& qres = pres.new_result(q + q0);
+            for (size_t i = lims[q]; i < lims[q + 1]; i++) {
+                qres.add(
+                        sorted_triplets[i].dis * one_a + b,
+                        sorted_triplets[i].b);
+            }
+        }
+    }
+};
+#endif
+/********************************************************************************
+ * Dynamic dispatching function. The consumer should have a templatized method f
+ * that will be replaced with the actual SIMDResultHandler that is determined
+ * dynamically.
+ */
+template <class C, bool W, class Consumer, class... Types>
+void dispatch_SIMDResultHanlder_fixedCW(
+        SIMDResultHandler& res,
+        Consumer& consumer,
+        Types... args) {
+    if (auto resh = dynamic_cast<SingleResultHandler<C, W>*>(&res)) {
+        consumer.template f<SingleResultHandler<C, W>>(*resh, args...);
+    } else if (auto resh = dynamic_cast<HeapHandler<C, W>*>(&res)) {
+        consumer.template f<HeapHandler<C, W>>(*resh, args...);
+    } else if (auto resh = dynamic_cast<ReservoirHandler<C, W>*>(&res)) {
+        consumer.template f<ReservoirHandler<C, W>>(*resh, args...);
+    } else { // generic handler -- will not be inlined
+        FAISS_THROW_IF_NOT_FMT(
+                simd_result_handlers_accept_virtual,
+                "Running vitrual handler for %s",
+                typeid(res).name());
+        consumer.template f<SIMDResultHandler>(res, args...);
+    }
+}
+template <class C, class Consumer, class... Types>
+void dispatch_SIMDResultHanlder_fixedC(
+        SIMDResultHandler& res,
+        Consumer& consumer,
+        Types... args) {
+    if (res.with_fields) {
+        dispatch_SIMDResultHanlder_fixedCW<C, true>(res, consumer, args...);
+    } else {
+        dispatch_SIMDResultHanlder_fixedCW<C, false>(res, consumer, args...);
+    }
+}
+template <class Consumer, class... Types>
+void dispatch_SIMDResultHanlder(
+        SIMDResultHandler& res,
+        Consumer& consumer,
+        Types... args) {
+    if (res.sizeof_ids == 0) {
+        if (auto resh = dynamic_cast<StoreResultHandler*>(&res)) {
+            consumer.template f<StoreResultHandler>(*resh, args...);
+        } else if (auto resh = dynamic_cast<DummyResultHandler*>(&res)) {
+            consumer.template f<DummyResultHandler>(*resh, args...);
+        } else { // generic path
+            FAISS_THROW_IF_NOT_FMT(
+                    simd_result_handlers_accept_virtual,
+                    "Running vitrual handler for %s",
+                    typeid(res).name());
+            consumer.template f<SIMDResultHandler>(res, args...);
+        }
+    } else if (res.sizeof_ids == sizeof(int)) {
+        if (res.is_CMax) {
+            dispatch_SIMDResultHanlder_fixedC<CMax<uint16_t, int>>(
+                    res, consumer, args...);
+        } else {
+            dispatch_SIMDResultHanlder_fixedC<CMin<uint16_t, int>>(
+                    res, consumer, args...);
+        }
+    } else if (res.sizeof_ids == sizeof(int64_t)) {
+        if (res.is_CMax) {
+            dispatch_SIMDResultHanlder_fixedC<CMax<uint16_t, int64_t>>(
+                    res, consumer, args...);
+        } else {
+            dispatch_SIMDResultHanlder_fixedC<CMin<uint16_t, int64_t>>(
+                    res, consumer, args...);
+        }
+    } else {
+        FAISS_THROW_FMT("Unknown id size %d", res.sizeof_ids);
+    }
+}
 } // namespace simd_result_handlers
 } // namespace faiss