RubyGems - faiss - Versions diffs - 0.1.0 → 0.1.1 - Mend

faiss 0.1.0 → 0.1.1

Files changed (226) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +103 -3
data/ext/faiss/ext.cpp +99 -32
data/ext/faiss/extconf.rb +12 -2
data/lib/faiss/ext.bundle +0 -0
data/lib/faiss/index.rb +3 -3
data/lib/faiss/index_binary.rb +3 -3
data/lib/faiss/kmeans.rb +1 -1
data/lib/faiss/pca_matrix.rb +2 -2
data/lib/faiss/product_quantizer.rb +3 -3
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/AutoTune.cpp +719 -0
data/vendor/faiss/AutoTune.h +212 -0
data/vendor/faiss/Clustering.cpp +261 -0
data/vendor/faiss/Clustering.h +101 -0
data/vendor/faiss/IVFlib.cpp +339 -0
data/vendor/faiss/IVFlib.h +132 -0
data/vendor/faiss/Index.cpp +171 -0
data/vendor/faiss/Index.h +261 -0
data/vendor/faiss/Index2Layer.cpp +437 -0
data/vendor/faiss/Index2Layer.h +85 -0
data/vendor/faiss/IndexBinary.cpp +77 -0
data/vendor/faiss/IndexBinary.h +163 -0
data/vendor/faiss/IndexBinaryFlat.cpp +83 -0
data/vendor/faiss/IndexBinaryFlat.h +54 -0
data/vendor/faiss/IndexBinaryFromFloat.cpp +78 -0
data/vendor/faiss/IndexBinaryFromFloat.h +52 -0
data/vendor/faiss/IndexBinaryHNSW.cpp +325 -0
data/vendor/faiss/IndexBinaryHNSW.h +56 -0
data/vendor/faiss/IndexBinaryIVF.cpp +671 -0
data/vendor/faiss/IndexBinaryIVF.h +211 -0
data/vendor/faiss/IndexFlat.cpp +508 -0
data/vendor/faiss/IndexFlat.h +175 -0
data/vendor/faiss/IndexHNSW.cpp +1090 -0
data/vendor/faiss/IndexHNSW.h +170 -0
data/vendor/faiss/IndexIVF.cpp +909 -0
data/vendor/faiss/IndexIVF.h +353 -0
data/vendor/faiss/IndexIVFFlat.cpp +502 -0
data/vendor/faiss/IndexIVFFlat.h +118 -0
data/vendor/faiss/IndexIVFPQ.cpp +1207 -0
data/vendor/faiss/IndexIVFPQ.h +161 -0
data/vendor/faiss/IndexIVFPQR.cpp +219 -0
data/vendor/faiss/IndexIVFPQR.h +65 -0
data/vendor/faiss/IndexIVFSpectralHash.cpp +331 -0
data/vendor/faiss/IndexIVFSpectralHash.h +75 -0
data/vendor/faiss/IndexLSH.cpp +225 -0
data/vendor/faiss/IndexLSH.h +87 -0
data/vendor/faiss/IndexLattice.cpp +143 -0
data/vendor/faiss/IndexLattice.h +68 -0
data/vendor/faiss/IndexPQ.cpp +1188 -0
data/vendor/faiss/IndexPQ.h +199 -0
data/vendor/faiss/IndexPreTransform.cpp +288 -0
data/vendor/faiss/IndexPreTransform.h +91 -0
data/vendor/faiss/IndexReplicas.cpp +123 -0
data/vendor/faiss/IndexReplicas.h +76 -0
data/vendor/faiss/IndexScalarQuantizer.cpp +317 -0
data/vendor/faiss/IndexScalarQuantizer.h +127 -0
data/vendor/faiss/IndexShards.cpp +317 -0
data/vendor/faiss/IndexShards.h +100 -0
data/vendor/faiss/InvertedLists.cpp +623 -0
data/vendor/faiss/InvertedLists.h +334 -0
data/vendor/faiss/LICENSE +21 -0
data/vendor/faiss/MatrixStats.cpp +252 -0
data/vendor/faiss/MatrixStats.h +62 -0
data/vendor/faiss/MetaIndexes.cpp +351 -0
data/vendor/faiss/MetaIndexes.h +126 -0
data/vendor/faiss/OnDiskInvertedLists.cpp +674 -0
data/vendor/faiss/OnDiskInvertedLists.h +127 -0
data/vendor/faiss/VectorTransform.cpp +1157 -0
data/vendor/faiss/VectorTransform.h +322 -0
data/vendor/faiss/c_api/AutoTune_c.cpp +83 -0
data/vendor/faiss/c_api/AutoTune_c.h +64 -0
data/vendor/faiss/c_api/Clustering_c.cpp +139 -0
data/vendor/faiss/c_api/Clustering_c.h +117 -0
data/vendor/faiss/c_api/IndexFlat_c.cpp +140 -0
data/vendor/faiss/c_api/IndexFlat_c.h +115 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.cpp +64 -0
data/vendor/faiss/c_api/IndexIVFFlat_c.h +58 -0
data/vendor/faiss/c_api/IndexIVF_c.cpp +92 -0
data/vendor/faiss/c_api/IndexIVF_c.h +135 -0
data/vendor/faiss/c_api/IndexLSH_c.cpp +37 -0
data/vendor/faiss/c_api/IndexLSH_c.h +40 -0
data/vendor/faiss/c_api/IndexShards_c.cpp +44 -0
data/vendor/faiss/c_api/IndexShards_c.h +42 -0
data/vendor/faiss/c_api/Index_c.cpp +105 -0
data/vendor/faiss/c_api/Index_c.h +183 -0
data/vendor/faiss/c_api/MetaIndexes_c.cpp +49 -0
data/vendor/faiss/c_api/MetaIndexes_c.h +49 -0
data/vendor/faiss/c_api/clone_index_c.cpp +23 -0
data/vendor/faiss/c_api/clone_index_c.h +32 -0
data/vendor/faiss/c_api/error_c.h +42 -0
data/vendor/faiss/c_api/error_impl.cpp +27 -0
data/vendor/faiss/c_api/error_impl.h +16 -0
data/vendor/faiss/c_api/faiss_c.h +58 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.cpp +96 -0
data/vendor/faiss/c_api/gpu/GpuAutoTune_c.h +56 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.cpp +52 -0
data/vendor/faiss/c_api/gpu/GpuClonerOptions_c.h +68 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.cpp +17 -0
data/vendor/faiss/c_api/gpu/GpuIndex_c.h +30 -0
data/vendor/faiss/c_api/gpu/GpuIndicesOptions_c.h +38 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.cpp +86 -0
data/vendor/faiss/c_api/gpu/GpuResources_c.h +66 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.cpp +54 -0
data/vendor/faiss/c_api/gpu/StandardGpuResources_c.h +53 -0
data/vendor/faiss/c_api/gpu/macros_impl.h +42 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.cpp +220 -0
data/vendor/faiss/c_api/impl/AuxIndexStructures_c.h +149 -0
data/vendor/faiss/c_api/index_factory_c.cpp +26 -0
data/vendor/faiss/c_api/index_factory_c.h +30 -0
data/vendor/faiss/c_api/index_io_c.cpp +42 -0
data/vendor/faiss/c_api/index_io_c.h +50 -0
data/vendor/faiss/c_api/macros_impl.h +110 -0
data/vendor/faiss/clone_index.cpp +147 -0
data/vendor/faiss/clone_index.h +38 -0
data/vendor/faiss/demos/demo_imi_flat.cpp +151 -0
data/vendor/faiss/demos/demo_imi_pq.cpp +199 -0
data/vendor/faiss/demos/demo_ivfpq_indexing.cpp +146 -0
data/vendor/faiss/demos/demo_sift1M.cpp +252 -0
data/vendor/faiss/gpu/GpuAutoTune.cpp +95 -0
data/vendor/faiss/gpu/GpuAutoTune.h +27 -0
data/vendor/faiss/gpu/GpuCloner.cpp +403 -0
data/vendor/faiss/gpu/GpuCloner.h +82 -0
data/vendor/faiss/gpu/GpuClonerOptions.cpp +28 -0
data/vendor/faiss/gpu/GpuClonerOptions.h +53 -0
data/vendor/faiss/gpu/GpuDistance.h +52 -0
data/vendor/faiss/gpu/GpuFaissAssert.h +29 -0
data/vendor/faiss/gpu/GpuIndex.h +148 -0
data/vendor/faiss/gpu/GpuIndexBinaryFlat.h +89 -0
data/vendor/faiss/gpu/GpuIndexFlat.h +190 -0
data/vendor/faiss/gpu/GpuIndexIVF.h +89 -0
data/vendor/faiss/gpu/GpuIndexIVFFlat.h +85 -0
data/vendor/faiss/gpu/GpuIndexIVFPQ.h +143 -0
data/vendor/faiss/gpu/GpuIndexIVFScalarQuantizer.h +100 -0
data/vendor/faiss/gpu/GpuIndicesOptions.h +30 -0
data/vendor/faiss/gpu/GpuResources.cpp +52 -0
data/vendor/faiss/gpu/GpuResources.h +73 -0
data/vendor/faiss/gpu/StandardGpuResources.cpp +295 -0
data/vendor/faiss/gpu/StandardGpuResources.h +114 -0
data/vendor/faiss/gpu/impl/RemapIndices.cpp +43 -0
data/vendor/faiss/gpu/impl/RemapIndices.h +24 -0
data/vendor/faiss/gpu/perf/IndexWrapper-inl.h +71 -0
data/vendor/faiss/gpu/perf/IndexWrapper.h +39 -0
data/vendor/faiss/gpu/perf/PerfClustering.cpp +115 -0
data/vendor/faiss/gpu/perf/PerfIVFPQAdd.cpp +139 -0
data/vendor/faiss/gpu/perf/WriteIndex.cpp +102 -0
data/vendor/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +130 -0
data/vendor/faiss/gpu/test/TestGpuIndexFlat.cpp +371 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +550 -0
data/vendor/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +450 -0
data/vendor/faiss/gpu/test/TestGpuMemoryException.cpp +84 -0
data/vendor/faiss/gpu/test/TestUtils.cpp +315 -0
data/vendor/faiss/gpu/test/TestUtils.h +93 -0
data/vendor/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +159 -0
data/vendor/faiss/gpu/utils/DeviceMemory.cpp +77 -0
data/vendor/faiss/gpu/utils/DeviceMemory.h +71 -0
data/vendor/faiss/gpu/utils/DeviceUtils.h +185 -0
data/vendor/faiss/gpu/utils/MemorySpace.cpp +89 -0
data/vendor/faiss/gpu/utils/MemorySpace.h +44 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.cpp +239 -0
data/vendor/faiss/gpu/utils/StackDeviceMemory.h +129 -0
data/vendor/faiss/gpu/utils/StaticUtils.h +83 -0
data/vendor/faiss/gpu/utils/Timer.cpp +60 -0
data/vendor/faiss/gpu/utils/Timer.h +52 -0
data/vendor/faiss/impl/AuxIndexStructures.cpp +305 -0
data/vendor/faiss/impl/AuxIndexStructures.h +246 -0
data/vendor/faiss/impl/FaissAssert.h +95 -0
data/vendor/faiss/impl/FaissException.cpp +66 -0
data/vendor/faiss/impl/FaissException.h +71 -0
data/vendor/faiss/impl/HNSW.cpp +818 -0
data/vendor/faiss/impl/HNSW.h +275 -0
data/vendor/faiss/impl/PolysemousTraining.cpp +953 -0
data/vendor/faiss/impl/PolysemousTraining.h +158 -0
data/vendor/faiss/impl/ProductQuantizer.cpp +876 -0
data/vendor/faiss/impl/ProductQuantizer.h +242 -0
data/vendor/faiss/impl/ScalarQuantizer.cpp +1628 -0
data/vendor/faiss/impl/ScalarQuantizer.h +120 -0
data/vendor/faiss/impl/ThreadedIndex-inl.h +192 -0
data/vendor/faiss/impl/ThreadedIndex.h +80 -0
data/vendor/faiss/impl/index_read.cpp +793 -0
data/vendor/faiss/impl/index_write.cpp +558 -0
data/vendor/faiss/impl/io.cpp +142 -0
data/vendor/faiss/impl/io.h +98 -0
data/vendor/faiss/impl/lattice_Zn.cpp +712 -0
data/vendor/faiss/impl/lattice_Zn.h +199 -0
data/vendor/faiss/index_factory.cpp +392 -0
data/vendor/faiss/index_factory.h +25 -0
data/vendor/faiss/index_io.h +75 -0
data/vendor/faiss/misc/test_blas.cpp +84 -0
data/vendor/faiss/tests/test_binary_flat.cpp +64 -0
data/vendor/faiss/tests/test_dealloc_invlists.cpp +183 -0
data/vendor/faiss/tests/test_ivfpq_codec.cpp +67 -0
data/vendor/faiss/tests/test_ivfpq_indexing.cpp +98 -0
data/vendor/faiss/tests/test_lowlevel_ivf.cpp +566 -0
data/vendor/faiss/tests/test_merge.cpp +258 -0
data/vendor/faiss/tests/test_omp_threads.cpp +14 -0
data/vendor/faiss/tests/test_ondisk_ivf.cpp +220 -0
data/vendor/faiss/tests/test_pairs_decoding.cpp +189 -0
data/vendor/faiss/tests/test_params_override.cpp +231 -0
data/vendor/faiss/tests/test_pq_encoding.cpp +98 -0
data/vendor/faiss/tests/test_sliding_ivf.cpp +240 -0
data/vendor/faiss/tests/test_threaded_index.cpp +253 -0
data/vendor/faiss/tests/test_transfer_invlists.cpp +159 -0
data/vendor/faiss/tutorial/cpp/1-Flat.cpp +98 -0
data/vendor/faiss/tutorial/cpp/2-IVFFlat.cpp +81 -0
data/vendor/faiss/tutorial/cpp/3-IVFPQ.cpp +93 -0
data/vendor/faiss/tutorial/cpp/4-GPU.cpp +119 -0
data/vendor/faiss/tutorial/cpp/5-Multiple-GPUs.cpp +99 -0
data/vendor/faiss/utils/Heap.cpp +122 -0
data/vendor/faiss/utils/Heap.h +495 -0
data/vendor/faiss/utils/WorkerThread.cpp +126 -0
data/vendor/faiss/utils/WorkerThread.h +61 -0
data/vendor/faiss/utils/distances.cpp +765 -0
data/vendor/faiss/utils/distances.h +243 -0
data/vendor/faiss/utils/distances_simd.cpp +809 -0
data/vendor/faiss/utils/extra_distances.cpp +336 -0
data/vendor/faiss/utils/extra_distances.h +54 -0
data/vendor/faiss/utils/hamming-inl.h +472 -0
data/vendor/faiss/utils/hamming.cpp +792 -0
data/vendor/faiss/utils/hamming.h +220 -0
data/vendor/faiss/utils/random.cpp +192 -0
data/vendor/faiss/utils/random.h +60 -0
data/vendor/faiss/utils/utils.cpp +783 -0
data/vendor/faiss/utils/utils.h +181 -0
metadata +216 -2

data/vendor/faiss/IndexLattice.h ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_INDEX_LATTICE_H
+#define FAISS_INDEX_LATTICE_H
+#include <vector>
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/lattice_Zn.h>
+namespace faiss {
+/** Index that encodes a vector with a series of Zn lattice quantizers
+ */
+struct IndexLattice: Index {
+    /// number of sub-vectors
+    int nsq;
+    /// dimension of sub-vectors
+    size_t dsq;
+    /// the lattice quantizer
+    ZnSphereCodecAlt zn_sphere_codec;
+    /// nb bits used to encode the scale, per subvector
+    int scale_nbit, lattice_nbit;
+    /// total, in bytes
+    size_t code_size;
+    /// mins and maxes of the vector norms, per subquantizer
+    std::vector<float> trained;
+    IndexLattice (idx_t d, int nsq, int scale_nbit, int r2);
+    void train(idx_t n, const float* x) override;
+    /* The standalone codec interface */
+    size_t sa_code_size () const override;
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+    /// not implemented
+    void add(idx_t n, const float* x) override;
+    void search(idx_t n, const float* x, idx_t k,
+                float* distances, idx_t* labels) const override;
+    void reset() override;
+};
+} // namespace faiss
+#endif

data/vendor/faiss/IndexPQ.cpp ADDED Viewed

@@ -0,0 +1,1188 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexPQ.h>
+#include <cstddef>
+#include <cstring>
+#include <cstdio>
+#include <cmath>
+#include <algorithm>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/utils/hamming.h>
+namespace faiss {
+/*********************************************************
+ * IndexPQ implementation
+ ********************************************************/
+IndexPQ::IndexPQ (int d, size_t M, size_t nbits, MetricType metric):
+    Index(d, metric), pq(d, M, nbits)
+{
+    is_trained = false;
+    do_polysemous_training = false;
+    polysemous_ht = nbits * M + 1;
+    search_type = ST_PQ;
+    encode_signs = false;
+}
+IndexPQ::IndexPQ ()
+{
+    metric_type = METRIC_L2;
+    is_trained = false;
+    do_polysemous_training = false;
+    polysemous_ht = pq.nbits * pq.M + 1;
+    search_type = ST_PQ;
+    encode_signs = false;
+}
+void IndexPQ::train (idx_t n, const float *x)
+{
+    if (!do_polysemous_training) {        // standard training
+        pq.train(n, x);
+    } else {
+        idx_t ntrain_perm = polysemous_training.ntrain_permutation;
+        if (ntrain_perm > n / 4)
+            ntrain_perm = n / 4;
+        if (verbose) {
+            printf ("PQ training on %ld points, remains %ld points: "
+                    "training polysemous on %s\n",
+                    n - ntrain_perm, ntrain_perm,
+                    ntrain_perm == 0 ? "centroids" : "these");
+        }
+        pq.train(n - ntrain_perm, x);
+        polysemous_training.optimize_pq_for_hamming (
+            pq, ntrain_perm, x + (n - ntrain_perm) * d);
+    }
+    is_trained = true;
+}
+void IndexPQ::add (idx_t n, const float *x)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    codes.resize ((n + ntotal) * pq.code_size);
+    pq.compute_codes (x, &codes[ntotal * pq.code_size], n);
+    ntotal += n;
+}
+size_t IndexPQ::remove_ids (const IDSelector & sel)
+{
+    idx_t j = 0;
+    for (idx_t i = 0; i < ntotal; i++) {
+        if (sel.is_member (i)) {
+            // should be removed
+        } else {
+            if (i > j) {
+                memmove (&codes[pq.code_size * j], &codes[pq.code_size * i], pq.code_size);
+            }
+            j++;
+        }
+    }
+    size_t nremove = ntotal - j;
+    if (nremove > 0) {
+        ntotal = j;
+        codes.resize (ntotal * pq.code_size);
+    }
+    return nremove;
+}
+void IndexPQ::reset()
+{
+    codes.clear();
+    ntotal = 0;
+}
+void IndexPQ::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
+{
+    FAISS_THROW_IF_NOT (ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
+    for (idx_t i = 0; i < ni; i++) {
+        const uint8_t * code = &codes[(i0 + i) * pq.code_size];
+        pq.decode (code, recons + i * d);
+    }
+}
+void IndexPQ::reconstruct (idx_t key, float * recons) const
+{
+    FAISS_THROW_IF_NOT (key >= 0 && key < ntotal);
+    pq.decode (&codes[key * pq.code_size], recons);
+}
+namespace {
+struct PQDis: DistanceComputer {
+    size_t d;
+    Index::idx_t nb;
+    const uint8_t *codes;
+    size_t code_size;
+    const ProductQuantizer & pq;
+    const float *sdc;
+    std::vector<float> precomputed_table;
+    size_t ndis;
+    float operator () (idx_t i) override
+    {
+        const uint8_t *code = codes + i * code_size;
+        const float *dt = precomputed_table.data();
+        float accu = 0;
+        for (int j = 0; j < pq.M; j++) {
+            accu += dt[*code++];
+            dt += 256;
+        }
+        ndis++;
+        return accu;
+    }
+    float symmetric_dis(idx_t i, idx_t j) override
+    {
+        const float * sdci = sdc;
+        float accu = 0;
+        const uint8_t *codei = codes + i * code_size;
+        const uint8_t *codej = codes + j * code_size;
+        for (int l = 0; l < pq.M; l++) {
+            accu += sdci[(*codei++) + (*codej++) * 256];
+            sdci += 256 * 256;
+        }
+        return accu;
+    }
+    explicit PQDis(const IndexPQ& storage, const float* /*q*/ = nullptr)
+        : pq(storage.pq) {
+        precomputed_table.resize(pq.M * pq.ksub);
+        nb = storage.ntotal;
+        d = storage.d;
+        codes = storage.codes.data();
+        code_size = pq.code_size;
+        FAISS_ASSERT(pq.ksub == 256);
+        FAISS_ASSERT(pq.sdc_table.size() == pq.ksub * pq.ksub * pq.M);
+        sdc = pq.sdc_table.data();
+        ndis = 0;
+    }
+    void set_query(const float *x) override {
+        pq.compute_distance_table(x, precomputed_table.data());
+    }
+};
+}  // namespace
+DistanceComputer * IndexPQ::get_distance_computer() const {
+    FAISS_THROW_IF_NOT(pq.nbits == 8);
+    return new PQDis(*this);
+}
+/*****************************************
+ * IndexPQ polysemous search routines
+ ******************************************/
+void IndexPQ::search (idx_t n, const float *x, idx_t k,
+                           float *distances, idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    if (search_type == ST_PQ) {  // Simple PQ search
+        if (metric_type == METRIC_L2) {
+            float_maxheap_array_t res = {
+                size_t(n), size_t(k), labels, distances };
+            pq.search (x, n, codes.data(), ntotal, &res, true);
+        } else {
+            float_minheap_array_t res = {
+                size_t(n), size_t(k), labels, distances };
+            pq.search_ip (x, n, codes.data(), ntotal, &res, true);
+        }
+        indexPQ_stats.nq += n;
+        indexPQ_stats.ncode += n * ntotal;
+    } else if (search_type == ST_polysemous ||
+               search_type == ST_polysemous_generalize) {
+        FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
+        search_core_polysemous (n, x, k, distances, labels);
+    } else { // code-to-code distances
+        uint8_t * q_codes = new uint8_t [n * pq.code_size];
+        ScopeDeleter<uint8_t> del (q_codes);
+        if (!encode_signs) {
+            pq.compute_codes (x, q_codes, n);
+        } else {
+            FAISS_THROW_IF_NOT (d == pq.nbits * pq.M);
+            memset (q_codes, 0, n * pq.code_size);
+            for (size_t i = 0; i < n; i++) {
+                const float *xi = x + i * d;
+                uint8_t *code = q_codes + i * pq.code_size;
+                for (int j = 0; j < d; j++)
+                    if (xi[j] > 0) code [j>>3] |= 1 << (j & 7);
+            }
+        }
+        if (search_type == ST_SDC)  {
+            float_maxheap_array_t res = {
+                size_t(n),  size_t(k), labels, distances};
+            pq.search_sdc (q_codes, n, codes.data(), ntotal, &res, true);
+        } else {
+            int * idistances = new int [n * k];
+            ScopeDeleter<int> del (idistances);
+            int_maxheap_array_t res = {
+                size_t (n), size_t (k), labels, idistances};
+            if (search_type == ST_HE) {
+                hammings_knn_hc (&res, q_codes, codes.data(),
+                                 ntotal, pq.code_size, true);
+            } else if (search_type == ST_generalized_HE) {
+                generalized_hammings_knn_hc (&res, q_codes, codes.data(),
+                                             ntotal, pq.code_size, true);
+            }
+            // convert distances to floats
+            for (int i = 0; i < k * n; i++)
+                distances[i] = idistances[i];
+        }
+        indexPQ_stats.nq += n;
+        indexPQ_stats.ncode += n * ntotal;
+    }
+}
+void IndexPQStats::reset()
+{
+    nq = ncode = n_hamming_pass = 0;
+}
+IndexPQStats indexPQ_stats;
+template <class HammingComputer>
+static size_t polysemous_inner_loop (
+        const IndexPQ & index,
+        const float *dis_table_qi, const uint8_t *q_code,
+        size_t k, float *heap_dis, int64_t *heap_ids)
+{
+    int M = index.pq.M;
+    int code_size = index.pq.code_size;
+    int ksub = index.pq.ksub;
+    size_t ntotal = index.ntotal;
+    int ht = index.polysemous_ht;
+    const uint8_t *b_code = index.codes.data();
+    size_t n_pass_i = 0;
+    HammingComputer hc (q_code, code_size);
+    for (int64_t bi = 0; bi < ntotal; bi++) {
+        int hd = hc.hamming (b_code);
+        if (hd < ht) {
+            n_pass_i ++;
+            float dis = 0;
+            const float * dis_table = dis_table_qi;
+            for (int m = 0; m < M; m++) {
+                dis += dis_table [b_code[m]];
+                dis_table += ksub;
+            }
+            if (dis < heap_dis[0]) {
+                maxheap_pop (k, heap_dis, heap_ids);
+                maxheap_push (k, heap_dis, heap_ids, dis, bi);
+            }
+        }
+        b_code += code_size;
+    }
+    return n_pass_i;
+}
+void IndexPQ::search_core_polysemous (idx_t n, const float *x, idx_t k,
+                                          float *distances, idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (pq.nbits == 8);
+    // PQ distance tables
+    float * dis_tables = new float [n * pq.ksub * pq.M];
+    ScopeDeleter<float> del (dis_tables);
+    pq.compute_distance_tables (n, x, dis_tables);
+    // Hamming embedding queries
+    uint8_t * q_codes = new uint8_t [n * pq.code_size];
+    ScopeDeleter<uint8_t> del2 (q_codes);
+    if (false) {
+        pq.compute_codes (x, q_codes, n);
+    } else {
+#pragma omp parallel for
+        for (idx_t qi = 0; qi < n; qi++) {
+            pq.compute_code_from_distance_table
+                (dis_tables + qi * pq.M * pq.ksub,
+                 q_codes + qi * pq.code_size);
+        }
+    }
+    size_t n_pass = 0;
+#pragma omp parallel for reduction (+: n_pass)
+    for (idx_t qi = 0; qi < n; qi++) {
+        const uint8_t * q_code = q_codes + qi * pq.code_size;
+        const float * dis_table_qi = dis_tables + qi * pq.M * pq.ksub;
+        int64_t * heap_ids = labels + qi * k;
+        float *heap_dis = distances + qi * k;
+        maxheap_heapify (k, heap_dis, heap_ids);
+        if (search_type == ST_polysemous) {
+            switch (pq.code_size) {
+            case 4:
+                n_pass += polysemous_inner_loop<HammingComputer4>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 8:
+                n_pass += polysemous_inner_loop<HammingComputer8>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 16:
+                n_pass += polysemous_inner_loop<HammingComputer16>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 32:
+                n_pass += polysemous_inner_loop<HammingComputer32>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 20:
+                n_pass += polysemous_inner_loop<HammingComputer20>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            default:
+                if (pq.code_size % 8 == 0) {
+                    n_pass += polysemous_inner_loop<HammingComputerM8>
+                        (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                } else if (pq.code_size % 4 == 0) {
+                    n_pass += polysemous_inner_loop<HammingComputerM4>
+                        (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                } else {
+                    FAISS_THROW_FMT(
+                         "code size %zd not supported for polysemous",
+                         pq.code_size);
+                }
+                break;
+            }
+        } else {
+            switch (pq.code_size) {
+            case 8:
+                n_pass += polysemous_inner_loop<GenHammingComputer8>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 16:
+                n_pass += polysemous_inner_loop<GenHammingComputer16>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            case 32:
+                n_pass += polysemous_inner_loop<GenHammingComputer32>
+                    (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                break;
+            default:
+                if (pq.code_size % 8 == 0) {
+                    n_pass += polysemous_inner_loop<GenHammingComputerM8>
+                        (*this, dis_table_qi, q_code, k, heap_dis, heap_ids);
+                } else {
+                    FAISS_THROW_FMT(
+                         "code size %zd not supported for polysemous",
+                         pq.code_size);
+                }
+                break;
+            }
+        }
+        maxheap_reorder (k, heap_dis, heap_ids);
+    }
+    indexPQ_stats.nq += n;
+    indexPQ_stats.ncode += n * ntotal;
+    indexPQ_stats.n_hamming_pass += n_pass;
+}
+/* The standalone codec interface (just remaps to the PQ functions) */
+size_t IndexPQ::sa_code_size () const
+{
+    return pq.code_size;
+}
+void IndexPQ::sa_encode (idx_t n, const float *x, uint8_t *bytes) const
+{
+    pq.compute_codes (x, bytes, n);
+}
+void IndexPQ::sa_decode (idx_t n, const uint8_t *bytes, float *x) const
+{
+    pq.decode (bytes, x, n);
+}
+/*****************************************
+ * Stats of IndexPQ codes
+ ******************************************/
+void IndexPQ::hamming_distance_table (idx_t n, const float *x,
+                                      int32_t *dis) const
+{
+    uint8_t * q_codes = new uint8_t [n * pq.code_size];
+    ScopeDeleter<uint8_t> del (q_codes);
+    pq.compute_codes (x, q_codes, n);
+    hammings (q_codes, codes.data(), n, ntotal, pq.code_size, dis);
+}
+void IndexPQ::hamming_distance_histogram (idx_t n, const float *x,
+                                          idx_t nb, const float *xb,
+                                          int64_t *hist)
+{
+    FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
+    FAISS_THROW_IF_NOT (pq.code_size % 8 == 0);
+    FAISS_THROW_IF_NOT (pq.nbits == 8);
+    // Hamming embedding queries
+    uint8_t * q_codes = new uint8_t [n * pq.code_size];
+    ScopeDeleter <uint8_t> del (q_codes);
+    pq.compute_codes (x, q_codes, n);
+    uint8_t * b_codes ;
+    ScopeDeleter <uint8_t> del_b_codes;
+    if (xb) {
+        b_codes = new uint8_t [nb * pq.code_size];
+        del_b_codes.set (b_codes);
+        pq.compute_codes (xb, b_codes, nb);
+    } else {
+        nb = ntotal;
+        b_codes = codes.data();
+    }
+    int nbits = pq.M * pq.nbits;
+    memset (hist, 0, sizeof(*hist) * (nbits + 1));
+    size_t bs = 256;
+#pragma omp parallel
+    {
+        std::vector<int64_t> histi (nbits + 1);
+        hamdis_t *distances = new hamdis_t [nb * bs];
+        ScopeDeleter<hamdis_t> del (distances);
+#pragma omp for
+        for (size_t q0 = 0; q0 < n; q0 += bs) {
+            // printf ("dis stats: %ld/%ld\n", q0, n);
+            size_t q1 = q0 + bs;
+            if (q1 > n) q1 = n;
+            hammings (q_codes + q0 * pq.code_size, b_codes,
+                      q1 - q0, nb,
+                      pq.code_size, distances);
+            for (size_t i = 0; i < nb * (q1 - q0); i++)
+                histi [distances [i]]++;
+        }
+#pragma omp critical
+        {
+            for (int i = 0; i <= nbits; i++)
+                hist[i] += histi[i];
+        }
+    }
+}
+/*****************************************
+ * MultiIndexQuantizer
+ ******************************************/
+namespace {
+template <typename T>
+struct PreSortedArray {
+    const T * x;
+    int N;
+    explicit PreSortedArray (int N): N(N) {
+    }
+    void init (const T*x) {
+        this->x = x;
+    }
+    // get smallest value
+    T get_0 () {
+        return x[0];
+    }
+    // get delta between n-smallest and n-1 -smallest
+    T get_diff (int n) {
+        return x[n] - x[n - 1];
+    }
+    // remap orders counted from smallest to indices in array
+    int get_ord (int n) {
+        return n;
+    }
+};
+template <typename T>
+struct ArgSort {
+    const T * x;
+    bool operator() (size_t i, size_t j) {
+        return x[i] < x[j];
+    }
+};
+/** Array that maintains a permutation of its elements so that the
+ *  array's elements are sorted
+ */
+template <typename T>
+struct SortedArray {
+    const T * x;
+    int N;
+    std::vector<int> perm;
+    explicit SortedArray (int N) {
+        this->N = N;
+        perm.resize (N);
+    }
+    void init (const T*x) {
+        this->x = x;
+        for (int n = 0; n < N; n++)
+            perm[n] = n;
+        ArgSort<T> cmp = {x };
+        std::sort (perm.begin(), perm.end(), cmp);
+    }
+    // get smallest value
+    T get_0 () {
+        return x[perm[0]];
+    }
+    // get delta between n-smallest and n-1 -smallest
+    T get_diff (int n) {
+        return x[perm[n]] - x[perm[n - 1]];
+    }
+    // remap orders counted from smallest to indices in array
+    int get_ord (int n) {
+        return perm[n];
+    }
+};
+/** Array has n values. Sort the k first ones and copy the other ones
+ *  into elements k..n-1
+ */
+template <class C>
+void partial_sort (int k, int n,
+                   const typename C::T * vals, typename C::TI * perm) {
+    // insert first k elts in heap
+    for (int i = 1; i < k; i++) {
+        indirect_heap_push<C> (i + 1, vals, perm, perm[i]);
+    }
+    // insert next n - k elts in heap
+    for (int i = k; i < n; i++) {
+        typename C::TI id = perm[i];
+        typename C::TI top = perm[0];
+        if (C::cmp(vals[top], vals[id])) {
+            indirect_heap_pop<C> (k, vals, perm);
+            indirect_heap_push<C> (k, vals, perm, id);
+            perm[i] = top;
+        } else {
+            // nothing, elt at i is good where it is.
+        }
+    }
+    // order the k first elements in heap
+    for (int i = k - 1; i > 0; i--) {
+        typename C::TI top = perm[0];
+        indirect_heap_pop<C> (i + 1, vals, perm);
+        perm[i] = top;
+    }
+}
+/** same as SortedArray, but only the k first elements are sorted */
+template <typename T>
+struct SemiSortedArray {
+    const T * x;
+    int N;
+    // type of the heap: CMax = sort ascending
+    typedef CMax<T, int> HC;
+    std::vector<int> perm;
+    int k;  // k elements are sorted
+    int initial_k, k_factor;
+    explicit SemiSortedArray (int N) {
+        this->N = N;
+        perm.resize (N);
+        perm.resize (N);
+        initial_k = 3;
+        k_factor = 4;
+    }
+    void init (const T*x) {
+        this->x = x;
+        for (int n = 0; n < N; n++)
+            perm[n] = n;
+        k = 0;
+        grow (initial_k);
+    }
+    /// grow the sorted part of the array to size next_k
+    void grow (int next_k) {
+        if (next_k < N) {
+            partial_sort<HC> (next_k - k, N - k, x, &perm[k]);
+            k = next_k;
+        } else { // full sort of remainder of array
+            ArgSort<T> cmp = {x };
+            std::sort (perm.begin() + k, perm.end(), cmp);
+            k = N;
+        }
+    }
+    // get smallest value
+    T get_0 () {
+        return x[perm[0]];
+    }
+    // get delta between n-smallest and n-1 -smallest
+    T get_diff (int n) {
+        if (n >= k) {
+            // want to keep powers of 2 - 1
+            int next_k = (k + 1) * k_factor - 1;
+            grow (next_k);
+        }
+        return x[perm[n]] - x[perm[n - 1]];
+    }
+    // remap orders counted from smallest to indices in array
+    int get_ord (int n) {
+        assert (n < k);
+        return perm[n];
+    }
+};
+/*****************************************
+ * Find the k smallest sums of M terms, where each term is taken in a
+ *  table x of n values.
+ *
+ * A combination of terms is encoded as a scalar 0 <= t < n^M. The
+ * combination t0 ... t(M-1) that correspond to the sum
+ *
+ *           sum = x[0, t0] + x[1, t1] + .... + x[M-1, t(M-1)]
+ *
+ * is encoded as
+ *
+ *           t = t0 + t1 * n + t2 * n^2 + ... + t(M-1) * n^(M-1)
+ *
+ * MinSumK is an object rather than a function, so that storage can be
+ * re-used over several computations with the same sizes. use_seen is
+ * good when there may be ties in the x array and it is a concern if
+ * occasionally several t's are returned.
+ *
+ * @param x       size M * n, values to add up
+ * @parms k       nb of results to retrieve
+ * @param M       nb of terms
+ * @param n       nb of distinct values
+ * @param sums    output, size k, sorted
+ * @prarm terms   output, size k, with encoding as above
+ *
+ ******************************************/
+template <typename T, class SSA, bool use_seen>
+struct MinSumK {
+    int K;  ///< nb of sums to return
+    int M;  ///< nb of elements to sum up
+    int nbit; ///< nb of bits to encode one entry
+    int N;  ///< nb of possible elements for each of the M terms
+    /** the heap.
+     * We use a heap to maintain a queue of sums, with the associated
+     * terms involved in the sum.
+     */
+    typedef CMin<T, int64_t> HC;
+    size_t heap_capacity, heap_size;
+    T *bh_val;
+    int64_t *bh_ids;
+    std::vector <SSA> ssx;
+    // all results get pushed several times. When there are ties, they
+    // are popped interleaved with others, so it is not easy to
+    // identify them. Therefore, this bit array just marks elements
+    // that were seen before.
+    std::vector <uint8_t> seen;
+    MinSumK (int K, int M, int nbit, int N):
+        K(K), M(M), nbit(nbit), N(N) {
+        heap_capacity = K * M;
+        assert (N <= (1 << nbit));
+        // we'll do k steps, each step pushes at most M vals
+        bh_val = new T[heap_capacity];
+        bh_ids = new int64_t[heap_capacity];
+        if (use_seen) {
+            int64_t n_ids = weight(M);
+            seen.resize ((n_ids + 7) / 8);
+        }
+        for (int m = 0; m < M; m++)
+            ssx.push_back (SSA(N));
+    }
+    int64_t weight (int i) {
+        return 1 << (i * nbit);
+    }
+    bool is_seen (int64_t i) {
+        return (seen[i >> 3] >> (i & 7)) & 1;
+    }
+    void mark_seen (int64_t i) {
+        if (use_seen)
+            seen [i >> 3] |= 1 << (i & 7);
+    }
+    void run (const T *x, int64_t ldx,
+              T * sums, int64_t * terms) {
+        heap_size = 0;
+        for (int m = 0; m < M; m++) {
+            ssx[m].init(x);
+            x += ldx;
+        }
+        { // intial result: take min for all elements
+            T sum = 0;
+            terms[0] = 0;
+            mark_seen (0);
+            for (int m = 0; m < M; m++) {
+                sum += ssx[m].get_0();
+            }
+            sums[0] = sum;
+            for (int m = 0; m < M; m++) {
+                heap_push<HC> (++heap_size, bh_val, bh_ids,
+                               sum + ssx[m].get_diff(1),
+                               weight(m));
+            }
+        }
+        for (int k = 1; k < K; k++) {
+            // pop smallest value from heap
+            if (use_seen) {// skip already seen elements
+                while (is_seen (bh_ids[0])) {
+                    assert (heap_size > 0);
+                    heap_pop<HC> (heap_size--, bh_val, bh_ids);
+                }
+            }
+            assert (heap_size > 0);
+            T sum = sums[k] = bh_val[0];
+            int64_t ti = terms[k] = bh_ids[0];
+            if (use_seen) {
+                mark_seen (ti);
+                heap_pop<HC> (heap_size--, bh_val, bh_ids);
+            } else {
+                do {
+                    heap_pop<HC> (heap_size--, bh_val, bh_ids);
+                }  while (heap_size > 0 && bh_ids[0] == ti);
+            }
+            // enqueue followers
+            int64_t ii = ti;
+            for (int m = 0; m < M; m++) {
+                int64_t n = ii & ((1L << nbit) - 1);
+                ii >>= nbit;
+                if (n + 1 >= N) continue;
+                enqueue_follower (ti, m, n, sum);
+            }
+        }
+        /*
+        for (int k = 0; k < K; k++)
+            for (int l = k + 1; l < K; l++)
+                assert (terms[k] != terms[l]);
+        */
+        // convert indices by applying permutation
+        for (int k = 0; k < K; k++) {
+            int64_t ii = terms[k];
+            if (use_seen) {
+                // clear seen for reuse at next loop
+                seen[ii >> 3] = 0;
+            }
+            int64_t ti = 0;
+            for (int m = 0; m < M; m++) {
+                int64_t n = ii & ((1L << nbit) - 1);
+                ti += int64_t(ssx[m].get_ord(n)) << (nbit * m);
+                ii >>= nbit;
+            }
+            terms[k] = ti;
+        }
+    }
+    void enqueue_follower (int64_t ti, int m, int n, T sum) {
+        T next_sum = sum + ssx[m].get_diff(n + 1);
+        int64_t next_ti = ti + weight(m);
+        heap_push<HC> (++heap_size, bh_val, bh_ids, next_sum, next_ti);
+    }
+    ~MinSumK () {
+        delete [] bh_ids;
+        delete [] bh_val;
+    }
+};
+} // anonymous namespace
+MultiIndexQuantizer::MultiIndexQuantizer (int d,
+                     size_t M,
+                     size_t nbits):
+    Index(d, METRIC_L2), pq(d, M, nbits)
+{
+    is_trained = false;
+    pq.verbose = verbose;
+}
+void MultiIndexQuantizer::train(idx_t n, const float *x)
+{
+    pq.verbose = verbose;
+    pq.train (n, x);
+    is_trained = true;
+    // count virtual elements in index
+    ntotal = 1;
+    for (int m = 0; m < pq.M; m++)
+        ntotal *= pq.ksub;
+}
+void MultiIndexQuantizer::search (idx_t n, const float *x, idx_t k,
+                                  float *distances, idx_t *labels) const {
+    if (n == 0) return;
+    // the allocation just below can be severe...
+    idx_t bs = 32768;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(i0 + bs, n);
+            if (verbose) {
+                printf("MultiIndexQuantizer::search: %ld:%ld / %ld\n",
+                       i0, i1, n);
+            }
+            search (i1 - i0, x + i0 * d, k,
+                    distances + i0 * k,
+                    labels + i0 * k);
+        }
+        return;
+    }
+    float * dis_tables = new float [n * pq.ksub * pq.M];
+    ScopeDeleter<float> del (dis_tables);
+    pq.compute_distance_tables (n, x, dis_tables);
+    if (k == 1) {
+        // simple version that just finds the min in each table
+#pragma omp parallel for
+        for (int i = 0; i < n; i++) {
+            const float * dis_table = dis_tables + i * pq.ksub * pq.M;
+            float dis = 0;
+            idx_t label = 0;
+            for (int s = 0; s < pq.M; s++) {
+                float vmin = HUGE_VALF;
+                idx_t lmin = -1;
+                for (idx_t j = 0; j < pq.ksub; j++) {
+                    if (dis_table[j] < vmin) {
+                        vmin = dis_table[j];
+                        lmin = j;
+                    }
+                }
+                dis += vmin;
+                label |= lmin << (s * pq.nbits);
+                dis_table += pq.ksub;
+            }
+            distances [i] = dis;
+            labels [i] = label;
+        }
+    } else {
+#pragma omp parallel if(n > 1)
+        {
+            MinSumK <float, SemiSortedArray<float>, false>
+                msk(k, pq.M, pq.nbits, pq.ksub);
+#pragma omp for
+            for (int i = 0; i < n; i++) {
+                msk.run (dis_tables + i * pq.ksub * pq.M, pq.ksub,
+                         distances + i * k, labels + i * k);
+            }
+        }
+    }
+}
+void MultiIndexQuantizer::reconstruct (idx_t key, float * recons) const
+{
+    int64_t jj = key;
+    for (int m = 0; m < pq.M; m++) {
+        int64_t n = jj & ((1L << pq.nbits) - 1);
+        jj >>= pq.nbits;
+        memcpy(recons, pq.get_centroids(m, n), sizeof(recons[0]) * pq.dsub);
+        recons += pq.dsub;
+    }
+}
+void MultiIndexQuantizer::add(idx_t /*n*/, const float* /*x*/) {
+  FAISS_THROW_MSG(
+      "This index has virtual elements, "
+      "it does not support add");
+}
+void MultiIndexQuantizer::reset ()
+{
+    FAISS_THROW_MSG ( "This index has virtual elements, "
+                      "it does not support reset");
+}
+/*****************************************
+ * MultiIndexQuantizer2
+ ******************************************/
+MultiIndexQuantizer2::MultiIndexQuantizer2 (
+        int d, size_t M, size_t nbits,
+        Index **indexes):
+    MultiIndexQuantizer (d, M, nbits)
+{
+    assign_indexes.resize (M);
+    for (int i = 0; i < M; i++) {
+        FAISS_THROW_IF_NOT_MSG(
+            indexes[i]->d == pq.dsub,
+            "Provided sub-index has incorrect size");
+        assign_indexes[i] = indexes[i];
+    }
+    own_fields = false;
+}
+MultiIndexQuantizer2::MultiIndexQuantizer2 (
+        int d, size_t nbits,
+        Index *assign_index_0,
+        Index *assign_index_1):
+    MultiIndexQuantizer (d, 2, nbits)
+{
+    FAISS_THROW_IF_NOT_MSG(
+            assign_index_0->d == pq.dsub &&
+            assign_index_1->d == pq.dsub,
+            "Provided sub-index has incorrect size");
+    assign_indexes.resize (2);
+    assign_indexes [0] = assign_index_0;
+    assign_indexes [1] = assign_index_1;
+    own_fields = false;
+}
+void MultiIndexQuantizer2::train(idx_t n, const float* x)
+{
+    MultiIndexQuantizer::train(n, x);
+    // add centroids to sub-indexes
+    for (int i = 0; i < pq.M; i++) {
+        assign_indexes[i]->add(pq.ksub, pq.get_centroids(i, 0));
+    }
+}
+void MultiIndexQuantizer2::search(
+        idx_t n, const float* x, idx_t K,
+        float* distances, idx_t* labels) const
+{
+    if (n == 0) return;
+    int k2 = std::min(K, int64_t(pq.ksub));
+    int64_t M = pq.M;
+    int64_t dsub = pq.dsub, ksub = pq.ksub;
+    // size (M, n, k2)
+    std::vector<idx_t> sub_ids(n * M * k2);
+    std::vector<float> sub_dis(n * M * k2);
+    std::vector<float> xsub(n * dsub);
+    for (int m = 0; m < M; m++) {
+        float *xdest = xsub.data();
+        const float *xsrc = x + m * dsub;
+        for (int j = 0; j < n; j++) {
+            memcpy(xdest, xsrc, dsub * sizeof(xdest[0]));
+            xsrc += d;
+            xdest += dsub;
+        }
+        assign_indexes[m]->search(
+              n, xsub.data(), k2,
+              &sub_dis[k2 * n * m],
+              &sub_ids[k2 * n * m]);
+    }
+    if (K == 1) {
+        // simple version that just finds the min in each table
+        assert (k2 == 1);
+        for (int i = 0; i < n; i++) {
+            float dis = 0;
+            idx_t label = 0;
+            for (int m = 0; m < M; m++) {
+                float vmin = sub_dis[i + m * n];
+                idx_t lmin = sub_ids[i + m * n];
+                dis += vmin;
+                label |= lmin << (m * pq.nbits);
+            }
+            distances [i] = dis;
+            labels [i] = label;
+        }
+    } else {
+#pragma omp parallel if(n > 1)
+        {
+            MinSumK <float, PreSortedArray<float>, false>
+                msk(K, pq.M, pq.nbits, k2);
+#pragma omp for
+            for (int i = 0; i < n; i++) {
+                idx_t *li = labels + i * K;
+                msk.run (&sub_dis[i * k2], k2 * n,
+                         distances + i * K, li);
+                // remap ids
+                const idx_t *idmap0 = sub_ids.data() + i * k2;
+                int64_t ld_idmap = k2 * n;
+                int64_t mask1 = ksub - 1L;
+                for (int k = 0; k < K; k++) {
+                    const idx_t *idmap = idmap0;
+                    int64_t vin = li[k];
+                    int64_t vout = 0;
+                    int bs = 0;
+                    for (int m = 0; m < M; m++) {
+                        int64_t s = vin & mask1;
+                        vin >>= pq.nbits;
+                        vout |= idmap[s] << bs;
+                        bs += pq.nbits;
+                        idmap += ld_idmap;
+                    }
+                    li[k] = vout;
+                }
+            }
+        }
+    }
+}
+} // namespace faiss