RubyGems - faiss - Versions diffs - 0.2.3 → 0.2.4 - Mend

faiss 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/LICENSE.txt +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/Clustering.cpp +32 -0
data/vendor/faiss/faiss/Clustering.h +14 -0
data/vendor/faiss/faiss/Index.h +1 -1
data/vendor/faiss/faiss/Index2Layer.cpp +19 -92
data/vendor/faiss/faiss/Index2Layer.h +2 -16
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
data/vendor/faiss/faiss/{IndexResidual.h → IndexAdditiveQuantizer.h} +101 -58
data/vendor/faiss/faiss/IndexFlat.cpp +22 -52
data/vendor/faiss/faiss/IndexFlat.h +9 -15
data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
data/vendor/faiss/faiss/IndexIVF.cpp +79 -7
data/vendor/faiss/faiss/IndexIVF.h +25 -7
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
data/vendor/faiss/faiss/IndexIVFFlat.cpp +9 -12
data/vendor/faiss/faiss/IndexIVFPQ.cpp +5 -4
data/vendor/faiss/faiss/IndexIVFPQ.h +1 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +60 -39
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +21 -6
data/vendor/faiss/faiss/IndexLSH.cpp +4 -30
data/vendor/faiss/faiss/IndexLSH.h +2 -15
data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -2
data/vendor/faiss/faiss/IndexNSG.cpp +0 -2
data/vendor/faiss/faiss/IndexPQ.cpp +2 -51
data/vendor/faiss/faiss/IndexPQ.h +2 -17
data/vendor/faiss/faiss/IndexRefine.cpp +28 -0
data/vendor/faiss/faiss/IndexRefine.h +10 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -28
data/vendor/faiss/faiss/IndexScalarQuantizer.h +2 -16
data/vendor/faiss/faiss/VectorTransform.cpp +2 -1
data/vendor/faiss/faiss/VectorTransform.h +3 -0
data/vendor/faiss/faiss/clone_index.cpp +3 -2
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +2 -2
data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +257 -24
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +69 -9
data/vendor/faiss/faiss/impl/HNSW.cpp +10 -5
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +393 -210
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +100 -28
data/vendor/faiss/faiss/impl/NSG.cpp +0 -3
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +357 -47
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +65 -7
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +12 -19
data/vendor/faiss/faiss/impl/index_read.cpp +102 -19
data/vendor/faiss/faiss/impl/index_write.cpp +66 -16
data/vendor/faiss/faiss/impl/io.cpp +1 -1
data/vendor/faiss/faiss/impl/io_macros.h +20 -0
data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
data/vendor/faiss/faiss/index_factory.cpp +585 -414
data/vendor/faiss/faiss/index_factory.h +3 -0
data/vendor/faiss/faiss/utils/distances.cpp +4 -2
data/vendor/faiss/faiss/utils/distances.h +36 -3
data/vendor/faiss/faiss/utils/distances_simd.cpp +50 -0
data/vendor/faiss/faiss/utils/utils.h +1 -1
metadata +12 -5
data/vendor/faiss/faiss/IndexResidual.cpp +0 -291

data/vendor/faiss/faiss/impl/index_read.cpp CHANGED Viewed

@@ -25,9 +25,11 @@
 #include <faiss/invlists/InvertedListsIOHook.h>
 #include <faiss/Index2Layer.h>
+#include <faiss/IndexAdditiveQuantizer.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexHNSW.h>
 #include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFAdditiveQuantizer.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
@@ -40,7 +42,6 @@
 #include <faiss/IndexPQFastScan.h>
 #include <faiss/IndexPreTransform.h>
 #include <faiss/IndexRefine.h>
-#include <faiss/IndexResidual.h>
 #include <faiss/IndexScalarQuantizer.h>
 #include <faiss/MetaIndexes.h>
 #include <faiss/VectorTransform.h>
@@ -77,16 +78,22 @@ VectorTransform* read_VectorTransform(IOReader* f) {
     VectorTransform* vt = nullptr;
     if (h == fourcc("rrot") || h == fourcc("PCAm") || h == fourcc("LTra") ||
-        h == fourcc("PcAm") || h == fourcc("Viqm")) {
+        h == fourcc("PcAm") || h == fourcc("Viqm") || h == fourcc("Pcam")) {
         LinearTransform* lt = nullptr;
         if (h == fourcc("rrot")) {
             lt = new RandomRotationMatrix();
-        } else if (h == fourcc("PCAm") || h == fourcc("PcAm")) {
+        } else if (
+                h == fourcc("PCAm") || h == fourcc("PcAm") ||
+                h == fourcc("Pcam")) {
             PCAMatrix* pca = new PCAMatrix();
             READ1(pca->eigen_power);
+            if (h == fourcc("Pcam")) {
+                READ1(pca->epsilon);
+            }
             READ1(pca->random_rotation);
-            if (h == fourcc("PcAm"))
+            if (h != fourcc("PCAm")) {
                 READ1(pca->balanced_bins);
+            }
             READVECTOR(pca->mean);
             READVECTOR(pca->eigenvalues);
             READVECTOR(pca->PCAMat);
@@ -139,9 +146,10 @@ VectorTransform* read_VectorTransform(IOReader* f) {
         vt = itqt;
     } else {
         FAISS_THROW_FMT(
-                "fourcc %ud (\"%s\") not recognized",
+                "fourcc %ud (\"%s\") not recognized in %s",
                 h,
-                fourcc_inv_printable(h).c_str());
+                fourcc_inv_printable(h).c_str(),
+                f->name.c_str());
     }
     READ1(vt->d_in);
     READ1(vt->d_out);
@@ -239,15 +247,58 @@ static void read_ProductQuantizer(ProductQuantizer* pq, IOReader* f) {
     READVECTOR(pq->centroids);
 }
-static void read_ResidualQuantizer(ResidualQuantizer* rq, IOReader* f) {
+static void read_ResidualQuantizer_old(ResidualQuantizer* rq, IOReader* f) {
     READ1(rq->d);
     READ1(rq->M);
     READVECTOR(rq->nbits);
-    rq->set_derived_values();
     READ1(rq->is_trained);
     READ1(rq->train_type);
     READ1(rq->max_beam_size);
     READVECTOR(rq->codebooks);
+    READ1(rq->search_type);
+    READ1(rq->norm_min);
+    READ1(rq->norm_max);
+    rq->set_derived_values();
+}
+static void read_AdditiveQuantizer(AdditiveQuantizer* aq, IOReader* f) {
+    READ1(aq->d);
+    READ1(aq->M);
+    READVECTOR(aq->nbits);
+    READ1(aq->is_trained);
+    READVECTOR(aq->codebooks);
+    READ1(aq->search_type);
+    READ1(aq->norm_min);
+    READ1(aq->norm_max);
+    if (aq->search_type == AdditiveQuantizer::ST_norm_cqint8 ||
+        aq->search_type == AdditiveQuantizer::ST_norm_cqint4) {
+        READXBVECTOR(aq->qnorm.codes);
+    }
+    aq->set_derived_values();
+}
+static void read_ResidualQuantizer(ResidualQuantizer* rq, IOReader* f) {
+    read_AdditiveQuantizer(rq, f);
+    READ1(rq->train_type);
+    READ1(rq->max_beam_size);
+    if (!(rq->train_type & ResidualQuantizer::Skip_codebook_tables)) {
+        rq->compute_codebook_tables();
+    }
+}
+static void read_LocalSearchQuantizer(LocalSearchQuantizer* lsq, IOReader* f) {
+    read_AdditiveQuantizer(lsq, f);
+    READ1(lsq->K);
+    READ1(lsq->train_iters);
+    READ1(lsq->encode_ils_iters);
+    READ1(lsq->train_ils_iters);
+    READ1(lsq->icm_iters);
+    READ1(lsq->p);
+    READ1(lsq->lambd);
+    READ1(lsq->chunk_size);
+    READ1(lsq->random_seed);
+    READ1(lsq->nperts);
+    READ1(lsq->update_codebooks_with_double);
 }
 static void read_ScalarQuantizer(ScalarQuantizer* ivsc, IOReader* f) {
@@ -422,8 +473,10 @@ Index* read_index(IOReader* f, int io_flags) {
             idxf = new IndexFlat();
         }
         read_index_header(idxf, f);
-        READVECTOR(idxf->xb);
-        FAISS_THROW_IF_NOT(idxf->xb.size() == idxf->ntotal * idxf->d);
+        idxf->code_size = idxf->d * sizeof(float);
+        READXBVECTOR(idxf->codes);
+        FAISS_THROW_IF_NOT(
+                idxf->codes.size() == idxf->ntotal * idxf->code_size);
         // leak!
         idx = idxf;
     } else if (h == fourcc("IxHE") || h == fourcc("IxHe")) {
@@ -433,7 +486,9 @@ Index* read_index(IOReader* f, int io_flags) {
         READ1(idxl->rotate_data);
         READ1(idxl->train_thresholds);
         READVECTOR(idxl->thresholds);
-        READ1(idxl->bytes_per_vec);
+        int code_size_i;
+        READ1(code_size_i);
+        idxl->code_size = code_size_i;
         if (h == fourcc("IxHE")) {
             FAISS_THROW_IF_NOT_FMT(
                     idxl->nbits % 64 == 0,
@@ -441,7 +496,7 @@ Index* read_index(IOReader* f, int io_flags) {
                     "nbits multiple of 64 (got %d)",
                     (int)idxl->nbits);
             // leak
-            idxl->bytes_per_vec *= 8;
+            idxl->code_size *= 8;
         }
         {
             RandomRotationMatrix* rrot = dynamic_cast<RandomRotationMatrix*>(
@@ -454,7 +509,7 @@ Index* read_index(IOReader* f, int io_flags) {
         FAISS_THROW_IF_NOT(
                 idxl->rrot.d_in == idxl->d && idxl->rrot.d_out == idxl->nbits);
         FAISS_THROW_IF_NOT(
-                idxl->codes.size() == idxl->ntotal * idxl->bytes_per_vec);
+                idxl->codes.size() == idxl->ntotal * idxl->code_size);
         idx = idxl;
     } else if (
             h == fourcc("IxPQ") || h == fourcc("IxPo") || h == fourcc("IxPq")) {
@@ -462,6 +517,7 @@ Index* read_index(IOReader* f, int io_flags) {
         IndexPQ* idxp = new IndexPQ();
         read_index_header(idxp, f);
         read_ProductQuantizer(&idxp->pq, f);
+        idxp->code_size = idxp->pq.code_size;
         READVECTOR(idxp->codes);
         if (h == fourcc("IxPo") || h == fourcc("IxPq")) {
             READ1(idxp->search_type);
@@ -475,13 +531,21 @@ Index* read_index(IOReader* f, int io_flags) {
             idxp->metric_type = METRIC_L2;
         }
         idx = idxp;
-    } else if (h == fourcc("IxRQ")) {
-        IndexResidual* idxr = new IndexResidual();
+    } else if (h == fourcc("IxRQ") || h == fourcc("IxRq")) {
+        IndexResidualQuantizer* idxr = new IndexResidualQuantizer();
         read_index_header(idxr, f);
-        read_ResidualQuantizer(&idxr->rq, f);
-        READ1(idxr->search_type);
-        READ1(idxr->norm_min);
-        READ1(idxr->norm_max);
+        if (h == fourcc("IxRQ")) {
+            read_ResidualQuantizer_old(&idxr->rq, f);
+        } else {
+            read_ResidualQuantizer(&idxr->rq, f);
+        }
+        READ1(idxr->code_size);
+        READVECTOR(idxr->codes);
+        idx = idxr;
+    } else if (h == fourcc("IxLS")) {
+        auto idxr = new IndexLocalSearchQuantizer();
+        read_index_header(idxr, f);
+        read_LocalSearchQuantizer(&idxr->lsq, f);
         READ1(idxr->code_size);
         READVECTOR(idxr->codes);
         idx = idxr;
@@ -571,6 +635,25 @@ Index* read_index(IOReader* f, int io_flags) {
         }
         read_InvertedLists(ivsc, f, io_flags);
         idx = ivsc;
+    } else if (h == fourcc("IwLS") || h == fourcc("IwRQ")) {
+        bool is_LSQ = h == fourcc("IwLS");
+        IndexIVFAdditiveQuantizer* iva;
+        if (is_LSQ) {
+            iva = new IndexIVFLocalSearchQuantizer();
+        } else {
+            iva = new IndexIVFResidualQuantizer();
+        }
+        read_ivf_header(iva, f);
+        READ1(iva->code_size);
+        if (is_LSQ) {
+            read_LocalSearchQuantizer((LocalSearchQuantizer*)iva->aq, f);
+        } else {
+            read_ResidualQuantizer((ResidualQuantizer*)iva->aq, f);
+        }
+        READ1(iva->by_residual);
+        READ1(iva->use_precomputed_table);
+        read_InvertedLists(iva, f, io_flags);
+        idx = iva;
     } else if (h == fourcc("IwSh")) {
         IndexIVFSpectralHash* ivsp = new IndexIVFSpectralHash();
         read_ivf_header(ivsp, f);

data/vendor/faiss/faiss/impl/index_write.cpp CHANGED Viewed

@@ -26,9 +26,11 @@
 #include <faiss/utils/hamming.h>
 #include <faiss/Index2Layer.h>
+#include <faiss/IndexAdditiveQuantizer.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexHNSW.h>
 #include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFAdditiveQuantizer.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
@@ -41,7 +43,6 @@
 #include <faiss/IndexPQFastScan.h>
 #include <faiss/IndexPreTransform.h>
 #include <faiss/IndexRefine.h>
-#include <faiss/IndexResidual.h>
 #include <faiss/IndexScalarQuantizer.h>
 #include <faiss/MetaIndexes.h>
 #include <faiss/VectorTransform.h>
@@ -95,9 +96,10 @@ void write_VectorTransform(const VectorTransform* vt, IOWriter* f) {
             uint32_t h = fourcc("rrot");
             WRITE1(h);
         } else if (const PCAMatrix* pca = dynamic_cast<const PCAMatrix*>(lt)) {
-            uint32_t h = fourcc("PcAm");
+            uint32_t h = fourcc("Pcam");
             WRITE1(h);
             WRITE1(pca->eigen_power);
+            WRITE1(pca->epsilon);
             WRITE1(pca->random_rotation);
             WRITE1(pca->balanced_bins);
             WRITEVECTOR(pca->mean);
@@ -158,14 +160,42 @@ void write_ProductQuantizer(const ProductQuantizer* pq, IOWriter* f) {
     WRITEVECTOR(pq->centroids);
 }
-void write_ResidualQuantizer(const ResidualQuantizer* rq, IOWriter* f) {
-    WRITE1(rq->d);
-    WRITE1(rq->M);
-    WRITEVECTOR(rq->nbits);
-    WRITE1(rq->is_trained);
+static void write_AdditiveQuantizer(const AdditiveQuantizer* aq, IOWriter* f) {
+    WRITE1(aq->d);
+    WRITE1(aq->M);
+    WRITEVECTOR(aq->nbits);
+    WRITE1(aq->is_trained);
+    WRITEVECTOR(aq->codebooks);
+    WRITE1(aq->search_type);
+    WRITE1(aq->norm_min);
+    WRITE1(aq->norm_max);
+    if (aq->search_type == AdditiveQuantizer::ST_norm_cqint8 ||
+        aq->search_type == AdditiveQuantizer::ST_norm_cqint4) {
+        WRITEXBVECTOR(aq->qnorm.codes);
+    }
+}
+static void write_ResidualQuantizer(const ResidualQuantizer* rq, IOWriter* f) {
+    write_AdditiveQuantizer(rq, f);
     WRITE1(rq->train_type);
     WRITE1(rq->max_beam_size);
-    WRITEVECTOR(rq->codebooks);
+}
+static void write_LocalSearchQuantizer(
+        const LocalSearchQuantizer* lsq,
+        IOWriter* f) {
+    write_AdditiveQuantizer(lsq, f);
+    WRITE1(lsq->K);
+    WRITE1(lsq->train_iters);
+    WRITE1(lsq->encode_ils_iters);
+    WRITE1(lsq->train_ils_iters);
+    WRITE1(lsq->icm_iters);
+    WRITE1(lsq->p);
+    WRITE1(lsq->lambd);
+    WRITE1(lsq->chunk_size);
+    WRITE1(lsq->random_seed);
+    WRITE1(lsq->nperts);
+    WRITE1(lsq->update_codebooks_with_double);
 }
 static void write_ScalarQuantizer(const ScalarQuantizer* ivsc, IOWriter* f) {
@@ -315,7 +345,7 @@ void write_index(const Index* idx, IOWriter* f) {
                                                                  : "IxFl");
         WRITE1(h);
         write_index_header(idx, f);
-        WRITEVECTOR(idxf->xb);
+        WRITEXBVECTOR(idxf->codes);
     } else if (const IndexLSH* idxl = dynamic_cast<const IndexLSH*>(idx)) {
         uint32_t h = fourcc("IxHe");
         WRITE1(h);
@@ -324,7 +354,8 @@ void write_index(const Index* idx, IOWriter* f) {
         WRITE1(idxl->rotate_data);
         WRITE1(idxl->train_thresholds);
         WRITEVECTOR(idxl->thresholds);
-        WRITE1(idxl->bytes_per_vec);
+        int code_size_i = idxl->code_size;
+        WRITE1(code_size_i);
         write_VectorTransform(&idxl->rrot, f);
         WRITEVECTOR(idxl->codes);
     } else if (const IndexPQ* idxp = dynamic_cast<const IndexPQ*>(idx)) {
@@ -338,15 +369,20 @@ void write_index(const Index* idx, IOWriter* f) {
         WRITE1(idxp->encode_signs);
         WRITE1(idxp->polysemous_ht);
     } else if (
-            const IndexResidual* idxr =
-                    dynamic_cast<const IndexResidual*>(idx)) {
-        uint32_t h = fourcc("IxRQ");
+            const IndexResidualQuantizer* idxr =
+                    dynamic_cast<const IndexResidualQuantizer*>(idx)) {
+        uint32_t h = fourcc("IxRq");
         WRITE1(h);
         write_index_header(idx, f);
         write_ResidualQuantizer(&idxr->rq, f);
-        WRITE1(idxr->search_type);
-        WRITE1(idxr->norm_min);
-        WRITE1(idxr->norm_max);
+        WRITE1(idxr->code_size);
+        WRITEVECTOR(idxr->codes);
+    } else if (
+            auto* idxr = dynamic_cast<const IndexLocalSearchQuantizer*>(idx)) {
+        uint32_t h = fourcc("IxLS");
+        WRITE1(h);
+        write_index_header(idx, f);
+        write_LocalSearchQuantizer(&idxr->lsq, f);
         WRITE1(idxr->code_size);
         WRITEVECTOR(idxr->codes);
     } else if (
@@ -421,6 +457,20 @@ void write_index(const Index* idx, IOWriter* f) {
         WRITE1(ivsc->code_size);
         WRITE1(ivsc->by_residual);
         write_InvertedLists(ivsc->invlists, f);
+    } else if (auto iva = dynamic_cast<const IndexIVFAdditiveQuantizer*>(idx)) {
+        bool is_LSQ = dynamic_cast<const IndexIVFLocalSearchQuantizer*>(iva);
+        uint32_t h = fourcc(is_LSQ ? "IwLS" : "IwRQ");
+        WRITE1(h);
+        write_ivf_header(iva, f);
+        WRITE1(iva->code_size);
+        if (is_LSQ) {
+            write_LocalSearchQuantizer((LocalSearchQuantizer*)iva->aq, f);
+        } else {
+            write_ResidualQuantizer((ResidualQuantizer*)iva->aq, f);
+        }
+        WRITE1(iva->by_residual);
+        WRITE1(iva->use_precomputed_table);
+        write_InvertedLists(iva->invlists, f);
     } else if (
             const IndexIVFSpectralHash* ivsp =
                     dynamic_cast<const IndexIVFSpectralHash*>(idx)) {

data/vendor/faiss/faiss/impl/io.cpp CHANGED Viewed

@@ -240,7 +240,7 @@ uint32_t fourcc(const std::string& sx) {
 void fourcc_inv(uint32_t x, char str[5]) {
     *(uint32_t*)str = x;
-    str[5] = 0;
+    str[4] = 0;
 }
 std::string fourcc_inv(uint32_t x) {

data/vendor/faiss/faiss/impl/io_macros.h CHANGED Viewed

@@ -66,3 +66,23 @@
         WRITEANDCHECK(&size, 1);           \
         WRITEANDCHECK((vec).data(), size); \
     }
+// read/write xb vector for backwards compatibility of IndexFlat
+#define WRITEXBVECTOR(vec)                         \
+    {                                              \
+        FAISS_THROW_IF_NOT((vec).size() % 4 == 0); \
+        size_t size = (vec).size() / 4;            \
+        WRITEANDCHECK(&size, 1);                   \
+        WRITEANDCHECK((vec).data(), size * 4);     \
+    }
+#define READXBVECTOR(vec)                                            \
+    {                                                                \
+        size_t size;                                                 \
+        READANDCHECK(&size, 1);                                      \
+        FAISS_THROW_IF_NOT(size >= 0 && size < (uint64_t{1} << 40)); \
+        size *= 4;                                                   \
+        (vec).resize(size);                                          \
+        READANDCHECK((vec).data(), size);                            \
+    }

data/vendor/faiss/faiss/impl/kmeans1d.cpp ADDED Viewed

@@ -0,0 +1,301 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <faiss/Index.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/kmeans1d.h>
+namespace faiss {
+using idx_t = Index::idx_t;
+using LookUpFunc = std::function<float(idx_t, idx_t)>;
+void reduce(
+        const std::vector<idx_t>& rows,
+        const std::vector<idx_t>& input_cols,
+        const LookUpFunc& lookup,
+        std::vector<idx_t>& output_cols) {
+    for (idx_t col : input_cols) {
+        while (!output_cols.empty()) {
+            idx_t row = rows[output_cols.size() - 1];
+            float a = lookup(row, col);
+            float b = lookup(row, output_cols.back());
+            if (a >= b) { // defeated
+                break;
+            }
+            output_cols.pop_back();
+        }
+        if (output_cols.size() < rows.size()) {
+            output_cols.push_back(col);
+        }
+    }
+}
+void interpolate(
+        const std::vector<idx_t>& rows,
+        const std::vector<idx_t>& cols,
+        const LookUpFunc& lookup,
+        idx_t* argmins) {
+    std::unordered_map<idx_t, idx_t> idx_to_col;
+    for (idx_t idx = 0; idx < cols.size(); ++idx) {
+        idx_to_col[cols[idx]] = idx;
+    }
+    idx_t start = 0;
+    for (idx_t r = 0; r < rows.size(); r += 2) {
+        idx_t row = rows[r];
+        idx_t end = cols.size() - 1;
+        if (r < rows.size() - 1) {
+            idx_t idx = argmins[rows[r + 1]];
+            end = idx_to_col[idx];
+        }
+        idx_t argmin = cols[start];
+        float min = lookup(row, argmin);
+        for (idx_t c = start + 1; c <= end; c++) {
+            float value = lookup(row, cols[c]);
+            if (value < min) {
+                argmin = cols[c];
+                min = value;
+            }
+        }
+        argmins[row] = argmin;
+        start = end;
+    }
+}
+/** SMAWK algo. Find the row minima of a monotone matrix.
+ *
+ * References:
+ *   1. http://web.cs.unlv.edu/larmore/Courses/CSC477/monge.pdf
+ *   2. https://gist.github.com/dstein64/8e94a6a25efc1335657e910ff525f405
+ *   3. https://github.com/dstein64/kmeans1d
+ */
+void smawk_impl(
+        const std::vector<idx_t>& rows,
+        const std::vector<idx_t>& input_cols,
+        const LookUpFunc& lookup,
+        idx_t* argmins) {
+    if (rows.size() == 0) {
+        return;
+    }
+    /**********************************
+     * REDUCE
+     **********************************/
+    auto ptr = &input_cols;
+    std::vector<idx_t> survived_cols; // survived columns
+    if (rows.size() < input_cols.size()) {
+        reduce(rows, input_cols, lookup, survived_cols);
+        ptr = &survived_cols;
+    }
+    auto& cols = *ptr; // avoid memory copy
+    /**********************************
+     * INTERPOLATE
+     **********************************/
+    // call recursively on odd-indexed rows
+    std::vector<idx_t> odd_rows;
+    for (idx_t i = 1; i < rows.size(); i += 2) {
+        odd_rows.push_back(rows[i]);
+    }
+    smawk_impl(odd_rows, cols, lookup, argmins);
+    // interpolate the even-indexed rows
+    interpolate(rows, cols, lookup, argmins);
+}
+void smawk(
+        const idx_t nrows,
+        const idx_t ncols,
+        const LookUpFunc& lookup,
+        idx_t* argmins) {
+    std::vector<idx_t> rows(nrows);
+    std::vector<idx_t> cols(ncols);
+    std::iota(std::begin(rows), std::end(rows), 0);
+    std::iota(std::begin(cols), std::end(cols), 0);
+    smawk_impl(rows, cols, lookup, argmins);
+}
+void smawk(
+        const idx_t nrows,
+        const idx_t ncols,
+        const float* x,
+        idx_t* argmins) {
+    auto lookup = [&x, &ncols](idx_t i, idx_t j) { return x[i * ncols + j]; };
+    smawk(nrows, ncols, lookup, argmins);
+}
+namespace {
+class CostCalculator {
+    // The reuslt would be inaccurate if we use float
+    std::vector<double> cumsum;
+    std::vector<double> cumsum2;
+   public:
+    CostCalculator(const std::vector<float>& vec, idx_t n) {
+        cumsum.push_back(0.0);
+        cumsum2.push_back(0.0);
+        for (idx_t i = 0; i < n; ++i) {
+            float x = vec[i];
+            cumsum.push_back(x + cumsum[i]);
+            cumsum2.push_back(x * x + cumsum2[i]);
+        }
+    }
+    float operator()(idx_t i, idx_t j) {
+        if (j < i) {
+            return 0.0f;
+        }
+        auto mu = (cumsum[j + 1] - cumsum[i]) / (j - i + 1);
+        auto result = cumsum2[j + 1] - cumsum2[i];
+        result += (j - i + 1) * (mu * mu);
+        result -= (2 * mu) * (cumsum[j + 1] - cumsum[i]);
+        return float(result);
+    }
+};
+template <class T>
+class Matrix {
+    std::vector<T> data;
+    idx_t nrows;
+    idx_t ncols;
+   public:
+    Matrix(idx_t nrows, idx_t ncols) {
+        this->nrows = nrows;
+        this->ncols = ncols;
+        data.resize(nrows * ncols);
+    }
+    inline T& at(idx_t i, idx_t j) {
+        return data[i * ncols + j];
+    }
+};
+} // anonymous namespace
+double kmeans1d(const float* x, size_t n, size_t nclusters, float* centroids) {
+    FAISS_THROW_IF_NOT(n >= nclusters);
+    // corner case
+    if (n == nclusters) {
+        memcpy(centroids, x, n * sizeof(*x));
+        return 0.0f;
+    }
+    /***************************************************
+     * sort in ascending order, O(NlogN) in time
+     ***************************************************/
+    std::vector<float> arr(x, x + n);
+    std::sort(arr.begin(), arr.end());
+    /***************************************************
+    dynamic programming algorithm
+    Reference: https://arxiv.org/abs/1701.07204
+    -------------------------------
+    Assume x is already sorted in ascending order.
+    N: number of points
+    K: number of clusters
+    CC(i, j): the cost of grouping xi,...,xj into one cluster
+    D[k][m]:  the cost of optimally clustering x1,...,xm into k clusters
+    T[k][m]:  the start index of the k-th cluster
+    The DP process is as follow:
+        D[k][m] = min_i D[k − 1][i − 1] + CC(i, m)
+        T[k][m] = argmin_i D[k − 1][i − 1] + CC(i, m)
+    This could be solved in O(KN^2) time and O(KN) space.
+    To further reduce the time complexity, we use SMAWK algo to
+    solve the argmin problem as follow:
+    For each k:
+        C[m][i] = D[k − 1][i − 1] + CC(i, m)
+        Here C is a n x n totally monotone matrix.
+        We could find the row minima by SMAWK in O(N) time.
+    Now the time complexity is reduced from O(kN^2) to O(KN).
+    ****************************************************/
+    CostCalculator CC(arr, n);
+    Matrix<float> D(nclusters, n);
+    Matrix<idx_t> T(nclusters, n);
+    for (idx_t m = 0; m < n; m++) {
+        D.at(0, m) = CC(0, m);
+        T.at(0, m) = 0;
+    }
+    std::vector<idx_t> indices(nclusters, 0);
+    for (idx_t k = 1; k < nclusters; ++k) {
+        // we define C here
+        auto C = [&D, &CC, &k](idx_t m, idx_t i) {
+            if (i == 0) {
+                return CC(i, m);
+            }
+            idx_t col = std::min(m, i - 1);
+            return D.at(k - 1, col) + CC(i, m);
+        };
+        std::vector<idx_t> argmins(n); // argmin of each row
+        smawk(n, n, C, argmins.data());
+        for (idx_t m = 0; m < argmins.size(); m++) {
+            idx_t idx = argmins[m];
+            D.at(k, m) = C(m, idx);
+            T.at(k, m) = idx;
+        }
+    }
+    /***************************************************
+    compute centroids by backtracking
+           T[K - 1][T[K][N] - 1]        T[K][N]        N
+    --------------|------------------------|-----------|
+                  |     cluster K - 1      | cluster K |
+    ****************************************************/
+    // for imbalance factor
+    double tot = 0.0, uf = 0.0;
+    idx_t end = n;
+    for (idx_t k = nclusters - 1; k >= 0; k--) {
+        idx_t start = T.at(k, end - 1);
+        float sum = std::accumulate(&arr[start], &arr[end], 0.0f);
+        idx_t size = end - start;
+        FAISS_THROW_IF_NOT_FMT(
+                size > 0, "Cluster %d: size %d", int(k), int(size));
+        centroids[k] = sum / size;
+        end = start;
+        tot += size;
+        uf += size * double(size);
+    }
+    uf = uf * nclusters / (tot * tot);
+    return uf;
+}
+} // namespace faiss