RubyGems - faiss - Versions diffs - 0.2.3 → 0.2.4 - Mend

faiss 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/LICENSE.txt +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/Clustering.cpp +32 -0
data/vendor/faiss/faiss/Clustering.h +14 -0
data/vendor/faiss/faiss/Index.h +1 -1
data/vendor/faiss/faiss/Index2Layer.cpp +19 -92
data/vendor/faiss/faiss/Index2Layer.h +2 -16
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
data/vendor/faiss/faiss/{IndexResidual.h → IndexAdditiveQuantizer.h} +101 -58
data/vendor/faiss/faiss/IndexFlat.cpp +22 -52
data/vendor/faiss/faiss/IndexFlat.h +9 -15
data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
data/vendor/faiss/faiss/IndexIVF.cpp +79 -7
data/vendor/faiss/faiss/IndexIVF.h +25 -7
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
data/vendor/faiss/faiss/IndexIVFFlat.cpp +9 -12
data/vendor/faiss/faiss/IndexIVFPQ.cpp +5 -4
data/vendor/faiss/faiss/IndexIVFPQ.h +1 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +60 -39
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +21 -6
data/vendor/faiss/faiss/IndexLSH.cpp +4 -30
data/vendor/faiss/faiss/IndexLSH.h +2 -15
data/vendor/faiss/faiss/IndexNNDescent.cpp +0 -2
data/vendor/faiss/faiss/IndexNSG.cpp +0 -2
data/vendor/faiss/faiss/IndexPQ.cpp +2 -51
data/vendor/faiss/faiss/IndexPQ.h +2 -17
data/vendor/faiss/faiss/IndexRefine.cpp +28 -0
data/vendor/faiss/faiss/IndexRefine.h +10 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -28
data/vendor/faiss/faiss/IndexScalarQuantizer.h +2 -16
data/vendor/faiss/faiss/VectorTransform.cpp +2 -1
data/vendor/faiss/faiss/VectorTransform.h +3 -0
data/vendor/faiss/faiss/clone_index.cpp +3 -2
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +2 -2
data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +257 -24
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +69 -9
data/vendor/faiss/faiss/impl/HNSW.cpp +10 -5
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +393 -210
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +100 -28
data/vendor/faiss/faiss/impl/NSG.cpp +0 -3
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +357 -47
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +65 -7
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +12 -19
data/vendor/faiss/faiss/impl/index_read.cpp +102 -19
data/vendor/faiss/faiss/impl/index_write.cpp +66 -16
data/vendor/faiss/faiss/impl/io.cpp +1 -1
data/vendor/faiss/faiss/impl/io_macros.h +20 -0
data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
data/vendor/faiss/faiss/index_factory.cpp +585 -414
data/vendor/faiss/faiss/index_factory.h +3 -0
data/vendor/faiss/faiss/utils/distances.cpp +4 -2
data/vendor/faiss/faiss/utils/distances.h +36 -3
data/vendor/faiss/faiss/utils/distances_simd.cpp +50 -0
data/vendor/faiss/faiss/utils/utils.h +1 -1
metadata +12 -5
data/vendor/faiss/faiss/IndexResidual.cpp +0 -291

data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp CHANGED Viewed

@@ -5,9 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
-// -*- c++ -*-
-#include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/LocalSearchQuantizer.h>
 #include <cstddef>
@@ -18,6 +15,9 @@
 #include <algorithm>
+#include <faiss/Clustering.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/hamming.h> // BitstringWriter
 #include <faiss/utils/utils.h>
@@ -42,18 +42,6 @@ void sgetri_(
         FINTEGER* lwork,
         FINTEGER* info);
-// solves a system of linear equations
-void sgetrs_(
-        const char* trans,
-        FINTEGER* n,
-        FINTEGER* nrhs,
-        float* A,
-        FINTEGER* lda,
-        FINTEGER* ipiv,
-        float* b,
-        FINTEGER* ldb,
-        FINTEGER* info);
 // general matrix multiplication
 int sgemm_(
         const char* transa,
@@ -69,26 +57,73 @@ int sgemm_(
         float* beta,
         float* c,
         FINTEGER* ldc);
+// LU decomoposition of a general matrix
+void dgetrf_(
+        FINTEGER* m,
+        FINTEGER* n,
+        double* a,
+        FINTEGER* lda,
+        FINTEGER* ipiv,
+        FINTEGER* info);
+// generate inverse of a matrix given its LU decomposition
+void dgetri_(
+        FINTEGER* n,
+        double* a,
+        FINTEGER* lda,
+        FINTEGER* ipiv,
+        double* work,
+        FINTEGER* lwork,
+        FINTEGER* info);
+// general matrix multiplication
+int dgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const double* alpha,
+        const double* a,
+        FINTEGER* lda,
+        const double* b,
+        FINTEGER* ldb,
+        double* beta,
+        double* c,
+        FINTEGER* ldc);
 }
 namespace {
+void fmat_inverse(float* a, int n) {
+    int info;
+    int lwork = n * n;
+    std::vector<int> ipiv(n);
+    std::vector<float> workspace(lwork);
+    sgetrf_(&n, &n, a, &n, ipiv.data(), &info);
+    FAISS_THROW_IF_NOT(info == 0);
+    sgetri_(&n, a, &n, ipiv.data(), workspace.data(), &lwork, &info);
+    FAISS_THROW_IF_NOT(info == 0);
+}
 // c and a and b can overlap
-void fvec_add(size_t d, const float* a, const float* b, float* c) {
+void dfvec_add(size_t d, const double* a, const float* b, double* c) {
     for (size_t i = 0; i < d; i++) {
         c[i] = a[i] + b[i];
     }
 }
-void fmat_inverse(float* a, int n) {
+void dmat_inverse(double* a, int n) {
     int info;
     int lwork = n * n;
     std::vector<int> ipiv(n);
-    std::vector<float> workspace(lwork);
+    std::vector<double> workspace(lwork);
-    sgetrf_(&n, &n, a, &n, ipiv.data(), &info);
+    dgetrf_(&n, &n, a, &n, ipiv.data(), &info);
     FAISS_THROW_IF_NOT(info == 0);
-    sgetri_(&n, a, &n, ipiv.data(), workspace.data(), &lwork, &info);
+    dgetri_(&n, a, &n, ipiv.data(), workspace.data(), &lwork, &info);
     FAISS_THROW_IF_NOT(info == 0);
 }
@@ -107,18 +142,15 @@ void random_int32(
 namespace faiss {
-LSQTimer lsq_timer;
-LocalSearchQuantizer::LocalSearchQuantizer(size_t d, size_t M, size_t nbits) {
-    FAISS_THROW_IF_NOT((M * nbits) % 8 == 0);
-    this->d = d;
-    this->M = M;
-    this->nbits = std::vector<size_t>(M, nbits);
-    // set derived values
-    set_derived_values();
+lsq::LSQTimer lsq_timer;
+using lsq::LSQTimerScope;
+LocalSearchQuantizer::LocalSearchQuantizer(
+        size_t d,
+        size_t M,
+        size_t nbits,
+        Search_type_t search_type)
+        : AdditiveQuantizer(d, std::vector<size_t>(M, nbits), search_type) {
     is_trained = false;
     verbose = false;
@@ -138,15 +170,23 @@ LocalSearchQuantizer::LocalSearchQuantizer(size_t d, size_t M, size_t nbits) {
     random_seed = 0x12345;
     std::srand(random_seed);
+    icm_encoder_factory = nullptr;
 }
+LocalSearchQuantizer::~LocalSearchQuantizer() {
+    delete icm_encoder_factory;
+}
+LocalSearchQuantizer::LocalSearchQuantizer() : LocalSearchQuantizer(0, 0, 0) {}
 void LocalSearchQuantizer::train(size_t n, const float* x) {
     FAISS_THROW_IF_NOT(K == (1 << nbits[0]));
     FAISS_THROW_IF_NOT(nperts <= M);
     lsq_timer.reset();
+    LSQTimerScope scope(&lsq_timer, "train");
     if (verbose) {
-        lsq_timer.start("train");
         printf("Training LSQ, with %zd subcodes on %zd %zdD vectors\n",
                M,
                n,
@@ -209,7 +249,7 @@ void LocalSearchQuantizer::train(size_t n, const float* x) {
         }
         // refine codes
-        icm_encode(x, codes.data(), n, train_ils_iters, gen);
+        icm_encode(codes.data(), x, n, train_ils_iters, gen);
         if (verbose) {
             float obj = evaluate(codes.data(), x, n);
@@ -217,25 +257,52 @@ void LocalSearchQuantizer::train(size_t n, const float* x) {
         }
     }
+    is_trained = true;
+    {
+        std::vector<float> x_recons(n * d);
+        std::vector<float> norms(n);
+        decode_unpacked(codes.data(), x_recons.data(), n);
+        fvec_norms_L2sqr(norms.data(), x_recons.data(), d, n);
+        norm_min = HUGE_VALF;
+        norm_max = -HUGE_VALF;
+        for (idx_t i = 0; i < n; i++) {
+            if (norms[i] < norm_min) {
+                norm_min = norms[i];
+            }
+            if (norms[i] > norm_max) {
+                norm_max = norms[i];
+            }
+        }
+        if (search_type == ST_norm_cqint8 || search_type == ST_norm_cqint4) {
+            size_t k = (1 << 8);
+            if (search_type == ST_norm_cqint4) {
+                k = (1 << 4);
+            }
+            Clustering1D clus(k);
+            clus.train_exact(n, norms.data());
+            qnorm.add(clus.k, clus.centroids.data());
+        }
+    }
     if (verbose) {
-        lsq_timer.end("train");
         float obj = evaluate(codes.data(), x, n);
+        scope.finish();
         printf("After training: obj = %lf\n", obj);
         printf("Time statistic:\n");
-        for (const auto& it : lsq_timer.duration) {
-            printf("\t%s time: %lf s\n", it.first.data(), it.second);
+        for (const auto& it : lsq_timer.t) {
+            printf("\t%s time: %lf s\n", it.first.data(), it.second / 1000);
         }
     }
-    is_trained = true;
 }
 void LocalSearchQuantizer::perturb_codebooks(
         float T,
         const std::vector<float>& stddev,
         std::mt19937& gen) {
-    lsq_timer.start("perturb_codebooks");
+    LSQTimerScope scope(&lsq_timer, "perturb_codebooks");
     std::vector<std::normal_distribution<float>> distribs;
     for (size_t i = 0; i < d; i++) {
@@ -249,8 +316,6 @@ void LocalSearchQuantizer::perturb_codebooks(
             }
         }
     }
-    lsq_timer.end("perturb_codebooks");
 }
 void LocalSearchQuantizer::compute_codes(
@@ -258,23 +323,26 @@ void LocalSearchQuantizer::compute_codes(
         uint8_t* codes_out,
         size_t n) const {
     FAISS_THROW_IF_NOT_MSG(is_trained, "LSQ is not trained yet.");
+    lsq_timer.reset();
+    LSQTimerScope scope(&lsq_timer, "encode");
     if (verbose) {
-        lsq_timer.reset();
         printf("Encoding %zd vectors...\n", n);
-        lsq_timer.start("encode");
     }
     std::vector<int32_t> codes(n * M);
     std::mt19937 gen(random_seed);
     random_int32(codes, 0, K - 1, gen);
-    icm_encode(x, codes.data(), n, encode_ils_iters, gen);
+    icm_encode(codes.data(), x, n, encode_ils_iters, gen);
     pack_codes(n, codes.data(), codes_out);
     if (verbose) {
-        lsq_timer.end("encode");
-        double t = lsq_timer.get("encode");
-        printf("Time to encode %zd vectors: %lf s\n", n, t);
+        scope.finish();
+        printf("Time statistic:\n");
+        for (const auto& it : lsq_timer.t) {
+            printf("\t%s time: %lf s\n", it.first.data(), it.second / 1000);
+        }
     }
 }
@@ -298,73 +366,144 @@ void LocalSearchQuantizer::update_codebooks(
         const float* x,
         const int32_t* codes,
         size_t n) {
-    lsq_timer.start("update_codebooks");
+    LSQTimerScope scope(&lsq_timer, "update_codebooks");
+    if (!update_codebooks_with_double) {
+        // allocate memory
+        // bb = B'B, bx = BX
+        std::vector<float> bb(M * K * M * K, 0.0f); // [M * K, M * K]
+        std::vector<float> bx(M * K * d, 0.0f);     // [M * K, d]
+        // compute B'B
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code1 = codes[i * M + m];
+                int32_t idx1 = m * K + code1;
+                bb[idx1 * M * K + idx1] += 1;
+                for (size_t m2 = m + 1; m2 < M; m2++) {
+                    int32_t code2 = codes[i * M + m2];
+                    int32_t idx2 = m2 * K + code2;
+                    bb[idx1 * M * K + idx2] += 1;
+                    bb[idx2 * M * K + idx1] += 1;
+                }
+            }
+        }
-    // allocate memory
-    // bb = B'B, bx = BX
-    std::vector<float> bb(M * K * M * K, 0.0f); // [M * K, M * K]
-    std::vector<float> bx(M * K * d, 0.0f);     // [M * K, d]
+        // add a regularization term to B'B
+        for (int64_t i = 0; i < M * K; i++) {
+            bb[i * (M * K) + i] += lambd;
+        }
-    // compute B'B
-    for (size_t i = 0; i < n; i++) {
-        for (size_t m = 0; m < M; m++) {
-            int32_t code1 = codes[i * M + m];
-            int32_t idx1 = m * K + code1;
-            bb[idx1 * M * K + idx1] += 1;
-            for (size_t m2 = m + 1; m2 < M; m2++) {
-                int32_t code2 = codes[i * M + m2];
-                int32_t idx2 = m2 * K + code2;
-                bb[idx1 * M * K + idx2] += 1;
-                bb[idx2 * M * K + idx1] += 1;
+        // compute (B'B)^(-1)
+        fmat_inverse(bb.data(), M * K); // [M*K, M*K]
+        // compute BX
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code = codes[i * M + m];
+                float* data = bx.data() + (m * K + code) * d;
+                fvec_add(d, data, x + i * d, data);
             }
         }
-    }
-    // add a regularization term to B'B
-    for (int64_t i = 0; i < M * K; i++) {
-        bb[i * (M * K) + i] += lambd;
-    }
+        // compute C = (B'B)^(-1) @ BX
+        //
+        // NOTE: LAPACK use column major order
+        // out = alpha * op(A) * op(B) + beta * C
+        FINTEGER nrows_A = d;
+        FINTEGER ncols_A = M * K;
+        FINTEGER nrows_B = M * K;
+        FINTEGER ncols_B = M * K;
+        float alpha = 1.0f;
+        float beta = 0.0f;
+        sgemm_("Not Transposed",
+               "Not Transposed",
+               &nrows_A, // nrows of op(A)
+               &ncols_B, // ncols of op(B)
+               &ncols_A, // ncols of op(A)
+               &alpha,
+               bx.data(),
+               &nrows_A, // nrows of A
+               bb.data(),
+               &nrows_B, // nrows of B
+               &beta,
+               codebooks.data(),
+               &nrows_A); // nrows of output
+    } else {
+        // allocate memory
+        // bb = B'B, bx = BX
+        std::vector<double> bb(M * K * M * K, 0.0f); // [M * K, M * K]
+        std::vector<double> bx(M * K * d, 0.0f);     // [M * K, d]
+        // compute B'B
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code1 = codes[i * M + m];
+                int32_t idx1 = m * K + code1;
+                bb[idx1 * M * K + idx1] += 1;
+                for (size_t m2 = m + 1; m2 < M; m2++) {
+                    int32_t code2 = codes[i * M + m2];
+                    int32_t idx2 = m2 * K + code2;
+                    bb[idx1 * M * K + idx2] += 1;
+                    bb[idx2 * M * K + idx1] += 1;
+                }
+            }
+        }
-    // compute (B'B)^(-1)
-    fmat_inverse(bb.data(), M * K); // [M*K, M*K]
+        // add a regularization term to B'B
+        for (int64_t i = 0; i < M * K; i++) {
+            bb[i * (M * K) + i] += lambd;
+        }
-    // compute BX
-    for (size_t i = 0; i < n; i++) {
-        for (size_t m = 0; m < M; m++) {
-            int32_t code = codes[i * M + m];
-            float* data = bx.data() + (m * K + code) * d;
-            fvec_add(d, data, x + i * d, data);
+        // compute (B'B)^(-1)
+        dmat_inverse(bb.data(), M * K); // [M*K, M*K]
+        // compute BX
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code = codes[i * M + m];
+                double* data = bx.data() + (m * K + code) * d;
+                dfvec_add(d, data, x + i * d, data);
+            }
         }
-    }
-    // compute C = (B'B)^(-1) @ BX
-    //
-    // NOTE: LAPACK use column major order
-    // out = alpha * op(A) * op(B) + beta * C
-    FINTEGER nrows_A = d;
-    FINTEGER ncols_A = M * K;
-    FINTEGER nrows_B = M * K;
-    FINTEGER ncols_B = M * K;
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    sgemm_("Not Transposed",
-           "Not Transposed",
-           &nrows_A, // nrows of op(A)
-           &ncols_B, // ncols of op(B)
-           &ncols_A, // ncols of op(A)
-           &alpha,
-           bx.data(),
-           &nrows_A, // nrows of A
-           bb.data(),
-           &nrows_B, // nrows of B
-           &beta,
-           codebooks.data(),
-           &nrows_A); // nrows of output
-    lsq_timer.end("update_codebooks");
+        // compute C = (B'B)^(-1) @ BX
+        //
+        // NOTE: LAPACK use column major order
+        // out = alpha * op(A) * op(B) + beta * C
+        FINTEGER nrows_A = d;
+        FINTEGER ncols_A = M * K;
+        FINTEGER nrows_B = M * K;
+        FINTEGER ncols_B = M * K;
+        std::vector<double> d_codebooks(M * K * d);
+        double alpha = 1.0f;
+        double beta = 0.0f;
+        dgemm_("Not Transposed",
+               "Not Transposed",
+               &nrows_A, // nrows of op(A)
+               &ncols_B, // ncols of op(B)
+               &ncols_A, // ncols of op(A)
+               &alpha,
+               bx.data(),
+               &nrows_A, // nrows of A
+               bb.data(),
+               &nrows_B, // nrows of B
+               &beta,
+               d_codebooks.data(),
+               &nrows_A); // nrows of output
+        for (size_t i = 0; i < M * K * d; i++) {
+            codebooks[i] = (float)d_codebooks[i];
+        }
+    }
 }
 /** encode using iterative conditional mode
@@ -386,15 +525,23 @@ void LocalSearchQuantizer::update_codebooks(
  * These two terms can be precomputed and store in a look up table.
  */
 void LocalSearchQuantizer::icm_encode(
-        const float* x,
         int32_t* codes,
+        const float* x,
         size_t n,
         size_t ils_iters,
         std::mt19937& gen) const {
-    lsq_timer.start("icm_encode");
+    LSQTimerScope scope(&lsq_timer, "icm_encode");
+    auto factory = icm_encoder_factory;
+    std::unique_ptr<lsq::IcmEncoder> icm_encoder;
+    if (factory == nullptr) {
+        icm_encoder.reset(lsq::IcmEncoderFactory().get(this));
+    } else {
+        icm_encoder.reset(factory->get(this));
+    }
-    std::vector<float> binaries(M * M * K * K); // [M, M, K, K]
-    compute_binary_terms(binaries.data());
+    // precompute binary terms for all chunks
+    icm_encoder->set_binary_term();
     const size_t n_chunks = (n + chunk_size - 1) / chunk_size;
     for (size_t i = 0; i < n_chunks; i++) {
@@ -410,21 +557,20 @@ void LocalSearchQuantizer::icm_encode(
         const float* xi = x + i * chunk_size * d;
         int32_t* codesi = codes + i * chunk_size * M;
-        icm_encode_partial(i, xi, codesi, ni, binaries.data(), ils_iters, gen);
+        icm_encoder->verbose = (verbose && i == 0);
+        icm_encoder->encode(codesi, xi, gen, ni, ils_iters);
     }
-    lsq_timer.end("icm_encode");
 }
-void LocalSearchQuantizer::icm_encode_partial(
-        size_t index,
-        const float* x,
+void LocalSearchQuantizer::icm_encode_impl(
         int32_t* codes,
-        size_t n,
+        const float* x,
         const float* binaries,
+        std::mt19937& gen,
+        size_t n,
         size_t ils_iters,
-        std::mt19937& gen) const {
-    std::vector<float> unaries(n * M * K); // [n, M, K]
+        bool verbose) const {
+    std::vector<float> unaries(n * M * K); // [M, n, K]
     compute_unary_terms(x, unaries.data(), n);
     std::vector<int32_t> best_codes;
@@ -438,9 +584,7 @@ void LocalSearchQuantizer::icm_encode_partial(
         // add perturbation to codes
         perturb_codes(codes, n, gen);
-        for (size_t iter2 = 0; iter2 < icm_iters; iter2++) {
-            icm_encode_step(unaries.data(), binaries, codes, n);
-        }
+        icm_encode_step(codes, unaries.data(), binaries, n, icm_iters);
         std::vector<float> icm_objs(n, 0.0f);
         evaluate(codes, x, n, icm_objs.data());
@@ -463,7 +607,7 @@ void LocalSearchQuantizer::icm_encode_partial(
         memcpy(codes, best_codes.data(), sizeof(int32_t) * n * M);
-        if (verbose && index == 0) {
+        if (verbose) {
             printf("\tils_iter %zd: obj = %lf, n_betters/n = %zd/%zd\n",
                    iter1,
                    mean_obj,
@@ -474,61 +618,67 @@ void LocalSearchQuantizer::icm_encode_partial(
 }
 void LocalSearchQuantizer::icm_encode_step(
+        int32_t* codes,
         const float* unaries,
         const float* binaries,
-        int32_t* codes,
-        size_t n) const {
-    // condition on the m-th subcode
-    for (size_t m = 0; m < M; m++) {
-        std::vector<float> objs(n * K);
-#pragma omp parallel for
-        for (int64_t i = 0; i < n; i++) {
-            auto u = unaries + i * (M * K) + m * K;
-            memcpy(objs.data() + i * K, u, sizeof(float) * K);
-        }
+        size_t n,
+        size_t n_iters) const {
+    FAISS_THROW_IF_NOT(M != 0 && K != 0);
+    FAISS_THROW_IF_NOT(binaries != nullptr);
-        // compute objective function by adding unary
-        // and binary terms together
-        for (size_t other_m = 0; other_m < M; other_m++) {
-            if (other_m == m) {
-                continue;
+    for (size_t iter = 0; iter < n_iters; iter++) {
+        // condition on the m-th subcode
+        for (size_t m = 0; m < M; m++) {
+            std::vector<float> objs(n * K);
+#pragma omp parallel for
+            for (int64_t i = 0; i < n; i++) {
+                auto u = unaries + m * n * K + i * K;
+                memcpy(objs.data() + i * K, u, sizeof(float) * K);
             }
+            // compute objective function by adding unary
+            // and binary terms together
+            for (size_t other_m = 0; other_m < M; other_m++) {
+                if (other_m == m) {
+                    continue;
+                }
 #pragma omp parallel for
-            for (int64_t i = 0; i < n; i++) {
-                for (int32_t code = 0; code < K; code++) {
-                    int32_t code2 = codes[i * M + other_m];
-                    size_t binary_idx =
-                            m * M * K * K + other_m * K * K + code * K + code2;
-                    // binaries[m, other_m, code, code2]
-                    objs[i * K + code] += binaries[binary_idx];
+                for (int64_t i = 0; i < n; i++) {
+                    for (int32_t code = 0; code < K; code++) {
+                        int32_t code2 = codes[i * M + other_m];
+                        size_t binary_idx = m * M * K * K + other_m * K * K +
+                                code * K + code2;
+                        // binaries[m, other_m, code, code2]
+                        objs[i * K + code] += binaries[binary_idx];
+                    }
                 }
             }
-        }
-        // find the optimal value of the m-th subcode
+            // find the optimal value of the m-th subcode
 #pragma omp parallel for
-        for (int64_t i = 0; i < n; i++) {
-            float best_obj = HUGE_VALF;
-            int32_t best_code = 0;
-            for (size_t code = 0; code < K; code++) {
-                float obj = objs[i * K + code];
-                if (obj < best_obj) {
-                    best_obj = obj;
-                    best_code = code;
+            for (int64_t i = 0; i < n; i++) {
+                float best_obj = HUGE_VALF;
+                int32_t best_code = 0;
+                for (size_t code = 0; code < K; code++) {
+                    float obj = objs[i * K + code];
+                    if (obj < best_obj) {
+                        best_obj = obj;
+                        best_code = code;
+                    }
                 }
+                codes[i * M + m] = best_code;
             }
-            codes[i * M + m] = best_code;
-        }
-    } // loop M
+        } // loop M
+    }
 }
 void LocalSearchQuantizer::perturb_codes(
         int32_t* codes,
         size_t n,
         std::mt19937& gen) const {
-    lsq_timer.start("perturb_codes");
+    LSQTimerScope scope(&lsq_timer, "perturb_codes");
     std::uniform_int_distribution<size_t> m_distrib(0, M - 1);
     std::uniform_int_distribution<int32_t> k_distrib(0, K - 1);
@@ -539,12 +689,10 @@ void LocalSearchQuantizer::perturb_codes(
             codes[i * M + m] = k_distrib(gen);
         }
     }
-    lsq_timer.end("perturb_codes");
 }
 void LocalSearchQuantizer::compute_binary_terms(float* binaries) const {
-    lsq_timer.start("compute_binary_terms");
+    LSQTimerScope scope(&lsq_timer, "compute_binary_terms");
 #pragma omp parallel for
     for (int64_t m12 = 0; m12 < M * M; m12++) {
@@ -562,52 +710,53 @@ void LocalSearchQuantizer::compute_binary_terms(float* binaries) const {
             }
         }
     }
-    lsq_timer.end("compute_binary_terms");
 }
 void LocalSearchQuantizer::compute_unary_terms(
         const float* x,
-        float* unaries,
+        float* unaries, // [M, n, K]
         size_t n) const {
-    lsq_timer.start("compute_unary_terms");
+    LSQTimerScope scope(&lsq_timer, "compute_unary_terms");
-    // compute x * codebooks^T
+    // compute x * codebook^T for each codebook
     //
     // NOTE: LAPACK use column major order
     // out = alpha * op(A) * op(B) + beta * C
-    FINTEGER nrows_A = M * K;
-    FINTEGER ncols_A = d;
-    FINTEGER nrows_B = d;
-    FINTEGER ncols_B = n;
-    float alpha = -2.0f;
-    float beta = 0.0f;
-    sgemm_("Transposed",
-           "Not Transposed",
-           &nrows_A, // nrows of op(A)
-           &ncols_B, // ncols of op(B)
-           &ncols_A, // ncols of op(A)
-           &alpha,
-           codebooks.data(),
-           &ncols_A, // nrows of A
-           x,
-           &nrows_B, // nrows of B
-           &beta,
-           unaries,
-           &nrows_A); // nrows of output
+    for (size_t m = 0; m < M; m++) {
+        FINTEGER nrows_A = K;
+        FINTEGER ncols_A = d;
+        FINTEGER nrows_B = d;
+        FINTEGER ncols_B = n;
+        float alpha = -2.0f;
+        float beta = 0.0f;
+        sgemm_("Transposed",
+               "Not Transposed",
+               &nrows_A, // nrows of op(A)
+               &ncols_B, // ncols of op(B)
+               &ncols_A, // ncols of op(A)
+               &alpha,
+               codebooks.data() + m * K * d,
+               &ncols_A, // nrows of A
+               x,
+               &nrows_B, // nrows of B
+               &beta,
+               unaries + m * n * K,
+               &nrows_A); // nrows of output
+    }
     std::vector<float> norms(M * K);
     fvec_norms_L2sqr(norms.data(), codebooks.data(), d, M * K);
 #pragma omp parallel for
     for (int64_t i = 0; i < n; i++) {
-        float* u = unaries + i * (M * K);
-        fvec_add(M * K, u, norms.data(), u);
+        for (size_t m = 0; m < M; m++) {
+            float* u = unaries + m * n * K + i * K;
+            fvec_add(K, u, norms.data() + m * K, u);
+        }
     }
-    lsq_timer.end("compute_unary_terms");
 }
 float LocalSearchQuantizer::evaluate(
@@ -615,7 +764,7 @@ float LocalSearchQuantizer::evaluate(
         const float* x,
         size_t n,
         float* objs) const {
-    lsq_timer.start("evaluate");
+    LSQTimerScope scope(&lsq_timer, "evaluate");
     // decode
     std::vector<float> decoded_x(n * d, 0.0f);
@@ -631,7 +780,7 @@ float LocalSearchQuantizer::evaluate(
             fvec_add(d, decoded_i, c, decoded_i);
         }
-        float err = fvec_L2sqr(x + i * d, decoded_i, d);
+        float err = faiss::fvec_L2sqr(x + i * d, decoded_i, d);
         obj += err;
         if (objs) {
@@ -639,34 +788,68 @@ float LocalSearchQuantizer::evaluate(
         }
     }
-    lsq_timer.end("evaluate");
     obj = obj / n;
     return obj;
 }
-double LSQTimer::get(const std::string& name) {
-    return duration[name];
+namespace lsq {
+IcmEncoder::IcmEncoder(const LocalSearchQuantizer* lsq)
+        : verbose(false), lsq(lsq) {}
+void IcmEncoder::set_binary_term() {
+    auto M = lsq->M;
+    auto K = lsq->K;
+    binaries.resize(M * M * K * K);
+    lsq->compute_binary_terms(binaries.data());
 }
-void LSQTimer::start(const std::string& name) {
-    FAISS_THROW_IF_NOT_MSG(!started[name], " timer is already running");
-    started[name] = true;
-    t0[name] = getmillisecs();
+void IcmEncoder::encode(
+        int32_t* codes,
+        const float* x,
+        std::mt19937& gen,
+        size_t n,
+        size_t ils_iters) const {
+    lsq->icm_encode_impl(codes, x, binaries.data(), gen, n, ils_iters, verbose);
 }
-void LSQTimer::end(const std::string& name) {
-    FAISS_THROW_IF_NOT_MSG(started[name], " timer is not running");
-    double t1 = getmillisecs();
-    double sec = (t1 - t0[name]) / 1000;
-    duration[name] += sec;
-    started[name] = false;
+double LSQTimer::get(const std::string& name) {
+    if (t.count(name) == 0) {
+        return 0.0;
+    } else {
+        return t[name];
+    }
+}
+void LSQTimer::add(const std::string& name, double delta) {
+    if (t.count(name) == 0) {
+        t[name] = delta;
+    } else {
+        t[name] += delta;
+    }
 }
 void LSQTimer::reset() {
-    duration.clear();
-    t0.clear();
-    started.clear();
+    t.clear();
+}
+LSQTimerScope::LSQTimerScope(LSQTimer* timer, std::string name)
+        : timer(timer), name(name), finished(false) {
+    t0 = getmillisecs();
 }
+void LSQTimerScope::finish() {
+    if (!finished) {
+        auto delta = getmillisecs() - t0;
+        timer->add(name, delta);
+        finished = true;
+    }
+}
+LSQTimerScope::~LSQTimerScope() {
+    finish();
+}
+} // namespace lsq
 } // namespace faiss