RubyGems - faiss - Versions diffs - 0.2.3 → 0.2.5 - Mend

faiss 0.2.3 → 0.2.5

Files changed (189) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/LICENSE.txt +1 -1
data/README.md +23 -21
data/ext/faiss/extconf.rb +11 -0
data/ext/faiss/index.cpp +4 -4
data/ext/faiss/index_binary.cpp +6 -6
data/ext/faiss/product_quantizer.cpp +4 -4
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +13 -0
data/vendor/faiss/faiss/Clustering.cpp +32 -0
data/vendor/faiss/faiss/Clustering.h +14 -0
data/vendor/faiss/faiss/IVFlib.cpp +101 -2
data/vendor/faiss/faiss/IVFlib.h +26 -2
data/vendor/faiss/faiss/Index.cpp +36 -3
data/vendor/faiss/faiss/Index.h +43 -6
data/vendor/faiss/faiss/Index2Layer.cpp +24 -93
data/vendor/faiss/faiss/Index2Layer.h +8 -17
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +610 -0
data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +253 -0
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +299 -0
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +199 -0
data/vendor/faiss/faiss/IndexBinary.cpp +20 -4
data/vendor/faiss/faiss/IndexBinary.h +18 -3
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +9 -2
data/vendor/faiss/faiss/IndexBinaryFlat.h +4 -2
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -1
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +5 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -1
data/vendor/faiss/faiss/IndexBinaryHash.cpp +17 -4
data/vendor/faiss/faiss/IndexBinaryHash.h +8 -4
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +28 -13
data/vendor/faiss/faiss/IndexBinaryIVF.h +10 -7
data/vendor/faiss/faiss/IndexFastScan.cpp +626 -0
data/vendor/faiss/faiss/IndexFastScan.h +145 -0
data/vendor/faiss/faiss/IndexFlat.cpp +52 -69
data/vendor/faiss/faiss/IndexFlat.h +16 -19
data/vendor/faiss/faiss/IndexFlatCodes.cpp +101 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +59 -0
data/vendor/faiss/faiss/IndexHNSW.cpp +66 -138
data/vendor/faiss/faiss/IndexHNSW.h +4 -2
data/vendor/faiss/faiss/IndexIDMap.cpp +247 -0
data/vendor/faiss/faiss/IndexIDMap.h +107 -0
data/vendor/faiss/faiss/IndexIVF.cpp +200 -40
data/vendor/faiss/faiss/IndexIVF.h +59 -22
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +393 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +183 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +590 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +171 -0
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +1290 -0
data/vendor/faiss/faiss/IndexIVFFastScan.h +213 -0
data/vendor/faiss/faiss/IndexIVFFlat.cpp +43 -26
data/vendor/faiss/faiss/IndexIVFFlat.h +4 -2
data/vendor/faiss/faiss/IndexIVFPQ.cpp +238 -53
data/vendor/faiss/faiss/IndexIVFPQ.h +6 -2
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +23 -852
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +7 -112
data/vendor/faiss/faiss/IndexIVFPQR.cpp +3 -3
data/vendor/faiss/faiss/IndexIVFPQR.h +1 -1
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +63 -40
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +23 -7
data/vendor/faiss/faiss/IndexLSH.cpp +8 -32
data/vendor/faiss/faiss/IndexLSH.h +4 -16
data/vendor/faiss/faiss/IndexLattice.cpp +7 -1
data/vendor/faiss/faiss/IndexLattice.h +3 -1
data/vendor/faiss/faiss/IndexNNDescent.cpp +4 -5
data/vendor/faiss/faiss/IndexNNDescent.h +2 -1
data/vendor/faiss/faiss/IndexNSG.cpp +37 -5
data/vendor/faiss/faiss/IndexNSG.h +25 -1
data/vendor/faiss/faiss/IndexPQ.cpp +108 -120
data/vendor/faiss/faiss/IndexPQ.h +21 -22
data/vendor/faiss/faiss/IndexPQFastScan.cpp +15 -450
data/vendor/faiss/faiss/IndexPQFastScan.h +15 -78
data/vendor/faiss/faiss/IndexPreTransform.cpp +47 -8
data/vendor/faiss/faiss/IndexPreTransform.h +15 -3
data/vendor/faiss/faiss/IndexRefine.cpp +36 -4
data/vendor/faiss/faiss/IndexRefine.h +14 -2
data/vendor/faiss/faiss/IndexReplicas.cpp +4 -2
data/vendor/faiss/faiss/IndexReplicas.h +2 -1
data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +438 -0
data/vendor/faiss/faiss/IndexRowwiseMinMax.h +92 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +28 -43
data/vendor/faiss/faiss/IndexScalarQuantizer.h +8 -23
data/vendor/faiss/faiss/IndexShards.cpp +4 -1
data/vendor/faiss/faiss/IndexShards.h +2 -1
data/vendor/faiss/faiss/MetaIndexes.cpp +5 -178
data/vendor/faiss/faiss/MetaIndexes.h +3 -81
data/vendor/faiss/faiss/VectorTransform.cpp +45 -1
data/vendor/faiss/faiss/VectorTransform.h +25 -4
data/vendor/faiss/faiss/clone_index.cpp +26 -3
data/vendor/faiss/faiss/clone_index.h +3 -0
data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +300 -0
data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +24 -0
data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +195 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2058 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +408 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2147 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +460 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +465 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +1618 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +251 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +1452 -0
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +1 -0
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +2 -6
data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
data/vendor/faiss/faiss/gpu/GpuIndex.h +28 -4
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -1
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +10 -8
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +75 -14
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +19 -32
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -31
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +22 -28
data/vendor/faiss/faiss/gpu/GpuResources.cpp +14 -0
data/vendor/faiss/faiss/gpu/GpuResources.h +16 -3
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +3 -3
data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +32 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +1 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +311 -75
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +10 -0
data/vendor/faiss/faiss/gpu/test/TestUtils.h +3 -0
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +5 -4
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +331 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +110 -19
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +0 -54
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +0 -76
data/vendor/faiss/faiss/impl/DistanceComputer.h +64 -0
data/vendor/faiss/faiss/impl/HNSW.cpp +133 -32
data/vendor/faiss/faiss/impl/HNSW.h +19 -16
data/vendor/faiss/faiss/impl/IDSelector.cpp +125 -0
data/vendor/faiss/faiss/impl/IDSelector.h +135 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +378 -217
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +106 -29
data/vendor/faiss/faiss/impl/LookupTableScaler.h +77 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +1 -0
data/vendor/faiss/faiss/impl/NSG.cpp +1 -4
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +383 -0
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +154 -0
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +225 -145
data/vendor/faiss/faiss/impl/ProductQuantizer.h +29 -10
data/vendor/faiss/faiss/impl/Quantizer.h +43 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +521 -55
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +94 -16
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -0
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +108 -191
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +18 -18
data/vendor/faiss/faiss/impl/index_read.cpp +338 -24
data/vendor/faiss/faiss/impl/index_write.cpp +300 -18
data/vendor/faiss/faiss/impl/io.cpp +1 -1
data/vendor/faiss/faiss/impl/io_macros.h +20 -0
data/vendor/faiss/faiss/impl/kmeans1d.cpp +303 -0
data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +56 -16
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +25 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +66 -25
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +75 -27
data/vendor/faiss/faiss/index_factory.cpp +772 -412
data/vendor/faiss/faiss/index_factory.h +3 -0
data/vendor/faiss/faiss/index_io.h +5 -0
data/vendor/faiss/faiss/invlists/DirectMap.cpp +1 -0
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +4 -1
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -1
data/vendor/faiss/faiss/python/python_callbacks.cpp +27 -0
data/vendor/faiss/faiss/python/python_callbacks.h +15 -0
data/vendor/faiss/faiss/utils/Heap.h +31 -15
data/vendor/faiss/faiss/utils/distances.cpp +384 -58
data/vendor/faiss/faiss/utils/distances.h +149 -18
data/vendor/faiss/faiss/utils/distances_simd.cpp +776 -6
data/vendor/faiss/faiss/utils/extra_distances.cpp +12 -7
data/vendor/faiss/faiss/utils/extra_distances.h +3 -1
data/vendor/faiss/faiss/utils/fp16-fp16c.h +21 -0
data/vendor/faiss/faiss/utils/fp16-inl.h +101 -0
data/vendor/faiss/faiss/utils/fp16.h +11 -0
data/vendor/faiss/faiss/utils/hamming-inl.h +54 -0
data/vendor/faiss/faiss/utils/hamming.cpp +0 -48
data/vendor/faiss/faiss/utils/ordered_key_value.h +10 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +62 -0
data/vendor/faiss/faiss/utils/quantize_lut.h +20 -0
data/vendor/faiss/faiss/utils/random.cpp +53 -0
data/vendor/faiss/faiss/utils/random.h +5 -0
data/vendor/faiss/faiss/utils/simdlib_avx2.h +4 -0
data/vendor/faiss/faiss/utils/simdlib_emulated.h +6 -1
data/vendor/faiss/faiss/utils/simdlib_neon.h +7 -2
data/vendor/faiss/faiss/utils/utils.h +1 -1
metadata +46 -5
data/vendor/faiss/faiss/IndexResidual.cpp +0 -291
data/vendor/faiss/faiss/IndexResidual.h +0 -152

data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp CHANGED Viewed

@@ -5,9 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
-// -*- c++ -*-
-#include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/LocalSearchQuantizer.h>
 #include <cstddef>
@@ -18,6 +15,8 @@
 #include <algorithm>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/hamming.h> // BitstringWriter
 #include <faiss/utils/utils.h>
@@ -42,18 +41,6 @@ void sgetri_(
         FINTEGER* lwork,
         FINTEGER* info);
-// solves a system of linear equations
-void sgetrs_(
-        const char* trans,
-        FINTEGER* n,
-        FINTEGER* nrhs,
-        float* A,
-        FINTEGER* lda,
-        FINTEGER* ipiv,
-        float* b,
-        FINTEGER* ldb,
-        FINTEGER* info);
 // general matrix multiplication
 int sgemm_(
         const char* transa,
@@ -69,26 +56,73 @@ int sgemm_(
         float* beta,
         float* c,
         FINTEGER* ldc);
+// LU decomoposition of a general matrix
+void dgetrf_(
+        FINTEGER* m,
+        FINTEGER* n,
+        double* a,
+        FINTEGER* lda,
+        FINTEGER* ipiv,
+        FINTEGER* info);
+// generate inverse of a matrix given its LU decomposition
+void dgetri_(
+        FINTEGER* n,
+        double* a,
+        FINTEGER* lda,
+        FINTEGER* ipiv,
+        double* work,
+        FINTEGER* lwork,
+        FINTEGER* info);
+// general matrix multiplication
+int dgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const double* alpha,
+        const double* a,
+        FINTEGER* lda,
+        const double* b,
+        FINTEGER* ldb,
+        double* beta,
+        double* c,
+        FINTEGER* ldc);
 }
 namespace {
+void fmat_inverse(float* a, int n) {
+    int info;
+    int lwork = n * n;
+    std::vector<int> ipiv(n);
+    std::vector<float> workspace(lwork);
+    sgetrf_(&n, &n, a, &n, ipiv.data(), &info);
+    FAISS_THROW_IF_NOT(info == 0);
+    sgetri_(&n, a, &n, ipiv.data(), workspace.data(), &lwork, &info);
+    FAISS_THROW_IF_NOT(info == 0);
+}
 // c and a and b can overlap
-void fvec_add(size_t d, const float* a, const float* b, float* c) {
+void dfvec_add(size_t d, const double* a, const float* b, double* c) {
     for (size_t i = 0; i < d; i++) {
         c[i] = a[i] + b[i];
     }
 }
-void fmat_inverse(float* a, int n) {
+void dmat_inverse(double* a, int n) {
     int info;
     int lwork = n * n;
     std::vector<int> ipiv(n);
-    std::vector<float> workspace(lwork);
+    std::vector<double> workspace(lwork);
-    sgetrf_(&n, &n, a, &n, ipiv.data(), &info);
+    dgetrf_(&n, &n, a, &n, ipiv.data(), &info);
     FAISS_THROW_IF_NOT(info == 0);
-    sgetri_(&n, a, &n, ipiv.data(), workspace.data(), &lwork, &info);
+    dgetri_(&n, a, &n, ipiv.data(), workspace.data(), &lwork, &info);
     FAISS_THROW_IF_NOT(info == 0);
 }
@@ -107,21 +141,15 @@ void random_int32(
 namespace faiss {
-LSQTimer lsq_timer;
-LocalSearchQuantizer::LocalSearchQuantizer(size_t d, size_t M, size_t nbits) {
-    FAISS_THROW_IF_NOT((M * nbits) % 8 == 0);
-    this->d = d;
-    this->M = M;
-    this->nbits = std::vector<size_t>(M, nbits);
-    // set derived values
-    set_derived_values();
-    is_trained = false;
-    verbose = false;
+lsq::LSQTimer lsq_timer;
+using lsq::LSQTimerScope;
+LocalSearchQuantizer::LocalSearchQuantizer(
+        size_t d,
+        size_t M,
+        size_t nbits,
+        Search_type_t search_type)
+        : AdditiveQuantizer(d, std::vector<size_t>(M, nbits), search_type) {
     K = (1 << nbits);
     train_iters = 25;
@@ -138,15 +166,23 @@ LocalSearchQuantizer::LocalSearchQuantizer(size_t d, size_t M, size_t nbits) {
     random_seed = 0x12345;
     std::srand(random_seed);
+    icm_encoder_factory = nullptr;
+}
+LocalSearchQuantizer::~LocalSearchQuantizer() {
+    delete icm_encoder_factory;
 }
+LocalSearchQuantizer::LocalSearchQuantizer() : LocalSearchQuantizer(0, 0, 0) {}
 void LocalSearchQuantizer::train(size_t n, const float* x) {
     FAISS_THROW_IF_NOT(K == (1 << nbits[0]));
-    FAISS_THROW_IF_NOT(nperts <= M);
+    nperts = std::min(nperts, M);
     lsq_timer.reset();
+    LSQTimerScope scope(&lsq_timer, "train");
     if (verbose) {
-        lsq_timer.start("train");
         printf("Training LSQ, with %zd subcodes on %zd %zdD vectors\n",
                M,
                n,
@@ -209,7 +245,7 @@ void LocalSearchQuantizer::train(size_t n, const float* x) {
         }
         // refine codes
-        icm_encode(x, codes.data(), n, train_ils_iters, gen);
+        icm_encode(codes.data(), x, n, train_ils_iters, gen);
         if (verbose) {
             float obj = evaluate(codes.data(), x, n);
@@ -217,25 +253,33 @@ void LocalSearchQuantizer::train(size_t n, const float* x) {
         }
     }
+    is_trained = true;
+    {
+        std::vector<float> x_recons(n * d);
+        std::vector<float> norms(n);
+        decode_unpacked(codes.data(), x_recons.data(), n);
+        fvec_norms_L2sqr(norms.data(), x_recons.data(), d, n);
+        train_norm(n, norms.data());
+    }
     if (verbose) {
-        lsq_timer.end("train");
         float obj = evaluate(codes.data(), x, n);
+        scope.finish();
         printf("After training: obj = %lf\n", obj);
         printf("Time statistic:\n");
-        for (const auto& it : lsq_timer.duration) {
-            printf("\t%s time: %lf s\n", it.first.data(), it.second);
+        for (const auto& it : lsq_timer.t) {
+            printf("\t%s time: %lf s\n", it.first.data(), it.second / 1000);
         }
     }
-    is_trained = true;
 }
 void LocalSearchQuantizer::perturb_codebooks(
         float T,
         const std::vector<float>& stddev,
         std::mt19937& gen) {
-    lsq_timer.start("perturb_codebooks");
+    LSQTimerScope scope(&lsq_timer, "perturb_codebooks");
     std::vector<std::normal_distribution<float>> distribs;
     for (size_t i = 0; i < d; i++) {
@@ -249,32 +293,34 @@ void LocalSearchQuantizer::perturb_codebooks(
             }
         }
     }
-    lsq_timer.end("perturb_codebooks");
 }
-void LocalSearchQuantizer::compute_codes(
+void LocalSearchQuantizer::compute_codes_add_centroids(
         const float* x,
         uint8_t* codes_out,
-        size_t n) const {
+        size_t n,
+        const float* centroids) const {
     FAISS_THROW_IF_NOT_MSG(is_trained, "LSQ is not trained yet.");
+    lsq_timer.reset();
+    LSQTimerScope scope(&lsq_timer, "encode");
     if (verbose) {
-        lsq_timer.reset();
         printf("Encoding %zd vectors...\n", n);
-        lsq_timer.start("encode");
     }
     std::vector<int32_t> codes(n * M);
     std::mt19937 gen(random_seed);
     random_int32(codes, 0, K - 1, gen);
-    icm_encode(x, codes.data(), n, encode_ils_iters, gen);
-    pack_codes(n, codes.data(), codes_out);
+    icm_encode(codes.data(), x, n, encode_ils_iters, gen);
+    pack_codes(n, codes.data(), codes_out, -1, nullptr, centroids);
     if (verbose) {
-        lsq_timer.end("encode");
-        double t = lsq_timer.get("encode");
-        printf("Time to encode %zd vectors: %lf s\n", n, t);
+        scope.finish();
+        printf("Time statistic:\n");
+        for (const auto& it : lsq_timer.t) {
+            printf("\t%s time: %lf s\n", it.first.data(), it.second / 1000);
+        }
     }
 }
@@ -298,73 +344,144 @@ void LocalSearchQuantizer::update_codebooks(
         const float* x,
         const int32_t* codes,
         size_t n) {
-    lsq_timer.start("update_codebooks");
+    LSQTimerScope scope(&lsq_timer, "update_codebooks");
+    if (!update_codebooks_with_double) {
+        // allocate memory
+        // bb = B'B, bx = BX
+        std::vector<float> bb(M * K * M * K, 0.0f); // [M * K, M * K]
+        std::vector<float> bx(M * K * d, 0.0f);     // [M * K, d]
+        // compute B'B
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code1 = codes[i * M + m];
+                int32_t idx1 = m * K + code1;
+                bb[idx1 * M * K + idx1] += 1;
+                for (size_t m2 = m + 1; m2 < M; m2++) {
+                    int32_t code2 = codes[i * M + m2];
+                    int32_t idx2 = m2 * K + code2;
+                    bb[idx1 * M * K + idx2] += 1;
+                    bb[idx2 * M * K + idx1] += 1;
+                }
+            }
+        }
-    // allocate memory
-    // bb = B'B, bx = BX
-    std::vector<float> bb(M * K * M * K, 0.0f); // [M * K, M * K]
-    std::vector<float> bx(M * K * d, 0.0f);     // [M * K, d]
+        // add a regularization term to B'B
+        for (int64_t i = 0; i < M * K; i++) {
+            bb[i * (M * K) + i] += lambd;
+        }
-    // compute B'B
-    for (size_t i = 0; i < n; i++) {
-        for (size_t m = 0; m < M; m++) {
-            int32_t code1 = codes[i * M + m];
-            int32_t idx1 = m * K + code1;
-            bb[idx1 * M * K + idx1] += 1;
-            for (size_t m2 = m + 1; m2 < M; m2++) {
-                int32_t code2 = codes[i * M + m2];
-                int32_t idx2 = m2 * K + code2;
-                bb[idx1 * M * K + idx2] += 1;
-                bb[idx2 * M * K + idx1] += 1;
+        // compute (B'B)^(-1)
+        fmat_inverse(bb.data(), M * K); // [M*K, M*K]
+        // compute BX
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code = codes[i * M + m];
+                float* data = bx.data() + (m * K + code) * d;
+                fvec_add(d, data, x + i * d, data);
             }
         }
-    }
-    // add a regularization term to B'B
-    for (int64_t i = 0; i < M * K; i++) {
-        bb[i * (M * K) + i] += lambd;
-    }
+        // compute C = (B'B)^(-1) @ BX
+        //
+        // NOTE: LAPACK use column major order
+        // out = alpha * op(A) * op(B) + beta * C
+        FINTEGER nrows_A = d;
+        FINTEGER ncols_A = M * K;
+        FINTEGER nrows_B = M * K;
+        FINTEGER ncols_B = M * K;
+        float alpha = 1.0f;
+        float beta = 0.0f;
+        sgemm_("Not Transposed",
+               "Not Transposed",
+               &nrows_A, // nrows of op(A)
+               &ncols_B, // ncols of op(B)
+               &ncols_A, // ncols of op(A)
+               &alpha,
+               bx.data(),
+               &nrows_A, // nrows of A
+               bb.data(),
+               &nrows_B, // nrows of B
+               &beta,
+               codebooks.data(),
+               &nrows_A); // nrows of output
+    } else {
+        // allocate memory
+        // bb = B'B, bx = BX
+        std::vector<double> bb(M * K * M * K, 0.0f); // [M * K, M * K]
+        std::vector<double> bx(M * K * d, 0.0f);     // [M * K, d]
+        // compute B'B
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code1 = codes[i * M + m];
+                int32_t idx1 = m * K + code1;
+                bb[idx1 * M * K + idx1] += 1;
+                for (size_t m2 = m + 1; m2 < M; m2++) {
+                    int32_t code2 = codes[i * M + m2];
+                    int32_t idx2 = m2 * K + code2;
+                    bb[idx1 * M * K + idx2] += 1;
+                    bb[idx2 * M * K + idx1] += 1;
+                }
+            }
+        }
-    // compute (B'B)^(-1)
-    fmat_inverse(bb.data(), M * K); // [M*K, M*K]
+        // add a regularization term to B'B
+        for (int64_t i = 0; i < M * K; i++) {
+            bb[i * (M * K) + i] += lambd;
+        }
-    // compute BX
-    for (size_t i = 0; i < n; i++) {
-        for (size_t m = 0; m < M; m++) {
-            int32_t code = codes[i * M + m];
-            float* data = bx.data() + (m * K + code) * d;
-            fvec_add(d, data, x + i * d, data);
+        // compute (B'B)^(-1)
+        dmat_inverse(bb.data(), M * K); // [M*K, M*K]
+        // compute BX
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code = codes[i * M + m];
+                double* data = bx.data() + (m * K + code) * d;
+                dfvec_add(d, data, x + i * d, data);
+            }
         }
-    }
-    // compute C = (B'B)^(-1) @ BX
-    //
-    // NOTE: LAPACK use column major order
-    // out = alpha * op(A) * op(B) + beta * C
-    FINTEGER nrows_A = d;
-    FINTEGER ncols_A = M * K;
-    FINTEGER nrows_B = M * K;
-    FINTEGER ncols_B = M * K;
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    sgemm_("Not Transposed",
-           "Not Transposed",
-           &nrows_A, // nrows of op(A)
-           &ncols_B, // ncols of op(B)
-           &ncols_A, // ncols of op(A)
-           &alpha,
-           bx.data(),
-           &nrows_A, // nrows of A
-           bb.data(),
-           &nrows_B, // nrows of B
-           &beta,
-           codebooks.data(),
-           &nrows_A); // nrows of output
-    lsq_timer.end("update_codebooks");
+        // compute C = (B'B)^(-1) @ BX
+        //
+        // NOTE: LAPACK use column major order
+        // out = alpha * op(A) * op(B) + beta * C
+        FINTEGER nrows_A = d;
+        FINTEGER ncols_A = M * K;
+        FINTEGER nrows_B = M * K;
+        FINTEGER ncols_B = M * K;
+        std::vector<double> d_codebooks(M * K * d);
+        double alpha = 1.0f;
+        double beta = 0.0f;
+        dgemm_("Not Transposed",
+               "Not Transposed",
+               &nrows_A, // nrows of op(A)
+               &ncols_B, // ncols of op(B)
+               &ncols_A, // ncols of op(A)
+               &alpha,
+               bx.data(),
+               &nrows_A, // nrows of A
+               bb.data(),
+               &nrows_B, // nrows of B
+               &beta,
+               d_codebooks.data(),
+               &nrows_A); // nrows of output
+        for (size_t i = 0; i < M * K * d; i++) {
+            codebooks[i] = (float)d_codebooks[i];
+        }
+    }
 }
 /** encode using iterative conditional mode
@@ -386,15 +503,23 @@ void LocalSearchQuantizer::update_codebooks(
  * These two terms can be precomputed and store in a look up table.
  */
 void LocalSearchQuantizer::icm_encode(
-        const float* x,
         int32_t* codes,
+        const float* x,
         size_t n,
         size_t ils_iters,
         std::mt19937& gen) const {
-    lsq_timer.start("icm_encode");
+    LSQTimerScope scope(&lsq_timer, "icm_encode");
+    auto factory = icm_encoder_factory;
+    std::unique_ptr<lsq::IcmEncoder> icm_encoder;
+    if (factory == nullptr) {
+        icm_encoder.reset(lsq::IcmEncoderFactory().get(this));
+    } else {
+        icm_encoder.reset(factory->get(this));
+    }
-    std::vector<float> binaries(M * M * K * K); // [M, M, K, K]
-    compute_binary_terms(binaries.data());
+    // precompute binary terms for all chunks
+    icm_encoder->set_binary_term();
     const size_t n_chunks = (n + chunk_size - 1) / chunk_size;
     for (size_t i = 0; i < n_chunks; i++) {
@@ -410,21 +535,20 @@ void LocalSearchQuantizer::icm_encode(
         const float* xi = x + i * chunk_size * d;
         int32_t* codesi = codes + i * chunk_size * M;
-        icm_encode_partial(i, xi, codesi, ni, binaries.data(), ils_iters, gen);
+        icm_encoder->verbose = (verbose && i == 0);
+        icm_encoder->encode(codesi, xi, gen, ni, ils_iters);
     }
-    lsq_timer.end("icm_encode");
 }
-void LocalSearchQuantizer::icm_encode_partial(
-        size_t index,
-        const float* x,
+void LocalSearchQuantizer::icm_encode_impl(
         int32_t* codes,
-        size_t n,
+        const float* x,
         const float* binaries,
+        std::mt19937& gen,
+        size_t n,
         size_t ils_iters,
-        std::mt19937& gen) const {
-    std::vector<float> unaries(n * M * K); // [n, M, K]
+        bool verbose) const {
+    std::vector<float> unaries(n * M * K); // [M, n, K]
     compute_unary_terms(x, unaries.data(), n);
     std::vector<int32_t> best_codes;
@@ -438,9 +562,7 @@ void LocalSearchQuantizer::icm_encode_partial(
         // add perturbation to codes
         perturb_codes(codes, n, gen);
-        for (size_t iter2 = 0; iter2 < icm_iters; iter2++) {
-            icm_encode_step(unaries.data(), binaries, codes, n);
-        }
+        icm_encode_step(codes, unaries.data(), binaries, n, icm_iters);
         std::vector<float> icm_objs(n, 0.0f);
         evaluate(codes, x, n, icm_objs.data());
@@ -463,7 +585,7 @@ void LocalSearchQuantizer::icm_encode_partial(
         memcpy(codes, best_codes.data(), sizeof(int32_t) * n * M);
-        if (verbose && index == 0) {
+        if (verbose) {
             printf("\tils_iter %zd: obj = %lf, n_betters/n = %zd/%zd\n",
                    iter1,
                    mean_obj,
@@ -474,61 +596,67 @@ void LocalSearchQuantizer::icm_encode_partial(
 }
 void LocalSearchQuantizer::icm_encode_step(
+        int32_t* codes,
         const float* unaries,
         const float* binaries,
-        int32_t* codes,
-        size_t n) const {
-    // condition on the m-th subcode
-    for (size_t m = 0; m < M; m++) {
-        std::vector<float> objs(n * K);
-#pragma omp parallel for
-        for (int64_t i = 0; i < n; i++) {
-            auto u = unaries + i * (M * K) + m * K;
-            memcpy(objs.data() + i * K, u, sizeof(float) * K);
-        }
+        size_t n,
+        size_t n_iters) const {
+    FAISS_THROW_IF_NOT(M != 0 && K != 0);
+    FAISS_THROW_IF_NOT(binaries != nullptr);
-        // compute objective function by adding unary
-        // and binary terms together
-        for (size_t other_m = 0; other_m < M; other_m++) {
-            if (other_m == m) {
-                continue;
+    for (size_t iter = 0; iter < n_iters; iter++) {
+        // condition on the m-th subcode
+        for (size_t m = 0; m < M; m++) {
+            std::vector<float> objs(n * K);
+#pragma omp parallel for
+            for (int64_t i = 0; i < n; i++) {
+                auto u = unaries + m * n * K + i * K;
+                memcpy(objs.data() + i * K, u, sizeof(float) * K);
             }
+            // compute objective function by adding unary
+            // and binary terms together
+            for (size_t other_m = 0; other_m < M; other_m++) {
+                if (other_m == m) {
+                    continue;
+                }
 #pragma omp parallel for
-            for (int64_t i = 0; i < n; i++) {
-                for (int32_t code = 0; code < K; code++) {
-                    int32_t code2 = codes[i * M + other_m];
-                    size_t binary_idx =
-                            m * M * K * K + other_m * K * K + code * K + code2;
-                    // binaries[m, other_m, code, code2]
-                    objs[i * K + code] += binaries[binary_idx];
+                for (int64_t i = 0; i < n; i++) {
+                    for (int32_t code = 0; code < K; code++) {
+                        int32_t code2 = codes[i * M + other_m];
+                        size_t binary_idx = m * M * K * K + other_m * K * K +
+                                code * K + code2;
+                        // binaries[m, other_m, code, code2]
+                        objs[i * K + code] += binaries[binary_idx];
+                    }
                 }
             }
-        }
-        // find the optimal value of the m-th subcode
+            // find the optimal value of the m-th subcode
 #pragma omp parallel for
-        for (int64_t i = 0; i < n; i++) {
-            float best_obj = HUGE_VALF;
-            int32_t best_code = 0;
-            for (size_t code = 0; code < K; code++) {
-                float obj = objs[i * K + code];
-                if (obj < best_obj) {
-                    best_obj = obj;
-                    best_code = code;
+            for (int64_t i = 0; i < n; i++) {
+                float best_obj = HUGE_VALF;
+                int32_t best_code = 0;
+                for (size_t code = 0; code < K; code++) {
+                    float obj = objs[i * K + code];
+                    if (obj < best_obj) {
+                        best_obj = obj;
+                        best_code = code;
+                    }
                 }
+                codes[i * M + m] = best_code;
             }
-            codes[i * M + m] = best_code;
-        }
-    } // loop M
+        } // loop M
+    }
 }
 void LocalSearchQuantizer::perturb_codes(
         int32_t* codes,
         size_t n,
         std::mt19937& gen) const {
-    lsq_timer.start("perturb_codes");
+    LSQTimerScope scope(&lsq_timer, "perturb_codes");
     std::uniform_int_distribution<size_t> m_distrib(0, M - 1);
     std::uniform_int_distribution<int32_t> k_distrib(0, K - 1);
@@ -539,12 +667,10 @@ void LocalSearchQuantizer::perturb_codes(
             codes[i * M + m] = k_distrib(gen);
         }
     }
-    lsq_timer.end("perturb_codes");
 }
 void LocalSearchQuantizer::compute_binary_terms(float* binaries) const {
-    lsq_timer.start("compute_binary_terms");
+    LSQTimerScope scope(&lsq_timer, "compute_binary_terms");
 #pragma omp parallel for
     for (int64_t m12 = 0; m12 < M * M; m12++) {
@@ -562,52 +688,53 @@ void LocalSearchQuantizer::compute_binary_terms(float* binaries) const {
             }
         }
     }
-    lsq_timer.end("compute_binary_terms");
 }
 void LocalSearchQuantizer::compute_unary_terms(
         const float* x,
-        float* unaries,
+        float* unaries, // [M, n, K]
         size_t n) const {
-    lsq_timer.start("compute_unary_terms");
+    LSQTimerScope scope(&lsq_timer, "compute_unary_terms");
-    // compute x * codebooks^T
+    // compute x * codebook^T for each codebook
     //
     // NOTE: LAPACK use column major order
     // out = alpha * op(A) * op(B) + beta * C
-    FINTEGER nrows_A = M * K;
-    FINTEGER ncols_A = d;
-    FINTEGER nrows_B = d;
-    FINTEGER ncols_B = n;
-    float alpha = -2.0f;
-    float beta = 0.0f;
-    sgemm_("Transposed",
-           "Not Transposed",
-           &nrows_A, // nrows of op(A)
-           &ncols_B, // ncols of op(B)
-           &ncols_A, // ncols of op(A)
-           &alpha,
-           codebooks.data(),
-           &ncols_A, // nrows of A
-           x,
-           &nrows_B, // nrows of B
-           &beta,
-           unaries,
-           &nrows_A); // nrows of output
+    for (size_t m = 0; m < M; m++) {
+        FINTEGER nrows_A = K;
+        FINTEGER ncols_A = d;
+        FINTEGER nrows_B = d;
+        FINTEGER ncols_B = n;
+        float alpha = -2.0f;
+        float beta = 0.0f;
+        sgemm_("Transposed",
+               "Not Transposed",
+               &nrows_A, // nrows of op(A)
+               &ncols_B, // ncols of op(B)
+               &ncols_A, // ncols of op(A)
+               &alpha,
+               codebooks.data() + m * K * d,
+               &ncols_A, // nrows of A
+               x,
+               &nrows_B, // nrows of B
+               &beta,
+               unaries + m * n * K,
+               &nrows_A); // nrows of output
+    }
     std::vector<float> norms(M * K);
     fvec_norms_L2sqr(norms.data(), codebooks.data(), d, M * K);
 #pragma omp parallel for
     for (int64_t i = 0; i < n; i++) {
-        float* u = unaries + i * (M * K);
-        fvec_add(M * K, u, norms.data(), u);
+        for (size_t m = 0; m < M; m++) {
+            float* u = unaries + m * n * K + i * K;
+            fvec_add(K, u, norms.data() + m * K, u);
+        }
     }
-    lsq_timer.end("compute_unary_terms");
 }
 float LocalSearchQuantizer::evaluate(
@@ -615,7 +742,7 @@ float LocalSearchQuantizer::evaluate(
         const float* x,
         size_t n,
         float* objs) const {
-    lsq_timer.start("evaluate");
+    LSQTimerScope scope(&lsq_timer, "evaluate");
     // decode
     std::vector<float> decoded_x(n * d, 0.0f);
@@ -631,7 +758,7 @@ float LocalSearchQuantizer::evaluate(
             fvec_add(d, decoded_i, c, decoded_i);
         }
-        float err = fvec_L2sqr(x + i * d, decoded_i, d);
+        float err = faiss::fvec_L2sqr(x + i * d, decoded_i, d);
         obj += err;
         if (objs) {
@@ -639,34 +766,68 @@ float LocalSearchQuantizer::evaluate(
         }
     }
-    lsq_timer.end("evaluate");
     obj = obj / n;
     return obj;
 }
-double LSQTimer::get(const std::string& name) {
-    return duration[name];
+namespace lsq {
+IcmEncoder::IcmEncoder(const LocalSearchQuantizer* lsq)
+        : verbose(false), lsq(lsq) {}
+void IcmEncoder::set_binary_term() {
+    auto M = lsq->M;
+    auto K = lsq->K;
+    binaries.resize(M * M * K * K);
+    lsq->compute_binary_terms(binaries.data());
 }
-void LSQTimer::start(const std::string& name) {
-    FAISS_THROW_IF_NOT_MSG(!started[name], " timer is already running");
-    started[name] = true;
-    t0[name] = getmillisecs();
+void IcmEncoder::encode(
+        int32_t* codes,
+        const float* x,
+        std::mt19937& gen,
+        size_t n,
+        size_t ils_iters) const {
+    lsq->icm_encode_impl(codes, x, binaries.data(), gen, n, ils_iters, verbose);
 }
-void LSQTimer::end(const std::string& name) {
-    FAISS_THROW_IF_NOT_MSG(started[name], " timer is not running");
-    double t1 = getmillisecs();
-    double sec = (t1 - t0[name]) / 1000;
-    duration[name] += sec;
-    started[name] = false;
+double LSQTimer::get(const std::string& name) {
+    if (t.count(name) == 0) {
+        return 0.0;
+    } else {
+        return t[name];
+    }
+}
+void LSQTimer::add(const std::string& name, double delta) {
+    if (t.count(name) == 0) {
+        t[name] = delta;
+    } else {
+        t[name] += delta;
+    }
 }
 void LSQTimer::reset() {
-    duration.clear();
-    t0.clear();
-    started.clear();
+    t.clear();
+}
+LSQTimerScope::LSQTimerScope(LSQTimer* timer, std::string name)
+        : timer(timer), name(name), finished(false) {
+    t0 = getmillisecs();
 }
+void LSQTimerScope::finish() {
+    if (!finished) {
+        auto delta = getmillisecs() - t0;
+        timer->add(name, delta);
+        finished = true;
+    }
+}
+LSQTimerScope::~LSQTimerScope() {
+    finish();
+}
+} // namespace lsq
 } // namespace faiss