RubyGems - faiss - Versions diffs - 0.2.0 → 0.2.1 - Mend

faiss 0.2.0 → 0.2.1

Files changed (202) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +292 -291
data/vendor/faiss/faiss/AutoTune.h +55 -56
data/vendor/faiss/faiss/Clustering.cpp +334 -195
data/vendor/faiss/faiss/Clustering.h +88 -35
data/vendor/faiss/faiss/IVFlib.cpp +171 -195
data/vendor/faiss/faiss/IVFlib.h +48 -51
data/vendor/faiss/faiss/Index.cpp +85 -103
data/vendor/faiss/faiss/Index.h +54 -48
data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
data/vendor/faiss/faiss/Index2Layer.h +22 -22
data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
data/vendor/faiss/faiss/IndexBinary.h +140 -132
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
data/vendor/faiss/faiss/IndexFlat.h +35 -46
data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
data/vendor/faiss/faiss/IndexHNSW.h +57 -41
data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
data/vendor/faiss/faiss/IndexIVF.h +146 -113
data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
data/vendor/faiss/faiss/IndexLSH.h +21 -26
data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
data/vendor/faiss/faiss/IndexLattice.h +11 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
data/vendor/faiss/faiss/IndexNSG.h +85 -0
data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
data/vendor/faiss/faiss/IndexPQ.h +64 -67
data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
data/vendor/faiss/faiss/IndexRefine.h +22 -23
data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
data/vendor/faiss/faiss/IndexReplicas.h +62 -56
data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
data/vendor/faiss/faiss/IndexResidual.h +152 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
data/vendor/faiss/faiss/IndexShards.cpp +256 -240
data/vendor/faiss/faiss/IndexShards.h +85 -73
data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
data/vendor/faiss/faiss/MatrixStats.h +7 -10
data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
data/vendor/faiss/faiss/MetaIndexes.h +40 -34
data/vendor/faiss/faiss/MetricType.h +7 -7
data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
data/vendor/faiss/faiss/VectorTransform.h +61 -89
data/vendor/faiss/faiss/clone_index.cpp +77 -73
data/vendor/faiss/faiss/clone_index.h +4 -9
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
data/vendor/faiss/faiss/impl/FaissException.h +41 -29
data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
data/vendor/faiss/faiss/impl/HNSW.h +179 -200
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
data/vendor/faiss/faiss/impl/NSG.h +199 -0
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
data/vendor/faiss/faiss/impl/io.cpp +75 -94
data/vendor/faiss/faiss/impl/io.h +31 -41
data/vendor/faiss/faiss/impl/io_macros.h +40 -29
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
data/vendor/faiss/faiss/index_factory.cpp +269 -218
data/vendor/faiss/faiss/index_factory.h +6 -7
data/vendor/faiss/faiss/index_io.h +23 -26
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
data/vendor/faiss/faiss/utils/Heap.h +186 -209
data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
data/vendor/faiss/faiss/utils/distances.cpp +301 -310
data/vendor/faiss/faiss/utils/distances.h +133 -118
data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
data/vendor/faiss/faiss/utils/hamming.h +62 -85
data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
data/vendor/faiss/faiss/utils/partitioning.h +26 -21
data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
data/vendor/faiss/faiss/utils/random.cpp +39 -63
data/vendor/faiss/faiss/utils/random.h +13 -16
data/vendor/faiss/faiss/utils/simdlib.h +4 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
data/vendor/faiss/faiss/utils/utils.cpp +304 -287
data/vendor/faiss/faiss/utils/utils.h +53 -48
metadata +20 -2

data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp ADDED Viewed

@@ -0,0 +1,672 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/LocalSearchQuantizer.h>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <random>
+#include <algorithm>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/hamming.h> // BitstringWriter
+#include <faiss/utils/utils.h>
+extern "C" {
+// LU decomoposition of a general matrix
+void sgetrf_(
+        FINTEGER* m,
+        FINTEGER* n,
+        float* a,
+        FINTEGER* lda,
+        FINTEGER* ipiv,
+        FINTEGER* info);
+// generate inverse of a matrix given its LU decomposition
+void sgetri_(
+        FINTEGER* n,
+        float* a,
+        FINTEGER* lda,
+        FINTEGER* ipiv,
+        float* work,
+        FINTEGER* lwork,
+        FINTEGER* info);
+// solves a system of linear equations
+void sgetrs_(
+        const char* trans,
+        FINTEGER* n,
+        FINTEGER* nrhs,
+        float* A,
+        FINTEGER* lda,
+        FINTEGER* ipiv,
+        float* b,
+        FINTEGER* ldb,
+        FINTEGER* info);
+// general matrix multiplication
+int sgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const float* alpha,
+        const float* a,
+        FINTEGER* lda,
+        const float* b,
+        FINTEGER* ldb,
+        float* beta,
+        float* c,
+        FINTEGER* ldc);
+}
+namespace {
+// c and a and b can overlap
+void fvec_add(size_t d, const float* a, const float* b, float* c) {
+    for (size_t i = 0; i < d; i++) {
+        c[i] = a[i] + b[i];
+    }
+}
+void fmat_inverse(float* a, int n) {
+    int info;
+    int lwork = n * n;
+    std::vector<int> ipiv(n);
+    std::vector<float> workspace(lwork);
+    sgetrf_(&n, &n, a, &n, ipiv.data(), &info);
+    FAISS_THROW_IF_NOT(info == 0);
+    sgetri_(&n, a, &n, ipiv.data(), workspace.data(), &lwork, &info);
+    FAISS_THROW_IF_NOT(info == 0);
+}
+void random_int32(
+        std::vector<int32_t>& x,
+        int32_t min,
+        int32_t max,
+        std::mt19937& gen) {
+    std::uniform_int_distribution<int32_t> distrib(min, max);
+    for (size_t i = 0; i < x.size(); i++) {
+        x[i] = distrib(gen);
+    }
+}
+} // anonymous namespace
+namespace faiss {
+LSQTimer lsq_timer;
+LocalSearchQuantizer::LocalSearchQuantizer(size_t d, size_t M, size_t nbits) {
+    FAISS_THROW_IF_NOT((M * nbits) % 8 == 0);
+    this->d = d;
+    this->M = M;
+    this->nbits = std::vector<size_t>(M, nbits);
+    // set derived values
+    set_derived_values();
+    is_trained = false;
+    verbose = false;
+    K = (1 << nbits);
+    train_iters = 25;
+    train_ils_iters = 8;
+    icm_iters = 4;
+    encode_ils_iters = 16;
+    p = 0.5f;
+    lambd = 1e-2f;
+    chunk_size = 10000;
+    nperts = 4;
+    random_seed = 0x12345;
+    std::srand(random_seed);
+}
+void LocalSearchQuantizer::train(size_t n, const float* x) {
+    FAISS_THROW_IF_NOT(K == (1 << nbits[0]));
+    FAISS_THROW_IF_NOT(nperts <= M);
+    lsq_timer.reset();
+    if (verbose) {
+        lsq_timer.start("train");
+        printf("Training LSQ, with %zd subcodes on %zd %zdD vectors\n",
+               M,
+               n,
+               d);
+    }
+    // allocate memory for codebooks, size [M, K, d]
+    codebooks.resize(M * K * d);
+    // randomly intialize codes
+    std::mt19937 gen(random_seed);
+    std::vector<int32_t> codes(n * M); // [n, M]
+    random_int32(codes, 0, K - 1, gen);
+    // compute standard derivations of each dimension
+    std::vector<float> stddev(d, 0);
+#pragma omp parallel for
+    for (int64_t i = 0; i < d; i++) {
+        float mean = 0;
+        for (size_t j = 0; j < n; j++) {
+            mean += x[j * d + i];
+        }
+        mean = mean / n;
+        float sum = 0;
+        for (size_t j = 0; j < n; j++) {
+            float xi = x[j * d + i] - mean;
+            sum += xi * xi;
+        }
+        stddev[i] = sqrtf(sum / n);
+    }
+    if (verbose) {
+        float obj = evaluate(codes.data(), x, n);
+        printf("Before training: obj = %lf\n", obj);
+    }
+    for (size_t i = 0; i < train_iters; i++) {
+        // 1. update codebooks given x and codes
+        // 2. add perturbation to codebooks (SR-D)
+        // 3. refine codes given x and codebooks using icm
+        // update codebooks
+        update_codebooks(x, codes.data(), n);
+        if (verbose) {
+            float obj = evaluate(codes.data(), x, n);
+            printf("iter %zd:\n", i);
+            printf("\tafter updating codebooks: obj = %lf\n", obj);
+        }
+        // SR-D: perturb codebooks
+        float T = pow((1.0f - (i + 1.0f) / train_iters), p);
+        perturb_codebooks(T, stddev, gen);
+        if (verbose) {
+            float obj = evaluate(codes.data(), x, n);
+            printf("\tafter perturbing codebooks: obj = %lf\n", obj);
+        }
+        // refine codes
+        icm_encode(x, codes.data(), n, train_ils_iters, gen);
+        if (verbose) {
+            float obj = evaluate(codes.data(), x, n);
+            printf("\tafter updating codes: obj = %lf\n", obj);
+        }
+    }
+    if (verbose) {
+        lsq_timer.end("train");
+        float obj = evaluate(codes.data(), x, n);
+        printf("After training: obj = %lf\n", obj);
+        printf("Time statistic:\n");
+        for (const auto& it : lsq_timer.duration) {
+            printf("\t%s time: %lf s\n", it.first.data(), it.second);
+        }
+    }
+    is_trained = true;
+}
+void LocalSearchQuantizer::perturb_codebooks(
+        float T,
+        const std::vector<float>& stddev,
+        std::mt19937& gen) {
+    lsq_timer.start("perturb_codebooks");
+    std::vector<std::normal_distribution<float>> distribs;
+    for (size_t i = 0; i < d; i++) {
+        distribs.emplace_back(0.0f, stddev[i]);
+    }
+    for (size_t m = 0; m < M; m++) {
+        for (size_t k = 0; k < K; k++) {
+            for (size_t i = 0; i < d; i++) {
+                codebooks[m * K * d + k * d + i] += T * distribs[i](gen) / M;
+            }
+        }
+    }
+    lsq_timer.end("perturb_codebooks");
+}
+void LocalSearchQuantizer::compute_codes(
+        const float* x,
+        uint8_t* codes_out,
+        size_t n) const {
+    FAISS_THROW_IF_NOT_MSG(is_trained, "LSQ is not trained yet.");
+    if (verbose) {
+        lsq_timer.reset();
+        printf("Encoding %zd vectors...\n", n);
+        lsq_timer.start("encode");
+    }
+    std::vector<int32_t> codes(n * M);
+    std::mt19937 gen(random_seed);
+    random_int32(codes, 0, K - 1, gen);
+    icm_encode(x, codes.data(), n, encode_ils_iters, gen);
+    pack_codes(n, codes.data(), codes_out);
+    if (verbose) {
+        lsq_timer.end("encode");
+        double t = lsq_timer.get("encode");
+        printf("Time to encode %zd vectors: %lf s\n", n, t);
+    }
+}
+/** update codebooks given x and codes
+ *
+ * Let B denote the sparse matrix of codes, size [n, M * K].
+ * Let C denote the codebooks, size [M * K, d].
+ * Let X denote the training vectors, size [n, d]
+ *
+ * objective function:
+ *     L = (X - BC)^2
+ *
+ * To minimize L, we have:
+ *     C = (B'B)^(-1)B'X
+ * where ' denote transposed
+ *
+ * Add a regularization term to make B'B inversible:
+ *     C = (B'B + lambd * I)^(-1)B'X
+ */
+void LocalSearchQuantizer::update_codebooks(
+        const float* x,
+        const int32_t* codes,
+        size_t n) {
+    lsq_timer.start("update_codebooks");
+    // allocate memory
+    // bb = B'B, bx = BX
+    std::vector<float> bb(M * K * M * K, 0.0f); // [M * K, M * K]
+    std::vector<float> bx(M * K * d, 0.0f);     // [M * K, d]
+    // compute B'B
+    for (size_t i = 0; i < n; i++) {
+        for (size_t m = 0; m < M; m++) {
+            int32_t code1 = codes[i * M + m];
+            int32_t idx1 = m * K + code1;
+            bb[idx1 * M * K + idx1] += 1;
+            for (size_t m2 = m + 1; m2 < M; m2++) {
+                int32_t code2 = codes[i * M + m2];
+                int32_t idx2 = m2 * K + code2;
+                bb[idx1 * M * K + idx2] += 1;
+                bb[idx2 * M * K + idx1] += 1;
+            }
+        }
+    }
+    // add a regularization term to B'B
+    for (int64_t i = 0; i < M * K; i++) {
+        bb[i * (M * K) + i] += lambd;
+    }
+    // compute (B'B)^(-1)
+    fmat_inverse(bb.data(), M * K); // [M*K, M*K]
+    // compute BX
+    for (size_t i = 0; i < n; i++) {
+        for (size_t m = 0; m < M; m++) {
+            int32_t code = codes[i * M + m];
+            float* data = bx.data() + (m * K + code) * d;
+            fvec_add(d, data, x + i * d, data);
+        }
+    }
+    // compute C = (B'B)^(-1) @ BX
+    //
+    // NOTE: LAPACK use column major order
+    // out = alpha * op(A) * op(B) + beta * C
+    FINTEGER nrows_A = d;
+    FINTEGER ncols_A = M * K;
+    FINTEGER nrows_B = M * K;
+    FINTEGER ncols_B = M * K;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    sgemm_("Not Transposed",
+           "Not Transposed",
+           &nrows_A, // nrows of op(A)
+           &ncols_B, // ncols of op(B)
+           &ncols_A, // ncols of op(A)
+           &alpha,
+           bx.data(),
+           &nrows_A, // nrows of A
+           bb.data(),
+           &nrows_B, // nrows of B
+           &beta,
+           codebooks.data(),
+           &nrows_A); // nrows of output
+    lsq_timer.end("update_codebooks");
+}
+/** encode using iterative conditional mode
+ *
+ * iterative conditional mode:
+ *     For every subcode ci (i = 1, ..., M) of a vector, we fix the other
+ *     subcodes cj (j != i) and then find the optimal value of ci such
+ *     that minimizing the objective function.
+ * objective function:
+ *     L = (X - \sum cj)^2, j = 1, ..., M
+ *     L = X^2 - 2X * \sum cj + (\sum cj)^2
+ *
+ * X^2 is negligable since it is the same for all possible value
+ * k of the m-th subcode.
+ *
+ * 2X * \sum cj is the unary term
+ * (\sum cj)^2 is the binary term
+ * These two terms can be precomputed and store in a look up table.
+ */
+void LocalSearchQuantizer::icm_encode(
+        const float* x,
+        int32_t* codes,
+        size_t n,
+        size_t ils_iters,
+        std::mt19937& gen) const {
+    lsq_timer.start("icm_encode");
+    std::vector<float> binaries(M * M * K * K); // [M, M, K, K]
+    compute_binary_terms(binaries.data());
+    const size_t n_chunks = (n + chunk_size - 1) / chunk_size;
+    for (size_t i = 0; i < n_chunks; i++) {
+        size_t ni = std::min(chunk_size, n - i * chunk_size);
+        if (verbose) {
+            printf("\r\ticm encoding %zd/%zd ...", i * chunk_size + ni, n);
+            fflush(stdout);
+            if (i == n_chunks - 1 || i == 0) {
+                printf("\n");
+            }
+        }
+        const float* xi = x + i * chunk_size * d;
+        int32_t* codesi = codes + i * chunk_size * M;
+        icm_encode_partial(i, xi, codesi, ni, binaries.data(), ils_iters, gen);
+    }
+    lsq_timer.end("icm_encode");
+}
+void LocalSearchQuantizer::icm_encode_partial(
+        size_t index,
+        const float* x,
+        int32_t* codes,
+        size_t n,
+        const float* binaries,
+        size_t ils_iters,
+        std::mt19937& gen) const {
+    std::vector<float> unaries(n * M * K); // [n, M, K]
+    compute_unary_terms(x, unaries.data(), n);
+    std::vector<int32_t> best_codes;
+    best_codes.assign(codes, codes + n * M);
+    std::vector<float> best_objs(n, 0.0f);
+    evaluate(codes, x, n, best_objs.data());
+    FAISS_THROW_IF_NOT(nperts <= M);
+    for (size_t iter1 = 0; iter1 < ils_iters; iter1++) {
+        // add perturbation to codes
+        perturb_codes(codes, n, gen);
+        for (size_t iter2 = 0; iter2 < icm_iters; iter2++) {
+            icm_encode_step(unaries.data(), binaries, codes, n);
+        }
+        std::vector<float> icm_objs(n, 0.0f);
+        evaluate(codes, x, n, icm_objs.data());
+        size_t n_betters = 0;
+        float mean_obj = 0.0f;
+        // select the best code for every vector xi
+#pragma omp parallel for reduction(+ : n_betters, mean_obj)
+        for (int64_t i = 0; i < n; i++) {
+            if (icm_objs[i] < best_objs[i]) {
+                best_objs[i] = icm_objs[i];
+                memcpy(best_codes.data() + i * M,
+                       codes + i * M,
+                       sizeof(int32_t) * M);
+                n_betters += 1;
+            }
+            mean_obj += best_objs[i];
+        }
+        mean_obj /= n;
+        memcpy(codes, best_codes.data(), sizeof(int32_t) * n * M);
+        if (verbose && index == 0) {
+            printf("\tils_iter %zd: obj = %lf, n_betters/n = %zd/%zd\n",
+                   iter1,
+                   mean_obj,
+                   n_betters,
+                   n);
+        }
+    } // loop ils_iters
+}
+void LocalSearchQuantizer::icm_encode_step(
+        const float* unaries,
+        const float* binaries,
+        int32_t* codes,
+        size_t n) const {
+    // condition on the m-th subcode
+    for (size_t m = 0; m < M; m++) {
+        std::vector<float> objs(n * K);
+#pragma omp parallel for
+        for (int64_t i = 0; i < n; i++) {
+            auto u = unaries + i * (M * K) + m * K;
+            memcpy(objs.data() + i * K, u, sizeof(float) * K);
+        }
+        // compute objective function by adding unary
+        // and binary terms together
+        for (size_t other_m = 0; other_m < M; other_m++) {
+            if (other_m == m) {
+                continue;
+            }
+#pragma omp parallel for
+            for (int64_t i = 0; i < n; i++) {
+                for (int32_t code = 0; code < K; code++) {
+                    int32_t code2 = codes[i * M + other_m];
+                    size_t binary_idx =
+                            m * M * K * K + other_m * K * K + code * K + code2;
+                    // binaries[m, other_m, code, code2]
+                    objs[i * K + code] += binaries[binary_idx];
+                }
+            }
+        }
+        // find the optimal value of the m-th subcode
+#pragma omp parallel for
+        for (int64_t i = 0; i < n; i++) {
+            float best_obj = HUGE_VALF;
+            int32_t best_code = 0;
+            for (size_t code = 0; code < K; code++) {
+                float obj = objs[i * K + code];
+                if (obj < best_obj) {
+                    best_obj = obj;
+                    best_code = code;
+                }
+            }
+            codes[i * M + m] = best_code;
+        }
+    } // loop M
+}
+void LocalSearchQuantizer::perturb_codes(
+        int32_t* codes,
+        size_t n,
+        std::mt19937& gen) const {
+    lsq_timer.start("perturb_codes");
+    std::uniform_int_distribution<size_t> m_distrib(0, M - 1);
+    std::uniform_int_distribution<int32_t> k_distrib(0, K - 1);
+    for (size_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < nperts; j++) {
+            size_t m = m_distrib(gen);
+            codes[i * M + m] = k_distrib(gen);
+        }
+    }
+    lsq_timer.end("perturb_codes");
+}
+void LocalSearchQuantizer::compute_binary_terms(float* binaries) const {
+    lsq_timer.start("compute_binary_terms");
+#pragma omp parallel for
+    for (int64_t m12 = 0; m12 < M * M; m12++) {
+        size_t m1 = m12 / M;
+        size_t m2 = m12 % M;
+        for (size_t code1 = 0; code1 < K; code1++) {
+            for (size_t code2 = 0; code2 < K; code2++) {
+                const float* c1 = codebooks.data() + m1 * K * d + code1 * d;
+                const float* c2 = codebooks.data() + m2 * K * d + code2 * d;
+                float ip = fvec_inner_product(c1, c2, d);
+                // binaries[m1, m2, code1, code2] = ip * 2
+                binaries[m1 * M * K * K + m2 * K * K + code1 * K + code2] =
+                        ip * 2;
+            }
+        }
+    }
+    lsq_timer.end("compute_binary_terms");
+}
+void LocalSearchQuantizer::compute_unary_terms(
+        const float* x,
+        float* unaries,
+        size_t n) const {
+    lsq_timer.start("compute_unary_terms");
+    // compute x * codebooks^T
+    //
+    // NOTE: LAPACK use column major order
+    // out = alpha * op(A) * op(B) + beta * C
+    FINTEGER nrows_A = M * K;
+    FINTEGER ncols_A = d;
+    FINTEGER nrows_B = d;
+    FINTEGER ncols_B = n;
+    float alpha = -2.0f;
+    float beta = 0.0f;
+    sgemm_("Transposed",
+           "Not Transposed",
+           &nrows_A, // nrows of op(A)
+           &ncols_B, // ncols of op(B)
+           &ncols_A, // ncols of op(A)
+           &alpha,
+           codebooks.data(),
+           &ncols_A, // nrows of A
+           x,
+           &nrows_B, // nrows of B
+           &beta,
+           unaries,
+           &nrows_A); // nrows of output
+    std::vector<float> norms(M * K);
+    fvec_norms_L2sqr(norms.data(), codebooks.data(), d, M * K);
+#pragma omp parallel for
+    for (int64_t i = 0; i < n; i++) {
+        float* u = unaries + i * (M * K);
+        fvec_add(M * K, u, norms.data(), u);
+    }
+    lsq_timer.end("compute_unary_terms");
+}
+float LocalSearchQuantizer::evaluate(
+        const int32_t* codes,
+        const float* x,
+        size_t n,
+        float* objs) const {
+    lsq_timer.start("evaluate");
+    // decode
+    std::vector<float> decoded_x(n * d, 0.0f);
+    float obj = 0.0f;
+#pragma omp parallel for reduction(+ : obj)
+    for (int64_t i = 0; i < n; i++) {
+        const auto code = codes + i * M;
+        const auto decoded_i = decoded_x.data() + i * d;
+        for (size_t m = 0; m < M; m++) {
+            // c = codebooks[m, code[m]]
+            const auto c = codebooks.data() + m * K * d + code[m] * d;
+            fvec_add(d, decoded_i, c, decoded_i);
+        }
+        float err = fvec_L2sqr(x + i * d, decoded_i, d);
+        obj += err;
+        if (objs) {
+            objs[i] = err;
+        }
+    }
+    lsq_timer.end("evaluate");
+    obj = obj / n;
+    return obj;
+}
+double LSQTimer::get(const std::string& name) {
+    return duration[name];
+}
+void LSQTimer::start(const std::string& name) {
+    FAISS_THROW_IF_NOT_MSG(!started[name], " timer is already running");
+    started[name] = true;
+    t0[name] = getmillisecs();
+}
+void LSQTimer::end(const std::string& name) {
+    FAISS_THROW_IF_NOT_MSG(started[name], " timer is not running");
+    double t1 = getmillisecs();
+    double sec = (t1 - t0[name]) / 1000;
+    duration[name] += sec;
+    started[name] = false;
+}
+void LSQTimer::reset() {
+    duration.clear();
+    t0.clear();
+    started.clear();
+}
+} // namespace faiss