RubyGems - faiss - Versions diffs - 0.2.0 → 0.2.4 - Mend

faiss 0.2.0 → 0.2.4

Files changed (215) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/LICENSE.txt +1 -1
data/README.md +7 -7
data/ext/faiss/extconf.rb +6 -3
data/ext/faiss/numo.hpp +4 -4
data/ext/faiss/utils.cpp +1 -1
data/ext/faiss/utils.h +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +292 -291
data/vendor/faiss/faiss/AutoTune.h +55 -56
data/vendor/faiss/faiss/Clustering.cpp +365 -194
data/vendor/faiss/faiss/Clustering.h +102 -35
data/vendor/faiss/faiss/IVFlib.cpp +171 -195
data/vendor/faiss/faiss/IVFlib.h +48 -51
data/vendor/faiss/faiss/Index.cpp +85 -103
data/vendor/faiss/faiss/Index.h +54 -48
data/vendor/faiss/faiss/Index2Layer.cpp +126 -224
data/vendor/faiss/faiss/Index2Layer.h +22 -36
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +195 -0
data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
data/vendor/faiss/faiss/IndexBinary.h +140 -132
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
data/vendor/faiss/faiss/IndexFlat.cpp +115 -176
data/vendor/faiss/faiss/IndexFlat.h +42 -59
data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
data/vendor/faiss/faiss/IndexHNSW.h +57 -41
data/vendor/faiss/faiss/IndexIVF.cpp +545 -453
data/vendor/faiss/faiss/IndexIVF.h +169 -118
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
data/vendor/faiss/faiss/IndexIVFFlat.cpp +247 -252
data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
data/vendor/faiss/faiss/IndexIVFPQ.cpp +459 -517
data/vendor/faiss/faiss/IndexIVFPQ.h +75 -67
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +163 -150
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +38 -25
data/vendor/faiss/faiss/IndexLSH.cpp +66 -113
data/vendor/faiss/faiss/IndexLSH.h +20 -38
data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
data/vendor/faiss/faiss/IndexLattice.h +11 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +229 -0
data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
data/vendor/faiss/faiss/IndexNSG.cpp +301 -0
data/vendor/faiss/faiss/IndexNSG.h +85 -0
data/vendor/faiss/faiss/IndexPQ.cpp +387 -495
data/vendor/faiss/faiss/IndexPQ.h +64 -82
data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
data/vendor/faiss/faiss/IndexRefine.cpp +139 -127
data/vendor/faiss/faiss/IndexRefine.h +32 -23
data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
data/vendor/faiss/faiss/IndexReplicas.h +62 -56
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +111 -172
data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -59
data/vendor/faiss/faiss/IndexShards.cpp +256 -240
data/vendor/faiss/faiss/IndexShards.h +85 -73
data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
data/vendor/faiss/faiss/MatrixStats.h +7 -10
data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
data/vendor/faiss/faiss/MetaIndexes.h +40 -34
data/vendor/faiss/faiss/MetricType.h +7 -7
data/vendor/faiss/faiss/VectorTransform.cpp +654 -475
data/vendor/faiss/faiss/VectorTransform.h +64 -89
data/vendor/faiss/faiss/clone_index.cpp +78 -73
data/vendor/faiss/faiss/clone_index.h +4 -9
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +198 -171
data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +503 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +175 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
data/vendor/faiss/faiss/impl/FaissException.h +41 -29
data/vendor/faiss/faiss/impl/HNSW.cpp +606 -617
data/vendor/faiss/faiss/impl/HNSW.h +179 -200
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +855 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +244 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
data/vendor/faiss/faiss/impl/NSG.cpp +679 -0
data/vendor/faiss/faiss/impl/NSG.h +199 -0
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +758 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +188 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +647 -707
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
data/vendor/faiss/faiss/impl/index_read.cpp +631 -480
data/vendor/faiss/faiss/impl/index_write.cpp +547 -407
data/vendor/faiss/faiss/impl/io.cpp +76 -95
data/vendor/faiss/faiss/impl/io.h +31 -41
data/vendor/faiss/faiss/impl/io_macros.h +60 -29
data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
data/vendor/faiss/faiss/index_factory.cpp +619 -397
data/vendor/faiss/faiss/index_factory.h +8 -6
data/vendor/faiss/faiss/index_io.h +23 -26
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
data/vendor/faiss/faiss/utils/Heap.h +186 -209
data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
data/vendor/faiss/faiss/utils/distances.cpp +305 -312
data/vendor/faiss/faiss/utils/distances.h +170 -122
data/vendor/faiss/faiss/utils/distances_simd.cpp +498 -508
data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
data/vendor/faiss/faiss/utils/hamming.h +62 -85
data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
data/vendor/faiss/faiss/utils/partitioning.h +26 -21
data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
data/vendor/faiss/faiss/utils/random.cpp +39 -63
data/vendor/faiss/faiss/utils/random.h +13 -16
data/vendor/faiss/faiss/utils/simdlib.h +4 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
data/vendor/faiss/faiss/utils/utils.cpp +304 -287
data/vendor/faiss/faiss/utils/utils.h +54 -49
metadata +29 -4

data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp ADDED Viewed

@@ -0,0 +1,855 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <faiss/impl/LocalSearchQuantizer.h>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <random>
+#include <algorithm>
+#include <faiss/Clustering.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/hamming.h> // BitstringWriter
+#include <faiss/utils/utils.h>
+extern "C" {
+// LU decomoposition of a general matrix
+void sgetrf_(
+        FINTEGER* m,
+        FINTEGER* n,
+        float* a,
+        FINTEGER* lda,
+        FINTEGER* ipiv,
+        FINTEGER* info);
+// generate inverse of a matrix given its LU decomposition
+void sgetri_(
+        FINTEGER* n,
+        float* a,
+        FINTEGER* lda,
+        FINTEGER* ipiv,
+        float* work,
+        FINTEGER* lwork,
+        FINTEGER* info);
+// general matrix multiplication
+int sgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const float* alpha,
+        const float* a,
+        FINTEGER* lda,
+        const float* b,
+        FINTEGER* ldb,
+        float* beta,
+        float* c,
+        FINTEGER* ldc);
+// LU decomoposition of a general matrix
+void dgetrf_(
+        FINTEGER* m,
+        FINTEGER* n,
+        double* a,
+        FINTEGER* lda,
+        FINTEGER* ipiv,
+        FINTEGER* info);
+// generate inverse of a matrix given its LU decomposition
+void dgetri_(
+        FINTEGER* n,
+        double* a,
+        FINTEGER* lda,
+        FINTEGER* ipiv,
+        double* work,
+        FINTEGER* lwork,
+        FINTEGER* info);
+// general matrix multiplication
+int dgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const double* alpha,
+        const double* a,
+        FINTEGER* lda,
+        const double* b,
+        FINTEGER* ldb,
+        double* beta,
+        double* c,
+        FINTEGER* ldc);
+}
+namespace {
+void fmat_inverse(float* a, int n) {
+    int info;
+    int lwork = n * n;
+    std::vector<int> ipiv(n);
+    std::vector<float> workspace(lwork);
+    sgetrf_(&n, &n, a, &n, ipiv.data(), &info);
+    FAISS_THROW_IF_NOT(info == 0);
+    sgetri_(&n, a, &n, ipiv.data(), workspace.data(), &lwork, &info);
+    FAISS_THROW_IF_NOT(info == 0);
+}
+// c and a and b can overlap
+void dfvec_add(size_t d, const double* a, const float* b, double* c) {
+    for (size_t i = 0; i < d; i++) {
+        c[i] = a[i] + b[i];
+    }
+}
+void dmat_inverse(double* a, int n) {
+    int info;
+    int lwork = n * n;
+    std::vector<int> ipiv(n);
+    std::vector<double> workspace(lwork);
+    dgetrf_(&n, &n, a, &n, ipiv.data(), &info);
+    FAISS_THROW_IF_NOT(info == 0);
+    dgetri_(&n, a, &n, ipiv.data(), workspace.data(), &lwork, &info);
+    FAISS_THROW_IF_NOT(info == 0);
+}
+void random_int32(
+        std::vector<int32_t>& x,
+        int32_t min,
+        int32_t max,
+        std::mt19937& gen) {
+    std::uniform_int_distribution<int32_t> distrib(min, max);
+    for (size_t i = 0; i < x.size(); i++) {
+        x[i] = distrib(gen);
+    }
+}
+} // anonymous namespace
+namespace faiss {
+lsq::LSQTimer lsq_timer;
+using lsq::LSQTimerScope;
+LocalSearchQuantizer::LocalSearchQuantizer(
+        size_t d,
+        size_t M,
+        size_t nbits,
+        Search_type_t search_type)
+        : AdditiveQuantizer(d, std::vector<size_t>(M, nbits), search_type) {
+    is_trained = false;
+    verbose = false;
+    K = (1 << nbits);
+    train_iters = 25;
+    train_ils_iters = 8;
+    icm_iters = 4;
+    encode_ils_iters = 16;
+    p = 0.5f;
+    lambd = 1e-2f;
+    chunk_size = 10000;
+    nperts = 4;
+    random_seed = 0x12345;
+    std::srand(random_seed);
+    icm_encoder_factory = nullptr;
+}
+LocalSearchQuantizer::~LocalSearchQuantizer() {
+    delete icm_encoder_factory;
+}
+LocalSearchQuantizer::LocalSearchQuantizer() : LocalSearchQuantizer(0, 0, 0) {}
+void LocalSearchQuantizer::train(size_t n, const float* x) {
+    FAISS_THROW_IF_NOT(K == (1 << nbits[0]));
+    FAISS_THROW_IF_NOT(nperts <= M);
+    lsq_timer.reset();
+    LSQTimerScope scope(&lsq_timer, "train");
+    if (verbose) {
+        printf("Training LSQ, with %zd subcodes on %zd %zdD vectors\n",
+               M,
+               n,
+               d);
+    }
+    // allocate memory for codebooks, size [M, K, d]
+    codebooks.resize(M * K * d);
+    // randomly intialize codes
+    std::mt19937 gen(random_seed);
+    std::vector<int32_t> codes(n * M); // [n, M]
+    random_int32(codes, 0, K - 1, gen);
+    // compute standard derivations of each dimension
+    std::vector<float> stddev(d, 0);
+#pragma omp parallel for
+    for (int64_t i = 0; i < d; i++) {
+        float mean = 0;
+        for (size_t j = 0; j < n; j++) {
+            mean += x[j * d + i];
+        }
+        mean = mean / n;
+        float sum = 0;
+        for (size_t j = 0; j < n; j++) {
+            float xi = x[j * d + i] - mean;
+            sum += xi * xi;
+        }
+        stddev[i] = sqrtf(sum / n);
+    }
+    if (verbose) {
+        float obj = evaluate(codes.data(), x, n);
+        printf("Before training: obj = %lf\n", obj);
+    }
+    for (size_t i = 0; i < train_iters; i++) {
+        // 1. update codebooks given x and codes
+        // 2. add perturbation to codebooks (SR-D)
+        // 3. refine codes given x and codebooks using icm
+        // update codebooks
+        update_codebooks(x, codes.data(), n);
+        if (verbose) {
+            float obj = evaluate(codes.data(), x, n);
+            printf("iter %zd:\n", i);
+            printf("\tafter updating codebooks: obj = %lf\n", obj);
+        }
+        // SR-D: perturb codebooks
+        float T = pow((1.0f - (i + 1.0f) / train_iters), p);
+        perturb_codebooks(T, stddev, gen);
+        if (verbose) {
+            float obj = evaluate(codes.data(), x, n);
+            printf("\tafter perturbing codebooks: obj = %lf\n", obj);
+        }
+        // refine codes
+        icm_encode(codes.data(), x, n, train_ils_iters, gen);
+        if (verbose) {
+            float obj = evaluate(codes.data(), x, n);
+            printf("\tafter updating codes: obj = %lf\n", obj);
+        }
+    }
+    is_trained = true;
+    {
+        std::vector<float> x_recons(n * d);
+        std::vector<float> norms(n);
+        decode_unpacked(codes.data(), x_recons.data(), n);
+        fvec_norms_L2sqr(norms.data(), x_recons.data(), d, n);
+        norm_min = HUGE_VALF;
+        norm_max = -HUGE_VALF;
+        for (idx_t i = 0; i < n; i++) {
+            if (norms[i] < norm_min) {
+                norm_min = norms[i];
+            }
+            if (norms[i] > norm_max) {
+                norm_max = norms[i];
+            }
+        }
+        if (search_type == ST_norm_cqint8 || search_type == ST_norm_cqint4) {
+            size_t k = (1 << 8);
+            if (search_type == ST_norm_cqint4) {
+                k = (1 << 4);
+            }
+            Clustering1D clus(k);
+            clus.train_exact(n, norms.data());
+            qnorm.add(clus.k, clus.centroids.data());
+        }
+    }
+    if (verbose) {
+        float obj = evaluate(codes.data(), x, n);
+        scope.finish();
+        printf("After training: obj = %lf\n", obj);
+        printf("Time statistic:\n");
+        for (const auto& it : lsq_timer.t) {
+            printf("\t%s time: %lf s\n", it.first.data(), it.second / 1000);
+        }
+    }
+}
+void LocalSearchQuantizer::perturb_codebooks(
+        float T,
+        const std::vector<float>& stddev,
+        std::mt19937& gen) {
+    LSQTimerScope scope(&lsq_timer, "perturb_codebooks");
+    std::vector<std::normal_distribution<float>> distribs;
+    for (size_t i = 0; i < d; i++) {
+        distribs.emplace_back(0.0f, stddev[i]);
+    }
+    for (size_t m = 0; m < M; m++) {
+        for (size_t k = 0; k < K; k++) {
+            for (size_t i = 0; i < d; i++) {
+                codebooks[m * K * d + k * d + i] += T * distribs[i](gen) / M;
+            }
+        }
+    }
+}
+void LocalSearchQuantizer::compute_codes(
+        const float* x,
+        uint8_t* codes_out,
+        size_t n) const {
+    FAISS_THROW_IF_NOT_MSG(is_trained, "LSQ is not trained yet.");
+    lsq_timer.reset();
+    LSQTimerScope scope(&lsq_timer, "encode");
+    if (verbose) {
+        printf("Encoding %zd vectors...\n", n);
+    }
+    std::vector<int32_t> codes(n * M);
+    std::mt19937 gen(random_seed);
+    random_int32(codes, 0, K - 1, gen);
+    icm_encode(codes.data(), x, n, encode_ils_iters, gen);
+    pack_codes(n, codes.data(), codes_out);
+    if (verbose) {
+        scope.finish();
+        printf("Time statistic:\n");
+        for (const auto& it : lsq_timer.t) {
+            printf("\t%s time: %lf s\n", it.first.data(), it.second / 1000);
+        }
+    }
+}
+/** update codebooks given x and codes
+ *
+ * Let B denote the sparse matrix of codes, size [n, M * K].
+ * Let C denote the codebooks, size [M * K, d].
+ * Let X denote the training vectors, size [n, d]
+ *
+ * objective function:
+ *     L = (X - BC)^2
+ *
+ * To minimize L, we have:
+ *     C = (B'B)^(-1)B'X
+ * where ' denote transposed
+ *
+ * Add a regularization term to make B'B inversible:
+ *     C = (B'B + lambd * I)^(-1)B'X
+ */
+void LocalSearchQuantizer::update_codebooks(
+        const float* x,
+        const int32_t* codes,
+        size_t n) {
+    LSQTimerScope scope(&lsq_timer, "update_codebooks");
+    if (!update_codebooks_with_double) {
+        // allocate memory
+        // bb = B'B, bx = BX
+        std::vector<float> bb(M * K * M * K, 0.0f); // [M * K, M * K]
+        std::vector<float> bx(M * K * d, 0.0f);     // [M * K, d]
+        // compute B'B
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code1 = codes[i * M + m];
+                int32_t idx1 = m * K + code1;
+                bb[idx1 * M * K + idx1] += 1;
+                for (size_t m2 = m + 1; m2 < M; m2++) {
+                    int32_t code2 = codes[i * M + m2];
+                    int32_t idx2 = m2 * K + code2;
+                    bb[idx1 * M * K + idx2] += 1;
+                    bb[idx2 * M * K + idx1] += 1;
+                }
+            }
+        }
+        // add a regularization term to B'B
+        for (int64_t i = 0; i < M * K; i++) {
+            bb[i * (M * K) + i] += lambd;
+        }
+        // compute (B'B)^(-1)
+        fmat_inverse(bb.data(), M * K); // [M*K, M*K]
+        // compute BX
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code = codes[i * M + m];
+                float* data = bx.data() + (m * K + code) * d;
+                fvec_add(d, data, x + i * d, data);
+            }
+        }
+        // compute C = (B'B)^(-1) @ BX
+        //
+        // NOTE: LAPACK use column major order
+        // out = alpha * op(A) * op(B) + beta * C
+        FINTEGER nrows_A = d;
+        FINTEGER ncols_A = M * K;
+        FINTEGER nrows_B = M * K;
+        FINTEGER ncols_B = M * K;
+        float alpha = 1.0f;
+        float beta = 0.0f;
+        sgemm_("Not Transposed",
+               "Not Transposed",
+               &nrows_A, // nrows of op(A)
+               &ncols_B, // ncols of op(B)
+               &ncols_A, // ncols of op(A)
+               &alpha,
+               bx.data(),
+               &nrows_A, // nrows of A
+               bb.data(),
+               &nrows_B, // nrows of B
+               &beta,
+               codebooks.data(),
+               &nrows_A); // nrows of output
+    } else {
+        // allocate memory
+        // bb = B'B, bx = BX
+        std::vector<double> bb(M * K * M * K, 0.0f); // [M * K, M * K]
+        std::vector<double> bx(M * K * d, 0.0f);     // [M * K, d]
+        // compute B'B
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code1 = codes[i * M + m];
+                int32_t idx1 = m * K + code1;
+                bb[idx1 * M * K + idx1] += 1;
+                for (size_t m2 = m + 1; m2 < M; m2++) {
+                    int32_t code2 = codes[i * M + m2];
+                    int32_t idx2 = m2 * K + code2;
+                    bb[idx1 * M * K + idx2] += 1;
+                    bb[idx2 * M * K + idx1] += 1;
+                }
+            }
+        }
+        // add a regularization term to B'B
+        for (int64_t i = 0; i < M * K; i++) {
+            bb[i * (M * K) + i] += lambd;
+        }
+        // compute (B'B)^(-1)
+        dmat_inverse(bb.data(), M * K); // [M*K, M*K]
+        // compute BX
+        for (size_t i = 0; i < n; i++) {
+            for (size_t m = 0; m < M; m++) {
+                int32_t code = codes[i * M + m];
+                double* data = bx.data() + (m * K + code) * d;
+                dfvec_add(d, data, x + i * d, data);
+            }
+        }
+        // compute C = (B'B)^(-1) @ BX
+        //
+        // NOTE: LAPACK use column major order
+        // out = alpha * op(A) * op(B) + beta * C
+        FINTEGER nrows_A = d;
+        FINTEGER ncols_A = M * K;
+        FINTEGER nrows_B = M * K;
+        FINTEGER ncols_B = M * K;
+        std::vector<double> d_codebooks(M * K * d);
+        double alpha = 1.0f;
+        double beta = 0.0f;
+        dgemm_("Not Transposed",
+               "Not Transposed",
+               &nrows_A, // nrows of op(A)
+               &ncols_B, // ncols of op(B)
+               &ncols_A, // ncols of op(A)
+               &alpha,
+               bx.data(),
+               &nrows_A, // nrows of A
+               bb.data(),
+               &nrows_B, // nrows of B
+               &beta,
+               d_codebooks.data(),
+               &nrows_A); // nrows of output
+        for (size_t i = 0; i < M * K * d; i++) {
+            codebooks[i] = (float)d_codebooks[i];
+        }
+    }
+}
+/** encode using iterative conditional mode
+ *
+ * iterative conditional mode:
+ *     For every subcode ci (i = 1, ..., M) of a vector, we fix the other
+ *     subcodes cj (j != i) and then find the optimal value of ci such
+ *     that minimizing the objective function.
+ * objective function:
+ *     L = (X - \sum cj)^2, j = 1, ..., M
+ *     L = X^2 - 2X * \sum cj + (\sum cj)^2
+ *
+ * X^2 is negligable since it is the same for all possible value
+ * k of the m-th subcode.
+ *
+ * 2X * \sum cj is the unary term
+ * (\sum cj)^2 is the binary term
+ * These two terms can be precomputed and store in a look up table.
+ */
+void LocalSearchQuantizer::icm_encode(
+        int32_t* codes,
+        const float* x,
+        size_t n,
+        size_t ils_iters,
+        std::mt19937& gen) const {
+    LSQTimerScope scope(&lsq_timer, "icm_encode");
+    auto factory = icm_encoder_factory;
+    std::unique_ptr<lsq::IcmEncoder> icm_encoder;
+    if (factory == nullptr) {
+        icm_encoder.reset(lsq::IcmEncoderFactory().get(this));
+    } else {
+        icm_encoder.reset(factory->get(this));
+    }
+    // precompute binary terms for all chunks
+    icm_encoder->set_binary_term();
+    const size_t n_chunks = (n + chunk_size - 1) / chunk_size;
+    for (size_t i = 0; i < n_chunks; i++) {
+        size_t ni = std::min(chunk_size, n - i * chunk_size);
+        if (verbose) {
+            printf("\r\ticm encoding %zd/%zd ...", i * chunk_size + ni, n);
+            fflush(stdout);
+            if (i == n_chunks - 1 || i == 0) {
+                printf("\n");
+            }
+        }
+        const float* xi = x + i * chunk_size * d;
+        int32_t* codesi = codes + i * chunk_size * M;
+        icm_encoder->verbose = (verbose && i == 0);
+        icm_encoder->encode(codesi, xi, gen, ni, ils_iters);
+    }
+}
+void LocalSearchQuantizer::icm_encode_impl(
+        int32_t* codes,
+        const float* x,
+        const float* binaries,
+        std::mt19937& gen,
+        size_t n,
+        size_t ils_iters,
+        bool verbose) const {
+    std::vector<float> unaries(n * M * K); // [M, n, K]
+    compute_unary_terms(x, unaries.data(), n);
+    std::vector<int32_t> best_codes;
+    best_codes.assign(codes, codes + n * M);
+    std::vector<float> best_objs(n, 0.0f);
+    evaluate(codes, x, n, best_objs.data());
+    FAISS_THROW_IF_NOT(nperts <= M);
+    for (size_t iter1 = 0; iter1 < ils_iters; iter1++) {
+        // add perturbation to codes
+        perturb_codes(codes, n, gen);
+        icm_encode_step(codes, unaries.data(), binaries, n, icm_iters);
+        std::vector<float> icm_objs(n, 0.0f);
+        evaluate(codes, x, n, icm_objs.data());
+        size_t n_betters = 0;
+        float mean_obj = 0.0f;
+        // select the best code for every vector xi
+#pragma omp parallel for reduction(+ : n_betters, mean_obj)
+        for (int64_t i = 0; i < n; i++) {
+            if (icm_objs[i] < best_objs[i]) {
+                best_objs[i] = icm_objs[i];
+                memcpy(best_codes.data() + i * M,
+                       codes + i * M,
+                       sizeof(int32_t) * M);
+                n_betters += 1;
+            }
+            mean_obj += best_objs[i];
+        }
+        mean_obj /= n;
+        memcpy(codes, best_codes.data(), sizeof(int32_t) * n * M);
+        if (verbose) {
+            printf("\tils_iter %zd: obj = %lf, n_betters/n = %zd/%zd\n",
+                   iter1,
+                   mean_obj,
+                   n_betters,
+                   n);
+        }
+    } // loop ils_iters
+}
+void LocalSearchQuantizer::icm_encode_step(
+        int32_t* codes,
+        const float* unaries,
+        const float* binaries,
+        size_t n,
+        size_t n_iters) const {
+    FAISS_THROW_IF_NOT(M != 0 && K != 0);
+    FAISS_THROW_IF_NOT(binaries != nullptr);
+    for (size_t iter = 0; iter < n_iters; iter++) {
+        // condition on the m-th subcode
+        for (size_t m = 0; m < M; m++) {
+            std::vector<float> objs(n * K);
+#pragma omp parallel for
+            for (int64_t i = 0; i < n; i++) {
+                auto u = unaries + m * n * K + i * K;
+                memcpy(objs.data() + i * K, u, sizeof(float) * K);
+            }
+            // compute objective function by adding unary
+            // and binary terms together
+            for (size_t other_m = 0; other_m < M; other_m++) {
+                if (other_m == m) {
+                    continue;
+                }
+#pragma omp parallel for
+                for (int64_t i = 0; i < n; i++) {
+                    for (int32_t code = 0; code < K; code++) {
+                        int32_t code2 = codes[i * M + other_m];
+                        size_t binary_idx = m * M * K * K + other_m * K * K +
+                                code * K + code2;
+                        // binaries[m, other_m, code, code2]
+                        objs[i * K + code] += binaries[binary_idx];
+                    }
+                }
+            }
+            // find the optimal value of the m-th subcode
+#pragma omp parallel for
+            for (int64_t i = 0; i < n; i++) {
+                float best_obj = HUGE_VALF;
+                int32_t best_code = 0;
+                for (size_t code = 0; code < K; code++) {
+                    float obj = objs[i * K + code];
+                    if (obj < best_obj) {
+                        best_obj = obj;
+                        best_code = code;
+                    }
+                }
+                codes[i * M + m] = best_code;
+            }
+        } // loop M
+    }
+}
+void LocalSearchQuantizer::perturb_codes(
+        int32_t* codes,
+        size_t n,
+        std::mt19937& gen) const {
+    LSQTimerScope scope(&lsq_timer, "perturb_codes");
+    std::uniform_int_distribution<size_t> m_distrib(0, M - 1);
+    std::uniform_int_distribution<int32_t> k_distrib(0, K - 1);
+    for (size_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < nperts; j++) {
+            size_t m = m_distrib(gen);
+            codes[i * M + m] = k_distrib(gen);
+        }
+    }
+}
+void LocalSearchQuantizer::compute_binary_terms(float* binaries) const {
+    LSQTimerScope scope(&lsq_timer, "compute_binary_terms");
+#pragma omp parallel for
+    for (int64_t m12 = 0; m12 < M * M; m12++) {
+        size_t m1 = m12 / M;
+        size_t m2 = m12 % M;
+        for (size_t code1 = 0; code1 < K; code1++) {
+            for (size_t code2 = 0; code2 < K; code2++) {
+                const float* c1 = codebooks.data() + m1 * K * d + code1 * d;
+                const float* c2 = codebooks.data() + m2 * K * d + code2 * d;
+                float ip = fvec_inner_product(c1, c2, d);
+                // binaries[m1, m2, code1, code2] = ip * 2
+                binaries[m1 * M * K * K + m2 * K * K + code1 * K + code2] =
+                        ip * 2;
+            }
+        }
+    }
+}
+void LocalSearchQuantizer::compute_unary_terms(
+        const float* x,
+        float* unaries, // [M, n, K]
+        size_t n) const {
+    LSQTimerScope scope(&lsq_timer, "compute_unary_terms");
+    // compute x * codebook^T for each codebook
+    //
+    // NOTE: LAPACK use column major order
+    // out = alpha * op(A) * op(B) + beta * C
+    for (size_t m = 0; m < M; m++) {
+        FINTEGER nrows_A = K;
+        FINTEGER ncols_A = d;
+        FINTEGER nrows_B = d;
+        FINTEGER ncols_B = n;
+        float alpha = -2.0f;
+        float beta = 0.0f;
+        sgemm_("Transposed",
+               "Not Transposed",
+               &nrows_A, // nrows of op(A)
+               &ncols_B, // ncols of op(B)
+               &ncols_A, // ncols of op(A)
+               &alpha,
+               codebooks.data() + m * K * d,
+               &ncols_A, // nrows of A
+               x,
+               &nrows_B, // nrows of B
+               &beta,
+               unaries + m * n * K,
+               &nrows_A); // nrows of output
+    }
+    std::vector<float> norms(M * K);
+    fvec_norms_L2sqr(norms.data(), codebooks.data(), d, M * K);
+#pragma omp parallel for
+    for (int64_t i = 0; i < n; i++) {
+        for (size_t m = 0; m < M; m++) {
+            float* u = unaries + m * n * K + i * K;
+            fvec_add(K, u, norms.data() + m * K, u);
+        }
+    }
+}
+float LocalSearchQuantizer::evaluate(
+        const int32_t* codes,
+        const float* x,
+        size_t n,
+        float* objs) const {
+    LSQTimerScope scope(&lsq_timer, "evaluate");
+    // decode
+    std::vector<float> decoded_x(n * d, 0.0f);
+    float obj = 0.0f;
+#pragma omp parallel for reduction(+ : obj)
+    for (int64_t i = 0; i < n; i++) {
+        const auto code = codes + i * M;
+        const auto decoded_i = decoded_x.data() + i * d;
+        for (size_t m = 0; m < M; m++) {
+            // c = codebooks[m, code[m]]
+            const auto c = codebooks.data() + m * K * d + code[m] * d;
+            fvec_add(d, decoded_i, c, decoded_i);
+        }
+        float err = faiss::fvec_L2sqr(x + i * d, decoded_i, d);
+        obj += err;
+        if (objs) {
+            objs[i] = err;
+        }
+    }
+    obj = obj / n;
+    return obj;
+}
+namespace lsq {
+IcmEncoder::IcmEncoder(const LocalSearchQuantizer* lsq)
+        : verbose(false), lsq(lsq) {}
+void IcmEncoder::set_binary_term() {
+    auto M = lsq->M;
+    auto K = lsq->K;
+    binaries.resize(M * M * K * K);
+    lsq->compute_binary_terms(binaries.data());
+}
+void IcmEncoder::encode(
+        int32_t* codes,
+        const float* x,
+        std::mt19937& gen,
+        size_t n,
+        size_t ils_iters) const {
+    lsq->icm_encode_impl(codes, x, binaries.data(), gen, n, ils_iters, verbose);
+}
+double LSQTimer::get(const std::string& name) {
+    if (t.count(name) == 0) {
+        return 0.0;
+    } else {
+        return t[name];
+    }
+}
+void LSQTimer::add(const std::string& name, double delta) {
+    if (t.count(name) == 0) {
+        t[name] = delta;
+    } else {
+        t[name] += delta;
+    }
+}
+void LSQTimer::reset() {
+    t.clear();
+}
+LSQTimerScope::LSQTimerScope(LSQTimer* timer, std::string name)
+        : timer(timer), name(name), finished(false) {
+    t0 = getmillisecs();
+}
+void LSQTimerScope::finish() {
+    if (!finished) {
+        auto delta = getmillisecs() - t0;
+        timer->add(name, delta);
+        finished = true;
+    }
+}
+LSQTimerScope::~LSQTimerScope() {
+    finish();
+}
+} // namespace lsq
+} // namespace faiss