RubyGems - faiss - Versions diffs - 0.1.7 → 0.2.3 - Mend

faiss 0.1.7 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +18 -0
data/README.md +7 -7
data/ext/faiss/ext.cpp +1 -1
data/ext/faiss/extconf.rb +8 -2
data/ext/faiss/index.cpp +102 -69
data/ext/faiss/index_binary.cpp +24 -30
data/ext/faiss/kmeans.cpp +20 -16
data/ext/faiss/numo.hpp +867 -0
data/ext/faiss/pca_matrix.cpp +13 -14
data/ext/faiss/product_quantizer.cpp +23 -24
data/ext/faiss/utils.cpp +10 -37
data/ext/faiss/utils.h +2 -13
data/lib/faiss/version.rb +1 -1
data/lib/faiss.rb +0 -5
data/vendor/faiss/faiss/AutoTune.cpp +292 -291
data/vendor/faiss/faiss/AutoTune.h +55 -56
data/vendor/faiss/faiss/Clustering.cpp +334 -195
data/vendor/faiss/faiss/Clustering.h +88 -35
data/vendor/faiss/faiss/IVFlib.cpp +171 -195
data/vendor/faiss/faiss/IVFlib.h +48 -51
data/vendor/faiss/faiss/Index.cpp +85 -103
data/vendor/faiss/faiss/Index.h +54 -48
data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
data/vendor/faiss/faiss/Index2Layer.h +22 -22
data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
data/vendor/faiss/faiss/IndexBinary.h +140 -132
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
data/vendor/faiss/faiss/IndexFlat.h +35 -46
data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
data/vendor/faiss/faiss/IndexHNSW.h +57 -41
data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
data/vendor/faiss/faiss/IndexIVF.h +146 -113
data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
data/vendor/faiss/faiss/IndexLSH.h +21 -26
data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
data/vendor/faiss/faiss/IndexLattice.h +11 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
data/vendor/faiss/faiss/IndexNSG.h +85 -0
data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
data/vendor/faiss/faiss/IndexPQ.h +64 -67
data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
data/vendor/faiss/faiss/IndexRefine.h +22 -23
data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
data/vendor/faiss/faiss/IndexReplicas.h +62 -56
data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
data/vendor/faiss/faiss/IndexResidual.h +152 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
data/vendor/faiss/faiss/IndexShards.cpp +256 -240
data/vendor/faiss/faiss/IndexShards.h +85 -73
data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
data/vendor/faiss/faiss/MatrixStats.h +7 -10
data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
data/vendor/faiss/faiss/MetaIndexes.h +40 -34
data/vendor/faiss/faiss/MetricType.h +7 -7
data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
data/vendor/faiss/faiss/VectorTransform.h +61 -89
data/vendor/faiss/faiss/clone_index.cpp +77 -73
data/vendor/faiss/faiss/clone_index.h +4 -9
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
data/vendor/faiss/faiss/impl/FaissException.h +41 -29
data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
data/vendor/faiss/faiss/impl/HNSW.h +179 -200
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
data/vendor/faiss/faiss/impl/NSG.h +199 -0
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
data/vendor/faiss/faiss/impl/io.cpp +75 -94
data/vendor/faiss/faiss/impl/io.h +31 -41
data/vendor/faiss/faiss/impl/io_macros.h +40 -29
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
data/vendor/faiss/faiss/index_factory.cpp +269 -218
data/vendor/faiss/faiss/index_factory.h +6 -7
data/vendor/faiss/faiss/index_io.h +23 -26
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
data/vendor/faiss/faiss/utils/Heap.h +186 -209
data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
data/vendor/faiss/faiss/utils/distances.cpp +301 -310
data/vendor/faiss/faiss/utils/distances.h +133 -118
data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
data/vendor/faiss/faiss/utils/hamming.h +62 -85
data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
data/vendor/faiss/faiss/utils/partitioning.h +26 -21
data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
data/vendor/faiss/faiss/utils/random.cpp +39 -63
data/vendor/faiss/faiss/utils/random.h +13 -16
data/vendor/faiss/faiss/utils/simdlib.h +4 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
data/vendor/faiss/faiss/utils/utils.cpp +304 -287
data/vendor/faiss/faiss/utils/utils.h +53 -48
metadata +26 -12
data/lib/faiss/index.rb +0 -20
data/lib/faiss/index_binary.rb +0 -20
data/lib/faiss/kmeans.rb +0 -15
data/lib/faiss/pca_matrix.rb +0 -15
data/lib/faiss/product_quantizer.rb +0 -22

data/vendor/faiss/faiss/impl/PolysemousTraining.cpp CHANGED Viewed

@@ -8,18 +8,21 @@
 // -*- c++ -*-
 #include <faiss/impl/PolysemousTraining.h>
+#include "faiss/impl/FaissAssert.h"
+#include <omp.h>
+#include <stdint.h>
-#include <cstdlib>
 #include <cmath>
+#include <cstdlib>
 #include <cstring>
-#include <stdint.h>
 #include <algorithm>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/hamming.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
 #include <faiss/impl/FaissAssert.h>
@@ -29,16 +32,14 @@
 namespace faiss {
 /****************************************************
  * Optimization code
  ****************************************************/
-SimulatedAnnealingParameters::SimulatedAnnealingParameters ()
-{
+SimulatedAnnealingParameters::SimulatedAnnealingParameters() {
     // set some reasonable defaults for the optimization
     init_temperature = 0.7;
-    temperature_decay = pow (0.9, 1/500.);
+    temperature_decay = pow(0.9, 1 / 500.);
     // reduce by a factor 0.9 every 500 it
     n_iter = 500000;
     n_redo = 2;
@@ -50,44 +51,37 @@ SimulatedAnnealingParameters::SimulatedAnnealingParameters ()
 // what would the cost update be if iw and jw were swapped?
 // default implementation just computes both and computes the difference
-double PermutationObjective::cost_update (
-        const int *perm, int iw, int jw) const
-{
-    double orig_cost = compute_cost (perm);
+double PermutationObjective::cost_update(const int* perm, int iw, int jw)
+        const {
+    double orig_cost = compute_cost(perm);
-    std::vector<int> perm2 (n);
+    std::vector<int> perm2(n);
     for (int i = 0; i < n; i++)
         perm2[i] = perm[i];
     perm2[iw] = perm[jw];
     perm2[jw] = perm[iw];
-    double new_cost = compute_cost (perm2.data());
+    double new_cost = compute_cost(perm2.data());
     return new_cost - orig_cost;
 }
-SimulatedAnnealingOptimizer::SimulatedAnnealingOptimizer (
-        PermutationObjective *obj,
-        const SimulatedAnnealingParameters &p):
-    SimulatedAnnealingParameters (p),
-    obj (obj),
-    n(obj->n),
-    logfile (nullptr)
-{
-    rnd = new RandomGenerator (p.seed);
-    FAISS_THROW_IF_NOT (n < 100000 && n >=0 );
+SimulatedAnnealingOptimizer::SimulatedAnnealingOptimizer(
+        PermutationObjective* obj,
+        const SimulatedAnnealingParameters& p)
+        : SimulatedAnnealingParameters(p),
+          obj(obj),
+          n(obj->n),
+          logfile(nullptr) {
+    rnd = new RandomGenerator(p.seed);
+    FAISS_THROW_IF_NOT(n < 100000 && n >= 0);
 }
-SimulatedAnnealingOptimizer::~SimulatedAnnealingOptimizer ()
-{
+SimulatedAnnealingOptimizer::~SimulatedAnnealingOptimizer() {
     delete rnd;
 }
 // run the optimization and return the best result in best_perm
-double SimulatedAnnealingOptimizer::run_optimization (int * best_perm)
-{
+double SimulatedAnnealingOptimizer::run_optimization(int* best_perm) {
     double min_cost = 1e30;
     // just do a few runs of the annealing and keep the lowest output cost
@@ -95,84 +89,89 @@ double SimulatedAnnealingOptimizer::run_optimization (int * best_perm)
         std::vector<int> perm(n);
         for (int i = 0; i < n; i++)
             perm[i] = i;
-         if (init_random) {
+        if (init_random) {
             for (int i = 0; i < n; i++) {
-                int j = i + rnd->rand_int (n - i);
-                std::swap (perm[i], perm[j]);
+                int j = i + rnd->rand_int(n - i);
+                std::swap(perm[i], perm[j]);
             }
         }
-         float cost = optimize (perm.data());
-        if (logfile) fprintf (logfile, "\n");
-        if(verbose > 1) {
-            printf ("    optimization run %d: cost=%g %s\n",
-                    it, cost, cost < min_cost ? "keep" : "");
+        float cost = optimize(perm.data());
+        if (logfile)
+            fprintf(logfile, "\n");
+        if (verbose > 1) {
+            printf("    optimization run %d: cost=%g %s\n",
+                   it,
+                   cost,
+                   cost < min_cost ? "keep" : "");
         }
         if (cost < min_cost) {
-            memcpy (best_perm, perm.data(), sizeof(perm[0]) * n);
+            memcpy(best_perm, perm.data(), sizeof(perm[0]) * n);
             min_cost = cost;
         }
     }
-     return min_cost;
+    return min_cost;
 }
 // perform the optimization loop, starting from and modifying
 // permutation in-place
-double SimulatedAnnealingOptimizer::optimize (int *perm)
-{
-    double cost = init_cost = obj->compute_cost (perm);
+double SimulatedAnnealingOptimizer::optimize(int* perm) {
+    double cost = init_cost = obj->compute_cost(perm);
     int log2n = 0;
-    while (!(n <= (1 << log2n))) log2n++;
+    while (!(n <= (1 << log2n)))
+        log2n++;
     double temperature = init_temperature;
-     int n_swap = 0, n_hot = 0;
+    int n_swap = 0, n_hot = 0;
     for (int it = 0; it < n_iter; it++) {
         temperature = temperature * temperature_decay;
         int iw, jw;
         if (only_bit_flips) {
-            iw = rnd->rand_int (n);
-            jw = iw ^ (1 << rnd->rand_int (log2n));
+            iw = rnd->rand_int(n);
+            jw = iw ^ (1 << rnd->rand_int(log2n));
         } else {
-            iw = rnd->rand_int (n);
-            jw = rnd->rand_int (n - 1);
-            if (jw == iw) jw++;
+            iw = rnd->rand_int(n);
+            jw = rnd->rand_int(n - 1);
+            if (jw == iw)
+                jw++;
         }
-         double delta_cost = obj->cost_update (perm, iw, jw);
-         if (delta_cost < 0 || rnd->rand_float () < temperature) {
-            std::swap (perm[iw], perm[jw]);
+        double delta_cost = obj->cost_update(perm, iw, jw);
+        if (delta_cost < 0 || rnd->rand_float() < temperature) {
+            std::swap(perm[iw], perm[jw]);
             cost += delta_cost;
             n_swap++;
-            if (delta_cost >= 0) n_hot++;
+            if (delta_cost >= 0)
+                n_hot++;
         }
-         if (verbose > 2 || (verbose > 1 && it % 10000 == 0)) {
-            printf ("      iteration %d cost %g temp %g n_swap %d "
-                    "(%d hot)     \r",
-                    it, cost, temperature, n_swap, n_hot);
+        if (verbose > 2 || (verbose > 1 && it % 10000 == 0)) {
+            printf("      iteration %d cost %g temp %g n_swap %d "
+                   "(%d hot)     \r",
+                   it,
+                   cost,
+                   temperature,
+                   n_swap,
+                   n_hot);
             fflush(stdout);
         }
         if (logfile) {
-            fprintf (logfile, "%d %g %g %d %d\n",
-                    it, cost, temperature, n_swap, n_hot);
+            fprintf(logfile,
+                    "%d %g %g %d %d\n",
+                    it,
+                    cost,
+                    temperature,
+                    n_swap,
+                    n_hot);
         }
-     }
-    if (verbose > 1) printf("\n");
+    }
+    if (verbose > 1)
+        printf("\n");
     return cost;
 }
 /****************************************************
  * Cost functions: ReproduceDistanceTable
  ****************************************************/
-static inline int hamming_dis (uint64_t a, uint64_t b)
-{
-    return __builtin_popcountl (a ^ b);
+static inline int hamming_dis(uint64_t a, uint64_t b) {
+    return __builtin_popcountl(a ^ b);
 }
 namespace {
@@ -182,14 +181,14 @@ struct ReproduceWithHammingObjective : PermutationObjective {
     int nbits;
     double dis_weight_factor;
-    static double sqr (double x) { return x * x; }
+    static double sqr(double x) {
+        return x * x;
+    }
     // weihgting of distances: it is more important to reproduce small
     // distances well
-    double dis_weight (double x) const
-    {
-        return exp (-dis_weight_factor * x);
+    double dis_weight(double x) const {
+        return exp(-dis_weight_factor * x);
     }
     std::vector<double> target_dis; // wanted distances (size n^2)
@@ -197,101 +196,105 @@ struct ReproduceWithHammingObjective : PermutationObjective {
     // cost = quadratic difference between actual distance and Hamming distance
     double compute_cost(const int* perm) const override {
-      double cost = 0;
-      for (int i = 0; i < n; i++) {
-        for (int j = 0; j < n; j++) {
-          double wanted = target_dis[i * n + j];
-          double w = weights[i * n + j];
-          double actual = hamming_dis(perm[i], perm[j]);
-          cost += w * sqr(wanted - actual);
+        double cost = 0;
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < n; j++) {
+                double wanted = target_dis[i * n + j];
+                double w = weights[i * n + j];
+                double actual = hamming_dis(perm[i], perm[j]);
+                cost += w * sqr(wanted - actual);
+            }
         }
-      }
-      return cost;
+        return cost;
     }
     // what would the cost update be if iw and jw were swapped?
     // computed in O(n) instead of O(n^2) for the full re-computation
     double cost_update(const int* perm, int iw, int jw) const override {
-      double delta_cost = 0;
+        double delta_cost = 0;
-      for (int i = 0; i < n; i++) {
-        if (i == iw) {
-          for (int j = 0; j < n; j++) {
-            double wanted = target_dis[i * n + j], w = weights[i * n + j];
-            double actual = hamming_dis(perm[i], perm[j]);
-            delta_cost -= w * sqr(wanted - actual);
-            double new_actual =
-                hamming_dis(perm[jw], perm[j == iw ? jw : j == jw ? iw : j]);
-            delta_cost += w * sqr(wanted - new_actual);
-          }
-        } else if (i == jw) {
-          for (int j = 0; j < n; j++) {
-            double wanted = target_dis[i * n + j], w = weights[i * n + j];
-            double actual = hamming_dis(perm[i], perm[j]);
-            delta_cost -= w * sqr(wanted - actual);
-            double new_actual =
-                hamming_dis(perm[iw], perm[j == iw ? jw : j == jw ? iw : j]);
-            delta_cost += w * sqr(wanted - new_actual);
-          }
-        } else {
-          int j = iw;
-          {
-            double wanted = target_dis[i * n + j], w = weights[i * n + j];
-            double actual = hamming_dis(perm[i], perm[j]);
-            delta_cost -= w * sqr(wanted - actual);
-            double new_actual = hamming_dis(perm[i], perm[jw]);
-            delta_cost += w * sqr(wanted - new_actual);
-          }
-          j = jw;
-          {
-            double wanted = target_dis[i * n + j], w = weights[i * n + j];
-            double actual = hamming_dis(perm[i], perm[j]);
-            delta_cost -= w * sqr(wanted - actual);
-            double new_actual = hamming_dis(perm[i], perm[iw]);
-            delta_cost += w * sqr(wanted - new_actual);
-          }
+        for (int i = 0; i < n; i++) {
+            if (i == iw) {
+                for (int j = 0; j < n; j++) {
+                    double wanted = target_dis[i * n + j],
+                           w = weights[i * n + j];
+                    double actual = hamming_dis(perm[i], perm[j]);
+                    delta_cost -= w * sqr(wanted - actual);
+                    double new_actual = hamming_dis(
+                            perm[jw],
+                            perm[j == iw           ? jw
+                                         : j == jw ? iw
+                                                   : j]);
+                    delta_cost += w * sqr(wanted - new_actual);
+                }
+            } else if (i == jw) {
+                for (int j = 0; j < n; j++) {
+                    double wanted = target_dis[i * n + j],
+                           w = weights[i * n + j];
+                    double actual = hamming_dis(perm[i], perm[j]);
+                    delta_cost -= w * sqr(wanted - actual);
+                    double new_actual = hamming_dis(
+                            perm[iw],
+                            perm[j == iw           ? jw
+                                         : j == jw ? iw
+                                                   : j]);
+                    delta_cost += w * sqr(wanted - new_actual);
+                }
+            } else {
+                int j = iw;
+                {
+                    double wanted = target_dis[i * n + j],
+                           w = weights[i * n + j];
+                    double actual = hamming_dis(perm[i], perm[j]);
+                    delta_cost -= w * sqr(wanted - actual);
+                    double new_actual = hamming_dis(perm[i], perm[jw]);
+                    delta_cost += w * sqr(wanted - new_actual);
+                }
+                j = jw;
+                {
+                    double wanted = target_dis[i * n + j],
+                           w = weights[i * n + j];
+                    double actual = hamming_dis(perm[i], perm[j]);
+                    delta_cost -= w * sqr(wanted - actual);
+                    double new_actual = hamming_dis(perm[i], perm[iw]);
+                    delta_cost += w * sqr(wanted - new_actual);
+                }
+            }
         }
-      }
-      return delta_cost;
+        return delta_cost;
     }
-    ReproduceWithHammingObjective (
-           int nbits,
-           const std::vector<double> & dis_table,
-           double dis_weight_factor):
-        nbits (nbits), dis_weight_factor (dis_weight_factor)
-    {
+    ReproduceWithHammingObjective(
+            int nbits,
+            const std::vector<double>& dis_table,
+            double dis_weight_factor)
+            : nbits(nbits), dis_weight_factor(dis_weight_factor) {
         n = 1 << nbits;
-        FAISS_THROW_IF_NOT (dis_table.size() == n * n);
-        set_affine_target_dis (dis_table);
+        FAISS_THROW_IF_NOT(dis_table.size() == n * n);
+        set_affine_target_dis(dis_table);
     }
-    void set_affine_target_dis (const std::vector<double> & dis_table)
-    {
+    void set_affine_target_dis(const std::vector<double>& dis_table) {
         double sum = 0, sum2 = 0;
         int n2 = n * n;
         for (int i = 0; i < n2; i++) {
-            sum += dis_table [i];
-            sum2 += dis_table [i] * dis_table [i];
+            sum += dis_table[i];
+            sum2 += dis_table[i] * dis_table[i];
         }
         double mean = sum / n2;
         double stddev = sqrt(sum2 / n2 - (sum / n2) * (sum / n2));
-        target_dis.resize (n2);
+        target_dis.resize(n2);
         for (int i = 0; i < n2; i++) {
             // the mapping function
-            double td = (dis_table [i] - mean) / stddev * sqrt(nbits / 4) +
-                nbits / 2;
+            double td = (dis_table[i] - mean) / stddev * sqrt(nbits / 4) +
+                    nbits / 2;
             target_dis[i] = td;
             // compute a weight
-            weights.push_back (dis_weight (td));
+            weights.push_back(dis_weight(td));
         }
     }
     ~ReproduceWithHammingObjective() override {}
@@ -301,27 +304,23 @@ struct ReproduceWithHammingObjective : PermutationObjective {
 // weihgting of distances: it is more important to reproduce small
 // distances well
-double ReproduceDistancesObjective::dis_weight (double x) const
-{
-    return exp (-dis_weight_factor * x);
+double ReproduceDistancesObjective::dis_weight(double x) const {
+    return exp(-dis_weight_factor * x);
 }
-double ReproduceDistancesObjective::get_source_dis (int i, int j) const
-{
-    return source_dis [i * n + j];
+double ReproduceDistancesObjective::get_source_dis(int i, int j) const {
+    return source_dis[i * n + j];
 }
 // cost = quadratic difference between actual distance and Hamming distance
-double ReproduceDistancesObjective::compute_cost (const int *perm) const
-{
+double ReproduceDistancesObjective::compute_cost(const int* perm) const {
     double cost = 0;
     for (int i = 0; i < n; i++) {
         for (int j = 0; j < n; j++) {
-            double wanted = target_dis [i * n + j];
-            double w = weights [i * n + j];
-            double actual = get_source_dis (perm[i], perm[j]);
-            cost += w * sqr (wanted - actual);
+            double wanted = target_dis[i * n + j];
+            double w = weights[i * n + j];
+            double actual = get_source_dis(perm[i], perm[j]);
+            cost += w * sqr(wanted - actual);
         }
     }
     return cost;
@@ -329,79 +328,75 @@ double ReproduceDistancesObjective::compute_cost (const int *perm) const
 // what would the cost update be if iw and jw were swapped?
 // computed in O(n) instead of O(n^2) for the full re-computation
-double ReproduceDistancesObjective::cost_update(
-        const int *perm, int iw, int jw) const
-{
+double ReproduceDistancesObjective::cost_update(const int* perm, int iw, int jw)
+        const {
     double delta_cost = 0;
-     for (int i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
         if (i == iw) {
             for (int j = 0; j < n; j++) {
-                double wanted = target_dis [i * n + j],
-                    w = weights [i * n + j];
-                double actual = get_source_dis (perm[i], perm[j]);
-                delta_cost -= w * sqr (wanted - actual);
-                double new_actual = get_source_dis (
-                       perm[jw],
-                       perm[j == iw ? jw : j == jw ? iw : j]);
-                delta_cost += w * sqr (wanted - new_actual);
+                double wanted = target_dis[i * n + j], w = weights[i * n + j];
+                double actual = get_source_dis(perm[i], perm[j]);
+                delta_cost -= w * sqr(wanted - actual);
+                double new_actual = get_source_dis(
+                        perm[jw],
+                        perm[j == iw           ? jw
+                                     : j == jw ? iw
+                                               : j]);
+                delta_cost += w * sqr(wanted - new_actual);
             }
         } else if (i == jw) {
             for (int j = 0; j < n; j++) {
-                double wanted = target_dis [i * n + j],
-                    w = weights [i * n + j];
-                double actual = get_source_dis (perm[i], perm[j]);
-                delta_cost -= w * sqr (wanted - actual);
-                double new_actual = get_source_dis (
-                       perm[iw],
-                       perm[j == iw ? jw : j == jw ? iw : j]);
-                delta_cost += w * sqr (wanted - new_actual);
+                double wanted = target_dis[i * n + j], w = weights[i * n + j];
+                double actual = get_source_dis(perm[i], perm[j]);
+                delta_cost -= w * sqr(wanted - actual);
+                double new_actual = get_source_dis(
+                        perm[iw],
+                        perm[j == iw           ? jw
+                                     : j == jw ? iw
+                                               : j]);
+                delta_cost += w * sqr(wanted - new_actual);
             }
-        } else  {
+        } else {
             int j = iw;
             {
-                double wanted = target_dis [i * n + j],
-                    w = weights [i * n + j];
-                double actual = get_source_dis (perm[i], perm[j]);
-                delta_cost -= w * sqr (wanted - actual);
-                double new_actual = get_source_dis (perm[i], perm[jw]);
-                delta_cost += w * sqr (wanted - new_actual);
+                double wanted = target_dis[i * n + j], w = weights[i * n + j];
+                double actual = get_source_dis(perm[i], perm[j]);
+                delta_cost -= w * sqr(wanted - actual);
+                double new_actual = get_source_dis(perm[i], perm[jw]);
+                delta_cost += w * sqr(wanted - new_actual);
             }
             j = jw;
             {
-                double wanted = target_dis [i * n + j],
-                    w = weights [i * n + j];
-                double actual = get_source_dis (perm[i], perm[j]);
-                delta_cost -= w * sqr (wanted - actual);
-                double new_actual = get_source_dis (perm[i], perm[iw]);
-                delta_cost += w * sqr (wanted - new_actual);
+                double wanted = target_dis[i * n + j], w = weights[i * n + j];
+                double actual = get_source_dis(perm[i], perm[j]);
+                delta_cost -= w * sqr(wanted - actual);
+                double new_actual = get_source_dis(perm[i], perm[iw]);
+                delta_cost += w * sqr(wanted - new_actual);
             }
         }
     }
-     return delta_cost;
+    return delta_cost;
 }
-ReproduceDistancesObjective::ReproduceDistancesObjective (
-       int n,
-       const double *source_dis_in,
-       const double *target_dis_in,
-       double dis_weight_factor):
-    dis_weight_factor (dis_weight_factor),
-    target_dis (target_dis_in)
-{
+ReproduceDistancesObjective::ReproduceDistancesObjective(
+        int n,
+        const double* source_dis_in,
+        const double* target_dis_in,
+        double dis_weight_factor)
+        : dis_weight_factor(dis_weight_factor), target_dis(target_dis_in) {
     this->n = n;
-    set_affine_target_dis (source_dis_in);
+    set_affine_target_dis(source_dis_in);
 }
-void ReproduceDistancesObjective::compute_mean_stdev (
-          const double *tab, size_t n2,
-          double *mean_out, double *stddev_out)
-{
+void ReproduceDistancesObjective::compute_mean_stdev(
+        const double* tab,
+        size_t n2,
+        double* mean_out,
+        double* stddev_out) {
     double sum = 0, sum2 = 0;
     for (int i = 0; i < n2; i++) {
-        sum += tab [i];
-        sum2 += tab [i] * tab [i];
+        sum += tab[i];
+        sum2 += tab[i] * tab[i];
     }
     double mean = sum / n2;
     double stddev = sqrt(sum2 / n2 - (sum / n2) * (sum / n2));
@@ -409,32 +404,34 @@ void ReproduceDistancesObjective::compute_mean_stdev (
     *stddev_out = stddev;
 }
-void ReproduceDistancesObjective::set_affine_target_dis (
-          const double *source_dis_in)
-{
+void ReproduceDistancesObjective::set_affine_target_dis(
+        const double* source_dis_in) {
     int n2 = n * n;
     double mean_src, stddev_src;
-    compute_mean_stdev (source_dis_in, n2, &mean_src, &stddev_src);
+    compute_mean_stdev(source_dis_in, n2, &mean_src, &stddev_src);
     double mean_target, stddev_target;
-    compute_mean_stdev (target_dis, n2, &mean_target, &stddev_target);
+    compute_mean_stdev(target_dis, n2, &mean_target, &stddev_target);
-    printf ("map mean %g std %g -> mean %g std %g\n",
-            mean_src, stddev_src, mean_target, stddev_target);
+    printf("map mean %g std %g -> mean %g std %g\n",
+           mean_src,
+           stddev_src,
+           mean_target,
+           stddev_target);
-    source_dis.resize (n2);
-    weights.resize (n2);
+    source_dis.resize(n2);
+    weights.resize(n2);
     for (int i = 0; i < n2; i++) {
         // the mapping function
-        source_dis[i] = (source_dis_in[i] - mean_src) / stddev_src
-            * stddev_target + mean_target;
+        source_dis[i] =
+                (source_dis_in[i] - mean_src) / stddev_src * stddev_target +
+                mean_target;
         // compute a weight
-        weights [i] = dis_weight (target_dis[i]);
+        weights[i] = dis_weight(target_dis[i]);
     }
 }
 /****************************************************
@@ -444,8 +441,7 @@ void ReproduceDistancesObjective::set_affine_target_dis (
 /// Maintains a 3D table of elementary costs.
 /// Accumulates elements based on Hamming distance comparisons
 template <typename Ttab, typename Taccu>
-struct Score3Computer: PermutationObjective {
+struct Score3Computer : PermutationObjective {
     int nc;
     // cost matrix of size nc * nc *nc
@@ -453,21 +449,18 @@ struct Score3Computer: PermutationObjective {
     // where x has PQ code i, y- PQ code j and y+ PQ code k
     std::vector<Ttab> n_gt;
     /// the cost is a triple loop on the nc * nc * nc matrix of entries.
     ///
-    Taccu compute (const int * perm) const
-    {
+    Taccu compute(const int* perm) const {
         Taccu accu = 0;
-        const Ttab *p = n_gt.data();
+        const Ttab* p = n_gt.data();
         for (int i = 0; i < nc; i++) {
-            int ip = perm [i];
+            int ip = perm[i];
             for (int j = 0; j < nc; j++) {
-                int jp = perm [j];
+                int jp = perm[j];
                 for (int k = 0; k < nc; k++) {
-                    int kp = perm [k];
-                    if (hamming_dis (ip, jp) <
-                        hamming_dis (ip, kp)) {
+                    int kp = perm[k];
+                    if (hamming_dis(ip, jp) < hamming_dis(ip, kp)) {
                         accu += *p; // n_gt [ ( i * nc + j) * nc + k];
                     }
                     p++;
@@ -477,7 +470,6 @@ struct Score3Computer: PermutationObjective {
         return accu;
     }
     /** cost update if entries iw and jw of the permutation would be
      * swapped.
      *
@@ -487,25 +479,23 @@ struct Score3Computer: PermutationObjective {
      * cells. Practical speedup is about 8x, and the code is quite
      * complex :-/
      */
-    Taccu compute_update (const int *perm, int iw, int jw) const
-    {
-        assert (iw != jw);
-        if (iw > jw) std::swap (iw, jw);
+    Taccu compute_update(const int* perm, int iw, int jw) const {
+        assert(iw != jw);
+        if (iw > jw)
+            std::swap(iw, jw);
         Taccu accu = 0;
-        const Ttab * n_gt_i = n_gt.data();
+        const Ttab* n_gt_i = n_gt.data();
         for (int i = 0; i < nc; i++) {
-            int ip0 = perm [i];
-            int ip = perm [i == iw ? jw : i == jw ? iw : i];
+            int ip0 = perm[i];
+            int ip = perm[i == iw ? jw : i == jw ? iw : i];
-            //accu += update_i (perm, iw, jw, ip0, ip, n_gt_i);
+            // accu += update_i (perm, iw, jw, ip0, ip, n_gt_i);
-            accu += update_i_cross (perm, iw, jw,
-                                    ip0, ip, n_gt_i);
+            accu += update_i_cross(perm, iw, jw, ip0, ip, n_gt_i);
             if (ip != ip0)
-                accu += update_i_plane (perm, iw, jw,
-                                       ip0, ip, n_gt_i);
+                accu += update_i_plane(perm, iw, jw, ip0, ip, n_gt_i);
             n_gt_i += nc * nc;
         }
@@ -513,23 +503,26 @@ struct Score3Computer: PermutationObjective {
         return accu;
     }
-    Taccu update_i (const int *perm, int iw, int jw,
-                   int ip0, int ip, const Ttab * n_gt_i) const
-    {
+    Taccu update_i(
+            const int* perm,
+            int iw,
+            int jw,
+            int ip0,
+            int ip,
+            const Ttab* n_gt_i) const {
         Taccu accu = 0;
-        const Ttab *n_gt_ij = n_gt_i;
+        const Ttab* n_gt_ij = n_gt_i;
         for (int j = 0; j < nc; j++) {
             int jp0 = perm[j];
-            int jp = perm [j == iw ? jw : j == jw ? iw : j];
+            int jp = perm[j == iw ? jw : j == jw ? iw : j];
             for (int k = 0; k < nc; k++) {
-                int kp0 = perm [k];
-                int kp = perm [k == iw ? jw : k == jw ? iw : k];
-                int ng = n_gt_ij [k];
-                if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+                int kp0 = perm[k];
+                int kp = perm[k == iw ? jw : k == jw ? iw : k];
+                int ng = n_gt_ij[k];
+                if (hamming_dis(ip, jp) < hamming_dis(ip, kp)) {
                     accu += ng;
                 }
-                if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp0)) {
+                if (hamming_dis(ip0, jp0) < hamming_dis(ip0, kp0)) {
                     accu -= ng;
                 }
             }
@@ -539,23 +532,27 @@ struct Score3Computer: PermutationObjective {
     }
     // 2 inner loops for the case ip0 != ip
-    Taccu update_i_plane (const int *perm, int iw, int jw,
-                         int ip0, int ip, const Ttab * n_gt_i) const
-    {
+    Taccu update_i_plane(
+            const int* perm,
+            int iw,
+            int jw,
+            int ip0,
+            int ip,
+            const Ttab* n_gt_i) const {
         Taccu accu = 0;
-        const Ttab *n_gt_ij = n_gt_i;
+        const Ttab* n_gt_ij = n_gt_i;
         for (int j = 0; j < nc; j++) {
             if (j != iw && j != jw) {
                 int jp = perm[j];
                 for (int k = 0; k < nc; k++) {
                     if (k != iw && k != jw) {
-                        int kp = perm [k];
-                        Ttab ng = n_gt_ij [k];
-                        if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+                        int kp = perm[k];
+                        Ttab ng = n_gt_ij[k];
+                        if (hamming_dis(ip, jp) < hamming_dis(ip, kp)) {
                             accu += ng;
                         }
-                        if (hamming_dis (ip0, jp) < hamming_dis (ip0, kp)) {
+                        if (hamming_dis(ip0, jp) < hamming_dis(ip0, kp)) {
                             accu -= ng;
                         }
                     }
@@ -567,114 +564,128 @@ struct Score3Computer: PermutationObjective {
     }
     /// used for the 8 cells were the 3 indices are swapped
-    inline Taccu update_k (const int *perm, int iw, int jw,
-                          int ip0, int ip, int jp0, int jp,
-                          int k,
-                          const Ttab * n_gt_ij) const
-    {
+    inline Taccu update_k(
+            const int* perm,
+            int iw,
+            int jw,
+            int ip0,
+            int ip,
+            int jp0,
+            int jp,
+            int k,
+            const Ttab* n_gt_ij) const {
         Taccu accu = 0;
-        int kp0 = perm [k];
-        int kp = perm [k == iw ? jw : k == jw ? iw : k];
-        Ttab ng = n_gt_ij [k];
-        if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+        int kp0 = perm[k];
+        int kp = perm[k == iw ? jw : k == jw ? iw : k];
+        Ttab ng = n_gt_ij[k];
+        if (hamming_dis(ip, jp) < hamming_dis(ip, kp)) {
             accu += ng;
         }
-        if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp0)) {
+        if (hamming_dis(ip0, jp0) < hamming_dis(ip0, kp0)) {
             accu -= ng;
         }
         return accu;
     }
     /// compute update on a line of k's, where i and j are swapped
-    Taccu update_j_line (const int *perm, int iw, int jw,
-                        int ip0, int ip, int jp0, int jp,
-                        const Ttab * n_gt_ij) const
-    {
+    Taccu update_j_line(
+            const int* perm,
+            int iw,
+            int jw,
+            int ip0,
+            int ip,
+            int jp0,
+            int jp,
+            const Ttab* n_gt_ij) const {
         Taccu accu = 0;
         for (int k = 0; k < nc; k++) {
-            if (k == iw || k == jw) continue;
-            int kp = perm [k];
-            Ttab ng = n_gt_ij [k];
-            if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
+            if (k == iw || k == jw)
+                continue;
+            int kp = perm[k];
+            Ttab ng = n_gt_ij[k];
+            if (hamming_dis(ip, jp) < hamming_dis(ip, kp)) {
                 accu += ng;
             }
-            if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp)) {
+            if (hamming_dis(ip0, jp0) < hamming_dis(ip0, kp)) {
                 accu -= ng;
             }
         }
         return accu;
     }
     /// considers the 2 pairs of crossing lines j=iw or jw and k = iw or kw
-    Taccu update_i_cross (const int *perm, int iw, int jw,
-                        int ip0, int ip, const Ttab * n_gt_i) const
-    {
+    Taccu update_i_cross(
+            const int* perm,
+            int iw,
+            int jw,
+            int ip0,
+            int ip,
+            const Ttab* n_gt_i) const {
         Taccu accu = 0;
-        const Ttab *n_gt_ij = n_gt_i;
+        const Ttab* n_gt_ij = n_gt_i;
         for (int j = 0; j < nc; j++) {
             int jp0 = perm[j];
-            int jp = perm [j == iw ? jw : j == jw ? iw : j];
+            int jp = perm[j == iw ? jw : j == jw ? iw : j];
-            accu += update_k (perm, iw, jw, ip0, ip, jp0, jp, iw, n_gt_ij);
-            accu += update_k (perm, iw, jw, ip0, ip, jp0, jp, jw, n_gt_ij);
+            accu += update_k(perm, iw, jw, ip0, ip, jp0, jp, iw, n_gt_ij);
+            accu += update_k(perm, iw, jw, ip0, ip, jp0, jp, jw, n_gt_ij);
             if (jp != jp0)
-                accu += update_j_line (perm, iw, jw, ip0, ip, jp0, jp, n_gt_ij);
+                accu += update_j_line(perm, iw, jw, ip0, ip, jp0, jp, n_gt_ij);
             n_gt_ij += nc;
         }
         return accu;
     }
     /// PermutationObjective implementeation (just negates the scores
     /// for minimization)
     double compute_cost(const int* perm) const override {
-      return -compute(perm);
+        return -compute(perm);
     }
     double cost_update(const int* perm, int iw, int jw) const override {
-      double ret = -compute_update(perm, iw, jw);
-      return ret;
+        double ret = -compute_update(perm, iw, jw);
+        return ret;
     }
     ~Score3Computer() override {}
 };
 struct IndirectSort {
-    const float *tab;
-    bool operator () (int a, int b) {return tab[a] < tab[b]; }
+    const float* tab;
+    bool operator()(int a, int b) {
+        return tab[a] < tab[b];
+    }
 };
-struct RankingScore2: Score3Computer<float, double> {
+struct RankingScore2 : Score3Computer<float, double> {
     int nbits;
     int nq, nb;
     const uint32_t *qcodes, *bcodes;
-    const float *gt_distances;
-    RankingScore2 (int nbits, int nq, int nb,
-                  const uint32_t *qcodes, const uint32_t *bcodes,
-                  const float *gt_distances):
-        nbits(nbits), nq(nq), nb(nb), qcodes(qcodes),
-        bcodes(bcodes), gt_distances(gt_distances)
-    {
+    const float* gt_distances;
+    RankingScore2(
+            int nbits,
+            int nq,
+            int nb,
+            const uint32_t* qcodes,
+            const uint32_t* bcodes,
+            const float* gt_distances)
+            : nbits(nbits),
+              nq(nq),
+              nb(nb),
+              qcodes(qcodes),
+              bcodes(bcodes),
+              gt_distances(gt_distances) {
         n = nc = 1 << nbits;
-        n_gt.resize (nc * nc * nc);
-        init_n_gt ();
+        n_gt.resize(nc * nc * nc);
+        init_n_gt();
     }
-    double rank_weight (int r)
-    {
+    double rank_weight(int r) {
         return 1.0 / (r + 1);
     }
@@ -683,271 +694,290 @@ struct RankingScore2: Score3Computer<float, double> {
     /// they are the ranks of j and k respectively.
     /// specific version for diff-of-rank weighting, cannot optimized
     /// with a cumulative table
-    double accum_gt_weight_diff (const std::vector<int> & a,
-                                 const std::vector<int> & b)
-    {
+    double accum_gt_weight_diff(
+            const std::vector<int>& a,
+            const std::vector<int>& b) {
         int nb = b.size(), na = a.size();
         double accu = 0;
         int j = 0;
         for (int i = 0; i < na; i++) {
             int ai = a[i];
-            while (j < nb && ai >= b[j]) j++;
+            while (j < nb && ai >= b[j])
+                j++;
             double accu_i = 0;
             for (int k = j; k < b.size(); k++)
-                accu_i += rank_weight (b[k] - ai);
-            accu += rank_weight (ai) * accu_i;
+                accu_i += rank_weight(b[k] - ai);
+            accu += rank_weight(ai) * accu_i;
         }
         return accu;
     }
-    void init_n_gt ()
-    {
+    void init_n_gt() {
         for (int q = 0; q < nq; q++) {
-            const float *gtd = gt_distances + q * nb;
-            const uint32_t *cb = bcodes;// all same codes
-            float * n_gt_q = & n_gt [qcodes[q] * nc * nc];
+            const float* gtd = gt_distances + q * nb;
+            const uint32_t* cb = bcodes; // all same codes
+            float* n_gt_q = &n_gt[qcodes[q] * nc * nc];
-            printf("init gt for q=%d/%d    \r", q, nq); fflush(stdout);
+            printf("init gt for q=%d/%d    \r", q, nq);
+            fflush(stdout);
-            std::vector<int> rankv (nb);
-            int * ranks = rankv.data();
+            std::vector<int> rankv(nb);
+            int* ranks = rankv.data();
             // elements in each code bin, ordered by rank within each bin
-            std::vector<std::vector<int> > tab (nc);
+            std::vector<std::vector<int>> tab(nc);
             { // build rank table
                 IndirectSort s = {gtd};
-                for (int j = 0; j < nb; j++) ranks[j] = j;
-                std::sort (ranks, ranks + nb, s);
+                for (int j = 0; j < nb; j++)
+                    ranks[j] = j;
+                std::sort(ranks, ranks + nb, s);
             }
             for (int rank = 0; rank < nb; rank++) {
-                int i = ranks [rank];
-                tab [cb[i]].push_back (rank);
+                int i = ranks[rank];
+                tab[cb[i]].push_back(rank);
             }
             // this is very expensive. Any suggestion for improvement
             // welcome.
             for (int i = 0; i < nc; i++) {
-                std::vector<int> & di = tab[i];
+                std::vector<int>& di = tab[i];
                 for (int j = 0; j < nc; j++) {
-                    std::vector<int> & dj = tab[j];
-                    n_gt_q [i * nc + j] += accum_gt_weight_diff (di, dj);
+                    std::vector<int>& dj = tab[j];
+                    n_gt_q[i * nc + j] += accum_gt_weight_diff(di, dj);
                 }
             }
         }
     }
 };
 /*****************************************
  * PolysemousTraining
  ******************************************/
-PolysemousTraining::PolysemousTraining ()
-{
+PolysemousTraining::PolysemousTraining() {
     optimization_type = OT_ReproduceDistances_affine;
     ntrain_permutation = 0;
     dis_weight_factor = log(2);
+    // max 20 G RAM
+    max_memory = (size_t)(20) * 1024 * 1024 * 1024;
 }
-void PolysemousTraining::optimize_reproduce_distances (
-       ProductQuantizer &pq) const
-{
+void PolysemousTraining::optimize_reproduce_distances(
+        ProductQuantizer& pq) const {
     int dsub = pq.dsub;
     int n = pq.ksub;
     int nbits = pq.nbits;
-#pragma omp parallel for
+    size_t mem1 = memory_usage_per_thread(pq);
+    int nt = std::min(omp_get_max_threads(), int(pq.M));
+    FAISS_THROW_IF_NOT_FMT(
+            mem1 < max_memory,
+            "Polysemous training will use %zd bytes per thread, while the max is set to %zd",
+            mem1,
+            max_memory);
+    if (mem1 * nt > max_memory) {
+        nt = max_memory / mem1;
+        fprintf(stderr,
+                "Polysemous training: WARN, reducing number of threads to %d to save memory",
+                nt);
+    }
+#pragma omp parallel for num_threads(nt)
     for (int m = 0; m < pq.M; m++) {
         std::vector<double> dis_table;
         // printf ("Optimizing quantizer %d\n", m);
-        float * centroids = pq.get_centroids (m, 0);
+        float* centroids = pq.get_centroids(m, 0);
         for (int i = 0; i < n; i++) {
             for (int j = 0; j < n; j++) {
-                dis_table.push_back (fvec_L2sqr (centroids + i * dsub,
-                                                 centroids + j * dsub,
-                                                 dsub));
+                dis_table.push_back(fvec_L2sqr(
+                        centroids + i * dsub, centroids + j * dsub, dsub));
             }
         }
-        std::vector<int> perm (n);
-        ReproduceWithHammingObjective obj (
-               nbits, dis_table,
-               dis_weight_factor);
+        std::vector<int> perm(n);
+        ReproduceWithHammingObjective obj(nbits, dis_table, dis_weight_factor);
-        SimulatedAnnealingOptimizer optim (&obj, *this);
+        SimulatedAnnealingOptimizer optim(&obj, *this);
         if (log_pattern.size()) {
             char fname[256];
-            snprintf (fname, 256, log_pattern.c_str(), m);
-            printf ("opening log file %s\n", fname);
-            optim.logfile = fopen (fname, "w");
-            FAISS_THROW_IF_NOT_MSG (optim.logfile, "could not open logfile");
+            snprintf(fname, 256, log_pattern.c_str(), m);
+            printf("opening log file %s\n", fname);
+            optim.logfile = fopen(fname, "w");
+            FAISS_THROW_IF_NOT_MSG(optim.logfile, "could not open logfile");
         }
-        double final_cost = optim.run_optimization (perm.data());
+        double final_cost = optim.run_optimization(perm.data());
         if (verbose > 0) {
-            printf ("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
-                    m, optim.init_cost, final_cost);
+            printf("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
+                   m,
+                   optim.init_cost,
+                   final_cost);
         }
-        if (log_pattern.size()) fclose (optim.logfile);
+        if (log_pattern.size())
+            fclose(optim.logfile);
         std::vector<float> centroids_copy;
         for (int i = 0; i < dsub * n; i++)
-            centroids_copy.push_back (centroids[i]);
+            centroids_copy.push_back(centroids[i]);
         for (int i = 0; i < n; i++)
-            memcpy (centroids + perm[i] * dsub,
-                    centroids_copy.data() + i * dsub,
-                    dsub * sizeof(centroids[0]));
+            memcpy(centroids + perm[i] * dsub,
+                   centroids_copy.data() + i * dsub,
+                   dsub * sizeof(centroids[0]));
     }
 }
-void PolysemousTraining::optimize_ranking (
-      ProductQuantizer &pq, size_t n, const float *x) const
-{
+void PolysemousTraining::optimize_ranking(
+        ProductQuantizer& pq,
+        size_t n,
+        const float* x) const {
     int dsub = pq.dsub;
     int nbits = pq.nbits;
-    std::vector<uint8_t> all_codes (pq.code_size * n);
+    std::vector<uint8_t> all_codes(pq.code_size * n);
-    pq.compute_codes (x, all_codes.data(), n);
+    pq.compute_codes(x, all_codes.data(), n);
-    FAISS_THROW_IF_NOT (pq.nbits == 8);
+    FAISS_THROW_IF_NOT(pq.nbits == 8);
-    if (n == 0)
-        pq.compute_sdc_table ();
+    if (n == 0) {
+        pq.compute_sdc_table();
+    }
 #pragma omp parallel for
     for (int m = 0; m < pq.M; m++) {
         size_t nq, nb;
-        std::vector <uint32_t> codes; // query codes, then db codes
-        std::vector <float> gt_distances; // nq * nb matrix of distances
+        std::vector<uint32_t> codes;     // query codes, then db codes
+        std::vector<float> gt_distances; // nq * nb matrix of distances
         if (n > 0) {
-            std::vector<float> xtrain (n * dsub);
+            std::vector<float> xtrain(n * dsub);
             for (int i = 0; i < n; i++)
-                memcpy (xtrain.data() + i * dsub,
-                        x + i * pq.d + m * dsub,
-                        sizeof(float) * dsub);
+                memcpy(xtrain.data() + i * dsub,
+                       x + i * pq.d + m * dsub,
+                       sizeof(float) * dsub);
-            codes.resize (n);
+            codes.resize(n);
             for (int i = 0; i < n; i++)
-                codes [i] = all_codes [i * pq.code_size + m];
+                codes[i] = all_codes[i * pq.code_size + m];
-            nq = n / 4; nb = n - nq;
-            const float *xq = xtrain.data();
-            const float *xb = xq + nq * dsub;
+            nq = n / 4;
+            nb = n - nq;
+            const float* xq = xtrain.data();
+            const float* xb = xq + nq * dsub;
-            gt_distances.resize (nq * nb);
+            gt_distances.resize(nq * nb);
-            pairwise_L2sqr (dsub,
-                            nq, xq,
-                            nb, xb,
-                            gt_distances.data());
+            pairwise_L2sqr(dsub, nq, xq, nb, xb, gt_distances.data());
         } else {
             nq = nb = pq.ksub;
-            codes.resize (2 * nq);
+            codes.resize(2 * nq);
             for (int i = 0; i < nq; i++)
-                codes[i] = codes [i + nq] = i;
+                codes[i] = codes[i + nq] = i;
-            gt_distances.resize (nq * nb);
+            gt_distances.resize(nq * nb);
-            memcpy (gt_distances.data (),
-                    pq.sdc_table.data () + m * nq * nb,
-                    sizeof (float) * nq * nb);
+            memcpy(gt_distances.data(),
+                   pq.sdc_table.data() + m * nq * nb,
+                   sizeof(float) * nq * nb);
         }
-        double t0 = getmillisecs ();
+        double t0 = getmillisecs();
-        PermutationObjective *obj = new RankingScore2 (
-                  nbits, nq, nb,
-                  codes.data(), codes.data() + nq,
-                  gt_distances.data ());
-        ScopeDeleter1<PermutationObjective> del (obj);
+        PermutationObjective* obj = new RankingScore2(
+                nbits,
+                nq,
+                nb,
+                codes.data(),
+                codes.data() + nq,
+                gt_distances.data());
+        ScopeDeleter1<PermutationObjective> del(obj);
         if (verbose > 0) {
             printf("   m=%d, nq=%zd, nb=%zd, intialize RankingScore "
                    "in %.3f ms\n",
-                   m, nq, nb, getmillisecs () - t0);
+                   m,
+                   nq,
+                   nb,
+                   getmillisecs() - t0);
         }
-        SimulatedAnnealingOptimizer optim (obj, *this);
+        SimulatedAnnealingOptimizer optim(obj, *this);
         if (log_pattern.size()) {
             char fname[256];
-            snprintf (fname, 256, log_pattern.c_str(), m);
-            printf ("opening log file %s\n", fname);
-            optim.logfile = fopen (fname, "w");
-            FAISS_THROW_IF_NOT_FMT (optim.logfile,
-                                    "could not open logfile %s", fname);
+            snprintf(fname, 256, log_pattern.c_str(), m);
+            printf("opening log file %s\n", fname);
+            optim.logfile = fopen(fname, "w");
+            FAISS_THROW_IF_NOT_FMT(
+                    optim.logfile, "could not open logfile %s", fname);
         }
-        std::vector<int> perm (pq.ksub);
+        std::vector<int> perm(pq.ksub);
-        double final_cost = optim.run_optimization (perm.data());
-        printf ("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
-                m, optim.init_cost, final_cost);
+        double final_cost = optim.run_optimization(perm.data());
+        printf("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
+               m,
+               optim.init_cost,
+               final_cost);
-        if (log_pattern.size()) fclose (optim.logfile);
+        if (log_pattern.size())
+            fclose(optim.logfile);
-        float * centroids = pq.get_centroids (m, 0);
+        float* centroids = pq.get_centroids(m, 0);
         std::vector<float> centroids_copy;
         for (int i = 0; i < dsub * pq.ksub; i++)
-            centroids_copy.push_back (centroids[i]);
+            centroids_copy.push_back(centroids[i]);
         for (int i = 0; i < pq.ksub; i++)
-            memcpy (centroids + perm[i] * dsub,
-                    centroids_copy.data() + i * dsub,
-                    dsub * sizeof(centroids[0]));
+            memcpy(centroids + perm[i] * dsub,
+                   centroids_copy.data() + i * dsub,
+                   dsub * sizeof(centroids[0]));
     }
 }
-void PolysemousTraining::optimize_pq_for_hamming (ProductQuantizer &pq,
-                                                size_t n, const float *x) const
-{
+void PolysemousTraining::optimize_pq_for_hamming(
+        ProductQuantizer& pq,
+        size_t n,
+        const float* x) const {
     if (optimization_type == OT_None) {
     } else if (optimization_type == OT_ReproduceDistances_affine) {
-        optimize_reproduce_distances (pq);
+        optimize_reproduce_distances(pq);
     } else {
-        optimize_ranking (pq, n, x);
+        optimize_ranking(pq, n, x);
     }
-    pq.compute_sdc_table ();
+    pq.compute_sdc_table();
 }
+size_t PolysemousTraining::memory_usage_per_thread(
+        const ProductQuantizer& pq) const {
+    size_t n = pq.ksub;
+    switch (optimization_type) {
+        case OT_None:
+            return 0;
+        case OT_ReproduceDistances_affine:
+            return n * n * sizeof(double) * 3;
+        case OT_Ranking_weighted_diff:
+            return n * n * n * sizeof(float);
+    }
+    FAISS_THROW_MSG("Invalid optmization type");
+    return 0;
+}
 } // namespace faiss