RubyGems - faiss - Versions diffs - 0.2.0 → 0.2.1 - Mend

faiss 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (202) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +292 -291
data/vendor/faiss/faiss/AutoTune.h +55 -56
data/vendor/faiss/faiss/Clustering.cpp +334 -195
data/vendor/faiss/faiss/Clustering.h +88 -35
data/vendor/faiss/faiss/IVFlib.cpp +171 -195
data/vendor/faiss/faiss/IVFlib.h +48 -51
data/vendor/faiss/faiss/Index.cpp +85 -103
data/vendor/faiss/faiss/Index.h +54 -48
data/vendor/faiss/faiss/Index2Layer.cpp +139 -164
data/vendor/faiss/faiss/Index2Layer.h +22 -22
data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
data/vendor/faiss/faiss/IndexBinary.h +140 -132
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
data/vendor/faiss/faiss/IndexFlat.cpp +116 -147
data/vendor/faiss/faiss/IndexFlat.h +35 -46
data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
data/vendor/faiss/faiss/IndexHNSW.h +57 -41
data/vendor/faiss/faiss/IndexIVF.cpp +474 -454
data/vendor/faiss/faiss/IndexIVF.h +146 -113
data/vendor/faiss/faiss/IndexIVFFlat.cpp +248 -250
data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
data/vendor/faiss/faiss/IndexIVFPQ.cpp +457 -516
data/vendor/faiss/faiss/IndexIVFPQ.h +74 -66
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +125 -133
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +19 -21
data/vendor/faiss/faiss/IndexLSH.cpp +75 -96
data/vendor/faiss/faiss/IndexLSH.h +21 -26
data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
data/vendor/faiss/faiss/IndexLattice.h +11 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +231 -0
data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
data/vendor/faiss/faiss/IndexNSG.cpp +303 -0
data/vendor/faiss/faiss/IndexNSG.h +85 -0
data/vendor/faiss/faiss/IndexPQ.cpp +405 -464
data/vendor/faiss/faiss/IndexPQ.h +64 -67
data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
data/vendor/faiss/faiss/IndexRefine.cpp +115 -131
data/vendor/faiss/faiss/IndexRefine.h +22 -23
data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
data/vendor/faiss/faiss/IndexReplicas.h +62 -56
data/vendor/faiss/faiss/IndexResidual.cpp +291 -0
data/vendor/faiss/faiss/IndexResidual.h +152 -0
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +120 -155
data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -45
data/vendor/faiss/faiss/IndexShards.cpp +256 -240
data/vendor/faiss/faiss/IndexShards.h +85 -73
data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
data/vendor/faiss/faiss/MatrixStats.h +7 -10
data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
data/vendor/faiss/faiss/MetaIndexes.h +40 -34
data/vendor/faiss/faiss/MetricType.h +7 -7
data/vendor/faiss/faiss/VectorTransform.cpp +652 -474
data/vendor/faiss/faiss/VectorTransform.h +61 -89
data/vendor/faiss/faiss/clone_index.cpp +77 -73
data/vendor/faiss/faiss/clone_index.h +4 -9
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +197 -170
data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +270 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +115 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
data/vendor/faiss/faiss/impl/FaissException.h +41 -29
data/vendor/faiss/faiss/impl/HNSW.cpp +595 -611
data/vendor/faiss/faiss/impl/HNSW.h +179 -200
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +672 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +172 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
data/vendor/faiss/faiss/impl/NSG.cpp +682 -0
data/vendor/faiss/faiss/impl/NSG.h +199 -0
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +448 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +130 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +648 -701
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
data/vendor/faiss/faiss/impl/index_read.cpp +547 -479
data/vendor/faiss/faiss/impl/index_write.cpp +497 -407
data/vendor/faiss/faiss/impl/io.cpp +75 -94
data/vendor/faiss/faiss/impl/io.h +31 -41
data/vendor/faiss/faiss/impl/io_macros.h +40 -29
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
data/vendor/faiss/faiss/index_factory.cpp +269 -218
data/vendor/faiss/faiss/index_factory.h +6 -7
data/vendor/faiss/faiss/index_io.h +23 -26
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
data/vendor/faiss/faiss/utils/Heap.h +186 -209
data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
data/vendor/faiss/faiss/utils/distances.cpp +301 -310
data/vendor/faiss/faiss/utils/distances.h +133 -118
data/vendor/faiss/faiss/utils/distances_simd.cpp +456 -516
data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
data/vendor/faiss/faiss/utils/hamming.h +62 -85
data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
data/vendor/faiss/faiss/utils/partitioning.h +26 -21
data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
data/vendor/faiss/faiss/utils/random.cpp +39 -63
data/vendor/faiss/faiss/utils/random.h +13 -16
data/vendor/faiss/faiss/utils/simdlib.h +4 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
data/vendor/faiss/faiss/utils/utils.cpp +304 -287
data/vendor/faiss/faiss/utils/utils.h +53 -48
metadata +20 -2

data/vendor/faiss/faiss/impl/PolysemousTraining.h CHANGED Viewed

@@ -10,62 +10,56 @@
 #ifndef FAISS_POLYSEMOUS_TRAINING_INCLUDED
 #define FAISS_POLYSEMOUS_TRAINING_INCLUDED
 #include <faiss/impl/ProductQuantizer.h>
 namespace faiss {
 /// parameters used for the simulated annealing method
 struct SimulatedAnnealingParameters {
     // optimization parameters
-    double init_temperature;   // init probability of accepting a bad swap
-    double temperature_decay;  // at each iteration the temp is multiplied by this
-    int n_iter; // nb of iterations
-    int n_redo; // nb of runs of the simulation
-    int seed;   // random seed
+    double init_temperature;  // init probability of accepting a bad swap
+    double temperature_decay; // at each iteration the temp is multiplied by
+                              // this
+    int n_iter;               // nb of iterations
+    int n_redo;               // nb of runs of the simulation
+    int seed;                 // random seed
     int verbose;
     bool only_bit_flips; // restrict permutation changes to bit flips
-    bool init_random; // initialize with a random permutation (not identity)
+    bool init_random;    // initialize with a random permutation (not identity)
     // set reasonable defaults
-    SimulatedAnnealingParameters ();
+    SimulatedAnnealingParameters();
 };
 /// abstract class for the loss function
 struct PermutationObjective {
     int n;
-    virtual double compute_cost (const int *perm) const = 0;
+    virtual double compute_cost(const int* perm) const = 0;
     // what would the cost update be if iw and jw were swapped?
     // default implementation just computes both and computes the difference
-    virtual double cost_update (const int *perm, int iw, int jw) const;
+    virtual double cost_update(const int* perm, int iw, int jw) const;
-    virtual ~PermutationObjective () {}
+    virtual ~PermutationObjective() {}
 };
 struct ReproduceDistancesObjective : PermutationObjective {
     double dis_weight_factor;
-    static double sqr (double x) { return x * x; }
+    static double sqr(double x) {
+        return x * x;
+    }
     // weighting of distances: it is more important to reproduce small
     // distances well
-    double dis_weight (double x) const;
+    double dis_weight(double x) const;
     std::vector<double> source_dis; ///< "real" corrected distances (size n^2)
-    const double *      target_dis; ///< wanted distances (size n^2)
+    const double* target_dis;       ///< wanted distances (size n^2)
     std::vector<double> weights;    ///< weights for each distance (size n^2)
-    double get_source_dis (int i, int j) const;
+    double get_source_dis(int i, int j) const;
     // cost = quadratic difference between actual distance and Hamming distance
     double compute_cost(const int* perm) const override;
@@ -74,16 +68,19 @@ struct ReproduceDistancesObjective : PermutationObjective {
     // computed in O(n) instead of O(n^2) for the full re-computation
     double cost_update(const int* perm, int iw, int jw) const override;
-    ReproduceDistancesObjective (
-           int n,
-           const double *source_dis_in,
-           const double *target_dis_in,
-           double dis_weight_factor);
+    ReproduceDistancesObjective(
+            int n,
+            const double* source_dis_in,
+            const double* target_dis_in,
+            double dis_weight_factor);
-    static void compute_mean_stdev (const double *tab, size_t n2,
-                                    double *mean_out, double *stddev_out);
+    static void compute_mean_stdev(
+            const double* tab,
+            size_t n2,
+            double* mean_out,
+            double* stddev_out);
-    void set_affine_target_dis (const double *source_dis_in);
+    void set_affine_target_dis(const double* source_dis_in);
     ~ReproduceDistancesObjective() override {}
 };
@@ -91,39 +88,36 @@ struct ReproduceDistancesObjective : PermutationObjective {
 struct RandomGenerator;
 /// Simulated annealing optimization algorithm for permutations.
- struct SimulatedAnnealingOptimizer: SimulatedAnnealingParameters {
-    PermutationObjective *obj;
+struct SimulatedAnnealingOptimizer : SimulatedAnnealingParameters {
+    PermutationObjective* obj;
     int n;         ///< size of the permutation
-    FILE *logfile; /// logs values of the cost function
+    FILE* logfile; /// logs values of the cost function
-    SimulatedAnnealingOptimizer (PermutationObjective *obj,
-                                 const SimulatedAnnealingParameters &p);
-    RandomGenerator *rnd;
+    SimulatedAnnealingOptimizer(
+            PermutationObjective* obj,
+            const SimulatedAnnealingParameters& p);
+    RandomGenerator* rnd;
     /// remember initial cost of optimization
     double init_cost;
     // main entry point. Perform the optimization loop, starting from
     // and modifying permutation in-place
-    double optimize (int *perm);
+    double optimize(int* perm);
     // run the optimization and return the best result in best_perm
-    double run_optimization (int * best_perm);
+    double run_optimization(int* best_perm);
-    virtual ~SimulatedAnnealingOptimizer ();
+    virtual ~SimulatedAnnealingOptimizer();
 };
 /// optimizes the order of indices in a ProductQuantizer
-struct PolysemousTraining: SimulatedAnnealingParameters {
+struct PolysemousTraining : SimulatedAnnealingParameters {
     enum Optimization_type_t {
         OT_None,
-        OT_ReproduceDistances_affine,  ///< default
-        OT_Ranking_weighted_diff       ///< same as _2, but use rank of y+ - rank of y-
+        OT_ReproduceDistances_affine, ///< default
+        OT_Ranking_weighted_diff ///< same as _2, but use rank of y+ - rank of
+                                 ///< y-
     };
     Optimization_type_t optimization_type;
@@ -133,26 +127,29 @@ struct PolysemousTraining: SimulatedAnnealingParameters {
     int ntrain_permutation;
     double dis_weight_factor; ///< decay of exp that weights distance loss
+    /// refuse to train if it would require more than that amount of RAM
+    size_t max_memory;
     // filename pattern for the logging of iterations
     std::string log_pattern;
     // sets default values
-    PolysemousTraining ();
+    PolysemousTraining();
     /// reorder the centroids so that the Hamming distance becomes a
     /// good approximation of the SDC distance (called by train)
-    void optimize_pq_for_hamming (ProductQuantizer & pq,
-                                  size_t n, const float *x) const;
+    void optimize_pq_for_hamming(ProductQuantizer& pq, size_t n, const float* x)
+            const;
     /// called by optimize_pq_for_hamming
-    void optimize_ranking (ProductQuantizer &pq, size_t n, const float *x) const;
+    void optimize_ranking(ProductQuantizer& pq, size_t n, const float* x) const;
     /// called by optimize_pq_for_hamming
-    void optimize_reproduce_distances (ProductQuantizer &pq) const;
+    void optimize_reproduce_distances(ProductQuantizer& pq) const;
+    /// make sure we don't blow up the memory
+    size_t memory_usage_per_thread(const ProductQuantizer& pq) const;
 };
 } // namespace faiss
 #endif

data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h CHANGED Viewed

@@ -7,20 +7,18 @@
 namespace faiss {
-inline
-PQEncoderGeneric::PQEncoderGeneric(uint8_t *code, int nbits,
-                                                     uint8_t offset)
-    : code(code), offset(offset), nbits(nbits), reg(0)
-{
+inline PQEncoderGeneric::PQEncoderGeneric(
+        uint8_t* code,
+        int nbits,
+        uint8_t offset)
+        : code(code), offset(offset), nbits(nbits), reg(0) {
     assert(nbits <= 64);
     if (offset > 0) {
         reg = (*code & ((1 << offset) - 1));
     }
 }
-inline
-void PQEncoderGeneric::encode(uint64_t x)
-{
+inline void PQEncoderGeneric::encode(uint64_t x) {
     reg |= (uint8_t)(x << offset);
     x >>= (8 - offset);
     if (offset + nbits >= 8) {
@@ -39,51 +37,39 @@ void PQEncoderGeneric::encode(uint64_t x)
     }
 }
-inline
-PQEncoderGeneric::~PQEncoderGeneric()
-{
+inline PQEncoderGeneric::~PQEncoderGeneric() {
     if (offset > 0) {
         *code = reg;
     }
 }
-inline
-PQEncoder8::PQEncoder8(uint8_t *code, int nbits)
-    : code(code) {
+inline PQEncoder8::PQEncoder8(uint8_t* code, int nbits) : code(code) {
     assert(8 == nbits);
 }
-inline
-void PQEncoder8::encode(uint64_t x) {
+inline void PQEncoder8::encode(uint64_t x) {
     *code++ = (uint8_t)x;
 }
-inline
-PQEncoder16::PQEncoder16(uint8_t *code, int nbits)
-    : code((uint16_t *)code) {
+inline PQEncoder16::PQEncoder16(uint8_t* code, int nbits)
+        : code((uint16_t*)code) {
     assert(16 == nbits);
 }
-inline
-void PQEncoder16::encode(uint64_t x) {
+inline void PQEncoder16::encode(uint64_t x) {
     *code++ = (uint16_t)x;
 }
-inline
-PQDecoderGeneric::PQDecoderGeneric(const uint8_t *code,
-                                                     int nbits)
-    : code(code),
-      offset(0),
-      nbits(nbits),
-      mask((1ull << nbits) - 1),
-      reg(0) {
+inline PQDecoderGeneric::PQDecoderGeneric(const uint8_t* code, int nbits)
+        : code(code),
+          offset(0),
+          nbits(nbits),
+          mask((1ull << nbits) - 1),
+          reg(0) {
     assert(nbits <= 64);
 }
-inline
-uint64_t PQDecoderGeneric::decode() {
+inline uint64_t PQDecoderGeneric::decode() {
     if (offset == 0) {
         reg = *code;
     }
@@ -110,27 +96,20 @@ uint64_t PQDecoderGeneric::decode() {
     return c & mask;
 }
-inline
-PQDecoder8::PQDecoder8(const uint8_t *code, int nbits)
-    : code(code) {
-    assert(8 == nbits);
+inline PQDecoder8::PQDecoder8(const uint8_t* code, int nbits_in) : code(code) {
+    assert(8 == nbits_in);
 }
-inline
-uint64_t PQDecoder8::decode() {
+inline uint64_t PQDecoder8::decode() {
     return (uint64_t)(*code++);
 }
-inline
-PQDecoder16::PQDecoder16(const uint8_t *code, int nbits)
-    : code((uint16_t *)code) {
-     assert(16 == nbits);
+inline PQDecoder16::PQDecoder16(const uint8_t* code, int nbits_in)
+        : code((uint16_t*)code) {
+    assert(16 == nbits_in);
 }
-inline
-uint64_t PQDecoder16::decode() {
+inline uint64_t PQDecoder16::decode() {
     return (uint64_t)(*code++);
 }

data/vendor/faiss/faiss/impl/ProductQuantizer.cpp CHANGED Viewed

@@ -9,113 +9,118 @@
 #include <faiss/impl/ProductQuantizer.h>
 #include <cstddef>
-#include <cstring>
 #include <cstdio>
+#include <cstring>
 #include <memory>
 #include <algorithm>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/VectorTransform.h>
 #include <faiss/IndexFlat.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/distances.h>
 extern "C" {
 /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
-int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
-            n, FINTEGER *k, const float *alpha, const float *a,
-            FINTEGER *lda, const float *b, FINTEGER *
-            ldb, float *beta, float *c, FINTEGER *ldc);
+int sgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const float* alpha,
+        const float* a,
+        FINTEGER* lda,
+        const float* b,
+        FINTEGER* ldb,
+        float* beta,
+        float* c,
+        FINTEGER* ldc);
 }
 namespace faiss {
 /* compute an estimator using look-up tables for typical values of M */
 template <typename CT, class C>
-void pq_estimators_from_tables_Mmul4 (int M, const CT * codes,
-                                      size_t ncodes,
-                                      const float * __restrict dis_table,
-                                      size_t ksub,
-                                      size_t k,
-                                      float * heap_dis,
-                                      int64_t * heap_ids)
-{
+void pq_estimators_from_tables_Mmul4(
+        int M,
+        const CT* codes,
+        size_t ncodes,
+        const float* __restrict dis_table,
+        size_t ksub,
+        size_t k,
+        float* heap_dis,
+        int64_t* heap_ids) {
     for (size_t j = 0; j < ncodes; j++) {
         float dis = 0;
-        const float *dt = dis_table;
+        const float* dt = dis_table;
-        for (size_t m = 0; m < M; m+=4) {
+        for (size_t m = 0; m < M; m += 4) {
             float dism = 0;
-            dism  = dt[*codes++]; dt += ksub;
-            dism += dt[*codes++]; dt += ksub;
-            dism += dt[*codes++]; dt += ksub;
-            dism += dt[*codes++]; dt += ksub;
+            dism = dt[*codes++];
+            dt += ksub;
+            dism += dt[*codes++];
+            dt += ksub;
+            dism += dt[*codes++];
+            dt += ksub;
+            dism += dt[*codes++];
+            dt += ksub;
             dis += dism;
         }
-        if (C::cmp (heap_dis[0], dis)) {
-            heap_replace_top<C> (k, heap_dis, heap_ids, dis, j);
+        if (C::cmp(heap_dis[0], dis)) {
+            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
         }
     }
 }
 template <typename CT, class C>
-void pq_estimators_from_tables_M4 (const CT * codes,
-                                   size_t ncodes,
-                                   const float * __restrict dis_table,
-                                   size_t ksub,
-                                   size_t k,
-                                   float * heap_dis,
-                                   int64_t * heap_ids)
-{
+void pq_estimators_from_tables_M4(
+        const CT* codes,
+        size_t ncodes,
+        const float* __restrict dis_table,
+        size_t ksub,
+        size_t k,
+        float* heap_dis,
+        int64_t* heap_ids) {
     for (size_t j = 0; j < ncodes; j++) {
         float dis = 0;
-        const float *dt = dis_table;
-        dis  = dt[*codes++]; dt += ksub;
-        dis += dt[*codes++]; dt += ksub;
-        dis += dt[*codes++]; dt += ksub;
+        const float* dt = dis_table;
+        dis = dt[*codes++];
+        dt += ksub;
+        dis += dt[*codes++];
+        dt += ksub;
+        dis += dt[*codes++];
+        dt += ksub;
         dis += dt[*codes++];
-        if (C::cmp (heap_dis[0], dis)) {
-            heap_replace_top<C> (k, heap_dis, heap_ids, dis, j);
+        if (C::cmp(heap_dis[0], dis)) {
+            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
         }
     }
 }
 template <typename CT, class C>
-static inline void pq_estimators_from_tables (const ProductQuantizer& pq,
-                                              const CT * codes,
-                                              size_t ncodes,
-                                              const float * dis_table,
-                                              size_t k,
-                                              float * heap_dis,
-                                              int64_t * heap_ids)
-{
-    if (pq.M == 4)  {
-        pq_estimators_from_tables_M4<CT, C> (codes, ncodes,
-                                             dis_table, pq.ksub, k,
-                                             heap_dis, heap_ids);
+static inline void pq_estimators_from_tables(
+        const ProductQuantizer& pq,
+        const CT* codes,
+        size_t ncodes,
+        const float* dis_table,
+        size_t k,
+        float* heap_dis,
+        int64_t* heap_ids) {
+    if (pq.M == 4) {
+        pq_estimators_from_tables_M4<CT, C>(
+                codes, ncodes, dis_table, pq.ksub, k, heap_dis, heap_ids);
         return;
     }
     if (pq.M % 4 == 0) {
-        pq_estimators_from_tables_Mmul4<CT, C> (pq.M, codes, ncodes,
-                                                dis_table, pq.ksub, k,
-                                                heap_dis, heap_ids);
+        pq_estimators_from_tables_Mmul4<CT, C>(
+                pq.M, codes, ncodes, dis_table, pq.ksub, k, heap_dis, heap_ids);
         return;
     }
@@ -124,132 +129,124 @@ static inline void pq_estimators_from_tables (const ProductQuantizer& pq,
     const size_t ksub = pq.ksub;
     for (size_t j = 0; j < ncodes; j++) {
         float dis = 0;
-        const float * __restrict dt = dis_table;
+        const float* __restrict dt = dis_table;
         for (int m = 0; m < M; m++) {
             dis += dt[*codes++];
             dt += ksub;
         }
-        if (C::cmp (heap_dis[0], dis)) {
-            heap_replace_top<C> (k, heap_dis, heap_ids, dis, j);
+        if (C::cmp(heap_dis[0], dis)) {
+            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
         }
     }
 }
 template <class C>
-static inline void pq_estimators_from_tables_generic(const ProductQuantizer& pq,
-                                                     size_t nbits,
-                                                     const uint8_t *codes,
-                                                     size_t ncodes,
-                                                     const float *dis_table,
-                                                     size_t k,
-                                                     float *heap_dis,
-                                                     int64_t *heap_ids)
-{
-  const size_t M = pq.M;
-  const size_t ksub = pq.ksub;
-  for (size_t j = 0; j < ncodes; ++j) {
-    PQDecoderGeneric decoder(
-      codes + j * pq.code_size, nbits
-    );
-    float dis = 0;
-    const float * __restrict dt = dis_table;
-    for (size_t m = 0; m < M; m++) {
-      uint64_t c = decoder.decode();
-      dis += dt[c];
-      dt += ksub;
-    }
+static inline void pq_estimators_from_tables_generic(
+        const ProductQuantizer& pq,
+        size_t nbits,
+        const uint8_t* codes,
+        size_t ncodes,
+        const float* dis_table,
+        size_t k,
+        float* heap_dis,
+        int64_t* heap_ids) {
+    const size_t M = pq.M;
+    const size_t ksub = pq.ksub;
+    for (size_t j = 0; j < ncodes; ++j) {
+        PQDecoderGeneric decoder(codes + j * pq.code_size, nbits);
+        float dis = 0;
+        const float* __restrict dt = dis_table;
+        for (size_t m = 0; m < M; m++) {
+            uint64_t c = decoder.decode();
+            dis += dt[c];
+            dt += ksub;
+        }
-    if (C::cmp(heap_dis[0], dis)) {
-      heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
+        if (C::cmp(heap_dis[0], dis)) {
+            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
+        }
     }
-  }
 }
 /*********************************************
  * PQ implementation
  *********************************************/
-ProductQuantizer::ProductQuantizer (size_t d, size_t M, size_t nbits):
-    d(d), M(M), nbits(nbits), assign_index(nullptr)
-{
-    set_derived_values ();
+ProductQuantizer::ProductQuantizer(size_t d, size_t M, size_t nbits)
+        : d(d), M(M), nbits(nbits), assign_index(nullptr) {
+    set_derived_values();
 }
-ProductQuantizer::ProductQuantizer ()
-    : ProductQuantizer(0, 1, 0) {}
+ProductQuantizer::ProductQuantizer() : ProductQuantizer(0, 1, 0) {}
-void ProductQuantizer::set_derived_values () {
+void ProductQuantizer::set_derived_values() {
     // quite a few derived values
-    FAISS_THROW_IF_NOT_MSG (d % M == 0, "The dimension of the vector (d) should be a multiple of the number of subquantizers (M)");
+    FAISS_THROW_IF_NOT_MSG(
+            d % M == 0,
+            "The dimension of the vector (d) should be a multiple of the number of subquantizers (M)");
     dsub = d / M;
     code_size = (nbits * M + 7) / 8;
     ksub = 1 << nbits;
-    centroids.resize (d * ksub);
+    centroids.resize(d * ksub);
     verbose = false;
     train_type = Train_default;
 }
-void ProductQuantizer::set_params (const float * centroids_, int m)
-{
-  memcpy (get_centroids(m, 0), centroids_,
-            ksub * dsub * sizeof (centroids_[0]));
+void ProductQuantizer::set_params(const float* centroids_, int m) {
+    memcpy(get_centroids(m, 0),
+           centroids_,
+           ksub * dsub * sizeof(centroids_[0]));
 }
-static void init_hypercube (int d, int nbits,
-                            int n, const float * x,
-                            float *centroids)
-{
-    std::vector<float> mean (d);
+static void init_hypercube(
+        int d,
+        int nbits,
+        int n,
+        const float* x,
+        float* centroids) {
+    std::vector<float> mean(d);
     for (int i = 0; i < n; i++)
         for (int j = 0; j < d; j++)
-            mean [j] += x[i * d + j];
+            mean[j] += x[i * d + j];
     float maxm = 0;
     for (int j = 0; j < d; j++) {
-        mean [j] /= n;
-        if (fabs(mean[j]) > maxm) maxm = fabs(mean[j]);
+        mean[j] /= n;
+        if (fabs(mean[j]) > maxm)
+            maxm = fabs(mean[j]);
     }
     for (int i = 0; i < (1 << nbits); i++) {
-        float * cent = centroids + i * d;
+        float* cent = centroids + i * d;
         for (int j = 0; j < nbits; j++)
-            cent[j] = mean [j] + (((i >> j) & 1) ? 1 : -1) * maxm;
+            cent[j] = mean[j] + (((i >> j) & 1) ? 1 : -1) * maxm;
         for (int j = nbits; j < d; j++)
-            cent[j] = mean [j];
+            cent[j] = mean[j];
     }
 }
-static void init_hypercube_pca (int d, int nbits,
-                                int n, const float * x,
-                                float *centroids)
-{
-    PCAMatrix pca (d, nbits);
-    pca.train (n, x);
+static void init_hypercube_pca(
+        int d,
+        int nbits,
+        int n,
+        const float* x,
+        float* centroids) {
+    PCAMatrix pca(d, nbits);
+    pca.train(n, x);
     for (int i = 0; i < (1 << nbits); i++) {
-        float * cent = centroids + i * d;
+        float* cent = centroids + i * d;
         for (int j = 0; j < d; j++) {
             cent[j] = pca.mean[j];
             float f = 1.0;
             for (int k = 0; k < nbits; k++)
-                cent[j] += f *
-                    sqrt (pca.eigenvalues [k]) *
-                    (((i >> k) & 1) ? 1 : -1) *
-                    pca.PCAMat [j + k * d];
+                cent[j] += f * sqrt(pca.eigenvalues[k]) *
+                        (((i >> k) & 1) ? 1 : -1) * pca.PCAMat[j + k * d];
         }
     }
 }
-void ProductQuantizer::train (int n, const float * x)
-{
+void ProductQuantizer::train(int n, const float* x) {
     if (train_type != Train_shared) {
         train_type_t final_train_type;
         final_train_type = train_type;
@@ -257,234 +254,229 @@ void ProductQuantizer::train (int n, const float * x)
             train_type == Train_hypercube_pca) {
             if (dsub < nbits) {
                 final_train_type = Train_default;
-                printf ("cannot train hypercube: nbits=%zd > log2(d=%zd)\n",
-                        nbits, dsub);
+                printf("cannot train hypercube: nbits=%zd > log2(d=%zd)\n",
+                       nbits,
+                       dsub);
             }
         }
-        float * xslice = new float[n * dsub];
-        ScopeDeleter<float> del (xslice);
+        float* xslice = new float[n * dsub];
+        ScopeDeleter<float> del(xslice);
         for (int m = 0; m < M; m++) {
             for (int j = 0; j < n; j++)
-                memcpy (xslice + j * dsub,
-                        x + j * d + m * dsub,
-                        dsub * sizeof(float));
+                memcpy(xslice + j * dsub,
+                       x + j * d + m * dsub,
+                       dsub * sizeof(float));
-            Clustering clus (dsub, ksub, cp);
+            Clustering clus(dsub, ksub, cp);
             // we have some initialization for the centroids
             if (final_train_type != Train_default) {
-                clus.centroids.resize (dsub * ksub);
+                clus.centroids.resize(dsub * ksub);
             }
             switch (final_train_type) {
-            case Train_hypercube:
-                init_hypercube (dsub, nbits, n, xslice,
-                                clus.centroids.data ());
-                break;
-            case  Train_hypercube_pca:
-                init_hypercube_pca (dsub, nbits, n, xslice,
-                                    clus.centroids.data ());
-                break;
-            case  Train_hot_start:
-                memcpy (clus.centroids.data(),
-                        get_centroids (m, 0),
-                        dsub * ksub * sizeof (float));
-                break;
-            default: ;
+                case Train_hypercube:
+                    init_hypercube(
+                            dsub, nbits, n, xslice, clus.centroids.data());
+                    break;
+                case Train_hypercube_pca:
+                    init_hypercube_pca(
+                            dsub, nbits, n, xslice, clus.centroids.data());
+                    break;
+                case Train_hot_start:
+                    memcpy(clus.centroids.data(),
+                           get_centroids(m, 0),
+                           dsub * ksub * sizeof(float));
+                    break;
+                default:;
             }
-            if(verbose) {
+            if (verbose) {
                 clus.verbose = true;
-                printf ("Training PQ slice %d/%zd\n", m, M);
+                printf("Training PQ slice %d/%zd\n", m, M);
             }
-            IndexFlatL2 index (dsub);
-            clus.train (n, xslice, assign_index ? *assign_index : index);
-            set_params (clus.centroids.data(), m);
+            IndexFlatL2 index(dsub);
+            clus.train(n, xslice, assign_index ? *assign_index : index);
+            set_params(clus.centroids.data(), m);
         }
     } else {
+        Clustering clus(dsub, ksub, cp);
-        Clustering clus (dsub, ksub, cp);
-        if(verbose) {
+        if (verbose) {
             clus.verbose = true;
-            printf ("Training all PQ slices at once\n");
+            printf("Training all PQ slices at once\n");
         }
-        IndexFlatL2 index (dsub);
+        IndexFlatL2 index(dsub);
-        clus.train (n * M, x, assign_index ? *assign_index : index);
+        clus.train(n * M, x, assign_index ? *assign_index : index);
         for (int m = 0; m < M; m++) {
-            set_params (clus.centroids.data(), m);
+            set_params(clus.centroids.data(), m);
         }
     }
 }
-template<class PQEncoder>
-void compute_code(const ProductQuantizer& pq, const float *x, uint8_t *code) {
-  std::vector<float> distances(pq.ksub);
-  PQEncoder encoder(code, pq.nbits);
-  for (size_t m = 0; m < pq.M; m++) {
-    float mindis = 1e20;
-    uint64_t idxm = 0;
-    const float * xsub = x + m * pq.dsub;
-    fvec_L2sqr_ny(distances.data(), xsub, pq.get_centroids(m, 0), pq.dsub, pq.ksub);
-    /* Find best centroid */
-    for (size_t i = 0; i < pq.ksub; i++) {
-      float dis = distances[i];
-      if (dis < mindis) {
-        mindis = dis;
-        idxm = i;
-      }
-    }
+template <class PQEncoder>
+void compute_code(const ProductQuantizer& pq, const float* x, uint8_t* code) {
+    std::vector<float> distances(pq.ksub);
+    PQEncoder encoder(code, pq.nbits);
+    for (size_t m = 0; m < pq.M; m++) {
+        float mindis = 1e20;
+        uint64_t idxm = 0;
+        const float* xsub = x + m * pq.dsub;
+        fvec_L2sqr_ny(
+                distances.data(),
+                xsub,
+                pq.get_centroids(m, 0),
+                pq.dsub,
+                pq.ksub);
+        /* Find best centroid */
+        for (size_t i = 0; i < pq.ksub; i++) {
+            float dis = distances[i];
+            if (dis < mindis) {
+                mindis = dis;
+                idxm = i;
+            }
+        }
-    encoder.encode(idxm);
-  }
+        encoder.encode(idxm);
+    }
 }
-void ProductQuantizer::compute_code(const float * x, uint8_t * code) const {
-  switch (nbits) {
-    case 8:
-      faiss::compute_code<PQEncoder8>(*this, x, code);
-      break;
+void ProductQuantizer::compute_code(const float* x, uint8_t* code) const {
+    switch (nbits) {
+        case 8:
+            faiss::compute_code<PQEncoder8>(*this, x, code);
+            break;
-    case 16:
-      faiss::compute_code<PQEncoder16>(*this, x, code);
-      break;
+        case 16:
+            faiss::compute_code<PQEncoder16>(*this, x, code);
+            break;
-    default:
-      faiss::compute_code<PQEncoderGeneric>(*this, x, code);
-      break;
-  }
+        default:
+            faiss::compute_code<PQEncoderGeneric>(*this, x, code);
+            break;
+    }
 }
-template<class PQDecoder>
-void decode(const ProductQuantizer& pq, const uint8_t *code, float *x)
-{
-  PQDecoder decoder(code, pq.nbits);
-  for (size_t m = 0; m < pq.M; m++) {
-    uint64_t c = decoder.decode();
-    memcpy(x + m * pq.dsub, pq.get_centroids(m, c), sizeof(float) * pq.dsub);
-  }
+template <class PQDecoder>
+void decode(const ProductQuantizer& pq, const uint8_t* code, float* x) {
+    PQDecoder decoder(code, pq.nbits);
+    for (size_t m = 0; m < pq.M; m++) {
+        uint64_t c = decoder.decode();
+        memcpy(x + m * pq.dsub,
+               pq.get_centroids(m, c),
+               sizeof(float) * pq.dsub);
+    }
 }
-void ProductQuantizer::decode (const uint8_t *code, float *x) const
-{
-  switch (nbits) {
-    case 8:
-      faiss::decode<PQDecoder8>(*this, code, x);
-      break;
-    case 16:
-      faiss::decode<PQDecoder16>(*this, code, x);
-      break;
-    default:
-      faiss::decode<PQDecoderGeneric>(*this, code, x);
-      break;
-  }
-}
+void ProductQuantizer::decode(const uint8_t* code, float* x) const {
+    switch (nbits) {
+        case 8:
+            faiss::decode<PQDecoder8>(*this, code, x);
+            break;
+        case 16:
+            faiss::decode<PQDecoder16>(*this, code, x);
+            break;
+        default:
+            faiss::decode<PQDecoderGeneric>(*this, code, x);
+            break;
+    }
+}
-void ProductQuantizer::decode (const uint8_t *code, float *x, size_t n) const
-{
+void ProductQuantizer::decode(const uint8_t* code, float* x, size_t n) const {
     for (size_t i = 0; i < n; i++) {
-        this->decode (code + code_size * i, x + d * i);
+        this->decode(code + code_size * i, x + d * i);
     }
 }
+void ProductQuantizer::compute_code_from_distance_table(
+        const float* tab,
+        uint8_t* code) const {
+    PQEncoderGeneric encoder(code, nbits);
+    for (size_t m = 0; m < M; m++) {
+        float mindis = 1e20;
+        uint64_t idxm = 0;
+        /* Find best centroid */
+        for (size_t j = 0; j < ksub; j++) {
+            float dis = *tab++;
+            if (dis < mindis) {
+                mindis = dis;
+                idxm = j;
+            }
+        }
-void ProductQuantizer::compute_code_from_distance_table (const float *tab,
-                                                         uint8_t *code) const
-{
-  PQEncoderGeneric encoder(code, nbits);
-  for (size_t m = 0; m < M; m++) {
-    float mindis = 1e20;
-    uint64_t idxm = 0;
-    /* Find best centroid */
-    for (size_t j = 0; j < ksub; j++) {
-      float dis = *tab++;
-      if (dis < mindis) {
-        mindis = dis;
-        idxm = j;
-      }
+        encoder.encode(idxm);
     }
-    encoder.encode(idxm);
-  }
 }
-void ProductQuantizer::compute_codes_with_assign_index (
-                const float * x,
-                uint8_t * codes,
-                size_t n)
-{
-    FAISS_THROW_IF_NOT (assign_index && assign_index->d == dsub);
+void ProductQuantizer::compute_codes_with_assign_index(
+        const float* x,
+        uint8_t* codes,
+        size_t n) {
+    FAISS_THROW_IF_NOT(assign_index && assign_index->d == dsub);
     for (size_t m = 0; m < M; m++) {
-        assign_index->reset ();
-        assign_index->add (ksub, get_centroids (m, 0));
+        assign_index->reset();
+        assign_index->add(ksub, get_centroids(m, 0));
         size_t bs = 65536;
-        float * xslice = new float[bs * dsub];
-        ScopeDeleter<float> del (xslice);
-        idx_t *assign = new idx_t[bs];
-        ScopeDeleter<idx_t> del2 (assign);
+        float* xslice = new float[bs * dsub];
+        ScopeDeleter<float> del(xslice);
+        idx_t* assign = new idx_t[bs];
+        ScopeDeleter<idx_t> del2(assign);
         for (size_t i0 = 0; i0 < n; i0 += bs) {
             size_t i1 = std::min(i0 + bs, n);
             for (size_t i = i0; i < i1; i++) {
-                memcpy (xslice + (i - i0) * dsub,
-                        x + i * d + m * dsub,
-                        dsub * sizeof(float));
+                memcpy(xslice + (i - i0) * dsub,
+                       x + i * d + m * dsub,
+                       dsub * sizeof(float));
             }
-            assign_index->assign (i1 - i0, xslice, assign);
+            assign_index->assign(i1 - i0, xslice, assign);
             if (nbits == 8) {
-              uint8_t *c = codes + code_size * i0 + m;
-              for (size_t i = i0; i < i1; i++) {
-                *c = assign[i - i0];
-                c += M;
-              }
+                uint8_t* c = codes + code_size * i0 + m;
+                for (size_t i = i0; i < i1; i++) {
+                    *c = assign[i - i0];
+                    c += M;
+                }
             } else if (nbits == 16) {
-              uint16_t *c = (uint16_t*)(codes + code_size * i0 + m * 2);
-              for (size_t i = i0; i < i1; i++) {
-                *c = assign[i - i0];
-                c += M;
-              }
+                uint16_t* c = (uint16_t*)(codes + code_size * i0 + m * 2);
+                for (size_t i = i0; i < i1; i++) {
+                    *c = assign[i - i0];
+                    c += M;
+                }
             } else {
-              for (size_t i = i0; i < i1; ++i) {
-                uint8_t *c = codes + code_size * i + ((m * nbits) / 8);
-                uint8_t offset = (m * nbits) % 8;
-                uint64_t ass = assign[i - i0];
-                PQEncoderGeneric encoder(c, nbits, offset);
-                encoder.encode(ass);
-              }
+                for (size_t i = i0; i < i1; ++i) {
+                    uint8_t* c = codes + code_size * i + ((m * nbits) / 8);
+                    uint8_t offset = (m * nbits) % 8;
+                    uint64_t ass = assign[i - i0];
+                    PQEncoderGeneric encoder(c, nbits, offset);
+                    encoder.encode(ass);
+                }
             }
         }
     }
 }
-void ProductQuantizer::compute_codes (const float * x,
-                                      uint8_t * codes,
-                                      size_t n)  const
-{
-  // process by blocks to avoid using too much RAM
+void ProductQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n)
+        const {
+    // process by blocks to avoid using too much RAM
     size_t bs = 256 * 1024;
     if (n > bs) {
         for (size_t i0 = 0; i0 < n; i0 += bs) {
             size_t i1 = std::min(i0 + bs, n);
-            compute_codes (x + d * i0, codes + code_size * i0, i1 - i0);
+            compute_codes(x + d * i0, codes + code_size * i0, i1 - i0);
         }
         return;
     }
@@ -493,282 +485,300 @@ void ProductQuantizer::compute_codes (const float * x,
 #pragma omp parallel for
         for (int64_t i = 0; i < n; i++)
-            compute_code (x + i * d, codes + i * code_size);
+            compute_code(x + i * d, codes + i * code_size);
     } else { // worthwile to use BLAS
-        float *dis_tables = new float [n * ksub * M];
-        ScopeDeleter<float> del (dis_tables);
-        compute_distance_tables (n, x, dis_tables);
+        float* dis_tables = new float[n * ksub * M];
+        ScopeDeleter<float> del(dis_tables);
+        compute_distance_tables(n, x, dis_tables);
 #pragma omp parallel for
         for (int64_t i = 0; i < n; i++) {
-            uint8_t * code = codes + i * code_size;
-            const float * tab = dis_tables + i * ksub * M;
-            compute_code_from_distance_table (tab, code);
+            uint8_t* code = codes + i * code_size;
+            const float* tab = dis_tables + i * ksub * M;
+            compute_code_from_distance_table(tab, code);
         }
     }
 }
-void ProductQuantizer::compute_distance_table (const float * x,
-                                               float * dis_table) const
-{
+void ProductQuantizer::compute_distance_table(const float* x, float* dis_table)
+        const {
     size_t m;
     for (m = 0; m < M; m++) {
-        fvec_L2sqr_ny (dis_table + m * ksub,
-                       x + m * dsub,
-                       get_centroids(m, 0),
-                       dsub,
-                       ksub);
+        fvec_L2sqr_ny(
+                dis_table + m * ksub,
+                x + m * dsub,
+                get_centroids(m, 0),
+                dsub,
+                ksub);
     }
 }
-void ProductQuantizer::compute_inner_prod_table (const float * x,
-                                                 float * dis_table) const
-{
+void ProductQuantizer::compute_inner_prod_table(
+        const float* x,
+        float* dis_table) const {
     size_t m;
     for (m = 0; m < M; m++) {
-        fvec_inner_products_ny (dis_table + m * ksub,
-                                x + m * dsub,
-                                get_centroids(m, 0),
-                                dsub,
-                                ksub);
+        fvec_inner_products_ny(
+                dis_table + m * ksub,
+                x + m * dsub,
+                get_centroids(m, 0),
+                dsub,
+                ksub);
     }
 }
-void ProductQuantizer::compute_distance_tables (
-           size_t nx,
-           const float * x,
-           float * dis_tables) const
-{
-#ifdef __AVX2__
+void ProductQuantizer::compute_distance_tables(
+        size_t nx,
+        const float* x,
+        float* dis_tables) const {
+#if defined(__AVX2__) || defined(__aarch64__)
     if (dsub == 2 && nbits < 8) { // interesting for a narrow range of settings
         compute_PQ_dis_tables_dsub2(
-            d, ksub, centroids.data(),
-            nx, x, false, dis_tables
-        );
+                d, ksub, centroids.data(), nx, x, false, dis_tables);
     } else
 #endif
-    if (dsub < 16) {
+            if (dsub < 16) {
 #pragma omp parallel for
         for (int64_t i = 0; i < nx; i++) {
-            compute_distance_table (x + i * d, dis_tables + i * ksub * M);
+            compute_distance_table(x + i * d, dis_tables + i * ksub * M);
         }
     } else { // use BLAS
         for (int m = 0; m < M; m++) {
-            pairwise_L2sqr (dsub,
-                            nx, x + dsub * m,
-                            ksub, centroids.data() + m * dsub * ksub,
-                            dis_tables + ksub * m,
-                            d, dsub, ksub * M);
+            pairwise_L2sqr(
+                    dsub,
+                    nx,
+                    x + dsub * m,
+                    ksub,
+                    centroids.data() + m * dsub * ksub,
+                    dis_tables + ksub * m,
+                    d,
+                    dsub,
+                    ksub * M);
         }
     }
 }
-void ProductQuantizer::compute_inner_prod_tables (
-           size_t nx,
-           const float * x,
-           float * dis_tables) const
-{
-#ifdef __AVX2__
+void ProductQuantizer::compute_inner_prod_tables(
+        size_t nx,
+        const float* x,
+        float* dis_tables) const {
+#if defined(__AVX2__) || defined(__aarch64__)
     if (dsub == 2 && nbits < 8) {
         compute_PQ_dis_tables_dsub2(
-            d, ksub, centroids.data(),
-            nx, x, true, dis_tables
-        );
+                d, ksub, centroids.data(), nx, x, true, dis_tables);
     } else
 #endif
-    if (dsub < 16) {
+            if (dsub < 16) {
 #pragma omp parallel for
         for (int64_t i = 0; i < nx; i++) {
-            compute_inner_prod_table (x + i * d, dis_tables + i * ksub * M);
+            compute_inner_prod_table(x + i * d, dis_tables + i * ksub * M);
         }
     } else { // use BLAS
         // compute distance tables
         for (int m = 0; m < M; m++) {
-            FINTEGER ldc = ksub * M, nxi = nx, ksubi = ksub,
-                dsubi = dsub, di = d;
+            FINTEGER ldc = ksub * M, nxi = nx, ksubi = ksub, dsubi = dsub,
+                     di = d;
             float one = 1.0, zero = 0;
-            sgemm_ ("Transposed", "Not transposed",
-                    &ksubi, &nxi, &dsubi,
-                    &one, &centroids [m * dsub * ksub], &dsubi,
-                    x + dsub * m, &di,
-                    &zero, dis_tables + ksub * m, &ldc);
+            sgemm_("Transposed",
+                   "Not transposed",
+                   &ksubi,
+                   &nxi,
+                   &dsubi,
+                   &one,
+                   &centroids[m * dsub * ksub],
+                   &dsubi,
+                   x + dsub * m,
+                   &di,
+                   &zero,
+                   dis_tables + ksub * m,
+                   &ldc);
         }
     }
 }
 template <class C>
-static void pq_knn_search_with_tables (
-      const ProductQuantizer& pq,
-      size_t nbits,
-      const float *dis_tables,
-      const uint8_t * codes,
-      const size_t ncodes,
-      HeapArray<C> * res,
-      bool init_finalize_heap)
-{
+static void pq_knn_search_with_tables(
+        const ProductQuantizer& pq,
+        size_t nbits,
+        const float* dis_tables,
+        const uint8_t* codes,
+        const size_t ncodes,
+        HeapArray<C>* res,
+        bool init_finalize_heap) {
     size_t k = res->k, nx = res->nh;
     size_t ksub = pq.ksub, M = pq.M;
 #pragma omp parallel for
     for (int64_t i = 0; i < nx; i++) {
         /* query preparation for asymmetric search: compute look-up tables */
         const float* dis_table = dis_tables + i * ksub * M;
         /* Compute distances and keep smallest values */
-        int64_t * __restrict heap_ids = res->ids + i * k;
-        float * __restrict heap_dis = res->val + i * k;
+        int64_t* __restrict heap_ids = res->ids + i * k;
+        float* __restrict heap_dis = res->val + i * k;
         if (init_finalize_heap) {
-            heap_heapify<C> (k, heap_dis, heap_ids);
+            heap_heapify<C>(k, heap_dis, heap_ids);
         }
         switch (nbits) {
-          case 8:
-              pq_estimators_from_tables<uint8_t, C> (pq,
-                                                     codes, ncodes,
-                                                     dis_table,
-                                                     k, heap_dis, heap_ids);
-              break;
-          case 16:
-              pq_estimators_from_tables<uint16_t, C> (pq,
-                                                      (uint16_t*)codes, ncodes,
-                                                      dis_table,
-                                                      k, heap_dis, heap_ids);
-              break;
-          default:
-              pq_estimators_from_tables_generic<C> (pq,
-                                                    nbits,
-                                                    codes, ncodes,
-                                                    dis_table,
-                                                    k, heap_dis, heap_ids);
-              break;
+            case 8:
+                pq_estimators_from_tables<uint8_t, C>(
+                        pq, codes, ncodes, dis_table, k, heap_dis, heap_ids);
+                break;
+            case 16:
+                pq_estimators_from_tables<uint16_t, C>(
+                        pq,
+                        (uint16_t*)codes,
+                        ncodes,
+                        dis_table,
+                        k,
+                        heap_dis,
+                        heap_ids);
+                break;
+            default:
+                pq_estimators_from_tables_generic<C>(
+                        pq,
+                        nbits,
+                        codes,
+                        ncodes,
+                        dis_table,
+                        k,
+                        heap_dis,
+                        heap_ids);
+                break;
         }
         if (init_finalize_heap) {
-            heap_reorder<C> (k, heap_dis, heap_ids);
+            heap_reorder<C>(k, heap_dis, heap_ids);
         }
     }
 }
-void ProductQuantizer::search (const float * __restrict x,
-                               size_t nx,
-                               const uint8_t * codes,
-                               const size_t ncodes,
-                               float_maxheap_array_t * res,
-                               bool init_finalize_heap) const
-{
-    FAISS_THROW_IF_NOT (nx == res->nh);
-    std::unique_ptr<float[]> dis_tables(new float [nx * ksub * M]);
-    compute_distance_tables (nx, x, dis_tables.get());
-    pq_knn_search_with_tables<CMax<float, int64_t>> (
-      *this, nbits, dis_tables.get(), codes, ncodes, res, init_finalize_heap);
+void ProductQuantizer::search(
+        const float* __restrict x,
+        size_t nx,
+        const uint8_t* codes,
+        const size_t ncodes,
+        float_maxheap_array_t* res,
+        bool init_finalize_heap) const {
+    FAISS_THROW_IF_NOT(nx == res->nh);
+    std::unique_ptr<float[]> dis_tables(new float[nx * ksub * M]);
+    compute_distance_tables(nx, x, dis_tables.get());
+    pq_knn_search_with_tables<CMax<float, int64_t>>(
+            *this,
+            nbits,
+            dis_tables.get(),
+            codes,
+            ncodes,
+            res,
+            init_finalize_heap);
 }
-void ProductQuantizer::search_ip (const float * __restrict x,
-                               size_t nx,
-                               const uint8_t * codes,
-                               const size_t ncodes,
-                               float_minheap_array_t * res,
-                               bool init_finalize_heap) const
-{
-    FAISS_THROW_IF_NOT (nx == res->nh);
-    std::unique_ptr<float[]> dis_tables(new float [nx * ksub * M]);
-    compute_inner_prod_tables (nx, x, dis_tables.get());
-    pq_knn_search_with_tables<CMin<float, int64_t> > (
-      *this, nbits, dis_tables.get(), codes, ncodes, res, init_finalize_heap);
+void ProductQuantizer::search_ip(
+        const float* __restrict x,
+        size_t nx,
+        const uint8_t* codes,
+        const size_t ncodes,
+        float_minheap_array_t* res,
+        bool init_finalize_heap) const {
+    FAISS_THROW_IF_NOT(nx == res->nh);
+    std::unique_ptr<float[]> dis_tables(new float[nx * ksub * M]);
+    compute_inner_prod_tables(nx, x, dis_tables.get());
+    pq_knn_search_with_tables<CMin<float, int64_t>>(
+            *this,
+            nbits,
+            dis_tables.get(),
+            codes,
+            ncodes,
+            res,
+            init_finalize_heap);
 }
-static float sqr (float x) {
+static float sqr(float x) {
     return x * x;
 }
-void ProductQuantizer::compute_sdc_table ()
-{
-    sdc_table.resize (M * ksub * ksub);
-    for (int m = 0; m < M; m++) {
+void ProductQuantizer::compute_sdc_table() {
+    sdc_table.resize(M * ksub * ksub);
-        const float *cents = centroids.data() + m * ksub * dsub;
-        float * dis_tab = sdc_table.data() + m * ksub * ksub;
-        // TODO optimize with BLAS
-        for (int i = 0; i < ksub; i++) {
-            const float *centi = cents + i * dsub;
-            for (int j = 0; j < ksub; j++) {
-                float accu = 0;
-                const float *centj = cents + j * dsub;
-                for (int k = 0; k < dsub; k++)
-                    accu += sqr (centi[k] - centj[k]);
-                dis_tab [i + j * ksub] = accu;
-            }
+    if (dsub < 4) {
+#pragma omp parallel for
+        for (int mk = 0; mk < M * ksub; mk++) {
+            // allow omp to schedule in a more fine-grained way
+            // `collapse` is not supported in OpenMP 2.x
+            int m = mk / ksub;
+            int k = mk % ksub;
+            const float* cents = centroids.data() + m * ksub * dsub;
+            const float* centi = cents + k * dsub;
+            float* dis_tab = sdc_table.data() + m * ksub * ksub;
+            fvec_L2sqr_ny(dis_tab + k * ksub, centi, cents, dsub, ksub);
+        }
+    } else {
+        // NOTE: it would disable the omp loop in pairwise_L2sqr
+        // but still accelerate especially when M >= 4
+#pragma omp parallel for
+        for (int m = 0; m < M; m++) {
+            const float* cents = centroids.data() + m * ksub * dsub;
+            float* dis_tab = sdc_table.data() + m * ksub * ksub;
+            pairwise_L2sqr(
+                    dsub, ksub, cents, ksub, cents, dis_tab, dsub, dsub, ksub);
         }
     }
 }
-void ProductQuantizer::search_sdc (const uint8_t * qcodes,
-                     size_t nq,
-                     const uint8_t * bcodes,
-                     const size_t nb,
-                     float_maxheap_array_t * res,
-                     bool init_finalize_heap) const
-{
-    FAISS_THROW_IF_NOT (sdc_table.size() == M * ksub * ksub);
-    FAISS_THROW_IF_NOT (nbits == 8);
+void ProductQuantizer::search_sdc(
+        const uint8_t* qcodes,
+        size_t nq,
+        const uint8_t* bcodes,
+        const size_t nb,
+        float_maxheap_array_t* res,
+        bool init_finalize_heap) const {
+    FAISS_THROW_IF_NOT(sdc_table.size() == M * ksub * ksub);
+    FAISS_THROW_IF_NOT(nbits == 8);
     size_t k = res->k;
 #pragma omp parallel for
     for (int64_t i = 0; i < nq; i++) {
         /* Compute distances and keep smallest values */
-        idx_t * heap_ids = res->ids + i * k;
-        float *  heap_dis = res->val + i * k;
-        const uint8_t * qcode = qcodes + i * code_size;
+        idx_t* heap_ids = res->ids + i * k;
+        float* heap_dis = res->val + i * k;
+        const uint8_t* qcode = qcodes + i * code_size;
         if (init_finalize_heap)
-            maxheap_heapify (k, heap_dis, heap_ids);
+            maxheap_heapify(k, heap_dis, heap_ids);
-        const uint8_t * bcode = bcodes;
+        const uint8_t* bcode = bcodes;
         for (size_t j = 0; j < nb; j++) {
             float dis = 0;
-            const float * tab = sdc_table.data();
+            const float* tab = sdc_table.data();
             for (int m = 0; m < M; m++) {
                 dis += tab[bcode[m] + qcode[m] * ksub];
                 tab += ksub * ksub;
             }
             if (dis < heap_dis[0]) {
-                maxheap_replace_top (k, heap_dis, heap_ids, dis, j);
+                maxheap_replace_top(k, heap_dis, heap_ids, dis, j);
             }
             bcode += code_size;
         }
         if (init_finalize_heap)
-            maxheap_reorder (k, heap_dis, heap_ids);
+            maxheap_reorder(k, heap_dis, heap_ids);
     }
 }
-}  // namespace faiss
+} // namespace faiss