RubyGems - faiss - Versions diffs - 0.2.0 → 0.2.4 - Mend

faiss 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (215) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/LICENSE.txt +1 -1
data/README.md +7 -7
data/ext/faiss/extconf.rb +6 -3
data/ext/faiss/numo.hpp +4 -4
data/ext/faiss/utils.cpp +1 -1
data/ext/faiss/utils.h +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +292 -291
data/vendor/faiss/faiss/AutoTune.h +55 -56
data/vendor/faiss/faiss/Clustering.cpp +365 -194
data/vendor/faiss/faiss/Clustering.h +102 -35
data/vendor/faiss/faiss/IVFlib.cpp +171 -195
data/vendor/faiss/faiss/IVFlib.h +48 -51
data/vendor/faiss/faiss/Index.cpp +85 -103
data/vendor/faiss/faiss/Index.h +54 -48
data/vendor/faiss/faiss/Index2Layer.cpp +126 -224
data/vendor/faiss/faiss/Index2Layer.h +22 -36
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +407 -0
data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +195 -0
data/vendor/faiss/faiss/IndexBinary.cpp +45 -37
data/vendor/faiss/faiss/IndexBinary.h +140 -132
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +73 -53
data/vendor/faiss/faiss/IndexBinaryFlat.h +29 -24
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +46 -43
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +16 -15
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +215 -232
data/vendor/faiss/faiss/IndexBinaryHNSW.h +25 -24
data/vendor/faiss/faiss/IndexBinaryHash.cpp +182 -177
data/vendor/faiss/faiss/IndexBinaryHash.h +41 -34
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +489 -461
data/vendor/faiss/faiss/IndexBinaryIVF.h +97 -68
data/vendor/faiss/faiss/IndexFlat.cpp +115 -176
data/vendor/faiss/faiss/IndexFlat.h +42 -59
data/vendor/faiss/faiss/IndexFlatCodes.cpp +67 -0
data/vendor/faiss/faiss/IndexFlatCodes.h +47 -0
data/vendor/faiss/faiss/IndexHNSW.cpp +372 -348
data/vendor/faiss/faiss/IndexHNSW.h +57 -41
data/vendor/faiss/faiss/IndexIVF.cpp +545 -453
data/vendor/faiss/faiss/IndexIVF.h +169 -118
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +316 -0
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +121 -0
data/vendor/faiss/faiss/IndexIVFFlat.cpp +247 -252
data/vendor/faiss/faiss/IndexIVFFlat.h +48 -51
data/vendor/faiss/faiss/IndexIVFPQ.cpp +459 -517
data/vendor/faiss/faiss/IndexIVFPQ.h +75 -67
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +406 -372
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +82 -57
data/vendor/faiss/faiss/IndexIVFPQR.cpp +104 -102
data/vendor/faiss/faiss/IndexIVFPQR.h +33 -28
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +163 -150
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +38 -25
data/vendor/faiss/faiss/IndexLSH.cpp +66 -113
data/vendor/faiss/faiss/IndexLSH.h +20 -38
data/vendor/faiss/faiss/IndexLattice.cpp +42 -56
data/vendor/faiss/faiss/IndexLattice.h +11 -16
data/vendor/faiss/faiss/IndexNNDescent.cpp +229 -0
data/vendor/faiss/faiss/IndexNNDescent.h +72 -0
data/vendor/faiss/faiss/IndexNSG.cpp +301 -0
data/vendor/faiss/faiss/IndexNSG.h +85 -0
data/vendor/faiss/faiss/IndexPQ.cpp +387 -495
data/vendor/faiss/faiss/IndexPQ.h +64 -82
data/vendor/faiss/faiss/IndexPQFastScan.cpp +143 -170
data/vendor/faiss/faiss/IndexPQFastScan.h +46 -32
data/vendor/faiss/faiss/IndexPreTransform.cpp +120 -150
data/vendor/faiss/faiss/IndexPreTransform.h +33 -36
data/vendor/faiss/faiss/IndexRefine.cpp +139 -127
data/vendor/faiss/faiss/IndexRefine.h +32 -23
data/vendor/faiss/faiss/IndexReplicas.cpp +147 -153
data/vendor/faiss/faiss/IndexReplicas.h +62 -56
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +111 -172
data/vendor/faiss/faiss/IndexScalarQuantizer.h +41 -59
data/vendor/faiss/faiss/IndexShards.cpp +256 -240
data/vendor/faiss/faiss/IndexShards.h +85 -73
data/vendor/faiss/faiss/MatrixStats.cpp +112 -97
data/vendor/faiss/faiss/MatrixStats.h +7 -10
data/vendor/faiss/faiss/MetaIndexes.cpp +135 -157
data/vendor/faiss/faiss/MetaIndexes.h +40 -34
data/vendor/faiss/faiss/MetricType.h +7 -7
data/vendor/faiss/faiss/VectorTransform.cpp +654 -475
data/vendor/faiss/faiss/VectorTransform.h +64 -89
data/vendor/faiss/faiss/clone_index.cpp +78 -73
data/vendor/faiss/faiss/clone_index.h +4 -9
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +33 -38
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +11 -9
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +198 -171
data/vendor/faiss/faiss/gpu/GpuCloner.h +53 -35
data/vendor/faiss/faiss/gpu/GpuClonerOptions.cpp +12 -14
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +27 -25
data/vendor/faiss/faiss/gpu/GpuDistance.h +116 -112
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -2
data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +60 -0
data/vendor/faiss/faiss/gpu/GpuIndex.h +134 -137
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +76 -73
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +173 -162
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +67 -64
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +89 -86
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +150 -141
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +101 -103
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +17 -16
data/vendor/faiss/faiss/gpu/GpuResources.cpp +116 -128
data/vendor/faiss/faiss/gpu/GpuResources.h +182 -186
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +433 -422
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +131 -130
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +468 -456
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +25 -19
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +22 -20
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +9 -8
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +39 -44
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +16 -14
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +77 -71
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +109 -88
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +75 -64
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +230 -215
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +80 -86
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +284 -277
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +416 -416
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +611 -517
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +166 -164
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +61 -53
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +274 -238
data/vendor/faiss/faiss/gpu/test/TestUtils.h +73 -57
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +47 -50
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +79 -72
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +140 -146
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +69 -71
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +21 -16
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +25 -29
data/vendor/faiss/faiss/gpu/utils/Timer.h +30 -29
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +503 -0
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +175 -0
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +90 -120
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +81 -65
data/vendor/faiss/faiss/impl/FaissAssert.h +73 -58
data/vendor/faiss/faiss/impl/FaissException.cpp +56 -48
data/vendor/faiss/faiss/impl/FaissException.h +41 -29
data/vendor/faiss/faiss/impl/HNSW.cpp +606 -617
data/vendor/faiss/faiss/impl/HNSW.h +179 -200
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +855 -0
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +244 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +487 -0
data/vendor/faiss/faiss/impl/NNDescent.h +154 -0
data/vendor/faiss/faiss/impl/NSG.cpp +679 -0
data/vendor/faiss/faiss/impl/NSG.h +199 -0
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +484 -454
data/vendor/faiss/faiss/impl/PolysemousTraining.h +52 -55
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +26 -47
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +469 -459
data/vendor/faiss/faiss/impl/ProductQuantizer.h +76 -87
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +758 -0
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +188 -0
data/vendor/faiss/faiss/impl/ResultHandler.h +96 -132
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +647 -707
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +48 -46
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +129 -131
data/vendor/faiss/faiss/impl/ThreadedIndex.h +61 -55
data/vendor/faiss/faiss/impl/index_read.cpp +631 -480
data/vendor/faiss/faiss/impl/index_write.cpp +547 -407
data/vendor/faiss/faiss/impl/io.cpp +76 -95
data/vendor/faiss/faiss/impl/io.h +31 -41
data/vendor/faiss/faiss/impl/io_macros.h +60 -29
data/vendor/faiss/faiss/impl/kmeans1d.cpp +301 -0
data/vendor/faiss/faiss/impl/kmeans1d.h +48 -0
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +137 -186
data/vendor/faiss/faiss/impl/lattice_Zn.h +40 -51
data/vendor/faiss/faiss/impl/platform_macros.h +29 -8
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +77 -124
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +39 -48
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +41 -52
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +80 -117
data/vendor/faiss/faiss/impl/simd_result_handlers.h +109 -137
data/vendor/faiss/faiss/index_factory.cpp +619 -397
data/vendor/faiss/faiss/index_factory.h +8 -6
data/vendor/faiss/faiss/index_io.h +23 -26
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +67 -75
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +22 -24
data/vendor/faiss/faiss/invlists/DirectMap.cpp +96 -112
data/vendor/faiss/faiss/invlists/DirectMap.h +29 -33
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +307 -364
data/vendor/faiss/faiss/invlists/InvertedLists.h +151 -151
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +29 -34
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +17 -18
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +257 -293
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +50 -45
data/vendor/faiss/faiss/python/python_callbacks.cpp +23 -26
data/vendor/faiss/faiss/python/python_callbacks.h +9 -16
data/vendor/faiss/faiss/utils/AlignedTable.h +79 -44
data/vendor/faiss/faiss/utils/Heap.cpp +40 -48
data/vendor/faiss/faiss/utils/Heap.h +186 -209
data/vendor/faiss/faiss/utils/WorkerThread.cpp +67 -76
data/vendor/faiss/faiss/utils/WorkerThread.h +32 -33
data/vendor/faiss/faiss/utils/distances.cpp +305 -312
data/vendor/faiss/faiss/utils/distances.h +170 -122
data/vendor/faiss/faiss/utils/distances_simd.cpp +498 -508
data/vendor/faiss/faiss/utils/extra_distances-inl.h +117 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +113 -232
data/vendor/faiss/faiss/utils/extra_distances.h +30 -29
data/vendor/faiss/faiss/utils/hamming-inl.h +260 -209
data/vendor/faiss/faiss/utils/hamming.cpp +375 -469
data/vendor/faiss/faiss/utils/hamming.h +62 -85
data/vendor/faiss/faiss/utils/ordered_key_value.h +16 -18
data/vendor/faiss/faiss/utils/partitioning.cpp +393 -318
data/vendor/faiss/faiss/utils/partitioning.h +26 -21
data/vendor/faiss/faiss/utils/quantize_lut.cpp +78 -66
data/vendor/faiss/faiss/utils/quantize_lut.h +22 -20
data/vendor/faiss/faiss/utils/random.cpp +39 -63
data/vendor/faiss/faiss/utils/random.h +13 -16
data/vendor/faiss/faiss/utils/simdlib.h +4 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +88 -85
data/vendor/faiss/faiss/utils/simdlib_emulated.h +226 -165
data/vendor/faiss/faiss/utils/simdlib_neon.h +832 -0
data/vendor/faiss/faiss/utils/utils.cpp +304 -287
data/vendor/faiss/faiss/utils/utils.h +54 -49
metadata +29 -4

data/vendor/faiss/faiss/utils/partitioning.cpp CHANGED Viewed

@@ -7,8 +7,8 @@
 #include <faiss/utils/partitioning.h>
-#include <cmath>
 #include <cassert>
+#include <cmath>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/AlignedTable.h>
@@ -19,15 +19,13 @@
 namespace faiss {
 /******************************************************************
  * Internal routines
  ******************************************************************/
 namespace partitioning {
-template<typename T>
+template <typename T>
 T median3(T a, T b, T c) {
     if (a > b) {
         std::swap(a, b);
@@ -41,12 +39,12 @@ T median3(T a, T b, T c) {
     return a;
 }
-template<class C>
+template <class C>
 typename C::T sample_threshold_median3(
-    const typename C::T * vals, int n,
-    typename C::T thresh_inf, typename C::T thresh_sup
-) {
+        const typename C::T* vals,
+        int n,
+        typename C::T thresh_inf,
+        typename C::T thresh_sup) {
     using T = typename C::T;
     size_t big_prime = 6700417;
     T val3[3];
@@ -73,31 +71,34 @@ typename C::T sample_threshold_median3(
     }
 }
-template<class C>
+template <class C>
 void count_lt_and_eq(
-    const typename C::T * vals, size_t n, typename C::T thresh,
-    size_t & n_lt, size_t & n_eq
-) {
+        const typename C::T* vals,
+        size_t n,
+        typename C::T thresh,
+        size_t& n_lt,
+        size_t& n_eq) {
     n_lt = n_eq = 0;
-    for(size_t i = 0; i < n; i++) {
+    for (size_t i = 0; i < n; i++) {
         typename C::T v = *vals++;
-        if(C::cmp(thresh, v)) {
+        if (C::cmp(thresh, v)) {
             n_lt++;
-        } else if(v == thresh) {
+        } else if (v == thresh) {
             n_eq++;
         }
     }
 }
-template<class C>
+template <class C>
 size_t compress_array(
-    typename C::T *vals, typename C::TI * ids,
-    size_t n, typename C::T thresh, size_t n_eq
-) {
+        typename C::T* vals,
+        typename C::TI* ids,
+        size_t n,
+        typename C::T thresh,
+        size_t n_eq) {
     size_t wp = 0;
-    for(size_t i = 0; i < n; i++) {
+    for (size_t i = 0; i < n; i++) {
         if (C::cmp(thresh, vals[i])) {
             vals[wp] = vals[i];
             ids[wp] = ids[i];
@@ -113,15 +114,16 @@ size_t compress_array(
     return wp;
 }
+#define IFV if (false)
-#define IFV if(false)
-template<class C>
+template <class C>
 typename C::T partition_fuzzy_median3(
-    typename C::T *vals, typename C::TI * ids, size_t n,
-    size_t q_min, size_t q_max, size_t * q_out)
-{
+        typename C::T* vals,
+        typename C::TI* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out) {
     if (q_min == 0) {
         if (q_out) {
             *q_out = C::Crev::neutral();
@@ -150,12 +152,19 @@ typename C::T partition_fuzzy_median3(
     size_t n_eq = 0, n_lt = 0;
     size_t q = 0;
-    for(int it = 0; it < 200; it++) {
+    for (int it = 0; it < 200; it++) {
         count_lt_and_eq<C>(vals, n, thresh, n_lt, n_eq);
-        IFV  printf("   thresh=%g [%g %g] n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n",
-            float(thresh), float(thresh_inf), float(thresh_sup),
-            long(n_lt), long(n_eq), long(q_min), long(q_max), long(n));
+        IFV printf(
+                "   thresh=%g [%g %g] n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n",
+                float(thresh),
+                float(thresh_inf),
+                float(thresh_sup),
+                long(n_lt),
+                long(n_eq),
+                long(q_min),
+                long(q_max),
+                long(n));
         if (n_lt <= q_min) {
             if (n_lt + n_eq >= q_min) {
@@ -172,8 +181,12 @@ typename C::T partition_fuzzy_median3(
         }
         // FIXME avoid a second pass over the array to sample the threshold
-        IFV  printf("     sample thresh in [%g %g]\n", float(thresh_inf), float(thresh_sup));
-        T new_thresh = sample_threshold_median3<C>(vals, n, thresh_inf, thresh_sup);
+        IFV printf(
+                "     sample thresh in [%g %g]\n",
+                float(thresh_inf),
+                float(thresh_sup));
+        T new_thresh =
+                sample_threshold_median3<C>(vals, n, thresh_inf, thresh_sup);
         if (new_thresh == thresh_inf) {
             // then there is nothing between thresh_inf and thresh_sup
             break;
@@ -203,25 +216,19 @@ typename C::T partition_fuzzy_median3(
     return thresh;
 }
 } // namespace partitioning
 /******************************************************************
  * SIMD routines when vals is an aligned array of uint16_t
  ******************************************************************/
 namespace simd_partitioning {
 void find_minimax(
-        const uint16_t * vals, size_t n,
-        uint16_t & smin, uint16_t & smax
-) {
+        const uint16_t* vals,
+        size_t n,
+        uint16_t& smin,
+        uint16_t& smax) {
     simd16uint16 vmin(0xffff), vmax(0);
     for (size_t i = 0; i + 15 < n; i += 16) {
         simd16uint16 v(vals + i);
@@ -235,22 +242,20 @@ void find_minimax(
     smin = tab32[0], smax = tab32[16];
-    for(int i = 1; i < 16; i++) {
+    for (int i = 1; i < 16; i++) {
         smin = std::min(smin, tab32[i]);
         smax = std::max(smax, tab32[i + 16]);
     }
     // missing values
-    for(size_t i = (n & ~15); i < n; i++) {
+    for (size_t i = (n & ~15); i < n; i++) {
         smin = std::min(smin, vals[i]);
         smax = std::max(smax, vals[i]);
     }
 }
 // max func differentiates between CMin and CMax (keep lowest or largest)
-template<class C>
+template <class C>
 simd16uint16 max_func(simd16uint16 v, simd16uint16 thr16) {
     constexpr bool is_max = C::is_max;
     if (is_max) {
@@ -260,11 +265,13 @@ simd16uint16 max_func(simd16uint16 v, simd16uint16 thr16) {
     }
 }
-template<class C>
+template <class C>
 void count_lt_and_eq(
-    const uint16_t * vals, int n, uint16_t thresh,
-    size_t & n_lt, size_t & n_eq
-) {
+        const uint16_t* vals,
+        int n,
+        uint16_t thresh,
+        size_t& n_lt,
+        size_t& n_eq) {
     n_lt = n_eq = 0;
     simd16uint16 thr16(thresh);
@@ -283,24 +290,25 @@ void count_lt_and_eq(
         n_lt += 16 - i_ge;
     }
-    for(size_t i = n1 * 16; i < n; i++) {
+    for (size_t i = n1 * 16; i < n; i++) {
         uint16_t v = *vals++;
-        if(C::cmp(thresh, v)) {
+        if (C::cmp(thresh, v)) {
             n_lt++;
-        } else if(v == thresh) {
+        } else if (v == thresh) {
             n_eq++;
         }
     }
 }
 /* compress separated values and ids table, keeping all values < thresh and at
  * most n_eq equal values */
-template<class C>
+template <class C>
 int simd_compress_array(
-    uint16_t *vals, typename C::TI * ids, size_t n, uint16_t thresh, int n_eq
-) {
+        uint16_t* vals,
+        typename C::TI* ids,
+        size_t n,
+        uint16_t thresh,
+        int n_eq) {
     simd16uint16 thr16(thresh);
     simd16uint16 mixmask(0xff00);
@@ -313,13 +321,15 @@ int simd_compress_array(
         simd16uint16 max2 = max_func<C>(v, thr16);
         simd16uint16 gemask = (v == max2);
         simd16uint16 eqmask = (v == thr16);
-        uint32_t bits = get_MSBs(blendv(
-            simd32uint8(eqmask), simd32uint8(gemask), simd32uint8(mixmask)));
+        uint32_t bits = get_MSBs(
+                blendv(simd32uint8(eqmask),
+                       simd32uint8(gemask),
+                       simd32uint8(mixmask)));
         bits ^= 0xAAAAAAAA;
         // bit 2*i     : eq
         // bit 2*i + 1 : lt
-        while(bits) {
+        while (bits) {
             int j = __builtin_ctz(bits) & (~1);
             bool is_eq = (bits >> j) & 1;
             bool is_lt = (bits >> j) & 2;
@@ -330,7 +340,7 @@ int simd_compress_array(
                 vals[wp] = vals[i0 + j];
                 ids[wp] = ids[i0 + j];
                 wp++;
-            } else if(is_eq && n_eq > 0) {
+            } else if (is_eq && n_eq > 0) {
                 vals[wp] = vals[i0 + j];
                 ids[wp] = ids[i0 + j];
                 wp++;
@@ -346,7 +356,7 @@ int simd_compress_array(
         simd16uint16 gemask = (v == max2);
         uint32_t bits = ~get_MSBs(simd32uint8(gemask));
-        while(bits) {
+        while (bits) {
             int j = __builtin_ctz(bits);
             bits &= ~(3 << j);
             j >>= 1;
@@ -358,7 +368,7 @@ int simd_compress_array(
     }
     // end with scalar
-    for(int i = (n & ~15); i < n; i++) {
+    for (int i = (n & ~15); i < n; i++) {
         if (C::cmp(thresh, vals[i])) {
             vals[wp] = vals[i];
             ids[wp] = ids[i];
@@ -376,29 +386,28 @@ int simd_compress_array(
 // #define MICRO_BENCHMARK
-static uint64_t get_cy () {
-#ifdef  MICRO_BENCHMARK
+static uint64_t get_cy() {
+#ifdef MICRO_BENCHMARK
     uint32_t high, low;
-    asm volatile("rdtsc \n\t"
-                 : "=a" (low),
-                   "=d" (high));
+    asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
     return ((uint64_t)high << 32) | (low);
 #else
     return 0;
 #endif
 }
+#define IFV if (false)
-#define IFV if(false)
-template<class C>
+template <class C>
 uint16_t simd_partition_fuzzy_with_bounds(
-    uint16_t *vals, typename C::TI * ids, size_t n,
-    size_t q_min, size_t q_max, size_t * q_out,
-    uint16_t s0i, uint16_t s1i)
-{
+        uint16_t* vals,
+        typename C::TI* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out,
+        uint16_t s0i,
+        uint16_t s1i) {
     if (q_min == 0) {
         if (q_out) {
             *q_out = 0;
@@ -428,13 +437,21 @@ uint16_t simd_partition_fuzzy_with_bounds(
     size_t n_eq = 0, n_lt = 0;
     size_t q = 0;
-    for(int it = 0; it < 200; it++) {
+    for (int it = 0; it < 200; it++) {
         // while(s0 + 1 < s1) {
         thresh = (s0 + s1) / 2;
         count_lt_and_eq<C>(vals, n, thresh, n_lt, n_eq);
-        IFV  printf("   [%ld %ld] thresh=%d n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n",
-            s0, s1, thresh, n_lt, n_eq, q_min, q_max, n);
+        IFV printf(
+                "   [%ld %ld] thresh=%d n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n",
+                s0,
+                s1,
+                thresh,
+                n_lt,
+                n_eq,
+                q_min,
+                q_max,
+                n);
         if (n_lt <= q_min) {
             if (n_lt + n_eq >= q_min) {
                 q = q_min;
@@ -456,7 +473,6 @@ uint16_t simd_partition_fuzzy_with_bounds(
                 s0 = thresh;
             }
         }
     }
     uint64_t t1 = get_cy();
@@ -495,14 +511,16 @@ uint16_t simd_partition_fuzzy_with_bounds(
     return thresh;
 }
-template<class C>
+template <class C>
 uint16_t simd_partition_fuzzy_with_bounds_histogram(
-    uint16_t *vals, typename C::TI * ids, size_t n,
-    size_t q_min, size_t q_max, size_t * q_out,
-    uint16_t s0i, uint16_t s1i)
-{
+        uint16_t* vals,
+        typename C::TI* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out,
+        uint16_t s0i,
+        uint16_t s1i) {
     if (q_min == 0) {
         if (q_out) {
             *q_out = 0;
@@ -522,11 +540,17 @@ uint16_t simd_partition_fuzzy_with_bounds_histogram(
         return s0i;
     }
-    IFV printf("partition fuzzy, q=%ld:%ld / %ld, bounds=%d %d\n",
-        q_min, q_max, n, s0i, s1i);
+    IFV printf(
+            "partition fuzzy, q=%ld:%ld / %ld, bounds=%d %d\n",
+            q_min,
+            q_max,
+            n,
+            s0i,
+            s1i);
     if (!C::is_max) {
-        IFV printf("revert due to CMin, q_min:q_max -> %ld:%ld\n", q_min, q_max);
+        IFV printf(
+                "revert due to CMin, q_min:q_max -> %ld:%ld\n", q_min, q_max);
         q_min = n - q_min;
         q_max = n - q_max;
     }
@@ -537,31 +561,39 @@ uint16_t simd_partition_fuzzy_with_bounds_histogram(
     size_t n_lt = 0, n_gt = 0;
     // output of loop:
-    int thresh; // final threshold
-    uint64_t tot_eq = 0;   // total nb of equal values
-    uint64_t n_eq = 0;     // nb of equal values to keep
-    size_t q;  // final quantile
+    int thresh;          // final threshold
+    uint64_t tot_eq = 0; // total nb of equal values
+    uint64_t n_eq = 0;   // nb of equal values to keep
+    size_t q;            // final quantile
     // buffer for the histograms
     int hist[16];
-    for(int it = 0; it < 20; it++) {
+    for (int it = 0; it < 20; it++) {
         // otherwise we would be done already
         int shift = 0;
-        IFV printf("  it %d bounds: %d %d n_lt=%ld n_gt=%ld\n",
-                it, s0, s1, n_lt, n_gt);
+        IFV printf(
+                "  it %d bounds: %d %d n_lt=%ld n_gt=%ld\n",
+                it,
+                s0,
+                s1,
+                n_lt,
+                n_gt);
         int maxval = s1 - s0;
-        while(maxval > 15) {
+        while (maxval > 15) {
             shift++;
             maxval >>= 1;
         }
-        IFV printf("    histogram shift %d maxval %d ?= %d\n",
-                shift, maxval, int((s1 - s0) >> shift));
+        IFV printf(
+                "    histogram shift %d maxval %d ?= %d\n",
+                shift,
+                maxval,
+                int((s1 - s0) >> shift));
         if (maxval > 7) {
             simd_histogram_16(vals, n, s0, shift, hist);
@@ -571,7 +603,7 @@ uint16_t simd_partition_fuzzy_with_bounds_histogram(
         IFV {
             int sum = n_lt + n_gt;
             printf("    n_lt=%ld hist=[", n_lt);
-            for(int i = 0; i <= maxval; i++) {
+            for (int i = 0; i <= maxval; i++) {
                 printf("%d ", hist[i]);
                 sum += hist[i];
             }
@@ -597,7 +629,12 @@ uint16_t simd_partition_fuzzy_with_bounds_histogram(
             assert(!"not implemented");
         }
-        IFV printf("    new bin: s0=%d s1=%d n_lt=%ld n_gt=%ld\n", s0, s1, n_lt, n_gt);
+        IFV printf(
+                "    new bin: s0=%d s1=%d n_lt=%ld n_gt=%ld\n",
+                s0,
+                s1,
+                n_lt,
+                n_gt);
         if (s1 > s0) {
             if (n_lt >= q_min && q_max >= n_lt) {
@@ -628,7 +665,7 @@ uint16_t simd_partition_fuzzy_with_bounds_histogram(
     if (!C::is_max) {
         if (n_eq == 0) {
-            thresh --;
+            thresh--;
         } else {
             // thresh unchanged
             n_eq = tot_eq - n_eq;
@@ -647,14 +684,14 @@ uint16_t simd_partition_fuzzy_with_bounds_histogram(
     return thresh;
 }
-template<class C>
+template <class C>
 uint16_t simd_partition_fuzzy(
-    uint16_t *vals, typename C::TI * ids, size_t n,
-    size_t q_min, size_t q_max, size_t * q_out
-) {
+        uint16_t* vals,
+        typename C::TI* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out) {
     assert(is_aligned_pointer(vals));
     uint16_t s0i, s1i;
@@ -662,14 +699,15 @@ uint16_t simd_partition_fuzzy(
     // QSelect_stats.t0 += get_cy() - t0;
     return simd_partition_fuzzy_with_bounds<C>(
-        vals, ids, n, q_min, q_max, q_out, s0i, s1i);
+            vals, ids, n, q_min, q_max, q_out, s0i, s1i);
 }
-template<class C>
-uint16_t simd_partition(uint16_t *vals, typename C::TI * ids, size_t n, size_t q) {
+template <class C>
+uint16_t simd_partition(
+        uint16_t* vals,
+        typename C::TI* ids,
+        size_t n,
+        size_t q) {
     assert(is_aligned_pointer(vals));
     if (q == 0) {
@@ -683,72 +721,97 @@ uint16_t simd_partition(uint16_t *vals, typename C::TI * ids, size_t n, size_t q
     find_minimax(vals, n, s0i, s1i);
     return simd_partition_fuzzy_with_bounds<C>(
-        vals, ids, n, q, q, nullptr, s0i, s1i);
+            vals, ids, n, q, q, nullptr, s0i, s1i);
 }
-template<class C>
+template <class C>
 uint16_t simd_partition_with_bounds(
-    uint16_t *vals, typename C::TI * ids, size_t n, size_t q,
-    uint16_t s0i, uint16_t s1i)
-{
+        uint16_t* vals,
+        typename C::TI* ids,
+        size_t n,
+        size_t q,
+        uint16_t s0i,
+        uint16_t s1i) {
     return simd_partition_fuzzy_with_bounds<C>(
-        vals, ids, n, q, q, nullptr, s0i, s1i);
+            vals, ids, n, q, q, nullptr, s0i, s1i);
 }
 } // namespace simd_partitioning
 /******************************************************************
  * Driver routine
  ******************************************************************/
-template<class C>
+template <class C>
 typename C::T partition_fuzzy(
-    typename C::T *vals, typename C::TI * ids, size_t n,
-    size_t q_min, size_t q_max, size_t * q_out)
-{
+        typename C::T* vals,
+        typename C::TI* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out) {
     // the code below compiles and runs without AVX2 but it's slower than
     // the scalar implementation
 #ifdef __AVX2__
     constexpr bool is_uint16 = std::is_same<typename C::T, uint16_t>::value;
     if (is_uint16 && is_aligned_pointer(vals)) {
         return simd_partitioning::simd_partition_fuzzy<C>(
-            (uint16_t*)vals, ids, n, q_min, q_max, q_out);
+                (uint16_t*)vals, ids, n, q_min, q_max, q_out);
     }
 #endif
     return partitioning::partition_fuzzy_median3<C>(
-        vals, ids, n, q_min, q_max, q_out);
+            vals, ids, n, q_min, q_max, q_out);
 }
 // explicit template instanciations
-template float partition_fuzzy<CMin<float, int64_t>> (
-    float *vals, int64_t * ids, size_t n,
-    size_t q_min, size_t q_max, size_t * q_out);
-template float partition_fuzzy<CMax<float, int64_t>> (
-    float *vals, int64_t * ids, size_t n,
-    size_t q_min, size_t q_max, size_t * q_out);
-template uint16_t partition_fuzzy<CMin<uint16_t, int64_t>> (
-    uint16_t *vals, int64_t * ids, size_t n,
-    size_t q_min, size_t q_max, size_t * q_out);
-template uint16_t partition_fuzzy<CMax<uint16_t, int64_t>> (
-    uint16_t *vals, int64_t * ids, size_t n,
-    size_t q_min, size_t q_max, size_t * q_out);
-template uint16_t partition_fuzzy<CMin<uint16_t, int>> (
-    uint16_t *vals, int * ids, size_t n,
-    size_t q_min, size_t q_max, size_t * q_out);
-template uint16_t partition_fuzzy<CMax<uint16_t, int>> (
-    uint16_t *vals, int * ids, size_t n,
-    size_t q_min, size_t q_max, size_t * q_out);
+template float partition_fuzzy<CMin<float, int64_t>>(
+        float* vals,
+        int64_t* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out);
+template float partition_fuzzy<CMax<float, int64_t>>(
+        float* vals,
+        int64_t* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out);
+template uint16_t partition_fuzzy<CMin<uint16_t, int64_t>>(
+        uint16_t* vals,
+        int64_t* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out);
+template uint16_t partition_fuzzy<CMax<uint16_t, int64_t>>(
+        uint16_t* vals,
+        int64_t* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out);
+template uint16_t partition_fuzzy<CMin<uint16_t, int>>(
+        uint16_t* vals,
+        int* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out);
+template uint16_t partition_fuzzy<CMax<uint16_t, int>>(
+        uint16_t* vals,
+        int* ids,
+        size_t n,
+        size_t q_min,
+        size_t q_max,
+        size_t* q_out);
 /******************************************************************
  * Histogram subroutines
@@ -758,7 +821,7 @@ template uint16_t partition_fuzzy<CMax<uint16_t, int>> (
 /// FIXME when MSB of uint16 is set
 // this code does not compile properly with GCC 7.4.0
-namespace  {
+namespace {
 /************************************************************
  * 8 bins
@@ -773,7 +836,6 @@ simd32uint8 accu4to8(simd16uint16 a4) {
     return simd32uint8(_mm256_hadd_epi16(a8_0.i, a8_1.i));
 }
 simd16uint16 accu8to16(simd32uint8 a8) {
     simd16uint16 mask8(0x00ff);
@@ -783,27 +845,53 @@ simd16uint16 accu8to16(simd32uint8 a8) {
     return simd16uint16(_mm256_hadd_epi16(a8_0.i, a8_1.i));
 }
 static const simd32uint8 shifts(_mm256_setr_epi8(
-    1, 16, 0, 0,  4, 64, 0, 0,
-    0, 0, 1, 16,  0, 0, 4, 64,
-    1, 16, 0, 0,  4, 64, 0, 0,
-    0, 0, 1, 16,  0, 0, 4, 64
-));
+        1,
+        16,
+        0,
+        0,
+        4,
+        64,
+        0,
+        0,
+        0,
+        0,
+        1,
+        16,
+        0,
+        0,
+        4,
+        64,
+        1,
+        16,
+        0,
+        0,
+        4,
+        64,
+        0,
+        0,
+        0,
+        0,
+        1,
+        16,
+        0,
+        0,
+        4,
+        64));
 // 2-bit accumulator: we can add only up to 3 elements
 // on output we return 2*4-bit results
 // preproc returns either an index in 0..7 or 0xffff
 // that yeilds a 0 when used in the table look-up
-template<int N, class Preproc>
+template <int N, class Preproc>
 void compute_accu2(
-        const uint16_t * & data,
-        Preproc & pp,
-        simd16uint16 & a4lo, simd16uint16 & a4hi
-) {
+        const uint16_t*& data,
+        Preproc& pp,
+        simd16uint16& a4lo,
+        simd16uint16& a4hi) {
     simd16uint16 mask2(0x3333);
     simd16uint16 a2((uint16_t)0); // 2-bit accu
-    for (int j = 0; j < N; j ++) {
+    for (int j = 0; j < N; j++) {
         simd16uint16 v(data);
         data += 16;
         v = pp(v);
@@ -815,34 +903,30 @@ void compute_accu2(
     a4hi += (a2 >> 2) & mask2;
 }
-template<class Preproc>
-simd16uint16 histogram_8(
-        const uint16_t * data, Preproc pp,
-        size_t n_in) {
-    assert (n_in % 16 == 0);
+template <class Preproc>
+simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) {
+    assert(n_in % 16 == 0);
     int n = n_in / 16;
     simd32uint8 a8lo(0);
     simd32uint8 a8hi(0);
-    for(int i0 = 0; i0 < n; i0 += 15) {
-        simd16uint16 a4lo(0);  // 4-bit accus
+    for (int i0 = 0; i0 < n; i0 += 15) {
+        simd16uint16 a4lo(0); // 4-bit accus
         simd16uint16 a4hi(0);
         int i1 = std::min(i0 + 15, n);
         int i;
-        for(i = i0; i + 2 < i1; i += 3) {
+        for (i = i0; i + 2 < i1; i += 3) {
             compute_accu2<3>(data, pp, a4lo, a4hi); // adds 3 max
         }
         switch (i1 - i) {
-        case 2:
-            compute_accu2<2>(data, pp, a4lo, a4hi);
-            break;
-        case 1:
-            compute_accu2<1>(data, pp, a4lo, a4hi);
-            break;
+            case 2:
+                compute_accu2<2>(data, pp, a4lo, a4hi);
+                break;
+            case 1:
+                compute_accu2<1>(data, pp, a4lo, a4hi);
+                break;
         }
         a8lo += accu4to8(a4lo);
@@ -859,50 +943,72 @@ simd16uint16 histogram_8(
     return a16;
 }
 /************************************************************
  * 16 bins
  ************************************************************/
 static const simd32uint8 shifts2(_mm256_setr_epi8(
-    1, 2, 4, 8, 16, 32, 64, (char)128,
-    1, 2, 4, 8, 16, 32, 64, (char)128,
-    1, 2, 4, 8, 16, 32, 64, (char)128,
-    1, 2, 4, 8, 16, 32, 64, (char)128
-));
-simd32uint8 shiftr_16(simd32uint8 x, int n)
-{
+        1,
+        2,
+        4,
+        8,
+        16,
+        32,
+        64,
+        (char)128,
+        1,
+        2,
+        4,
+        8,
+        16,
+        32,
+        64,
+        (char)128,
+        1,
+        2,
+        4,
+        8,
+        16,
+        32,
+        64,
+        (char)128,
+        1,
+        2,
+        4,
+        8,
+        16,
+        32,
+        64,
+        (char)128));
+simd32uint8 shiftr_16(simd32uint8 x, int n) {
     return simd32uint8(simd16uint16(x) >> n);
 }
 inline simd32uint8 combine_2x2(simd32uint8 a, simd32uint8 b) {
     __m256i a1b0 = _mm256_permute2f128_si256(a.i, b.i, 0x21);
     __m256i a0b1 = _mm256_blend_epi32(a.i, b.i, 0xF0);
     return simd32uint8(a1b0) + simd32uint8(a0b1);
 }
 // 2-bit accumulator: we can add only up to 3 elements
 // on output we return 2*4-bit results
-template<int N, class Preproc>
+template <int N, class Preproc>
 void compute_accu2_16(
-        const uint16_t * & data, Preproc pp,
-        simd32uint8 & a4_0, simd32uint8 & a4_1,
-        simd32uint8 & a4_2, simd32uint8 & a4_3
-) {
+        const uint16_t*& data,
+        Preproc pp,
+        simd32uint8& a4_0,
+        simd32uint8& a4_1,
+        simd32uint8& a4_2,
+        simd32uint8& a4_3) {
     simd32uint8 mask1(0x55);
     simd32uint8 a2_0; // 2-bit accu
     simd32uint8 a2_1; // 2-bit accu
-    a2_0.clear(); a2_1.clear();
+    a2_0.clear();
+    a2_1.clear();
-    for (int j = 0; j < N; j ++) {
+    for (int j = 0; j < N; j++) {
         simd16uint16 v(data);
         data += 16;
         v = pp(v);
@@ -925,38 +1031,27 @@ void compute_accu2_16(
     a4_1 += a2_1 & mask2;
     a4_2 += shiftr_16(a2_0, 2) & mask2;
     a4_3 += shiftr_16(a2_1, 2) & mask2;
 }
 simd32uint8 accu4to8_2(simd32uint8 a4_0, simd32uint8 a4_1) {
     simd32uint8 mask4(0x0f);
-    simd32uint8 a8_0 = combine_2x2(
-        a4_0 & mask4,
-        shiftr_16(a4_0, 4) & mask4
-    );
+    simd32uint8 a8_0 = combine_2x2(a4_0 & mask4, shiftr_16(a4_0, 4) & mask4);
-    simd32uint8 a8_1 = combine_2x2(
-        a4_1 & mask4,
-        shiftr_16(a4_1, 4) & mask4
-    );
+    simd32uint8 a8_1 = combine_2x2(a4_1 & mask4, shiftr_16(a4_1, 4) & mask4);
     return simd32uint8(_mm256_hadd_epi16(a8_0.i, a8_1.i));
 }
-template<class Preproc>
-simd16uint16 histogram_16(const uint16_t * data, Preproc pp, size_t n_in) {
-    assert (n_in % 16 == 0);
+template <class Preproc>
+simd16uint16 histogram_16(const uint16_t* data, Preproc pp, size_t n_in) {
+    assert(n_in % 16 == 0);
     int n = n_in / 16;
     simd32uint8 a8lo((uint8_t)0);
     simd32uint8 a8hi((uint8_t)0);
-    for(int i0 = 0; i0 < n; i0 += 7) {
+    for (int i0 = 0; i0 < n; i0 += 7) {
         simd32uint8 a4_0(0); // 0, 4, 8, 12
         simd32uint8 a4_1(0); // 1, 5, 9, 13
         simd32uint8 a4_2(0); // 2, 6, 10, 14
@@ -964,16 +1059,16 @@ simd16uint16 histogram_16(const uint16_t * data, Preproc pp, size_t n_in) {
         int i1 = std::min(i0 + 7, n);
         int i;
-        for(i = i0; i + 2 < i1; i += 3) {
+        for (i = i0; i + 2 < i1; i += 3) {
             compute_accu2_16<3>(data, pp, a4_0, a4_1, a4_2, a4_3);
         }
         switch (i1 - i) {
-        case 2:
-            compute_accu2_16<2>(data, pp, a4_0, a4_1, a4_2, a4_3);
-            break;
-        case 1:
-            compute_accu2_16<1>(data, pp, a4_0, a4_1, a4_2, a4_3);
-            break;
+            case 2:
+                compute_accu2_16<2>(data, pp, a4_0, a4_1, a4_2, a4_3);
+                break;
+            case 1:
+                compute_accu2_16<1>(data, pp, a4_0, a4_1, a4_2, a4_3);
+                break;
         }
         a8lo += accu4to8_2(a4_0, a4_1);
@@ -986,23 +1081,19 @@ simd16uint16 histogram_16(const uint16_t * data, Preproc pp, size_t n_in) {
     simd16uint16 a16 = simd16uint16(_mm256_hadd_epi16(a16lo.i, a16hi.i));
-    __m256i perm32 = _mm256_setr_epi32(
-        0, 2, 4, 6, 1, 3, 5, 7
-    );
+    __m256i perm32 = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
     a16.i = _mm256_permutevar8x32_epi32(a16.i, perm32);
     return a16;
 }
 struct PreprocNOP {
-    simd16uint16 operator () (simd16uint16 x)  {
+    simd16uint16 operator()(simd16uint16 x) {
         return x;
     }
 };
-template<int shift, int nbin>
+template <int shift, int nbin>
 struct PreprocMinShift {
     simd16uint16 min16;
     simd16uint16 max16;
@@ -1014,59 +1105,46 @@ struct PreprocMinShift {
         max16.set1(vmax); // vmax inclusive
     }
-    simd16uint16 operator () (simd16uint16 x)  {
+    simd16uint16 operator()(simd16uint16 x) {
         x = x - min16;
         simd16uint16 mask = (x == max(x, max16)) - (x == max16);
         return (x >> shift) | mask;
     }
 };
 /* unbounded versions of the functions */
-void simd_histogram_8_unbounded(
-    const uint16_t *data, int n,
-    int *hist)
-{
+void simd_histogram_8_unbounded(const uint16_t* data, int n, int* hist) {
     PreprocNOP pp;
     simd16uint16 a16 = histogram_8(data, pp, (n & ~15));
     ALIGNED(32) uint16_t a16_tab[16];
     a16.store(a16_tab);
-    for(int i = 0; i < 8; i++) {
+    for (int i = 0; i < 8; i++) {
         hist[i] = a16_tab[i] + a16_tab[i + 8];
     }
-    for(int i = (n & ~15); i < n; i++) {
+    for (int i = (n & ~15); i < n; i++) {
         hist[data[i]]++;
     }
 }
-void simd_histogram_16_unbounded(
-    const uint16_t *data, int n,
-    int *hist)
-{
+void simd_histogram_16_unbounded(const uint16_t* data, int n, int* hist) {
     simd16uint16 a16 = histogram_16(data, PreprocNOP(), (n & ~15));
     ALIGNED(32) uint16_t a16_tab[16];
     a16.store(a16_tab);
-    for(int i = 0; i < 16; i++) {
+    for (int i = 0; i < 16; i++) {
         hist[i] = a16_tab[i];
     }
-    for(int i = (n & ~15); i < n; i++) {
+    for (int i = (n & ~15); i < n; i++) {
         hist[data[i]]++;
     }
 }
 } // anonymous namespace
 /************************************************************
@@ -1074,10 +1152,11 @@ void simd_histogram_16_unbounded(
  ************************************************************/
 void simd_histogram_8(
-    const uint16_t *data, int n,
-    uint16_t min, int shift,
-    int *hist)
-{
+        const uint16_t* data,
+        int n,
+        uint16_t min,
+        int shift,
+        int* hist) {
     if (shift < 0) {
         simd_histogram_8_unbounded(data, n, hist);
         return;
@@ -1085,12 +1164,12 @@ void simd_histogram_8(
     simd16uint16 a16;
-#define DISPATCH(s)  \
-     case s: \
+#define DISPATCH(s)                                                     \
+    case s:                                                             \
         a16 = histogram_8(data, PreprocMinShift<s, 8>(min), (n & ~15)); \
         break
-    switch(shift) {
+    switch (shift) {
         DISPATCH(0);
         DISPATCH(1);
         DISPATCH(2);
@@ -1105,35 +1184,35 @@ void simd_histogram_8(
         DISPATCH(11);
         DISPATCH(12);
         DISPATCH(13);
-    default:
-        FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
+        default:
+            FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
     }
 #undef DISPATCH
     ALIGNED(32) uint16_t a16_tab[16];
     a16.store(a16_tab);
-    for(int i = 0; i < 8; i++) {
+    for (int i = 0; i < 8; i++) {
         hist[i] = a16_tab[i] + a16_tab[i + 8];
     }
     // complete with remaining bins
-    for(int i = (n & ~15); i < n; i++) {
-        if (data[i] < min) continue;
+    for (int i = (n & ~15); i < n; i++) {
+        if (data[i] < min)
+            continue;
         uint16_t v = data[i] - min;
         v >>= shift;
-        if (v < 8) hist[v]++;
+        if (v < 8)
+            hist[v]++;
     }
 }
 void simd_histogram_16(
-    const uint16_t *data, int n,
-    uint16_t min, int shift,
-    int *hist)
-{
+        const uint16_t* data,
+        int n,
+        uint16_t min,
+        int shift,
+        int* hist) {
     if (shift < 0) {
         simd_histogram_16_unbounded(data, n, hist);
         return;
@@ -1141,12 +1220,12 @@ void simd_histogram_16(
     simd16uint16 a16;
-#define DISPATCH(s)  \
-     case s: \
+#define DISPATCH(s)                                                       \
+    case s:                                                               \
         a16 = histogram_16(data, PreprocMinShift<s, 16>(min), (n & ~15)); \
         break
-    switch(shift) {
+    switch (shift) {
         DISPATCH(0);
         DISPATCH(1);
         DISPATCH(2);
@@ -1160,48 +1239,47 @@ void simd_histogram_16(
         DISPATCH(10);
         DISPATCH(11);
         DISPATCH(12);
-    default:
-        FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
+        default:
+            FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
     }
 #undef DISPATCH
     ALIGNED(32) uint16_t a16_tab[16];
     a16.store(a16_tab);
-    for(int i = 0; i < 16; i++) {
+    for (int i = 0; i < 16; i++) {
         hist[i] = a16_tab[i];
     }
-    for(int i = (n & ~15); i < n; i++) {
-        if (data[i] < min) continue;
+    for (int i = (n & ~15); i < n; i++) {
+        if (data[i] < min)
+            continue;
         uint16_t v = data[i] - min;
         v >>= shift;
-        if (v < 16) hist[v]++;
+        if (v < 16)
+            hist[v]++;
     }
 }
 // no AVX2
 #else
 void simd_histogram_16(
-    const uint16_t *data, int n,
-    uint16_t min, int shift,
-    int *hist)
-{
+        const uint16_t* data,
+        int n,
+        uint16_t min,
+        int shift,
+        int* hist) {
     memset(hist, 0, sizeof(*hist) * 16);
     if (shift < 0) {
-        for(size_t i = 0; i < n; i++) {
+        for (size_t i = 0; i < n; i++) {
             hist[data[i]]++;
         }
     } else {
         int vmax0 = std::min((16 << shift) + min, 65536);
         uint16_t vmax = uint16_t(vmax0 - 1 - min);
-        for(size_t i = 0; i < n; i++) {
+        for (size_t i = 0; i < n; i++) {
             uint16_t v = data[i];
             v -= min;
             if (!(v <= vmax))
@@ -1217,40 +1295,37 @@ void simd_histogram_16(
             */
         }
     }
 }
 void simd_histogram_8(
-    const uint16_t *data, int n,
-    uint16_t min, int shift,
-    int *hist)
-{
+        const uint16_t* data,
+        int n,
+        uint16_t min,
+        int shift,
+        int* hist) {
     memset(hist, 0, sizeof(*hist) * 8);
     if (shift < 0) {
-        for(size_t i = 0; i < n; i++) {
+        for (size_t i = 0; i < n; i++) {
             hist[data[i]]++;
         }
     } else {
-        for(size_t i = 0; i < n; i++) {
-            if (data[i] < min) continue;
+        for (size_t i = 0; i < n; i++) {
+            if (data[i] < min)
+                continue;
             uint16_t v = data[i] - min;
             v >>= shift;
-            if (v < 8) hist[v]++;
+            if (v < 8)
+                hist[v]++;
         }
     }
 }
 #endif
 void PartitionStats::reset() {
     memset(this, 0, sizeof(*this));
 }
 PartitionStats partition_stats;
 } // namespace faiss