RubyGems - faiss - Versions diffs - 0.3.0 → 0.3.2 - Mend

faiss 0.3.0 → 0.3.2

Files changed (216) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +9 -0
data/LICENSE.txt +1 -1
data/README.md +1 -1
data/ext/faiss/extconf.rb +9 -2
data/ext/faiss/index.cpp +1 -1
data/ext/faiss/index_binary.cpp +2 -2
data/ext/faiss/product_quantizer.cpp +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +7 -7
data/vendor/faiss/faiss/AutoTune.h +1 -2
data/vendor/faiss/faiss/Clustering.cpp +39 -22
data/vendor/faiss/faiss/Clustering.h +40 -21
data/vendor/faiss/faiss/IVFlib.cpp +26 -12
data/vendor/faiss/faiss/Index.cpp +1 -1
data/vendor/faiss/faiss/Index.h +40 -10
data/vendor/faiss/faiss/Index2Layer.cpp +7 -7
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +176 -166
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +15 -15
data/vendor/faiss/faiss/IndexBinary.cpp +9 -4
data/vendor/faiss/faiss/IndexBinary.h +8 -19
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +24 -31
data/vendor/faiss/faiss/IndexBinaryHNSW.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHash.cpp +25 -50
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +107 -188
data/vendor/faiss/faiss/IndexFastScan.cpp +95 -146
data/vendor/faiss/faiss/IndexFastScan.h +9 -8
data/vendor/faiss/faiss/IndexFlat.cpp +206 -10
data/vendor/faiss/faiss/IndexFlat.h +20 -1
data/vendor/faiss/faiss/IndexFlatCodes.cpp +170 -5
data/vendor/faiss/faiss/IndexFlatCodes.h +23 -4
data/vendor/faiss/faiss/IndexHNSW.cpp +231 -382
data/vendor/faiss/faiss/IndexHNSW.h +62 -49
data/vendor/faiss/faiss/IndexIDMap.cpp +69 -28
data/vendor/faiss/faiss/IndexIDMap.h +24 -2
data/vendor/faiss/faiss/IndexIVF.cpp +162 -56
data/vendor/faiss/faiss/IndexIVF.h +46 -6
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +33 -26
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +6 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +19 -46
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +502 -401
data/vendor/faiss/faiss/IndexIVFFastScan.h +63 -26
data/vendor/faiss/faiss/IndexIVFFlat.cpp +15 -5
data/vendor/faiss/faiss/IndexIVFFlat.h +3 -2
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +172 -0
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +56 -0
data/vendor/faiss/faiss/IndexIVFPQ.cpp +79 -125
data/vendor/faiss/faiss/IndexIVFPQ.h +6 -7
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +39 -52
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +4 -3
data/vendor/faiss/faiss/IndexIVFPQR.cpp +45 -29
data/vendor/faiss/faiss/IndexIVFPQR.h +5 -2
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +25 -27
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +6 -6
data/vendor/faiss/faiss/IndexLSH.cpp +14 -16
data/vendor/faiss/faiss/IndexLattice.cpp +1 -19
data/vendor/faiss/faiss/IndexLattice.h +3 -22
data/vendor/faiss/faiss/IndexNNDescent.cpp +3 -33
data/vendor/faiss/faiss/IndexNNDescent.h +1 -1
data/vendor/faiss/faiss/IndexNSG.cpp +11 -27
data/vendor/faiss/faiss/IndexNSG.h +11 -11
data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +56 -0
data/vendor/faiss/faiss/IndexNeuralNetCodec.h +49 -0
data/vendor/faiss/faiss/IndexPQ.cpp +72 -88
data/vendor/faiss/faiss/IndexPQ.h +1 -4
data/vendor/faiss/faiss/IndexPQFastScan.cpp +1 -1
data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -31
data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
data/vendor/faiss/faiss/IndexRefine.cpp +54 -24
data/vendor/faiss/faiss/IndexRefine.h +7 -0
data/vendor/faiss/faiss/IndexReplicas.cpp +23 -26
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +25 -17
data/vendor/faiss/faiss/IndexScalarQuantizer.h +6 -4
data/vendor/faiss/faiss/IndexShards.cpp +21 -29
data/vendor/faiss/faiss/IndexShardsIVF.cpp +1 -2
data/vendor/faiss/faiss/MatrixStats.cpp +17 -32
data/vendor/faiss/faiss/MatrixStats.h +21 -9
data/vendor/faiss/faiss/MetaIndexes.cpp +35 -35
data/vendor/faiss/faiss/MetricType.h +7 -2
data/vendor/faiss/faiss/VectorTransform.cpp +13 -26
data/vendor/faiss/faiss/VectorTransform.h +7 -7
data/vendor/faiss/faiss/clone_index.cpp +15 -10
data/vendor/faiss/faiss/clone_index.h +3 -0
data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +95 -17
data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +152 -0
data/vendor/faiss/faiss/cppcontrib/factory_tools.h +24 -0
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +83 -30
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +123 -8
data/vendor/faiss/faiss/gpu/GpuCloner.h +22 -0
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +13 -0
data/vendor/faiss/faiss/gpu/GpuDistance.h +46 -38
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +1 -1
data/vendor/faiss/faiss/gpu/GpuIndex.h +30 -12
data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +282 -0
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +4 -4
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +14 -9
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +20 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +22 -11
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +1 -3
data/vendor/faiss/faiss/gpu/GpuResources.cpp +24 -3
data/vendor/faiss/faiss/gpu/GpuResources.h +39 -11
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +142 -17
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +57 -3
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +26 -21
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +7 -1
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +8 -5
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +25 -0
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +129 -9
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +332 -40
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +299 -208
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +1 -0
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +1 -1
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +6 -0
data/vendor/faiss/faiss/gpu/utils/RaftUtils.h +75 -0
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +4 -1
data/vendor/faiss/faiss/gpu/utils/Timer.h +1 -1
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +3 -1
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +5 -5
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +26 -1
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +10 -3
data/vendor/faiss/faiss/impl/DistanceComputer.h +70 -1
data/vendor/faiss/faiss/impl/FaissAssert.h +4 -2
data/vendor/faiss/faiss/impl/FaissException.h +13 -34
data/vendor/faiss/faiss/impl/HNSW.cpp +605 -186
data/vendor/faiss/faiss/impl/HNSW.h +52 -30
data/vendor/faiss/faiss/impl/IDSelector.h +4 -4
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +11 -9
data/vendor/faiss/faiss/impl/LookupTableScaler.h +34 -0
data/vendor/faiss/faiss/impl/NNDescent.cpp +42 -27
data/vendor/faiss/faiss/impl/NSG.cpp +0 -29
data/vendor/faiss/faiss/impl/NSG.h +1 -1
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +14 -12
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +1 -1
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -22
data/vendor/faiss/faiss/impl/ProductQuantizer.h +6 -2
data/vendor/faiss/faiss/impl/Quantizer.h +1 -1
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +27 -1015
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +5 -63
data/vendor/faiss/faiss/impl/ResultHandler.h +347 -172
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +1104 -147
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +3 -8
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +285 -42
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +248 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +21 -14
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +22 -12
data/vendor/faiss/faiss/impl/index_read.cpp +74 -34
data/vendor/faiss/faiss/impl/index_read_utils.h +37 -0
data/vendor/faiss/faiss/impl/index_write.cpp +88 -51
data/vendor/faiss/faiss/impl/io.cpp +23 -15
data/vendor/faiss/faiss/impl/io.h +4 -4
data/vendor/faiss/faiss/impl/io_macros.h +6 -0
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +1 -1
data/vendor/faiss/faiss/impl/platform_macros.h +40 -1
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +14 -0
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +7 -6
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +52 -38
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +487 -49
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +960 -0
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +176 -0
data/vendor/faiss/faiss/impl/simd_result_handlers.h +481 -225
data/vendor/faiss/faiss/index_factory.cpp +41 -20
data/vendor/faiss/faiss/index_io.h +12 -5
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +28 -8
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +3 -0
data/vendor/faiss/faiss/invlists/DirectMap.cpp +10 -2
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +73 -17
data/vendor/faiss/faiss/invlists/InvertedLists.h +26 -8
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +24 -9
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -1
data/vendor/faiss/faiss/python/python_callbacks.cpp +4 -4
data/vendor/faiss/faiss/utils/Heap.cpp +3 -1
data/vendor/faiss/faiss/utils/Heap.h +105 -0
data/vendor/faiss/faiss/utils/NeuralNet.cpp +342 -0
data/vendor/faiss/faiss/utils/NeuralNet.h +147 -0
data/vendor/faiss/faiss/utils/WorkerThread.h +1 -0
data/vendor/faiss/faiss/utils/bf16.h +36 -0
data/vendor/faiss/faiss/utils/distances.cpp +147 -123
data/vendor/faiss/faiss/utils/distances.h +86 -9
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +1 -1
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +5 -5
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +1 -1
data/vendor/faiss/faiss/utils/distances_simd.cpp +1589 -243
data/vendor/faiss/faiss/utils/extra_distances-inl.h +70 -0
data/vendor/faiss/faiss/utils/extra_distances.cpp +85 -137
data/vendor/faiss/faiss/utils/extra_distances.h +3 -2
data/vendor/faiss/faiss/utils/fp16-arm.h +29 -0
data/vendor/faiss/faiss/utils/fp16.h +2 -0
data/vendor/faiss/faiss/utils/hamming.cpp +163 -111
data/vendor/faiss/faiss/utils/hamming.h +58 -0
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +16 -89
data/vendor/faiss/faiss/utils/hamming_distance/common.h +1 -0
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +19 -88
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +58 -0
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +14 -104
data/vendor/faiss/faiss/utils/partitioning.cpp +3 -4
data/vendor/faiss/faiss/utils/prefetch.h +77 -0
data/vendor/faiss/faiss/utils/quantize_lut.cpp +0 -14
data/vendor/faiss/faiss/utils/random.cpp +43 -0
data/vendor/faiss/faiss/utils/random.h +25 -0
data/vendor/faiss/faiss/utils/simdlib.h +10 -1
data/vendor/faiss/faiss/utils/simdlib_avx2.h +0 -6
data/vendor/faiss/faiss/utils/simdlib_avx512.h +296 -0
data/vendor/faiss/faiss/utils/simdlib_neon.h +77 -79
data/vendor/faiss/faiss/utils/simdlib_ppc64.h +1084 -0
data/vendor/faiss/faiss/utils/sorting.cpp +140 -5
data/vendor/faiss/faiss/utils/sorting.h +27 -0
data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +176 -0
data/vendor/faiss/faiss/utils/utils.cpp +120 -7
data/vendor/faiss/faiss/utils/utils.h +60 -20
metadata +23 -4
data/vendor/faiss/faiss/impl/code_distance/code_distance_avx512.h +0 -102

data/vendor/faiss/faiss/utils/simdlib_ppc64.h ADDED Viewed

@@ -0,0 +1,1084 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+namespace faiss {
+struct simd256bit {
+    union {
+        uint8_t u8[32];
+        uint16_t u16[16];
+        uint32_t u32[8];
+        float f32[8];
+    };
+    simd256bit() {}
+    explicit simd256bit(const void* x) {
+        memcpy(u8, x, 32);
+    }
+    void clear() {
+        memset(u8, 0, 32);
+    }
+    void storeu(void* ptr) const {
+        memcpy(ptr, u8, 32);
+    }
+    void loadu(const void* ptr) {
+        memcpy(u8, ptr, 32);
+    }
+    void store(void* ptr) const {
+        storeu(ptr);
+    }
+    void bin(char bits[257]) const {
+        const char* bytes = (char*)this->u8;
+        for (int i = 0; i < 256; i++) {
+            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
+        }
+        bits[256] = 0;
+    }
+    std::string bin() const {
+        char bits[257];
+        bin(bits);
+        return std::string(bits);
+    }
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd256bit other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+};
+/// vector of 16 elements in uint16
+struct simd16uint16 : simd256bit {
+    simd16uint16() {}
+    explicit simd16uint16(int x) {
+        set1(x);
+    }
+    explicit simd16uint16(uint16_t x) {
+        set1(x);
+    }
+    explicit simd16uint16(const simd256bit& x) : simd256bit(x) {}
+    explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
+    explicit simd16uint16(
+            uint16_t u0,
+            uint16_t u1,
+            uint16_t u2,
+            uint16_t u3,
+            uint16_t u4,
+            uint16_t u5,
+            uint16_t u6,
+            uint16_t u7,
+            uint16_t u8,
+            uint16_t u9,
+            uint16_t u10,
+            uint16_t u11,
+            uint16_t u12,
+            uint16_t u13,
+            uint16_t u14,
+            uint16_t u15) {
+        this->u16[0] = u0;
+        this->u16[1] = u1;
+        this->u16[2] = u2;
+        this->u16[3] = u3;
+        this->u16[4] = u4;
+        this->u16[5] = u5;
+        this->u16[6] = u6;
+        this->u16[7] = u7;
+        this->u16[8] = u8;
+        this->u16[9] = u9;
+        this->u16[10] = u10;
+        this->u16[11] = u11;
+        this->u16[12] = u12;
+        this->u16[13] = u13;
+        this->u16[14] = u14;
+        this->u16[15] = u15;
+    }
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 16; i++) {
+            ptr += sprintf(ptr, fmt, u16[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+    template <typename F>
+    static simd16uint16 unary_func(const simd16uint16& a, F&& f) {
+        simd16uint16 c;
+        for (int j = 0; j < 16; j++) {
+            c.u16[j] = f(a.u16[j]);
+        }
+        return c;
+    }
+    template <typename F>
+    static simd16uint16 binary_func(
+            const simd16uint16& a,
+            const simd16uint16& b,
+            F&& f) {
+        simd16uint16 c;
+        for (int j = 0; j < 16; j++) {
+            c.u16[j] = f(a.u16[j], b.u16[j]);
+        }
+        return c;
+    }
+    void set1(uint16_t x) {
+        for (int i = 0; i < 16; i++) {
+            u16[i] = x;
+        }
+    }
+    simd16uint16 operator*(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a * b; });
+    }
+    // shift must be known at compile time
+    simd16uint16 operator>>(const int shift) const {
+        return unary_func(*this, [shift](uint16_t a) { return a >> shift; });
+    }
+    // shift must be known at compile time
+    simd16uint16 operator<<(const int shift) const {
+        return unary_func(*this, [shift](uint16_t a) { return a << shift; });
+    }
+    simd16uint16 operator+=(const simd16uint16& other) {
+        *this = *this + other;
+        return *this;
+    }
+    simd16uint16 operator-=(const simd16uint16& other) {
+        *this = *this - other;
+        return *this;
+    }
+    simd16uint16 operator+(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a + b; });
+    }
+    simd16uint16 operator-(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a - b; });
+    }
+    simd16uint16 operator&(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a & b;
+                });
+    }
+    simd16uint16 operator|(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a | b;
+                });
+    }
+    simd16uint16 operator^(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a ^ b;
+                });
+    }
+    // returns binary masks
+    simd16uint16 operator==(const simd16uint16& other) const {
+        return binary_func(*this, other, [](uint16_t a, uint16_t b) {
+            return a == b ? 0xffff : 0;
+        });
+    }
+    simd16uint16 operator~() const {
+        return unary_func(*this, [](uint16_t a) { return ~a; });
+    }
+    // get scalar at index 0
+    uint16_t get_scalar_0() const {
+        return u16[0];
+    }
+    // mask of elements where this >= thresh
+    // 2 bit per component: 16 * 2 = 32 bit
+    uint32_t ge_mask(const simd16uint16& thresh) const {
+        uint32_t gem = 0;
+        for (int j = 0; j < 16; j++) {
+            if (u16[j] >= thresh.u16[j]) {
+                gem |= 3 << (j * 2);
+            }
+        }
+        return gem;
+    }
+    uint32_t le_mask(const simd16uint16& thresh) const {
+        return thresh.ge_mask(*this);
+    }
+    uint32_t gt_mask(const simd16uint16& thresh) const {
+        return ~le_mask(thresh);
+    }
+    bool all_gt(const simd16uint16& thresh) const {
+        return le_mask(thresh) == 0;
+    }
+    // for debugging only
+    uint16_t operator[](int i) const {
+        return u16[i];
+    }
+    void accu_min(const simd16uint16& incoming) {
+        for (int j = 0; j < 16; j++) {
+            if (incoming.u16[j] < u16[j]) {
+                u16[j] = incoming.u16[j];
+            }
+        }
+    }
+    void accu_max(const simd16uint16& incoming) {
+        for (int j = 0; j < 16; j++) {
+            if (incoming.u16[j] > u16[j]) {
+                u16[j] = incoming.u16[j];
+            }
+        }
+    }
+};
+// not really a std::min because it returns an elementwise min
+inline simd16uint16 min(const simd16uint16& av, const simd16uint16& bv) {
+    return simd16uint16::binary_func(
+            av, bv, [](uint16_t a, uint16_t b) { return std::min(a, b); });
+}
+inline simd16uint16 max(const simd16uint16& av, const simd16uint16& bv) {
+    return simd16uint16::binary_func(
+            av, bv, [](uint16_t a, uint16_t b) { return std::max(a, b); });
+}
+// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
+// return (a0 + a1, b0 + b1)
+// TODO find a better name
+inline simd16uint16 combine2x2(const simd16uint16& a, const simd16uint16& b) {
+    simd16uint16 c;
+    for (int j = 0; j < 8; j++) {
+        c.u16[j] = a.u16[j] + a.u16[j + 8];
+        c.u16[j + 8] = b.u16[j] + b.u16[j + 8];
+    }
+    return c;
+}
+// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
+// of d0 and d1 with thr
+inline uint32_t cmp_ge32(
+        const simd16uint16& d0,
+        const simd16uint16& d1,
+        const simd16uint16& thr) {
+    uint32_t gem = 0;
+    for (int j = 0; j < 16; j++) {
+        if (d0.u16[j] >= thr.u16[j]) {
+            gem |= 1 << j;
+        }
+        if (d1.u16[j] >= thr.u16[j]) {
+            gem |= 1 << (j + 16);
+        }
+    }
+    return gem;
+}
+inline uint32_t cmp_le32(
+        const simd16uint16& d0,
+        const simd16uint16& d1,
+        const simd16uint16& thr) {
+    uint32_t gem = 0;
+    for (int j = 0; j < 16; j++) {
+        if (d0.u16[j] <= thr.u16[j]) {
+            gem |= 1 << j;
+        }
+        if (d1.u16[j] <= thr.u16[j]) {
+            gem |= 1 << (j + 16);
+        }
+    }
+    return gem;
+}
+// hadd does not cross lanes
+inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
+    simd16uint16 c;
+    c.u16[0] = a.u16[0] + a.u16[1];
+    c.u16[1] = a.u16[2] + a.u16[3];
+    c.u16[2] = a.u16[4] + a.u16[5];
+    c.u16[3] = a.u16[6] + a.u16[7];
+    c.u16[4] = b.u16[0] + b.u16[1];
+    c.u16[5] = b.u16[2] + b.u16[3];
+    c.u16[6] = b.u16[4] + b.u16[5];
+    c.u16[7] = b.u16[6] + b.u16[7];
+    c.u16[8] = a.u16[8] + a.u16[9];
+    c.u16[9] = a.u16[10] + a.u16[11];
+    c.u16[10] = a.u16[12] + a.u16[13];
+    c.u16[11] = a.u16[14] + a.u16[15];
+    c.u16[12] = b.u16[8] + b.u16[9];
+    c.u16[13] = b.u16[10] + b.u16[11];
+    c.u16[14] = b.u16[12] + b.u16[13];
+    c.u16[15] = b.u16[14] + b.u16[15];
+    return c;
+}
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd16uint16 candidateValues,
+        const simd16uint16 candidateIndices,
+        const simd16uint16 currentValues,
+        const simd16uint16 currentIndices,
+        simd16uint16& minValues,
+        simd16uint16& minIndices,
+        simd16uint16& maxValues,
+        simd16uint16& maxIndices) {
+    for (size_t i = 0; i < 16; i++) {
+        bool flag = (candidateValues.u16[i] < currentValues.u16[i]);
+        minValues.u16[i] = flag ? candidateValues.u16[i] : currentValues.u16[i];
+        minIndices.u16[i] =
+                flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+        maxValues.u16[i] =
+                !flag ? candidateValues.u16[i] : currentValues.u16[i];
+        maxIndices.u16[i] =
+                !flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+    }
+}
+// vector of 32 unsigned 8-bit integers
+struct simd32uint8 : simd256bit {
+    simd32uint8() {}
+    explicit simd32uint8(int x) {
+        set1(x);
+    }
+    explicit simd32uint8(uint8_t x) {
+        set1(x);
+    }
+    template <
+            uint8_t _0,
+            uint8_t _1,
+            uint8_t _2,
+            uint8_t _3,
+            uint8_t _4,
+            uint8_t _5,
+            uint8_t _6,
+            uint8_t _7,
+            uint8_t _8,
+            uint8_t _9,
+            uint8_t _10,
+            uint8_t _11,
+            uint8_t _12,
+            uint8_t _13,
+            uint8_t _14,
+            uint8_t _15,
+            uint8_t _16,
+            uint8_t _17,
+            uint8_t _18,
+            uint8_t _19,
+            uint8_t _20,
+            uint8_t _21,
+            uint8_t _22,
+            uint8_t _23,
+            uint8_t _24,
+            uint8_t _25,
+            uint8_t _26,
+            uint8_t _27,
+            uint8_t _28,
+            uint8_t _29,
+            uint8_t _30,
+            uint8_t _31>
+    static simd32uint8 create() {
+        simd32uint8 ret;
+        ret.u8[0] = _0;
+        ret.u8[1] = _1;
+        ret.u8[2] = _2;
+        ret.u8[3] = _3;
+        ret.u8[4] = _4;
+        ret.u8[5] = _5;
+        ret.u8[6] = _6;
+        ret.u8[7] = _7;
+        ret.u8[8] = _8;
+        ret.u8[9] = _9;
+        ret.u8[10] = _10;
+        ret.u8[11] = _11;
+        ret.u8[12] = _12;
+        ret.u8[13] = _13;
+        ret.u8[14] = _14;
+        ret.u8[15] = _15;
+        ret.u8[16] = _16;
+        ret.u8[17] = _17;
+        ret.u8[18] = _18;
+        ret.u8[19] = _19;
+        ret.u8[20] = _20;
+        ret.u8[21] = _21;
+        ret.u8[22] = _22;
+        ret.u8[23] = _23;
+        ret.u8[24] = _24;
+        ret.u8[25] = _25;
+        ret.u8[26] = _26;
+        ret.u8[27] = _27;
+        ret.u8[28] = _28;
+        ret.u8[29] = _29;
+        ret.u8[30] = _30;
+        ret.u8[31] = _31;
+        return ret;
+    }
+    explicit simd32uint8(const simd256bit& x) : simd256bit(x) {}
+    explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 32; i++) {
+            ptr += sprintf(ptr, fmt, u8[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+    void set1(uint8_t x) {
+        for (int j = 0; j < 32; j++) {
+            u8[j] = x;
+        }
+    }
+    template <typename F>
+    static simd32uint8 binary_func(
+            const simd32uint8& a,
+            const simd32uint8& b,
+            F&& f) {
+        simd32uint8 c;
+        for (int j = 0; j < 32; j++) {
+            c.u8[j] = f(a.u8[j], b.u8[j]);
+        }
+        return c;
+    }
+    simd32uint8 operator&(const simd256bit& other) const {
+        return binary_func(*this, simd32uint8(other), [](uint8_t a, uint8_t b) {
+            return a & b;
+        });
+    }
+    simd32uint8 operator+(const simd32uint8& other) const {
+        return binary_func(
+                *this, other, [](uint8_t a, uint8_t b) { return a + b; });
+    }
+    // The very important operation that everything relies on
+    simd32uint8 lookup_2_lanes(const simd32uint8& idx) const {
+        simd32uint8 c;
+        // The original for loop:
+        // for (int j = 0; j < 32; j++) {
+        //     if (idx.u8[j] & 0x80) {
+        //         c.u8[j] = 0;
+        //     } else {
+        //         uint8_t i = idx.u8[j] & 15;
+        //         if (j < 16) {
+        //             c.u8[j] = u8[i];
+        //         } else {
+        //             c.u8[j] = u8[16 + i];
+        //         }
+        //     }
+        // The following function was re-written for Power 10
+        // The loop was unrolled to remove the if (j < 16) statement by doing
+        // the j and j + 16 iterations in parallel.  The additional unrolling
+        // for j + 1 and j + 17, reduces the execution time on Power 10 by
+        // about 50% as the instruction scheduling allows on average 2X more
+        // instructions to be issued per cycle.
+        for (int j = 0; j < 16; j = j + 2) {
+            // j < 16, unrolled to depth of 2
+            if (idx.u8[j] & 0x80) {
+                c.u8[j] = 0;
+            } else {
+                uint8_t i = idx.u8[j] & 15;
+                c.u8[j] = u8[i];
+            }
+            if (idx.u8[j + 1] & 0x80) {
+                c.u8[j + 1] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 1] & 15;
+                c.u8[j + 1] = u8[i];
+            }
+            // j >= 16, unrolled to depth of 2
+            if (idx.u8[j + 16] & 0x80) {
+                c.u8[j + 16] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 16] & 15;
+                c.u8[j + 16] = u8[i + 16];
+            }
+            if (idx.u8[j + 17] & 0x80) {
+                c.u8[j + 17] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 17] & 15;
+                c.u8[j + 17] = u8[i + 16];
+            }
+        }
+        return c;
+    }
+    // extract + 0-extend lane
+    // this operation is slow (3 cycles)
+    simd32uint8 operator+=(const simd32uint8& other) {
+        *this = *this + other;
+        return *this;
+    }
+    // for debugging only
+    uint8_t operator[](int i) const {
+        return u8[i];
+    }
+};
+// convert with saturation
+// careful: this does not cross lanes, so the order is weird
+inline simd32uint8 uint16_to_uint8_saturate(
+        const simd16uint16& a,
+        const simd16uint16& b) {
+    simd32uint8 c;
+    auto saturate_16_to_8 = [](uint16_t x) { return x >= 256 ? 0xff : x; };
+    for (int i = 0; i < 8; i++) {
+        c.u8[i] = saturate_16_to_8(a.u16[i]);
+        c.u8[8 + i] = saturate_16_to_8(b.u16[i]);
+        c.u8[16 + i] = saturate_16_to_8(a.u16[8 + i]);
+        c.u8[24 + i] = saturate_16_to_8(b.u16[8 + i]);
+    }
+    return c;
+}
+/// get most significant bit of each byte
+inline uint32_t get_MSBs(const simd32uint8& a) {
+    uint32_t res = 0;
+    for (int i = 0; i < 32; i++) {
+        if (a.u8[i] & 0x80) {
+            res |= 1 << i;
+        }
+    }
+    return res;
+}
+/// use MSB of each byte of mask to select a byte between a and b
+inline simd32uint8 blendv(
+        const simd32uint8& a,
+        const simd32uint8& b,
+        const simd32uint8& mask) {
+    simd32uint8 c;
+    for (int i = 0; i < 32; i++) {
+        if (mask.u8[i] & 0x80) {
+            c.u8[i] = b.u8[i];
+        } else {
+            c.u8[i] = a.u8[i];
+        }
+    }
+    return c;
+}
+/// vector of 8 unsigned 32-bit integers
+struct simd8uint32 : simd256bit {
+    simd8uint32() {}
+    explicit simd8uint32(uint32_t x) {
+        set1(x);
+    }
+    explicit simd8uint32(const simd256bit& x) : simd256bit(x) {}
+    explicit simd8uint32(const uint32_t* x) : simd256bit((const void*)x) {}
+    explicit simd8uint32(
+            uint32_t u0,
+            uint32_t u1,
+            uint32_t u2,
+            uint32_t u3,
+            uint32_t u4,
+            uint32_t u5,
+            uint32_t u6,
+            uint32_t u7) {
+        u32[0] = u0;
+        u32[1] = u1;
+        u32[2] = u2;
+        u32[3] = u3;
+        u32[4] = u4;
+        u32[5] = u5;
+        u32[6] = u6;
+        u32[7] = u7;
+    }
+    simd8uint32 operator+(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] + other.u32[i];
+        }
+        return result;
+    }
+    simd8uint32 operator-(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] - other.u32[i];
+        }
+        return result;
+    }
+    simd8uint32& operator+=(const simd8uint32& other) {
+        for (int i = 0; i < 8; i++) {
+            u32[i] += other.u32[i];
+        }
+        return *this;
+    }
+    bool operator==(simd8uint32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+    bool operator!=(simd8uint32 other) const {
+        return !(*this == other);
+    }
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, fmt, u32[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+    std::string hex() const {
+        return elements_to_string("%08x,");
+    }
+    std::string dec() const {
+        return elements_to_string("%10d,");
+    }
+    void set1(uint32_t x) {
+        for (int i = 0; i < 8; i++) {
+            u32[i] = x;
+        }
+    }
+    simd8uint32 unzip() const {
+        const uint32_t ret[] = {
+                u32[0], u32[2], u32[4], u32[6], u32[1], u32[3], u32[5], u32[7]};
+        return simd8uint32{ret};
+    }
+};
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8uint32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8uint32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8uint32& minValues,
+        simd8uint32& minIndices,
+        simd8uint32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.u32[i] < currentValues.u32[i]);
+        minValues.u32[i] = flag ? candidateValues.u32[i] : currentValues.u32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.u32[i] =
+                !flag ? candidateValues.u32[i] : currentValues.u32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+struct simd8float32 : simd256bit {
+    simd8float32() {}
+    explicit simd8float32(const simd256bit& x) : simd256bit(x) {}
+    explicit simd8float32(float x) {
+        set1(x);
+    }
+    explicit simd8float32(const float* x) {
+        loadu((void*)x);
+    }
+    void set1(float x) {
+        for (int i = 0; i < 8; i++) {
+            f32[i] = x;
+        }
+    }
+    explicit simd8float32(
+            float f0,
+            float f1,
+            float f2,
+            float f3,
+            float f4,
+            float f5,
+            float f6,
+            float f7) {
+        f32[0] = f0;
+        f32[1] = f1;
+        f32[2] = f2;
+        f32[3] = f3;
+        f32[4] = f4;
+        f32[5] = f5;
+        f32[6] = f6;
+        f32[7] = f7;
+    }
+    template <typename F>
+    static simd8float32 binary_func(
+            const simd8float32& a,
+            const simd8float32& b,
+            F&& f) {
+        simd8float32 c;
+        for (int j = 0; j < 8; j++) {
+            c.f32[j] = f(a.f32[j], b.f32[j]);
+        }
+        return c;
+    }
+    simd8float32 operator*(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a * b; });
+    }
+    simd8float32 operator+(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a + b; });
+    }
+    simd8float32 operator-(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a - b; });
+    }
+    simd8float32& operator+=(const simd8float32& other) {
+        for (size_t i = 0; i < 8; i++) {
+            f32[i] += other.f32[i];
+        }
+        return *this;
+    }
+    bool operator==(simd8float32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (f32[i] != other.f32[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+    bool operator!=(simd8float32 other) const {
+        return !(*this == other);
+    }
+    std::string tostring() const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, "%g,", f32[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+};
+// hadd does not cross lanes
+inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0] + a.f32[1];
+    c.f32[1] = a.f32[2] + a.f32[3];
+    c.f32[2] = b.f32[0] + b.f32[1];
+    c.f32[3] = b.f32[2] + b.f32[3];
+    c.f32[4] = a.f32[4] + a.f32[5];
+    c.f32[5] = a.f32[6] + a.f32[7];
+    c.f32[6] = b.f32[4] + b.f32[5];
+    c.f32[7] = b.f32[6] + b.f32[7];
+    return c;
+}
+inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0];
+    c.f32[1] = b.f32[0];
+    c.f32[2] = a.f32[1];
+    c.f32[3] = b.f32[1];
+    c.f32[4] = a.f32[4];
+    c.f32[5] = b.f32[4];
+    c.f32[6] = a.f32[5];
+    c.f32[7] = b.f32[5];
+    return c;
+}
+inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[2];
+    c.f32[1] = b.f32[2];
+    c.f32[2] = a.f32[3];
+    c.f32[3] = b.f32[3];
+    c.f32[4] = a.f32[6];
+    c.f32[5] = b.f32[6];
+    c.f32[6] = a.f32[7];
+    c.f32[7] = b.f32[7];
+    return c;
+}
+// compute a * b + c
+inline simd8float32 fmadd(
+        const simd8float32& a,
+        const simd8float32& b,
+        const simd8float32& c) {
+    simd8float32 res;
+    for (int i = 0; i < 8; i++) {
+        res.f32[i] = a.f32[i] * b.f32[i] + c.f32[i];
+    }
+    return res;
+}
+namespace {
+// get even float32's of a and b, interleaved
+simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0];
+    c.f32[1] = a.f32[2];
+    c.f32[2] = b.f32[0];
+    c.f32[3] = b.f32[2];
+    c.f32[4] = a.f32[4];
+    c.f32[5] = a.f32[6];
+    c.f32[6] = b.f32[4];
+    c.f32[7] = b.f32[6];
+    return c;
+}
+// get odd float32's of a and b, interleaved
+simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[1];
+    c.f32[1] = a.f32[3];
+    c.f32[2] = b.f32[1];
+    c.f32[3] = b.f32[3];
+    c.f32[4] = a.f32[5];
+    c.f32[5] = a.f32[7];
+    c.f32[6] = b.f32[5];
+    c.f32[7] = b.f32[7];
+    return c;
+}
+// 3 cycles
+// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
+simd8float32 getlow128(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0];
+    c.f32[1] = a.f32[1];
+    c.f32[2] = a.f32[2];
+    c.f32[3] = a.f32[3];
+    c.f32[4] = b.f32[0];
+    c.f32[5] = b.f32[1];
+    c.f32[6] = b.f32[2];
+    c.f32[7] = b.f32[3];
+    return c;
+}
+simd8float32 gethigh128(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[4];
+    c.f32[1] = a.f32[5];
+    c.f32[2] = a.f32[6];
+    c.f32[3] = a.f32[7];
+    c.f32[4] = b.f32[4];
+    c.f32[5] = b.f32[5];
+    c.f32[6] = b.f32[6];
+    c.f32[7] = b.f32[7];
+    return c;
+}
+// The following primitive is a vectorized version of the following code
+// snippet:
+//   float lowestValue = HUGE_VAL;
+//   uint lowestIndex = 0;
+//   for (size_t i = 0; i < n; i++) {
+//     if (values[i] < lowestValue) {
+//       lowestValue = values[i];
+//       lowestIndex = i;
+//     }
+//   }
+// Vectorized version can be implemented via two operations: cmp and blend
+// with something like this:
+//   lowestValues = [HUGE_VAL; 8];
+//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
+//   for (size_t i = 0; i < n; i += 8) {
+//     auto comparison = cmp(values + i, lowestValues);
+//     lowestValues = blend(
+//         comparison,
+//         values + i,
+//         lowestValues);
+//     lowestIndices = blend(
+//         comparison,
+//         i + {0, 1, 2, 3, 4, 5, 6, 7},
+//         lowestIndices);
+//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
+//   }
+// The problem is that blend primitive needs very different instruction
+// order for AVX and ARM.
+// So, let's introduce a combination of these two in order to avoid
+// confusion for ppl who write in low-level SIMD instructions. Additionally,
+// these two ops (cmp and blend) are very often used together.
+inline void cmplt_and_blend_inplace(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        simd8float32& lowestValues,
+        simd8uint32& lowestIndices) {
+    for (size_t j = 0; j < 8; j++) {
+        bool comparison = (candidateValues.f32[j] < lowestValues.f32[j]);
+        if (comparison) {
+            lowestValues.f32[j] = candidateValues.f32[j];
+            lowestIndices.u32[j] = candidateIndices.u32[j];
+        }
+    }
+}
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8float32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8float32& minValues,
+        simd8uint32& minIndices,
+        simd8float32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.f32[i] < currentValues.f32[i]);
+        minValues.f32[i] = flag ? candidateValues.f32[i] : currentValues.f32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.f32[i] =
+                !flag ? candidateValues.f32[i] : currentValues.f32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+} // namespace
+} // namespace faiss