RubyGems - faiss - Versions diffs - 0.6.1 → 0.6.2 - Mend

faiss 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/Index.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
data/vendor/faiss/faiss/factory_tools.cpp +4 -0
data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
data/vendor/faiss/faiss/impl/HNSW.h +51 -13
data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
data/vendor/faiss/faiss/impl/Panorama.h +11 -0
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
data/vendor/faiss/faiss/impl/io_macros.h +25 -0
data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
data/vendor/faiss/faiss/index_factory.cpp +5 -1
data/vendor/faiss/faiss/index_io.h +16 -0
data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
data/vendor/faiss/faiss/utils/bf16.h +34 -0
data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
metadata +12 -2

data/vendor/faiss/faiss/impl/ScalarQuantizer.h CHANGED Viewed

@@ -7,6 +7,8 @@
 #pragma once
+#include <cstring>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/DistanceComputer.h>
 #include <faiss/impl/Quantizer.h>
@@ -39,6 +41,10 @@ struct ScalarQuantizer : Quantizer {
         QT_3bit_tqmse, ///< TurboQuant MSE-optimized, 3 bits per component
         QT_4bit_tqmse, ///< TurboQuant MSE-optimized, 4 bits per component
         QT_8bit_tqmse, ///< TurboQuant MSE-optimized, 8 bits per component
+        QT_2bit_tq,    ///< Full TurboQuant (1-bit MSE + 1-bit QJL + factors)
+        QT_3bit_tq,    ///< Full TurboQuant (2-bit MSE + 1-bit QJL + factors)
+        QT_4bit_tq,    ///< Full TurboQuant (3-bit MSE + 1-bit QJL + factors)
+        QT_5bit_tq,    ///< Full TurboQuant (4-bit MSE + 1-bit QJL + factors)
         QT_count
     };
@@ -131,6 +137,50 @@ struct ScalarQuantizer : Quantizer {
         }
     };
+    /// TurboQuant full (QT_*_tq) refinement state, isolated from the
+    /// main ScalarQuantizer to avoid polluting it with TQ-specific data.
+    struct TurboQuantRefine {
+        static bool is_turboq_full(QuantizerType qt) {
+            return qt >= QT_2bit_tq && qt <= QT_5bit_tq;
+        }
+        static void pack_seed(uint64_t seed, float out[2]) {
+            static_assert(sizeof(uint64_t) == 2 * sizeof(float));
+            std::memcpy(out, &seed, sizeof(uint64_t));
+        }
+        static uint64_t unpack_seed(float lo, float hi) {
+            float tmp[2] = {lo, hi};
+            uint64_t s;
+            static_assert(sizeof(uint64_t) == 2 * sizeof(float));
+            std::memcpy(&s, tmp, sizeof(uint64_t));
+            return s;
+        }
+        uint8_t qjl_type = 0;
+        uint64_t seed = 42;
+        size_t padded_d = 0;
+        std::vector<float> fwht_signs;
+        std::vector<float> rr_matrix;
+        size_t nb_bits_lo = 0;
+        size_t n_hi_dims = 0;
+        void init_projection(size_t d);
+        bool use_fwht() const {
+            return qjl_type == 0;
+        }
+        struct DistanceComputer : SQDistanceComputer {
+            virtual void configure(uint8_t qb, bool int_qjl) = 0;
+            virtual void set_prescreen_threshold(
+                    const float* t,
+                    bool minimize) = 0;
+            virtual void clear_prescreen_threshold() = 0;
+        };
+    };
+    TurboQuantRefine turboq_refine;
     SQDistanceComputer* get_distance_computer(
             MetricType metric = METRIC_L2) const;

data/vendor/faiss/faiss/impl/VisitedTable.cpp CHANGED Viewed

@@ -18,19 +18,19 @@ namespace faiss {
 // A size of ~1M seems to be the threshold where the hash set wins.
 size_t visited_table_hashset_threshold = 500000;
-VisitedTable::VisitedTable(size_t size, std::optional<bool> use_hashset)
-        : visno(use_hashset.value_or(size >= visited_table_hashset_threshold)
-                        ? 0
-                        : 1) {
-    if (visno != 0) {
-        visited.resize(size, 0);
+std::unique_ptr<VisitedTable> VisitedTable::create(
+        size_t size,
+        std::optional<bool> use_hashset) {
+    bool use_set =
+            use_hashset.value_or(size >= visited_table_hashset_threshold);
+    if (use_set) {
+        return std::make_unique<VisitedTableSet>();
     }
+    return std::make_unique<VisitedTableVector>(size);
 }
-void VisitedTable::advance() {
-    if (visno == 0) {
-        visited_set.clear();
-    } else if (visno < 254) {
+void VisitedTableVector::advance() {
+    if (visno < 254) {
         // 254 rather than 255 because sometimes we use visno and visno+1
         ++visno;
     } else {

data/vendor/faiss/faiss/impl/VisitedTable.h CHANGED Viewed

@@ -10,6 +10,7 @@
 #include <stdint.h>
+#include <memory>
 #include <optional>
 #include <unordered_set>
 #include <vector>
@@ -21,54 +22,88 @@ namespace faiss {
 FAISS_API extern size_t visited_table_hashset_threshold;
-/// A fast, reusable Visited Set for graph search algorithms.
+/// Abstract base class for a fast, reusable Visited Set for graph search
+/// algorithms.
 struct VisitedTable {
-    std::vector<uint8_t> visited;
-    std::unordered_set<size_t> visited_set;
-    uint8_t visno; // 0 if using visited_set, 1..250 if using vector.
+    virtual ~VisitedTable() = default;
+    /// set flag #no to true, return whether this changed it.
+    virtual bool set(size_t no) = 0;
+    /// get flag #no
+    virtual bool get(size_t no) const = 0;
+    /// prefetch flag #no
+    virtual void prefetch(size_t no) const = 0;
+    /// pre-allocate bucket space to avoid rehashing during repeated set() calls
+    virtual void reserve(size_t /*n*/) {}
-    // If use_hashset is nullopt, the use of a hashset will be determined by
-    // size >= visited_table_hashset_threshold.
-    explicit VisitedTable(
+    /// reset all flags to false
+    virtual void advance() = 0;
+    /// Factory method to create appropriate implementation.
+    /// If use_hashset is nullopt, the use of a hashset will be determined by
+    /// size >= visited_table_hashset_threshold.
+    static std::unique_ptr<VisitedTable> create(
             size_t size,
             std::optional<bool> use_hashset = std::nullopt);
+};
-    /// set flag #no to true, return whether this changed it.
-    bool set(size_t no) {
-        if (visno == 0) {
-            return visited_set.insert(no).second;
-        } else if (visited[no] == visno) {
-            return false;
-        } else {
-            visited[no] = visno;
-            return true;
-        }
+/// Set-based implementation using unordered_set.
+/// O(1) to construct and O(visits) to advance.
+struct VisitedTableSet FAISS_FINAL : VisitedTable {
+    std::unordered_set<size_t> visited_set;
+    VisitedTableSet() = default;
+    bool set(size_t no) final {
+        return visited_set.insert(no).second;
     }
-    /// pre-allocate bucket space to avoid rehashing during repeated set() calls
-    void reserve(size_t n) {
-        if (visno == 0) {
-            visited_set.reserve(n);
-        }
+    bool get(size_t no) const final {
+        return visited_set.count(no) != 0;
     }
-    /// get flag #no
-    bool get(size_t no) const {
-        if (visno == 0) {
-            return visited_set.count(no) != 0;
-        } else {
-            return visited[no] == visno;
-        }
+    void prefetch(size_t /*no*/) const final {
+        // No-op for set-based implementation
     }
-    void prefetch(size_t no) const {
-        if (visno != 0) {
-            prefetch_L2(&visited[no]);
+    void reserve(size_t n) final {
+        visited_set.reserve(n);
+    }
+    void advance() final {
+        visited_set.clear();
+    }
+};
+/// Vector-based implementation using a versioned byte array.
+/// Faster for get()/set(), but O(size) to initialize.
+/// advance() is O(1) except every 250 calls, which are O(size).
+struct VisitedTableVector FAISS_FINAL : VisitedTable {
+    std::vector<uint8_t> visited;
+    uint8_t visno{1}; // Version number, 1..254
+    explicit VisitedTableVector(size_t size) : visited(size, 0) {}
+    bool set(size_t no) final {
+        if (visited[no] == visno) {
+            return false;
         }
+        visited[no] = visno;
+        return true;
     }
-    /// reset all flags to false
-    void advance();
+    bool get(size_t no) const final {
+        return visited[no] == visno;
+    }
+    void prefetch(size_t no) const final {
+        prefetch_L2(&visited[no]);
+    }
+    void advance() final;
 };
 } // namespace faiss

data/vendor/faiss/faiss/impl/fast_scan/dispatching.h CHANGED Viewed

@@ -48,7 +48,9 @@ using namespace simd_result_handlers;
  * so callers don't need to know the handler type.
  ***************************************************************/
-template <class Handler>
+// SIMDLevel SL = THE_LEVEL_TO_DISPATCH added to make the mangled
+// symbol name unique per translation unit.
+template <class Handler, SIMDLevel SL = THE_LEVEL_TO_DISPATCH>
 struct ScannerMixIn : FastScanCodeScanner {
     Handler handler_;

data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp CHANGED Viewed

@@ -5,39 +5,32 @@
  * LICENSE file in the root directory of this source tree.
  */
-#include <cmath>
 #include <faiss/impl/hnsw/MinimaxHeap.h>
-#include <cassert>
 #include <faiss/impl/simd_dispatch.h>
 namespace faiss {
-void MinimaxHeap::push(storage_idx_t i, float v) {
-    // Treat NaN distances as infinitely far away so heap ordering is preserved.
-    if (std::isnan(v)) {
-        v = HC::neutral();
-    }
-    if (k == n) {
-        if (v >= dis[0]) {
-            return;
-        }
-        if (ids[0] != -1) {
-            --nvalid;
-        }
-        faiss::heap_pop<HC>(k--, dis.data(), ids.data());
-    }
-    faiss::heap_push<HC>(++k, dis.data(), ids.data(), v, i);
-    ++nvalid;
+// Runtime-dispatched pop_min (NONE + AVX2 + AVX512 only).
+constexpr int MINIMAX_HEAP_SIMD_LEVELS = (1 << int(SIMDLevel::NONE)) |
+        (1 << int(SIMDLevel::AVX2)) | (1 << int(SIMDLevel::AVX512));
+template <class HC_>
+int MinimaxHeapT<HC_>::pop_min(float* vmin_out) {
+    return with_selected_simd_levels<MINIMAX_HEAP_SIMD_LEVELS>(
+            [&]<SIMDLevel SL>() {
+                return pop_min_tpl<HC_, SL>(this, vmin_out);
+            });
 }
-// Scalar (NONE) specialization of pop_min_tpl
-template <>
-int MinimaxHeap::pop_min_tpl<SIMDLevel::NONE>(float* vmin_out) {
+// Primary-template scalar implementation. Used directly when SL==NONE
+template <class HC>
+int pop_min_simd_none(MinimaxHeapT<HC>* heap, float* vmin_out) {
+    int k = heap->k;
+    int* ids = heap->ids.data();
+    float* dis = heap->dis.data();
     assert(k > 0);
-    // returns min. This is an O(n) operation
+    // Returns the "best" entry. This is an O(n) operation.
     int i = k - 1;
     while (i >= 0) {
         if (ids[i] != -1) {
@@ -52,7 +45,8 @@ int MinimaxHeap::pop_min_tpl<SIMDLevel::NONE>(float* vmin_out) {
     float vmin = dis[i];
     i--;
     while (i >= 0) {
-        if (ids[i] != -1 && dis[i] < vmin) {
+        // HC::cmp(vmin, dis[i]) → "dis[i] is better than vmin".
+        if (ids[i] != -1 && HC::cmp(vmin, dis[i])) {
             vmin = dis[i];
             imin = i;
         }
@@ -63,29 +57,27 @@ int MinimaxHeap::pop_min_tpl<SIMDLevel::NONE>(float* vmin_out) {
     }
     int ret = ids[imin];
     ids[imin] = -1;
-    --nvalid;
+    --heap->nvalid;
     return ret;
 }
-// Runtime-dispatched pop_min (NONE + AVX2 + AVX512 only)
-constexpr int MINIMAX_HEAP_SIMD_LEVELS = (1 << int(SIMDLevel::NONE)) |
-        (1 << int(SIMDLevel::AVX2)) | (1 << int(SIMDLevel::AVX512));
-int MinimaxHeap::pop_min(float* vmin_out) {
-    return with_selected_simd_levels<MINIMAX_HEAP_SIMD_LEVELS>(
-            [&]<SIMDLevel SL>() { return pop_min_tpl<SL>(vmin_out); });
+// declare for min and max heap at simd level NONE
+template <>
+int pop_min_tpl<CMin<float, int32_t>, SIMDLevel::NONE>(
+        MinimaxHeapT<CMin<float, int32_t>>* heap,
+        float* vmin_out) {
+    return pop_min_simd_none(heap, vmin_out);
 }
-int MinimaxHeap::count_below(float thresh) {
-    int n_below = 0;
-    for (int i = 0; i < k; i++) {
-        if (dis[i] < thresh) {
-            n_below++;
-        }
-    }
-    return n_below;
+template <>
+int pop_min_tpl<CMax<float, int32_t>, SIMDLevel::NONE>(
+        MinimaxHeapT<CMax<float, int32_t>>* heap,
+        float* vmin_out) {
+    return pop_min_simd_none(heap, vmin_out);
 }
+// Explicit instantiations of pop_min for the two HC variants
+template int MinimaxHeapT<CMax<float, int32_t>>::pop_min(float*);
+template int MinimaxHeapT<CMin<float, int32_t>>::pop_min(float*);
 } // namespace faiss

data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h CHANGED Viewed

@@ -7,21 +7,30 @@
 #pragma once
+#include <cassert>
+#include <cmath>
 #include <cstdint>
 #include <vector>
 #include <faiss/utils/Heap.h>
+#include <faiss/utils/ordered_key_value.h>
 #include <faiss/utils/simd_levels.h>
 namespace faiss {
 /** Heap structure that allows fast access and updates.
  *
- * Supports both max-heap operations (via the underlying CMax heap)
- * and efficient min extraction via linear scan (with optional SIMD
- * acceleration).
+ * Templated on the comparator HC_ so that the same data structure can
+ * service both distance-style searches (HC_ = CMax<float, int32_t>, smaller
+ * is better) and similarity-style searches (HC_ = CMin<float, int32_t>,
+ * larger is better). For the distance variant the underlying heap is a
+ * max-heap and "pop_min" returns the closest element; for similarity the
+ * underlying heap is a min-heap and "pop_min" returns the most similar
+ * element.
  */
-struct MinimaxHeap {
+template <class HC_ = CMax<float, int32_t>>
+struct MinimaxHeapT {
+    using HC = HC_;
     using storage_idx_t = int32_t;
     int n;
@@ -30,12 +39,34 @@ struct MinimaxHeap {
     std::vector<storage_idx_t> ids;
     std::vector<float> dis;
-    using HC = faiss::CMax<float, storage_idx_t>;
-    explicit MinimaxHeap(int n_in)
+    explicit MinimaxHeapT(int n_in)
             : n(n_in), k(0), nvalid(0), ids(n_in), dis(n_in) {}
-    void push(storage_idx_t i, float v);
+    void push(storage_idx_t i, float v) {
+        // Treat NaN distances as the "worst" value so heap ordering is
+        // preserved (insertion is then guaranteed to fall through the
+        // not-better-than-top early-reject branch when the heap is full).
+        if (std::isnan(v)) {
+            v = HC::neutral();
+        }
+        if (k == n) {
+            // top of the heap is the "worst" entry under HC. If the new
+            // value is not strictly better than the worst, drop it.
+            // HC::cmp(top, v) means "v is better than top" for both CMax
+            // (cmp = a > b → top > v → v < top) and CMin (cmp = a < b →
+            // top < v → v > top).
+            if (!HC::cmp(dis[0], v)) {
+                return;
+            }
+            if (ids[0] != -1) {
+                --nvalid;
+            }
+            faiss::heap_pop<HC>(k--, dis.data(), ids.data());
+        }
+        faiss::heap_push<HC>(++k, dis.data(), ids.data(), v, i);
+        ++nvalid;
+    }
     float max() const {
         return dis[0];
@@ -49,16 +80,34 @@ struct MinimaxHeap {
         nvalid = k = 0;
     }
-    /// SIMD-templated pop_min implementation.
-    /// Specializations exist for NONE, AVX2, and AVX512.
-    template <SIMDLevel SL>
-    int pop_min_tpl(float* vmin_out = nullptr);
-    /// Runtime-dispatched pop_min (calls pop_min_tpl with best available
-    /// SIMD level).
+    /// Runtime-dispatched best-element extraction (NONE + AVX2 + AVX512).
     int pop_min(float* vmin_out = nullptr);
-    int count_below(float thresh);
+    int count_below(float thresh) {
+        int n_below = 0;
+        for (int i = 0; i < k; i++) {
+            // Count entries that are strictly "better than" thresh.
+            // HC::cmp(thresh, dis[i]) → for CMax: thresh > dis[i]
+            // (i.e., dis[i] < thresh, the historical L2 semantics);
+            // for CMin: thresh < dis[i] (similarity above threshold).
+            if (HC::cmp(thresh, dis[i])) {
+                n_below++;
+            }
+        }
+        return n_below;
+    }
 };
+// Default `MinimaxHeap` keeps the historical max-heap semantics (smaller
+// distance is better). The CMin instantiation is used when the owning
+// HNSW has `is_similarity = true`. The alias itself is declared once,
+// alongside the forward declaration in HNSW.h, to avoid duplicate
+// `using` declarations that SWIG treats as redundant.
+// Forward declarations of the SIMD specializations. The actual bodies live
+// in the SIMD-specific translation units (avx2.cpp, avx512.cpp) and are
+// resolved at link time.
+template <class HC_, SIMDLevel SL>
+int pop_min_tpl(MinimaxHeapT<HC_>* heap, float* vmin_out);
 } // namespace faiss

data/vendor/faiss/faiss/impl/hnsw/avx2.cpp CHANGED Viewed

@@ -16,89 +16,135 @@
 namespace faiss {
-template <>
-int MinimaxHeap::pop_min_tpl<SIMDLevel::AVX2>(float* vmin_out) {
-    assert(k > 0);
+namespace {
+/// Templated AVX2 implementation of "pop best" for both CMax (returns
+/// the smallest distance) and CMin (returns the largest similarity).
+/// The only differences between the two flavors are: (1) the initial
+/// "worst possible" value, (2) the running-best update comparison
+/// (`_CMP_LT_OS` vs `_CMP_GT_OS`), and (3) the tiebreaker direction.
+template <class HC>
+int pop_best_avx2(MinimaxHeapT<HC>& heap, float* vmin_out) {
+    using storage_idx_t = typename MinimaxHeapT<HC>::storage_idx_t;
     static_assert(
             std::is_same<storage_idx_t, int32_t>::value,
             "This code expects storage_idx_t to be int32_t");
+    assert(heap.k > 0);
+    // For CMax (distance) the "best" candidate is the smallest value, so
+    // we initialize the running best to +inf. For CMin (similarity) the
+    // best is the largest value, so we initialize to -inf.
+    constexpr float worst_v = HC::is_max
+            ? std::numeric_limits<float>::infinity()
+            : -std::numeric_limits<float>::infinity();
-    int32_t min_idx = -1;
-    float min_dis = std::numeric_limits<float>::infinity();
+    int32_t best_idx = -1;
+    float best_dis = worst_v;
     size_t iii = 0;
-    __m256i min_indices = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
-    __m256 min_distances =
-            _mm256_set1_ps(std::numeric_limits<float>::infinity());
+    __m256i best_indices = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
+    __m256 best_distances = _mm256_set1_ps(worst_v);
     __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
     __m256i offset = _mm256_set1_epi32(8);
-    // The baseline version is available in the NONE specialization.
-    // The following loop tracks the rightmost index with the min distance.
-    // -1 index values are ignored.
-    const size_t k8 = (k / 8) * 8;
+    // Track the rightmost index whose distance equals the running best.
+    // -1 index values are filtered out via m1mask.
+    const size_t k8 = (heap.k / 8) * 8;
     for (; iii < k8; iii += 8) {
         __m256i indices =
-                _mm256_loadu_si256((const __m256i*)(ids.data() + iii));
-        __m256 distances = _mm256_loadu_ps(dis.data() + iii);
+                _mm256_loadu_si256((const __m256i*)(heap.ids.data() + iii));
+        __m256 distances = _mm256_loadu_ps(heap.dis.data() + iii);
-        // This mask filters out -1 values among indices.
+        // Mask out -1 indices (invalid entries).
         __m256i m1mask = _mm256_cmpgt_epi32(_mm256_setzero_si256(), indices);
-        __m256i dmask = _mm256_castps_si256(
-                _mm256_cmp_ps(min_distances, distances, _CMP_LT_OS));
+        // dmask is "true where best is already (strictly) better than the
+        // candidate" — entries the candidate should NOT update. For CMax,
+        // best < candidate means we keep best (we want the smallest);
+        // for CMin we keep best when best > candidate (we want the largest).
+        __m256i dmask;
+        if constexpr (HC::is_max) {
+            dmask = _mm256_castps_si256(
+                    _mm256_cmp_ps(best_distances, distances, _CMP_LT_OS));
+        } else {
+            dmask = _mm256_castps_si256(
+                    _mm256_cmp_ps(best_distances, distances, _CMP_GT_OS));
+        }
         __m256 finalmask = _mm256_castsi256_ps(_mm256_or_si256(m1mask, dmask));
-        const __m256i min_indices_new = _mm256_castps_si256(_mm256_blendv_ps(
+        const __m256i best_indices_new = _mm256_castps_si256(_mm256_blendv_ps(
                 _mm256_castsi256_ps(current_indices),
-                _mm256_castsi256_ps(min_indices),
+                _mm256_castsi256_ps(best_indices),
                 finalmask));
-        const __m256 min_distances_new =
-                _mm256_blendv_ps(distances, min_distances, finalmask);
+        const __m256 best_distances_new =
+                _mm256_blendv_ps(distances, best_distances, finalmask);
-        min_indices = min_indices_new;
-        min_distances = min_distances_new;
+        best_indices = best_indices_new;
+        best_distances = best_distances_new;
         current_indices = _mm256_add_epi32(current_indices, offset);
     }
-    // Vectorizing is doable, but is not practical
+    // Vectorizing the horizontal reduction is doable but not practical.
     int32_t vidx8[8];
     float vdis8[8];
-    _mm256_storeu_ps(vdis8, min_distances);
-    _mm256_storeu_si256((__m256i*)vidx8, min_indices);
+    _mm256_storeu_ps(vdis8, best_distances);
+    _mm256_storeu_si256((__m256i*)vidx8, best_indices);
     for (size_t j = 0; j < 8; j++) {
-        if (min_dis > vdis8[j] || (min_dis == vdis8[j] && min_idx < vidx8[j])) {
-            min_idx = vidx8[j];
-            min_dis = vdis8[j];
+        const bool strictly_better =
+                HC::is_max ? (best_dis > vdis8[j]) : (best_dis < vdis8[j]);
+        if (strictly_better || (best_dis == vdis8[j] && best_idx < vidx8[j])) {
+            best_idx = vidx8[j];
+            best_dis = vdis8[j];
         }
     }
-    // process last values. Vectorizing is doable, but is not practical
-    for (; iii < static_cast<size_t>(k); iii++) {
-        if (ids[iii] != -1 && dis[iii] <= min_dis) {
-            min_dis = dis[iii];
-            min_idx = iii;
+    // Tail (under 8 entries). Vectorizing is doable but not practical.
+    for (; iii < static_cast<size_t>(heap.k); iii++) {
+        if (heap.ids[iii] == -1) {
+            continue;
+        }
+        const bool weakly_better = HC::is_max ? (best_dis >= heap.dis[iii])
+                                              : (best_dis <= heap.dis[iii]);
+        if (weakly_better) {
+            best_dis = heap.dis[iii];
+            best_idx = iii;
         }
     }
-    if (min_idx == -1) {
+    if (best_idx == -1) {
         return -1;
     }
     if (vmin_out) {
-        *vmin_out = min_dis;
+        *vmin_out = best_dis;
     }
-    int ret = ids[min_idx];
-    ids[min_idx] = -1;
-    --nvalid;
+    int ret = heap.ids[best_idx];
+    heap.ids[best_idx] = -1;
+    --heap.nvalid;
     return ret;
 }
+} // namespace
+// Explicit specializations for AVX2
+template <>
+int pop_min_tpl<CMax<float, int32_t>, SIMDLevel::AVX2>(
+        MinimaxHeapT<CMax<float, int32_t>>* heap,
+        float* vmin_out) {
+    return pop_best_avx2<CMax<float, int32_t>>(*heap, vmin_out);
+}
+template <>
+int pop_min_tpl<CMin<float, int32_t>, SIMDLevel::AVX2>(
+        MinimaxHeapT<CMin<float, int32_t>>* heap,
+        float* vmin_out) {
+    return pop_best_avx2<CMin<float, int32_t>>(*heap, vmin_out);
+}
 } // namespace faiss
 #endif // COMPILE_SIMD_AVX2