RubyGems - faiss - Versions diffs - 0.6.1 → 0.6.2 - Mend

faiss 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/Index.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
data/vendor/faiss/faiss/factory_tools.cpp +4 -0
data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
data/vendor/faiss/faiss/impl/HNSW.h +51 -13
data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
data/vendor/faiss/faiss/impl/Panorama.h +11 -0
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
data/vendor/faiss/faiss/impl/io_macros.h +25 -0
data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
data/vendor/faiss/faiss/index_factory.cpp +5 -1
data/vendor/faiss/faiss/index_io.h +16 -0
data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
data/vendor/faiss/faiss/utils/bf16.h +34 -0
data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
metadata +12 -2

data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp ADDED Viewed

@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/**
+ * @file rabitq_avx512_spr.cpp
+ *
+ * RaBitQ SIMD kernels specialized for SIMDLevel::AVX512_SPR.
+ *
+ * Sapphire Rapids (SPR) and later Intel microarchitectures expose
+ * AVX-512 VPOPCNTDQ (vpopcntq), which performs a per-lane 64-bit
+ * popcount in a single instruction. This is used here to replace the
+ * multi-step shuffle/pshufb-based popcount used by the generic AVX-512
+ * specialization in rabitq_avx512.cpp. The popcount-heavy kernels
+ * (bitwise_and_dot_product, bitwise_xor_dot_product, popcount) become
+ * substantially shorter and faster on SPR+ as a result.
+ *
+ * Build / dispatch behavior:
+ *   - faiss_avx512 (AVX-512 only, no SPR features): NOT compiled.
+ *     The existing AVX512 specialization in rabitq_avx512.cpp is used.
+ *   - faiss_avx512_spr (statically built for SPR+): compiled. The
+ *     SINGLE_SIMD_LEVEL is AVX512_SPR, so this specialization is
+ *     selected by static dispatch.
+ *   - faiss with FAISS_OPT_LEVEL=dd (dynamic dispatch): compiled with
+ *     -mavx512vpopcntdq as a per-file flag. Selected at runtime when
+ *     SIMDConfig::level == SIMDLevel::AVX512_SPR.
+ *
+ * The floating-point multi-bit inner-product kernel does not benefit
+ * from VPOPCNTDQ, so this TU forwards compute_inner_product<SPR> to
+ * the AVX512 implementation to avoid duplicating that code path.
+ */
+#ifdef COMPILE_SIMD_AVX512_SPR
+#include <faiss/utils/popcount.h>
+#include <faiss/utils/rabitq_simd.h>
+#include <immintrin.h>
+#include <cstdint>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+namespace faiss::rabitq {
+// Forward declarations for the AVX512 specializations defined in
+// rabitq_avx512.cpp. They live in the same TU group on SPR builds, so
+// we can reuse them as a tail handler / fallback. Declaring rather
+// than redefining avoids ODR risk and keeps a single source of truth
+// for the floating-point kernel.
+template <>
+uint64_t bitwise_and_dot_product<SIMDLevel::AVX512>(
+        const uint8_t* query,
+        const uint8_t* data,
+        size_t size,
+        size_t qb);
+template <>
+uint64_t bitwise_xor_dot_product<SIMDLevel::AVX512>(
+        const uint8_t* query,
+        const uint8_t* data,
+        size_t size,
+        size_t qb);
+template <>
+uint64_t popcount<SIMDLevel::AVX512>(const uint8_t* data, size_t size);
+namespace {
+// 512-bit popcount using AVX-512 VPOPCNTDQ (vpopcntq).
+// Single-instruction per-lane popcount on 8x uint64 lanes.
+inline __m512i popcount_512_vpopcntdq(__m512i v) {
+    return _mm512_popcnt_epi64(v);
+}
+// 256-bit popcount using AVX-512VL VPOPCNTDQ.
+// AVX512VL is part of the SPR feature set, so vpopcntq is available
+// on 256-bit registers via _mm256_popcnt_epi64.
+inline __m256i popcount_256_vpopcntdq(__m256i v) {
+    return _mm256_popcnt_epi64(v);
+}
+// 128-bit popcount using AVX-512VL VPOPCNTDQ.
+inline __m128i popcount_128_vpopcntdq(__m128i v) {
+    return _mm_popcnt_epi64(v);
+}
+inline uint64_t reduce_add_256(__m256i v) {
+    alignas(32) uint64_t lanes[4];
+    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes), v);
+    return lanes[0] + lanes[1] + lanes[2] + lanes[3];
+}
+inline uint64_t reduce_add_128(__m128i v) {
+    alignas(16) uint64_t lanes[2];
+    _mm_store_si128(reinterpret_cast<__m128i*>(lanes), v);
+    return lanes[0] + lanes[1];
+}
+} // namespace
+template <>
+uint64_t bitwise_and_dot_product<SIMDLevel::AVX512_SPR>(
+        const uint8_t* query,
+        const uint8_t* data,
+        size_t size,
+        size_t qb) {
+    uint64_t sum = 0;
+    size_t offset = 0;
+    // 512-bit main loop: vpopcntq replaces the shuffle-based popcount,
+    // halving the instruction count per iteration relative to AVX512.
+    if (size_t step = 512 / 8; offset + step <= size) {
+        __m512i sum_512 = _mm512_setzero_si512();
+        for (; offset + step <= size; offset += step) {
+            __m512i v_x = _mm512_loadu_si512(
+                    reinterpret_cast<const __m512i*>(data + offset));
+            for (size_t j = 0; j < qb; j++) {
+                __m512i v_q = _mm512_loadu_si512(
+                        reinterpret_cast<const __m512i*>(
+                                query + j * size + offset));
+                __m512i v_and = _mm512_and_si512(v_q, v_x);
+                __m512i v_popcnt = popcount_512_vpopcntdq(v_and);
+                __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
+                sum_512 = _mm512_add_epi64(sum_512, v_shifted);
+            }
+        }
+        sum += _mm512_reduce_add_epi64(sum_512);
+    }
+    // 256-bit tail.
+    if (size_t step = 256 / 8; offset + step <= size) {
+        __m256i sum_256 = _mm256_setzero_si256();
+        for (; offset + step <= size; offset += step) {
+            __m256i v_x = _mm256_loadu_si256(
+                    reinterpret_cast<const __m256i*>(data + offset));
+            for (size_t j = 0; j < qb; j++) {
+                __m256i v_q = _mm256_loadu_si256(
+                        reinterpret_cast<const __m256i*>(
+                                query + j * size + offset));
+                __m256i v_and = _mm256_and_si256(v_q, v_x);
+                __m256i v_popcnt = popcount_256_vpopcntdq(v_and);
+                __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
+                sum_256 = _mm256_add_epi64(sum_256, v_shifted);
+            }
+        }
+        sum += reduce_add_256(sum_256);
+    }
+    // 128-bit tail.
+    __m128i sum_128 = _mm_setzero_si128();
+    for (size_t step = 128 / 8; offset + step <= size; offset += step) {
+        __m128i v_x = _mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(data + offset));
+        for (size_t j = 0; j < qb; j++) {
+            __m128i v_q = _mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(
+                            query + j * size + offset));
+            __m128i v_and = _mm_and_si128(v_q, v_x);
+            __m128i v_popcnt = popcount_128_vpopcntdq(v_and);
+            __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
+            sum_128 = _mm_add_epi64(sum_128, v_shifted);
+        }
+    }
+    sum += reduce_add_128(sum_128);
+    // 64-bit scalar tail.
+    for (size_t step = 64 / 8; offset + step <= size; offset += step) {
+        const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
+        for (size_t j = 0; j < qb; j++) {
+            const auto qv = *reinterpret_cast<const uint64_t*>(
+                    query + j * size + offset);
+            sum += static_cast<uint64_t>(popcount64(qv & yv)) << j;
+        }
+    }
+    // Byte tail.
+    for (; offset < size; ++offset) {
+        const auto yv = *(data + offset);
+        for (size_t j = 0; j < qb; j++) {
+            const auto qv = *(query + j * size + offset);
+            sum += static_cast<uint64_t>(popcount32(qv & yv)) << j;
+        }
+    }
+    return sum;
+}
+template <>
+uint64_t bitwise_xor_dot_product<SIMDLevel::AVX512_SPR>(
+        const uint8_t* query,
+        const uint8_t* data,
+        size_t size,
+        size_t qb) {
+    uint64_t sum = 0;
+    size_t offset = 0;
+    if (size_t step = 512 / 8; offset + step <= size) {
+        __m512i sum_512 = _mm512_setzero_si512();
+        for (; offset + step <= size; offset += step) {
+            __m512i v_x = _mm512_loadu_si512(
+                    reinterpret_cast<const __m512i*>(data + offset));
+            for (size_t j = 0; j < qb; j++) {
+                __m512i v_q = _mm512_loadu_si512(
+                        reinterpret_cast<const __m512i*>(
+                                query + j * size + offset));
+                __m512i v_xor = _mm512_xor_si512(v_q, v_x);
+                __m512i v_popcnt = popcount_512_vpopcntdq(v_xor);
+                __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
+                sum_512 = _mm512_add_epi64(sum_512, v_shifted);
+            }
+        }
+        sum += _mm512_reduce_add_epi64(sum_512);
+    }
+    if (size_t step = 256 / 8; offset + step <= size) {
+        __m256i sum_256 = _mm256_setzero_si256();
+        for (; offset + step <= size; offset += step) {
+            __m256i v_x = _mm256_loadu_si256(
+                    reinterpret_cast<const __m256i*>(data + offset));
+            for (size_t j = 0; j < qb; j++) {
+                __m256i v_q = _mm256_loadu_si256(
+                        reinterpret_cast<const __m256i*>(
+                                query + j * size + offset));
+                __m256i v_xor = _mm256_xor_si256(v_q, v_x);
+                __m256i v_popcnt = popcount_256_vpopcntdq(v_xor);
+                __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
+                sum_256 = _mm256_add_epi64(sum_256, v_shifted);
+            }
+        }
+        sum += reduce_add_256(sum_256);
+    }
+    __m128i sum_128 = _mm_setzero_si128();
+    for (size_t step = 128 / 8; offset + step <= size; offset += step) {
+        __m128i v_x = _mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(data + offset));
+        for (size_t j = 0; j < qb; j++) {
+            __m128i v_q = _mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(
+                            query + j * size + offset));
+            __m128i v_xor = _mm_xor_si128(v_q, v_x);
+            __m128i v_popcnt = popcount_128_vpopcntdq(v_xor);
+            __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
+            sum_128 = _mm_add_epi64(sum_128, v_shifted);
+        }
+    }
+    sum += reduce_add_128(sum_128);
+    for (size_t step = 64 / 8; offset + step <= size; offset += step) {
+        const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
+        for (size_t j = 0; j < qb; j++) {
+            const auto qv = *reinterpret_cast<const uint64_t*>(
+                    query + j * size + offset);
+            sum += static_cast<uint64_t>(popcount64(qv ^ yv)) << j;
+        }
+    }
+    for (; offset < size; ++offset) {
+        const auto yv = *(data + offset);
+        for (size_t j = 0; j < qb; j++) {
+            const auto qv = *(query + j * size + offset);
+            sum += static_cast<uint64_t>(popcount32(qv ^ yv)) << j;
+        }
+    }
+    return sum;
+}
+template <>
+uint64_t popcount<SIMDLevel::AVX512_SPR>(const uint8_t* data, size_t size) {
+    uint64_t sum = 0;
+    size_t offset = 0;
+    if (offset + 512 / 8 <= size) {
+        __m512i sum_512 = _mm512_setzero_si512();
+        for (size_t end; (end = offset + 512 / 8) <= size; offset = end) {
+            __m512i v_x = _mm512_loadu_si512(
+                    reinterpret_cast<const __m512i*>(data + offset));
+            __m512i v_popcnt = popcount_512_vpopcntdq(v_x);
+            sum_512 = _mm512_add_epi64(sum_512, v_popcnt);
+        }
+        sum += _mm512_reduce_add_epi64(sum_512);
+    }
+    if (offset + 256 / 8 <= size) {
+        __m256i sum_256 = _mm256_setzero_si256();
+        for (size_t end; (end = offset + 256 / 8) <= size; offset = end) {
+            __m256i v_x = _mm256_loadu_si256(
+                    reinterpret_cast<const __m256i*>(data + offset));
+            __m256i v_popcnt = popcount_256_vpopcntdq(v_x);
+            sum_256 = _mm256_add_epi64(sum_256, v_popcnt);
+        }
+        sum += reduce_add_256(sum_256);
+    }
+    __m128i sum_128 = _mm_setzero_si128();
+    for (size_t step = 128 / 8; offset + step <= size; offset += step) {
+        __m128i v_x = _mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(data + offset));
+        sum_128 = _mm_add_epi64(sum_128, popcount_128_vpopcntdq(v_x));
+    }
+    sum += reduce_add_128(sum_128);
+    for (size_t step = 64 / 8; offset + step <= size; offset += step) {
+        const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
+        sum += popcount64(yv);
+    }
+    for (; offset < size; ++offset) {
+        const auto yv = *(data + offset);
+        sum += popcount32(yv);
+    }
+    return sum;
+}
+} // namespace faiss::rabitq
+namespace faiss::rabitq::multibit {
+// Forward-declare the AVX512 floating-point inner-product kernel.
+// VPOPCNTDQ does not help this kernel (it operates on FP32), so we
+// reuse the AVX512 implementation rather than duplicate it.
+template <>
+float compute_inner_product<SIMDLevel::AVX512>(
+        const uint8_t* __restrict sign_bits,
+        const uint8_t* __restrict ex_code,
+        const float* __restrict rotated_q,
+        size_t d,
+        size_t ex_bits,
+        float cb);
+template <>
+float compute_inner_product<SIMDLevel::AVX512_SPR>(
+        const uint8_t* __restrict sign_bits,
+        const uint8_t* __restrict ex_code,
+        const float* __restrict rotated_q,
+        size_t d,
+        size_t ex_bits,
+        float cb) {
+    return compute_inner_product<SIMDLevel::AVX512>(
+            sign_bits, ex_code, rotated_q, d, ex_bits, cb);
+}
+} // namespace faiss::rabitq::multibit
+#endif // COMPILE_SIMD_AVX512_SPR

data/vendor/faiss/faiss/utils/simd_levels.cpp CHANGED Viewed

@@ -129,6 +129,9 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() {
         asm volatile("cpuid"
                      : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
                      : "a"(eax), "c"(ecx));
+        // Save EDX before xgetbv clobbers it — needed for
+        // AVX512_FP16 check (bit 23) in the SPR detection below.
+        unsigned int cpuid7_edx = edx;
         unsigned int xcr0;
         asm volatile("xgetbv" : "=a"(xcr0), "=d"(edx) : "c"(0));
@@ -155,8 +158,15 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() {
                         (1 << static_cast<int>(SIMDLevel::AVX512));
 #if defined(COMPILE_SIMD_AVX512_SPR)
-                // Check for Sapphire Rapids features (AVX512_BF16)
+                // Check for Sapphire Rapids features.
+                // The SPR code path is compiled with -mavx512fp16, so we
+                // must verify both AVX512_BF16 and AVX512_FP16 before
+                // dispatching to it. AMD Zen 4 (bergamo) has BF16 but
+                // not FP16 — using SPR code there causes SIGILL.
                 // CPUID EAX=7, ECX=1: EAX bit 5 = AVX512_BF16
+                // CPUID EAX=7, ECX=0: EDX bit 23 = AVX512_FP16
+                // (Linux: X86_FEATURE_AVX512_FP16 = 18*32+23)
+                bool has_avx512_fp16 = (cpuid7_edx & (1 << 23)) != 0;
                 unsigned int eax1, ebx1, ecx1, edx1;
                 eax1 = 7;
                 ecx1 = 1;
@@ -164,7 +174,7 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() {
                              : "=a"(eax1), "=b"(ebx1), "=c"(ecx1), "=d"(edx1)
                              : "a"(eax1), "c"(ecx1));
                 bool has_avx512_bf16 = (eax1 & (1 << 5)) != 0;
-                if (has_avx512_bf16) {
+                if (has_avx512_bf16 && has_avx512_fp16) {
                     detected_level = SIMDLevel::AVX512_SPR;
                     supported_simd_levels |=
                             (1 << static_cast<int>(SIMDLevel::AVX512_SPR));

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: faiss
 version: !ruby/object:Gem::Version
-  version: 0.6.1
+  version: 0.6.2
 platform: ruby
 authors:
 - Andrew Kane
@@ -233,12 +233,16 @@ files:
 - vendor/faiss/faiss/gpu/utils/Timer.h
 - vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h
 - vendor/faiss/faiss/gpu_metal/MetalCloner.h
+- vendor/faiss/faiss/gpu_metal/MetalDistance.h
 - vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h
 - vendor/faiss/faiss/gpu_metal/MetalIndex.h
 - vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h
+- vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h
 - vendor/faiss/faiss/gpu_metal/MetalKernels.h
+- vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h
 - vendor/faiss/faiss/gpu_metal/MetalResources.h
 - vendor/faiss/faiss/gpu_metal/StandardMetalResources.h
+- vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h
 - vendor/faiss/faiss/impl/AdSampling.cpp
 - vendor/faiss/faiss/impl/AdSampling.h
 - vendor/faiss/faiss/impl/AdditiveQuantizer.cpp
@@ -365,6 +369,7 @@ files:
 - vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h
 - vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h
 - vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp
+- vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h
 - vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp
 - vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp
 - vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h
@@ -374,6 +379,8 @@ files:
 - vendor/faiss/faiss/impl/scalar_quantizer/scanners.h
 - vendor/faiss/faiss/impl/scalar_quantizer/similarities.h
 - vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp
+- vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h
+- vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp
 - vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp
 - vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h
 - vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp
@@ -453,8 +460,10 @@ files:
 - vendor/faiss/faiss/utils/hamming_distance/common.h
 - vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp
 - vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp
+- vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp
 - vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h
 - vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h
+- vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h
 - vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h
 - vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h
 - vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h
@@ -489,6 +498,7 @@ files:
 - vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h
 - vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp
 - vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp
+- vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp
 - vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp
 - vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp
 - vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h
@@ -521,7 +531,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 4.0.10
+rubygems_version: 4.0.14
 specification_version: 4
 summary: Efficient similarity search and clustering for Ruby
 test_files: []