RubyGems - faiss - Versions diffs - 0.6.1 → 0.6.2 - Mend

faiss 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/Index.h +1 -1
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +6 -7
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +3 -3
data/vendor/faiss/faiss/IndexHNSW.cpp +173 -143
data/vendor/faiss/faiss/IndexIVF.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFFlat.cpp +3 -1
data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +3 -3
data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -3
data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +4 -13
data/vendor/faiss/faiss/IndexNNDescent.cpp +1 -1
data/vendor/faiss/faiss/IndexNSG.cpp +1 -2
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +68 -6
data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
data/vendor/faiss/faiss/factory_tools.cpp +4 -0
data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +11 -12
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +3 -3
data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +7 -0
data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +48 -3
data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
data/vendor/faiss/faiss/impl/HNSW.cpp +556 -199
data/vendor/faiss/faiss/impl/HNSW.h +51 -13
data/vendor/faiss/faiss/impl/NSG.cpp +15 -11
data/vendor/faiss/faiss/impl/Panorama.h +11 -0
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +25 -2
data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +1 -1
data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +7 -1
data/vendor/faiss/faiss/impl/ResultHandler.h +1 -0
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +271 -8
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +50 -0
data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
data/vendor/faiss/faiss/impl/VisitedTable.h +69 -34
data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +3 -1
data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +35 -43
data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -15
data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +86 -40
data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +81 -50
data/vendor/faiss/faiss/impl/index_read.cpp +100 -39
data/vendor/faiss/faiss/impl/index_write.cpp +1 -0
data/vendor/faiss/faiss/impl/io_macros.h +25 -0
data/vendor/faiss/faiss/impl/platform_macros.h +12 -8
data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +2 -0
data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +2 -0
data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +2 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +20 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +36 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -0
data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +2 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +6 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +327 -18
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +264 -27
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +199 -27
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +366 -3
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +144 -19
data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +26 -0
data/vendor/faiss/faiss/impl/simd_dispatch.h +65 -8
data/vendor/faiss/faiss/index_factory.cpp +5 -1
data/vendor/faiss/faiss/index_io.h +16 -0
data/vendor/faiss/faiss/invlists/DirectMap.cpp +4 -1
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +13 -13
data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +119 -22
data/vendor/faiss/faiss/svs/IndexSVSVamana.h +15 -5
data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +65 -24
data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +3 -2
data/vendor/faiss/faiss/utils/bf16.h +34 -0
data/vendor/faiss/faiss/utils/distances_simd.cpp +0 -1
data/vendor/faiss/faiss/utils/hamming.cpp +8 -8
data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +2 -1
data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +6 -30
data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
data/vendor/faiss/faiss/utils/partitioning.cpp +0 -2
data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +14 -68
data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
data/vendor/faiss/faiss/utils/simd_levels.cpp +12 -2
metadata +12 -2

data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h CHANGED Viewed

@@ -35,7 +35,8 @@ struct IndexSVSVamanaLeanVec : IndexSVSVamana {
             size_t degree,
             MetricType metric = METRIC_L2,
             size_t leanvec_dims = 0,
-            SVSStorageKind storage = SVSStorageKind::SVS_LeanVec4x4);
+            SVSStorageKind storage = SVSStorageKind::SVS_LeanVec4x4,
+            bool is_static = false);
     ~IndexSVSVamanaLeanVec() override;
@@ -66,7 +67,7 @@ struct IndexSVSVamanaLeanVec : IndexSVSVamana {
     svs_runtime::LeanVecTrainingData* training_data{nullptr};
    protected:
-    void create_impl() override;
+    void create_impl(idx_t n, const float* x) override;
 };
 } // namespace faiss

data/vendor/faiss/faiss/utils/bf16.h CHANGED Viewed

@@ -7,8 +7,13 @@
 #pragma once
+#include <cstddef>
 #include <cstdint>
+#if defined(__AVX512F__) || defined(__AVX512BF16__)
+#include <immintrin.h>
+#endif
 namespace faiss {
 namespace {
@@ -33,4 +38,33 @@ inline float decode_bf16(const uint16_t v) {
     return fp.as_f32;
 }
+inline void encode_bf16_simd(const float* src, uint16_t* dst, size_t n) {
+    size_t i = 0;
+#ifdef __AVX512BF16__
+    for (; i + 16 <= n; i += 16) {
+        __m512 v = _mm512_loadu_ps(src + i);
+        __m256bh encoded = _mm512_cvtneps_pbh(v);
+        _mm256_storeu_epi16(dst + i, (__m256i)encoded);
+    }
+#endif
+    for (; i < n; i++) {
+        dst[i] = encode_bf16(src[i]);
+    }
+}
+inline void decode_bf16_simd(const uint16_t* src, float* dst, size_t n) {
+    size_t i = 0;
+#if defined(__AVX512F__)
+    for (; i + 16 <= n; i += 16) {
+        __m256i v = _mm256_loadu_si256((const __m256i*)(src + i));
+        __m512i w = _mm512_cvtepu16_epi32(v);
+        w = _mm512_slli_epi32(w, 16);
+        _mm512_storeu_ps(dst + i, _mm512_castsi512_ps(w));
+    }
+#endif
+    for (; i < n; i++) {
+        dst[i] = decode_bf16(src[i]);
+    }
+}
 } // namespace faiss

data/vendor/faiss/faiss/utils/distances_simd.cpp CHANGED Viewed

@@ -9,7 +9,6 @@
 #include <faiss/utils/distances.h>
-#include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <cstring>

data/vendor/faiss/faiss/utils/hamming.cpp CHANGED Viewed

@@ -146,7 +146,7 @@ void hammings(
         size_t nb,
         size_t ncodes,
         hamdis_t* __restrict dis) {
-    with_simd_level([&]<SIMDLevel SL>() {
+    with_simd_level_a0_spr([&]<SIMDLevel SL>() {
         hammings_fixSL<SL>(a, b, na, nb, ncodes, dis);
     });
 }
@@ -170,7 +170,7 @@ void hammings_knn_hc(
         int order,
         ApproxTopK_mode_t approx_topk_mode,
         const faiss::IDSelector* sel) {
-    with_simd_level([&]<SIMDLevel SL>() {
+    with_simd_level_a0_spr([&]<SIMDLevel SL>() {
         hammings_knn_hc_fixSL<SL>(
                 ha, a, b, nb, ncodes, order, approx_topk_mode, sel);
     });
@@ -186,7 +186,7 @@ void hammings_knn_mc(
         int32_t* __restrict distances,
         int64_t* __restrict labels,
         const faiss::IDSelector* sel) {
-    with_simd_level([&]<SIMDLevel SL>() {
+    with_simd_level_a0_spr([&]<SIMDLevel SL>() {
         hammings_knn_mc_fixSL<SL>(
                 a, b, na, nb, k, ncodes, distances, labels, sel);
     });
@@ -201,7 +201,7 @@ void hamming_range_search(
         size_t code_size,
         RangeSearchResult* result,
         const faiss::IDSelector* sel) {
-    with_simd_level([&]<SIMDLevel SL>() {
+    with_simd_level_a0_spr([&]<SIMDLevel SL>() {
         hamming_range_search_fixSL<SL>(
                 a, b, na, nb, radius, code_size, result, sel);
     });
@@ -215,7 +215,7 @@ void hamming_count_thres(
         hamdis_t ht,
         size_t ncodes,
         size_t* nptr) {
-    with_simd_level([&]<SIMDLevel SL>() {
+    with_simd_level_a0_spr([&]<SIMDLevel SL>() {
         hamming_count_thres_fixSL<SL>(bs1, bs2, n1, n2, ht, ncodes, nptr);
     });
 }
@@ -226,7 +226,7 @@ void crosshamming_count_thres(
         hamdis_t ht,
         size_t ncodes,
         size_t* nptr) {
-    with_simd_level([&]<SIMDLevel SL>() {
+    with_simd_level_a0_spr([&]<SIMDLevel SL>() {
         crosshamming_count_thres_fixSL<SL>(dbs, n, ht, ncodes, nptr);
     });
 }
@@ -240,7 +240,7 @@ size_t match_hamming_thres(
         size_t ncodes,
         int64_t* idx,
         hamdis_t* dis) {
-    return with_simd_level([&]<SIMDLevel SL>() -> size_t {
+    return with_simd_level_a0_spr([&]<SIMDLevel SL>() -> size_t {
         return match_hamming_thres_fixSL<SL>(
                 bs1, bs2, n1, n2, ht, ncodes, idx, dis);
     });
@@ -253,7 +253,7 @@ void generalized_hammings_knn_hc(
         size_t nb,
         size_t code_size,
         int ordered) {
-    with_simd_level([&]<SIMDLevel SL>() {
+    with_simd_level_a0_spr([&]<SIMDLevel SL>() {
         generalized_hammings_knn_hc_fixSL<SL>(ha, a, b, nb, code_size, ordered);
     });
 }

data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp CHANGED Viewed

@@ -8,8 +8,9 @@
 #ifdef COMPILE_SIMD_AVX2
 #define THE_SIMD_LEVEL SIMDLevel::AVX2
-// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+// NOLINTBEGIN(facebook-hte-InlineHeader,facebook-unused-include-check)
 #include <faiss/utils/hamming_distance/hamming_computer-avx2.h>
 #include <faiss/utils/hamming_distance/hamming_impl.h>
+// NOLINTEND(facebook-hte-InlineHeader,facebook-unused-include-check)
 #endif // COMPILE_SIMD_AVX2

data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp ADDED Viewed

@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#ifdef COMPILE_SIMD_AVX512_SPR
+#define THE_SIMD_LEVEL SIMDLevel::AVX512_SPR
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/utils/hamming_distance/hamming_computer-avx512_spr.h>
+#include <faiss/utils/hamming_distance/hamming_impl.h>
+#endif // COMPILE_SIMD_AVX512_SPR

data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h CHANGED Viewed

@@ -10,10 +10,10 @@
 // AVX512 HammingComputer and GenHammingComputer specializations.
 // Types without custom AVX512 code inherit from the NONE specializations
-// in hamming_computer-generic.h. Custom specializations for
-// HammingComputer64 and HammingComputerDefault use _mm512_popcnt_epi64
-// when __AVX512VPOPCNTDQ__ is available. GenHammingComputer classes
-// leverage SSE/AVX2 intrinsics.
+// in hamming_computer-generic.h. HammingComputer64 and
+// HammingComputerDefault use scalar popcount here; the VPOPCNTDQ fast
+// path lives in hamming_computer-avx512_spr.h (AVX512_SPR level).
+// GenHammingComputer classes leverage SSE/AVX2 intrinsics.
 #include <cassert>
 #include <cstdint>
@@ -74,18 +74,10 @@ struct HammingComputer64_tpl<SIMDLevel::AVX512> {
     inline int hamming(const uint8_t* b8) const {
         const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
-#ifdef __AVX512VPOPCNTDQ__
-        __m512i vxor =
-                _mm512_xor_si512(_mm512_loadu_si512(a), _mm512_loadu_si512(b));
-        __m512i vpcnt = _mm512_popcnt_epi64(vxor);
-        // reduce performs better than adding the lower and higher parts
-        return _mm512_reduce_add_epi32(vpcnt);
-#else
         return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
                 popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
                 popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
                 popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
-#endif
     }
     inline static constexpr int get_code_size() {
@@ -112,27 +104,11 @@ struct HammingComputerDefault_tpl<SIMDLevel::AVX512> {
     }
     int hamming(const uint8_t* b8) const {
-        int accu = 0;
         const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
         const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-        int i = 0;
-#ifdef __AVX512VPOPCNTDQ__
-        int quotient64 = quotient8 / 8;
-        for (; i < quotient64; ++i) {
-            __m512i vxor = _mm512_xor_si512(
-                    _mm512_loadu_si512(&a64[i * 8]),
-                    _mm512_loadu_si512(&b64[i * 8]));
-            __m512i vpcnt = _mm512_popcnt_epi64(vxor);
-            // reduce performs better than adding the lower and higher parts
-            accu += _mm512_reduce_add_epi32(vpcnt);
-        }
-        i *= 8;
-#endif
-        accu += hamming_popcount_tail(
-                a64, b64, i, quotient8, a8, b8, remainder8);
-        return accu;
+        return hamming_popcount_tail(
+                a64, b64, 0, quotient8, a8, b8, remainder8);
     }
     inline int get_code_size() const {

data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h ADDED Viewed

@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#ifndef HAMMING_COMPUTER_AVX512_SPR_H
+#define HAMMING_COMPUTER_AVX512_SPR_H
+// AVX512_SPR HammingComputer specializations using VPOPCNTDQ.
+// On Sapphire Rapids+, _mm512_popcnt_epi64 (and _mm256_popcnt_epi64 with VL)
+// are unconditionally available. This gives a faster path than the scalar
+// popcount fallback used in the base AVX512 specializations when compiled
+// without -mavx512vpopcntdq.
+#include <cassert>
+#include <cstdint>
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/hamming_distance/hamming_computer-avx512.h>
+#include <immintrin.h>
+namespace faiss {
+/***************************************************************************
+ * AVX512_SPR inheriting specializations for types without custom SPR code.
+ ***************************************************************************/
+#define FAISS_INHERIT_HAMMING_SPR(Class)                                   \
+    template <>                                                            \
+    struct Class##                                                         \
+            _tpl<SIMDLevel::AVX512_SPR> : Class##_tpl<SIMDLevel::AVX512> { \
+        using Class##_tpl<SIMDLevel::AVX512>::Class##_tpl;                 \
+    }
+FAISS_INHERIT_HAMMING_SPR(HammingComputer16);
+FAISS_INHERIT_HAMMING_SPR(HammingComputer20);
+FAISS_INHERIT_HAMMING_SPR(GenHammingComputer8);
+FAISS_INHERIT_HAMMING_SPR(GenHammingComputer16);
+FAISS_INHERIT_HAMMING_SPR(GenHammingComputer32);
+FAISS_INHERIT_HAMMING_SPR(GenHammingComputerM8);
+#undef FAISS_INHERIT_HAMMING_SPR
+/***************************************************************************
+ * Custom AVX512_SPR specializations using VPOPCNTDQ.
+ ***************************************************************************/
+template <>
+struct HammingComputer32_tpl<SIMDLevel::AVX512_SPR> {
+    const uint8_t* a8;
+    HammingComputer32_tpl() {}
+    HammingComputer32_tpl(const uint8_t* a8_in, int code_size) {
+        set(a8_in, code_size);
+    }
+    void set(const uint8_t* a8_in, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 32);
+        a8 = a8_in;
+    }
+    inline int hamming(const uint8_t* b8) const {
+        __m256i va = _mm256_loadu_si256((const __m256i*)a8);
+        __m256i vb = _mm256_loadu_si256((const __m256i*)b8);
+        __m256i vxor = _mm256_xor_si256(va, vb);
+        __m256i vpcnt = _mm256_popcnt_epi64(vxor);
+        __m128i lo = _mm256_castsi256_si128(vpcnt);
+        __m128i hi = _mm256_extracti128_si256(vpcnt, 1);
+        __m128i sum = _mm_add_epi64(lo, hi);
+        return static_cast<int>(
+                _mm_extract_epi64(sum, 0) + _mm_extract_epi64(sum, 1));
+    }
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+template <>
+struct HammingComputer64_tpl<SIMDLevel::AVX512_SPR> {
+    const uint8_t* a8;
+    HammingComputer64_tpl() {}
+    HammingComputer64_tpl(const uint8_t* a8_in, int code_size) {
+        set(a8_in, code_size);
+    }
+    void set(const uint8_t* a8_in, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 64);
+        a8 = a8_in;
+    }
+    inline int hamming(const uint8_t* b8) const {
+        __m512i vxor = _mm512_xor_si512(
+                _mm512_loadu_si512(a8), _mm512_loadu_si512(b8));
+        __m512i vpcnt = _mm512_popcnt_epi64(vxor);
+        return _mm512_reduce_add_epi32(vpcnt);
+    }
+    inline static constexpr int get_code_size() {
+        return 64;
+    }
+};
+template <>
+struct HammingComputerDefault_tpl<SIMDLevel::AVX512_SPR> {
+    const uint8_t* a8;
+    int quotient8;
+    int remainder8;
+    HammingComputerDefault_tpl() {}
+    HammingComputerDefault_tpl(const uint8_t* a8_in, int code_size) {
+        set(a8_in, code_size);
+    }
+    void set(const uint8_t* a8_2, int code_size) {
+        this->a8 = a8_2;
+        quotient8 = code_size / 8;
+        remainder8 = code_size % 8;
+    }
+    int hamming(const uint8_t* b8) const {
+        int accu = 0;
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
+        int i = 0;
+        int quotient64 = quotient8 / 8;
+        for (; i < quotient64; ++i) {
+            __m512i vxor = _mm512_xor_si512(
+                    _mm512_loadu_si512(&a64[i * 8]),
+                    _mm512_loadu_si512(&b64[i * 8]));
+            __m512i vpcnt = _mm512_popcnt_epi64(vxor);
+            accu += _mm512_reduce_add_epi32(vpcnt);
+        }
+        i *= 8;
+        // Handle 4-word (256-bit) remainder with VPOPCNTDQ VL
+        if (i + 4 <= quotient8) {
+            __m256i vxor = _mm256_xor_si256(
+                    _mm256_loadu_si256((const __m256i*)&a64[i]),
+                    _mm256_loadu_si256((const __m256i*)&b64[i]));
+            __m256i vpcnt = _mm256_popcnt_epi64(vxor);
+            __m128i lo = _mm256_castsi256_si128(vpcnt);
+            __m128i hi = _mm256_extracti128_si256(vpcnt, 1);
+            __m128i sum = _mm_add_epi64(lo, hi);
+            accu += static_cast<int>(
+                    _mm_extract_epi64(sum, 0) + _mm_extract_epi64(sum, 1));
+            i += 4;
+        }
+        accu += hamming_popcount_tail(
+                a64, b64, i, quotient8, a8, b8, remainder8);
+        return accu;
+    }
+    inline int get_code_size() const {
+        return quotient8 * 8 + remainder8;
+    }
+};
+} // namespace faiss
+#endif

data/vendor/faiss/faiss/utils/partitioning.cpp CHANGED Viewed

@@ -18,8 +18,6 @@
 #include <faiss/utils/AlignedTable.h>
 #include <faiss/utils/ordered_key_value.h>
-#include <faiss/impl/platform_macros.h>
 namespace faiss {
 /******************************************************************

data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h CHANGED Viewed

@@ -592,39 +592,12 @@ simd16uint16 accu8to16(simd32uint8 a8) {
     return hadd(a8_0, a8_1);
 }
-static const simd32uint8 shifts = simd32uint8::create<
-        1,
-        16,
-        0,
-        0,
-        4,
-        64,
-        0,
-        0,
-        0,
-        0,
-        1,
-        16,
-        0,
-        0,
-        4,
-        64,
-        1,
-        16,
-        0,
-        0,
-        4,
-        64,
-        0,
-        0,
-        0,
-        0,
-        1,
-        16,
-        0,
-        0,
-        4,
-        64>();
+// Lookup table held as a plain byte array in .rodata. Storing it as a
+// `simd32uint8` global would emit an AVX2 initializer into `.init_array` that
+// runs at dlopen, before runtime SIMD dispatch, and SIGILLs on non-AVX2 CPUs
+alignas(32) static const uint8_t shifts[32] = {
+        1, 16, 0, 0, 4, 64, 0, 0, 0, 0, 1, 16, 0, 0, 4, 64,
+        1, 16, 0, 0, 4, 64, 0, 0, 0, 0, 1, 16, 0, 0, 4, 64};
 // 2-bit accumulator: we can add only up to 3 elements
 // on output we return 2*4-bit results
@@ -644,7 +617,8 @@ void compute_accu2(
         v = pp(v);
         // 0x800 -> force second half of table
         simd16uint16 idx = v | (v << 8) | simd16uint16(0x800);
-        a2 += simd16uint16(shifts.lookup_2_lanes(simd32uint8(idx)));
+        a2 += simd16uint16(
+                simd32uint8(shifts).lookup_2_lanes(simd32uint8(idx)));
     }
     a4lo += a2 & mask2;
     a4hi += (a2 >> 2) & mask2;
@@ -694,39 +668,11 @@ simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) {
  * 16 bins
  ************************************************************/
-static const simd32uint8 shifts2 = simd32uint8::create<
-        1,
-        2,
-        4,
-        8,
-        16,
-        32,
-        64,
-        128,
-        1,
-        2,
-        4,
-        8,
-        16,
-        32,
-        64,
-        128,
-        1,
-        2,
-        4,
-        8,
-        16,
-        32,
-        64,
-        128,
-        1,
-        2,
-        4,
-        8,
-        16,
-        32,
-        64,
-        128>();
+// See the note on `shifts` above: kept as a .rodata byte array so its
+// initializer does not emit AVX2 into `.init_array`
+alignas(32) static const uint8_t shifts2[32] = {
+        1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
+        1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
 simd32uint8 shiftr_16(simd32uint8 x, int n) {
     return simd32uint8(simd16uint16(x) >> n);
@@ -754,7 +700,7 @@ void compute_accu2_16(
         v = pp(v);
         simd16uint16 idx = v | (v << 8);
-        simd32uint8 a1 = shifts2.lookup_2_lanes(simd32uint8(idx));
+        simd32uint8 a1 = simd32uint8(shifts2).lookup_2_lanes(simd32uint8(idx));
         // contains 0s for out-of-bounds elements
         simd16uint16 lt8 = (v >> 3) == simd16uint16(0);