RubyGems - faiss - Versions diffs - 0.3.2 → 0.3.3 - Mend

faiss 0.3.2 → 0.3.3

Files changed (292) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/LICENSE.txt +1 -1
data/lib/faiss/version.rb +1 -1
data/vendor/faiss/faiss/AutoTune.cpp +2 -2
data/vendor/faiss/faiss/AutoTune.h +2 -2
data/vendor/faiss/faiss/Clustering.cpp +2 -2
data/vendor/faiss/faiss/Clustering.h +2 -2
data/vendor/faiss/faiss/IVFlib.cpp +2 -2
data/vendor/faiss/faiss/IVFlib.h +2 -2
data/vendor/faiss/faiss/Index.cpp +6 -2
data/vendor/faiss/faiss/Index.h +10 -3
data/vendor/faiss/faiss/Index2Layer.cpp +2 -2
data/vendor/faiss/faiss/Index2Layer.h +2 -2
data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +7 -7
data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +2 -2
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +14 -16
data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +2 -2
data/vendor/faiss/faiss/IndexBinary.cpp +13 -2
data/vendor/faiss/faiss/IndexBinary.h +8 -2
data/vendor/faiss/faiss/IndexBinaryFlat.cpp +2 -3
data/vendor/faiss/faiss/IndexBinaryFlat.h +2 -2
data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -2
data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -2
data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +2 -7
data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -2
data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -3
data/vendor/faiss/faiss/IndexBinaryHash.h +2 -2
data/vendor/faiss/faiss/IndexBinaryIVF.cpp +2 -2
data/vendor/faiss/faiss/IndexBinaryIVF.h +2 -2
data/vendor/faiss/faiss/IndexFastScan.cpp +10 -14
data/vendor/faiss/faiss/IndexFastScan.h +11 -2
data/vendor/faiss/faiss/IndexFlat.cpp +2 -3
data/vendor/faiss/faiss/IndexFlat.h +2 -2
data/vendor/faiss/faiss/IndexFlatCodes.cpp +11 -2
data/vendor/faiss/faiss/IndexFlatCodes.h +5 -2
data/vendor/faiss/faiss/IndexHNSW.cpp +13 -6
data/vendor/faiss/faiss/IndexHNSW.h +2 -2
data/vendor/faiss/faiss/IndexIDMap.cpp +19 -3
data/vendor/faiss/faiss/IndexIDMap.h +5 -2
data/vendor/faiss/faiss/IndexIVF.cpp +2 -3
data/vendor/faiss/faiss/IndexIVF.h +5 -4
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +6 -7
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +2 -2
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +3 -14
data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +2 -4
data/vendor/faiss/faiss/IndexIVFFastScan.cpp +71 -34
data/vendor/faiss/faiss/IndexIVFFastScan.h +19 -2
data/vendor/faiss/faiss/IndexIVFFlat.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFFlat.h +2 -2
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +2 -2
data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
data/vendor/faiss/faiss/IndexIVFPQ.h +2 -2
data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +7 -33
data/vendor/faiss/faiss/IndexIVFPQFastScan.h +2 -4
data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -2
data/vendor/faiss/faiss/IndexIVFPQR.h +2 -2
data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +2 -3
data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -2
data/vendor/faiss/faiss/IndexLSH.cpp +2 -3
data/vendor/faiss/faiss/IndexLSH.h +2 -2
data/vendor/faiss/faiss/IndexLattice.cpp +2 -2
data/vendor/faiss/faiss/IndexLattice.h +2 -2
data/vendor/faiss/faiss/IndexNNDescent.cpp +2 -2
data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
data/vendor/faiss/faiss/IndexNSG.cpp +2 -5
data/vendor/faiss/faiss/IndexNSG.h +2 -2
data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +2 -2
data/vendor/faiss/faiss/IndexNeuralNetCodec.h +2 -2
data/vendor/faiss/faiss/IndexPQ.cpp +26 -26
data/vendor/faiss/faiss/IndexPQ.h +2 -2
data/vendor/faiss/faiss/IndexPQFastScan.cpp +2 -5
data/vendor/faiss/faiss/IndexPQFastScan.h +2 -11
data/vendor/faiss/faiss/IndexPreTransform.cpp +2 -2
data/vendor/faiss/faiss/IndexPreTransform.h +2 -2
data/vendor/faiss/faiss/IndexRefine.cpp +41 -4
data/vendor/faiss/faiss/IndexRefine.h +9 -2
data/vendor/faiss/faiss/IndexReplicas.cpp +2 -2
data/vendor/faiss/faiss/IndexReplicas.h +2 -2
data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +2 -2
data/vendor/faiss/faiss/IndexRowwiseMinMax.h +2 -2
data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -3
data/vendor/faiss/faiss/IndexScalarQuantizer.h +2 -2
data/vendor/faiss/faiss/IndexShards.cpp +2 -2
data/vendor/faiss/faiss/IndexShards.h +2 -2
data/vendor/faiss/faiss/IndexShardsIVF.cpp +2 -2
data/vendor/faiss/faiss/IndexShardsIVF.h +2 -2
data/vendor/faiss/faiss/MatrixStats.cpp +2 -2
data/vendor/faiss/faiss/MatrixStats.h +2 -2
data/vendor/faiss/faiss/MetaIndexes.cpp +2 -3
data/vendor/faiss/faiss/MetaIndexes.h +2 -2
data/vendor/faiss/faiss/MetricType.h +2 -2
data/vendor/faiss/faiss/VectorTransform.cpp +2 -2
data/vendor/faiss/faiss/VectorTransform.h +2 -2
data/vendor/faiss/faiss/clone_index.cpp +2 -2
data/vendor/faiss/faiss/clone_index.h +2 -2
data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +2 -2
data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +2 -2
data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +2 -2
data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +44 -4
data/vendor/faiss/faiss/cppcontrib/factory_tools.h +7 -2
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2 -2
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +2 -2
data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2 -2
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +2 -2
data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +2 -2
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +2 -2
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +2 -2
data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +2 -2
data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +2 -5
data/vendor/faiss/faiss/gpu/GpuAutoTune.h +2 -2
data/vendor/faiss/faiss/gpu/GpuCloner.cpp +13 -13
data/vendor/faiss/faiss/gpu/GpuCloner.h +2 -2
data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +6 -6
data/vendor/faiss/faiss/gpu/GpuDistance.h +11 -7
data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +2 -2
data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +2 -2
data/vendor/faiss/faiss/gpu/GpuIndex.h +8 -7
data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -2
data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -3
data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +2 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +2 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +2 -2
data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +3 -3
data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +2 -2
data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +2 -2
data/vendor/faiss/faiss/gpu/GpuResources.cpp +7 -2
data/vendor/faiss/faiss/gpu/GpuResources.h +11 -4
data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +51 -21
data/vendor/faiss/faiss/gpu/StandardGpuResources.h +15 -5
data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +2 -2
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +2 -2
data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +2 -2
data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +2 -2
data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +2 -2
data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +2 -2
data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +2 -2
data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +2 -2
data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +2 -3
data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +2 -2
data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +2 -2
data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +2 -2
data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +54 -54
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +80 -78
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +51 -51
data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +2 -2
data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +3 -3
data/vendor/faiss/faiss/gpu/test/TestGpuResidualQuantizer.cpp +70 -0
data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +74 -4
data/vendor/faiss/faiss/gpu/test/TestUtils.h +2 -2
data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
data/vendor/faiss/faiss/gpu/utils/{RaftUtils.h → CuvsUtils.h} +12 -11
data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +2 -2
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +2 -2
data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +2 -2
data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +2 -2
data/vendor/faiss/faiss/gpu/utils/Timer.cpp +2 -2
data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +79 -11
data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +17 -5
data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +2 -2
data/vendor/faiss/faiss/impl/AuxIndexStructures.h +2 -2
data/vendor/faiss/faiss/impl/CodePacker.cpp +2 -2
data/vendor/faiss/faiss/impl/CodePacker.h +2 -2
data/vendor/faiss/faiss/impl/DistanceComputer.h +2 -2
data/vendor/faiss/faiss/impl/FaissAssert.h +2 -2
data/vendor/faiss/faiss/impl/FaissException.cpp +2 -2
data/vendor/faiss/faiss/impl/FaissException.h +2 -3
data/vendor/faiss/faiss/impl/HNSW.cpp +24 -19
data/vendor/faiss/faiss/impl/HNSW.h +12 -2
data/vendor/faiss/faiss/impl/IDSelector.cpp +2 -2
data/vendor/faiss/faiss/impl/IDSelector.h +2 -2
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +2 -2
data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +2 -2
data/vendor/faiss/faiss/impl/LookupTableScaler.h +2 -2
data/vendor/faiss/faiss/impl/NNDescent.cpp +2 -2
data/vendor/faiss/faiss/impl/NNDescent.h +2 -2
data/vendor/faiss/faiss/impl/NSG.cpp +27 -21
data/vendor/faiss/faiss/impl/NSG.h +20 -8
data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +2 -2
data/vendor/faiss/faiss/impl/PolysemousTraining.h +2 -2
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +2 -4
data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +2 -2
data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +2 -2
data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +2 -2
data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -2
data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +2 -36
data/vendor/faiss/faiss/impl/ResidualQuantizer.h +3 -13
data/vendor/faiss/faiss/impl/ResultHandler.h +2 -2
data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +2 -2
data/vendor/faiss/faiss/impl/ScalarQuantizer.h +2 -2
data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +2 -2
data/vendor/faiss/faiss/impl/ThreadedIndex.h +2 -2
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +2 -2
data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +2 -2
data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +2 -2
data/vendor/faiss/faiss/impl/code_distance/code_distance-sve.h +440 -0
data/vendor/faiss/faiss/impl/code_distance/code_distance.h +55 -2
data/vendor/faiss/faiss/impl/index_read.cpp +2 -5
data/vendor/faiss/faiss/impl/index_read_utils.h +2 -2
data/vendor/faiss/faiss/impl/index_write.cpp +2 -6
data/vendor/faiss/faiss/impl/io.cpp +2 -2
data/vendor/faiss/faiss/impl/io.h +2 -2
data/vendor/faiss/faiss/impl/io_macros.h +2 -9
data/vendor/faiss/faiss/impl/kmeans1d.cpp +2 -3
data/vendor/faiss/faiss/impl/kmeans1d.h +2 -2
data/vendor/faiss/faiss/impl/lattice_Zn.cpp +2 -3
data/vendor/faiss/faiss/impl/lattice_Zn.h +2 -2
data/vendor/faiss/faiss/impl/platform_macros.h +12 -2
data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +2 -2
data/vendor/faiss/faiss/impl/pq4_fast_scan.h +20 -2
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +2 -2
data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +2 -2
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +3 -3
data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +2 -2
data/vendor/faiss/faiss/impl/simd_result_handlers.h +18 -18
data/vendor/faiss/faiss/index_factory.cpp +20 -21
data/vendor/faiss/faiss/index_factory.h +2 -2
data/vendor/faiss/faiss/index_io.h +2 -2
data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +2 -2
data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +2 -2
data/vendor/faiss/faiss/invlists/DirectMap.cpp +2 -2
data/vendor/faiss/faiss/invlists/DirectMap.h +2 -2
data/vendor/faiss/faiss/invlists/InvertedLists.cpp +2 -2
data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +2 -2
data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +2 -2
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -3
data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -2
data/vendor/faiss/faiss/python/python_callbacks.cpp +2 -2
data/vendor/faiss/faiss/python/python_callbacks.h +2 -2
data/vendor/faiss/faiss/utils/AlignedTable.h +5 -3
data/vendor/faiss/faiss/utils/Heap.cpp +2 -2
data/vendor/faiss/faiss/utils/Heap.h +2 -2
data/vendor/faiss/faiss/utils/NeuralNet.cpp +11 -7
data/vendor/faiss/faiss/utils/NeuralNet.h +2 -2
data/vendor/faiss/faiss/utils/WorkerThread.cpp +2 -2
data/vendor/faiss/faiss/utils/WorkerThread.h +2 -2
data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +2 -2
data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +2 -2
data/vendor/faiss/faiss/utils/approx_topk/generic.h +2 -2
data/vendor/faiss/faiss/utils/approx_topk/mode.h +2 -2
data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +2 -2
data/vendor/faiss/faiss/utils/bf16.h +2 -2
data/vendor/faiss/faiss/utils/distances.cpp +191 -2
data/vendor/faiss/faiss/utils/distances.h +3 -3
data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +2 -2
data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +2 -2
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +2 -2
data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +2 -2
data/vendor/faiss/faiss/utils/distances_simd.cpp +502 -3
data/vendor/faiss/faiss/utils/extra_distances-inl.h +2 -2
data/vendor/faiss/faiss/utils/extra_distances.cpp +2 -3
data/vendor/faiss/faiss/utils/extra_distances.h +2 -2
data/vendor/faiss/faiss/utils/fp16-arm.h +2 -2
data/vendor/faiss/faiss/utils/fp16-fp16c.h +2 -2
data/vendor/faiss/faiss/utils/fp16-inl.h +2 -2
data/vendor/faiss/faiss/utils/fp16.h +2 -2
data/vendor/faiss/faiss/utils/hamming-inl.h +2 -2
data/vendor/faiss/faiss/utils/hamming.cpp +2 -3
data/vendor/faiss/faiss/utils/hamming.h +2 -2
data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +2 -2
data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +490 -0
data/vendor/faiss/faiss/utils/hamming_distance/common.h +2 -2
data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +2 -2
data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +5 -2
data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +5 -5
data/vendor/faiss/faiss/utils/ordered_key_value.h +2 -2
data/vendor/faiss/faiss/utils/partitioning.cpp +2 -2
data/vendor/faiss/faiss/utils/partitioning.h +2 -2
data/vendor/faiss/faiss/utils/prefetch.h +2 -2
data/vendor/faiss/faiss/utils/quantize_lut.cpp +2 -2
data/vendor/faiss/faiss/utils/quantize_lut.h +2 -2
data/vendor/faiss/faiss/utils/random.cpp +2 -2
data/vendor/faiss/faiss/utils/random.h +2 -2
data/vendor/faiss/faiss/utils/simdlib.h +2 -2
data/vendor/faiss/faiss/utils/simdlib_avx2.h +2 -2
data/vendor/faiss/faiss/utils/simdlib_avx512.h +2 -2
data/vendor/faiss/faiss/utils/simdlib_emulated.h +2 -2
data/vendor/faiss/faiss/utils/simdlib_neon.h +2 -2
data/vendor/faiss/faiss/utils/simdlib_ppc64.h +2 -2
data/vendor/faiss/faiss/utils/sorting.cpp +2 -2
data/vendor/faiss/faiss/utils/sorting.h +2 -2
data/vendor/faiss/faiss/utils/transpose/transpose-avx2-inl.h +2 -2
data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +2 -2
data/vendor/faiss/faiss/utils/utils.cpp +7 -7
data/vendor/faiss/faiss/utils/utils.h +4 -3
metadata +9 -10

data/vendor/faiss/faiss/utils/distances_simd.cpp CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
@@ -29,6 +29,10 @@
 #include <faiss/utils/transpose/transpose-avx2-inl.h>
 #endif
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
 #ifdef __aarch64__
 #include <arm_neon.h>
 #endif
@@ -2585,6 +2589,7 @@ size_t fvec_L2sqr_ny_nearest_y_transposed(
 float fvec_L1(const float* x, const float* y, size_t d) {
     __m256 msum1 = _mm256_setzero_ps();
+    // signmask used for absolute value
     __m256 signmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffffUL));
     while (d >= 8) {
@@ -2592,7 +2597,9 @@ float fvec_L1(const float* x, const float* y, size_t d) {
         x += 8;
         __m256 my = _mm256_loadu_ps(y);
         y += 8;
+        // subtract
         const __m256 a_m_b = _mm256_sub_ps(mx, my);
+        // find sum of absolute value of distances (manhattan distance)
         msum1 = _mm256_add_ps(msum1, _mm256_and_ps(signmask, a_m_b));
         d -= 8;
     }
@@ -2625,6 +2632,7 @@ float fvec_L1(const float* x, const float* y, size_t d) {
 float fvec_Linf(const float* x, const float* y, size_t d) {
     __m256 msum1 = _mm256_setzero_ps();
+    // signmask used for absolute value
     __m256 signmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffffUL));
     while (d >= 8) {
@@ -2632,7 +2640,9 @@ float fvec_Linf(const float* x, const float* y, size_t d) {
         x += 8;
         __m256 my = _mm256_loadu_ps(y);
         y += 8;
+        // subtract
         const __m256 a_m_b = _mm256_sub_ps(mx, my);
+        // find max of absolute value of distances (chebyshev distance)
         msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b));
         d -= 8;
     }
@@ -2673,6 +2683,441 @@ float fvec_Linf(const float* x, const float* y, size_t d) {
     return fvec_Linf_ref(x, y, d);
 }
+#elif defined(__ARM_FEATURE_SVE)
+struct ElementOpIP {
+    static svfloat32_t op(svbool_t pg, svfloat32_t x, svfloat32_t y) {
+        return svmul_f32_x(pg, x, y);
+    }
+    static svfloat32_t merge(
+            svbool_t pg,
+            svfloat32_t z,
+            svfloat32_t x,
+            svfloat32_t y) {
+        return svmla_f32_x(pg, z, x, y);
+    }
+};
+template <typename ElementOp>
+void fvec_op_ny_sve_d1(float* dis, const float* x, const float* y, size_t ny) {
+    const size_t lanes = svcntw();
+    const size_t lanes2 = lanes * 2;
+    const size_t lanes3 = lanes * 3;
+    const size_t lanes4 = lanes * 4;
+    const svbool_t pg = svptrue_b32();
+    const svfloat32_t x0 = svdup_n_f32(x[0]);
+    size_t i = 0;
+    for (; i + lanes4 < ny; i += lanes4) {
+        svfloat32_t y0 = svld1_f32(pg, y);
+        svfloat32_t y1 = svld1_f32(pg, y + lanes);
+        svfloat32_t y2 = svld1_f32(pg, y + lanes2);
+        svfloat32_t y3 = svld1_f32(pg, y + lanes3);
+        y0 = ElementOp::op(pg, x0, y0);
+        y1 = ElementOp::op(pg, x0, y1);
+        y2 = ElementOp::op(pg, x0, y2);
+        y3 = ElementOp::op(pg, x0, y3);
+        svst1_f32(pg, dis, y0);
+        svst1_f32(pg, dis + lanes, y1);
+        svst1_f32(pg, dis + lanes2, y2);
+        svst1_f32(pg, dis + lanes3, y3);
+        y += lanes4;
+        dis += lanes4;
+    }
+    const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
+    const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
+    const svbool_t pg2 = svwhilelt_b32_u64(i + lanes2, ny);
+    const svbool_t pg3 = svwhilelt_b32_u64(i + lanes3, ny);
+    svfloat32_t y0 = svld1_f32(pg0, y);
+    svfloat32_t y1 = svld1_f32(pg1, y + lanes);
+    svfloat32_t y2 = svld1_f32(pg2, y + lanes2);
+    svfloat32_t y3 = svld1_f32(pg3, y + lanes3);
+    y0 = ElementOp::op(pg0, x0, y0);
+    y1 = ElementOp::op(pg1, x0, y1);
+    y2 = ElementOp::op(pg2, x0, y2);
+    y3 = ElementOp::op(pg3, x0, y3);
+    svst1_f32(pg0, dis, y0);
+    svst1_f32(pg1, dis + lanes, y1);
+    svst1_f32(pg2, dis + lanes2, y2);
+    svst1_f32(pg3, dis + lanes3, y3);
+}
+template <typename ElementOp>
+void fvec_op_ny_sve_d2(float* dis, const float* x, const float* y, size_t ny) {
+    const size_t lanes = svcntw();
+    const size_t lanes2 = lanes * 2;
+    const size_t lanes4 = lanes * 4;
+    const svbool_t pg = svptrue_b32();
+    const svfloat32_t x0 = svdup_n_f32(x[0]);
+    const svfloat32_t x1 = svdup_n_f32(x[1]);
+    size_t i = 0;
+    for (; i + lanes2 < ny; i += lanes2) {
+        const svfloat32x2_t y0 = svld2_f32(pg, y);
+        const svfloat32x2_t y1 = svld2_f32(pg, y + lanes2);
+        svfloat32_t y00 = svget2_f32(y0, 0);
+        const svfloat32_t y01 = svget2_f32(y0, 1);
+        svfloat32_t y10 = svget2_f32(y1, 0);
+        const svfloat32_t y11 = svget2_f32(y1, 1);
+        y00 = ElementOp::op(pg, x0, y00);
+        y10 = ElementOp::op(pg, x0, y10);
+        y00 = ElementOp::merge(pg, y00, x1, y01);
+        y10 = ElementOp::merge(pg, y10, x1, y11);
+        svst1_f32(pg, dis, y00);
+        svst1_f32(pg, dis + lanes, y10);
+        y += lanes4;
+        dis += lanes2;
+    }
+    const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
+    const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
+    const svfloat32x2_t y0 = svld2_f32(pg0, y);
+    const svfloat32x2_t y1 = svld2_f32(pg1, y + lanes2);
+    svfloat32_t y00 = svget2_f32(y0, 0);
+    const svfloat32_t y01 = svget2_f32(y0, 1);
+    svfloat32_t y10 = svget2_f32(y1, 0);
+    const svfloat32_t y11 = svget2_f32(y1, 1);
+    y00 = ElementOp::op(pg0, x0, y00);
+    y10 = ElementOp::op(pg1, x0, y10);
+    y00 = ElementOp::merge(pg0, y00, x1, y01);
+    y10 = ElementOp::merge(pg1, y10, x1, y11);
+    svst1_f32(pg0, dis, y00);
+    svst1_f32(pg1, dis + lanes, y10);
+}
+template <typename ElementOp>
+void fvec_op_ny_sve_d4(float* dis, const float* x, const float* y, size_t ny) {
+    const size_t lanes = svcntw();
+    const size_t lanes4 = lanes * 4;
+    const svbool_t pg = svptrue_b32();
+    const svfloat32_t x0 = svdup_n_f32(x[0]);
+    const svfloat32_t x1 = svdup_n_f32(x[1]);
+    const svfloat32_t x2 = svdup_n_f32(x[2]);
+    const svfloat32_t x3 = svdup_n_f32(x[3]);
+    size_t i = 0;
+    for (; i + lanes < ny; i += lanes) {
+        const svfloat32x4_t y0 = svld4_f32(pg, y);
+        svfloat32_t y00 = svget4_f32(y0, 0);
+        const svfloat32_t y01 = svget4_f32(y0, 1);
+        svfloat32_t y02 = svget4_f32(y0, 2);
+        const svfloat32_t y03 = svget4_f32(y0, 3);
+        y00 = ElementOp::op(pg, x0, y00);
+        y02 = ElementOp::op(pg, x2, y02);
+        y00 = ElementOp::merge(pg, y00, x1, y01);
+        y02 = ElementOp::merge(pg, y02, x3, y03);
+        y00 = svadd_f32_x(pg, y00, y02);
+        svst1_f32(pg, dis, y00);
+        y += lanes4;
+        dis += lanes;
+    }
+    const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
+    const svfloat32x4_t y0 = svld4_f32(pg0, y);
+    svfloat32_t y00 = svget4_f32(y0, 0);
+    const svfloat32_t y01 = svget4_f32(y0, 1);
+    svfloat32_t y02 = svget4_f32(y0, 2);
+    const svfloat32_t y03 = svget4_f32(y0, 3);
+    y00 = ElementOp::op(pg0, x0, y00);
+    y02 = ElementOp::op(pg0, x2, y02);
+    y00 = ElementOp::merge(pg0, y00, x1, y01);
+    y02 = ElementOp::merge(pg0, y02, x3, y03);
+    y00 = svadd_f32_x(pg0, y00, y02);
+    svst1_f32(pg0, dis, y00);
+}
+template <typename ElementOp>
+void fvec_op_ny_sve_d8(float* dis, const float* x, const float* y, size_t ny) {
+    const size_t lanes = svcntw();
+    const size_t lanes4 = lanes * 4;
+    const size_t lanes8 = lanes * 8;
+    const svbool_t pg = svptrue_b32();
+    const svfloat32_t x0 = svdup_n_f32(x[0]);
+    const svfloat32_t x1 = svdup_n_f32(x[1]);
+    const svfloat32_t x2 = svdup_n_f32(x[2]);
+    const svfloat32_t x3 = svdup_n_f32(x[3]);
+    const svfloat32_t x4 = svdup_n_f32(x[4]);
+    const svfloat32_t x5 = svdup_n_f32(x[5]);
+    const svfloat32_t x6 = svdup_n_f32(x[6]);
+    const svfloat32_t x7 = svdup_n_f32(x[7]);
+    size_t i = 0;
+    for (; i + lanes < ny; i += lanes) {
+        const svfloat32x4_t ya = svld4_f32(pg, y);
+        const svfloat32x4_t yb = svld4_f32(pg, y + lanes4);
+        const svfloat32_t ya0 = svget4_f32(ya, 0);
+        const svfloat32_t ya1 = svget4_f32(ya, 1);
+        const svfloat32_t ya2 = svget4_f32(ya, 2);
+        const svfloat32_t ya3 = svget4_f32(ya, 3);
+        const svfloat32_t yb0 = svget4_f32(yb, 0);
+        const svfloat32_t yb1 = svget4_f32(yb, 1);
+        const svfloat32_t yb2 = svget4_f32(yb, 2);
+        const svfloat32_t yb3 = svget4_f32(yb, 3);
+        svfloat32_t y0 = svuzp1(ya0, yb0);
+        const svfloat32_t y1 = svuzp1(ya1, yb1);
+        svfloat32_t y2 = svuzp1(ya2, yb2);
+        const svfloat32_t y3 = svuzp1(ya3, yb3);
+        svfloat32_t y4 = svuzp2(ya0, yb0);
+        const svfloat32_t y5 = svuzp2(ya1, yb1);
+        svfloat32_t y6 = svuzp2(ya2, yb2);
+        const svfloat32_t y7 = svuzp2(ya3, yb3);
+        y0 = ElementOp::op(pg, x0, y0);
+        y2 = ElementOp::op(pg, x2, y2);
+        y4 = ElementOp::op(pg, x4, y4);
+        y6 = ElementOp::op(pg, x6, y6);
+        y0 = ElementOp::merge(pg, y0, x1, y1);
+        y2 = ElementOp::merge(pg, y2, x3, y3);
+        y4 = ElementOp::merge(pg, y4, x5, y5);
+        y6 = ElementOp::merge(pg, y6, x7, y7);
+        y0 = svadd_f32_x(pg, y0, y2);
+        y4 = svadd_f32_x(pg, y4, y6);
+        y0 = svadd_f32_x(pg, y0, y4);
+        svst1_f32(pg, dis, y0);
+        y += lanes8;
+        dis += lanes;
+    }
+    const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
+    const svbool_t pga = svwhilelt_b32_u64(i * 2, ny * 2);
+    const svbool_t pgb = svwhilelt_b32_u64(i * 2 + lanes, ny * 2);
+    const svfloat32x4_t ya = svld4_f32(pga, y);
+    const svfloat32x4_t yb = svld4_f32(pgb, y + lanes4);
+    const svfloat32_t ya0 = svget4_f32(ya, 0);
+    const svfloat32_t ya1 = svget4_f32(ya, 1);
+    const svfloat32_t ya2 = svget4_f32(ya, 2);
+    const svfloat32_t ya3 = svget4_f32(ya, 3);
+    const svfloat32_t yb0 = svget4_f32(yb, 0);
+    const svfloat32_t yb1 = svget4_f32(yb, 1);
+    const svfloat32_t yb2 = svget4_f32(yb, 2);
+    const svfloat32_t yb3 = svget4_f32(yb, 3);
+    svfloat32_t y0 = svuzp1(ya0, yb0);
+    const svfloat32_t y1 = svuzp1(ya1, yb1);
+    svfloat32_t y2 = svuzp1(ya2, yb2);
+    const svfloat32_t y3 = svuzp1(ya3, yb3);
+    svfloat32_t y4 = svuzp2(ya0, yb0);
+    const svfloat32_t y5 = svuzp2(ya1, yb1);
+    svfloat32_t y6 = svuzp2(ya2, yb2);
+    const svfloat32_t y7 = svuzp2(ya3, yb3);
+    y0 = ElementOp::op(pg0, x0, y0);
+    y2 = ElementOp::op(pg0, x2, y2);
+    y4 = ElementOp::op(pg0, x4, y4);
+    y6 = ElementOp::op(pg0, x6, y6);
+    y0 = ElementOp::merge(pg0, y0, x1, y1);
+    y2 = ElementOp::merge(pg0, y2, x3, y3);
+    y4 = ElementOp::merge(pg0, y4, x5, y5);
+    y6 = ElementOp::merge(pg0, y6, x7, y7);
+    y0 = svadd_f32_x(pg0, y0, y2);
+    y4 = svadd_f32_x(pg0, y4, y6);
+    y0 = svadd_f32_x(pg0, y0, y4);
+    svst1_f32(pg0, dis, y0);
+    y += lanes8;
+    dis += lanes;
+}
+template <typename ElementOp>
+void fvec_op_ny_sve_lanes1(
+        float* dis,
+        const float* x,
+        const float* y,
+        size_t ny) {
+    const size_t lanes = svcntw();
+    const size_t lanes2 = lanes * 2;
+    const size_t lanes3 = lanes * 3;
+    const size_t lanes4 = lanes * 4;
+    const svbool_t pg = svptrue_b32();
+    const svfloat32_t x0 = svld1_f32(pg, x);
+    size_t i = 0;
+    for (; i + 3 < ny; i += 4) {
+        svfloat32_t y0 = svld1_f32(pg, y);
+        svfloat32_t y1 = svld1_f32(pg, y + lanes);
+        svfloat32_t y2 = svld1_f32(pg, y + lanes2);
+        svfloat32_t y3 = svld1_f32(pg, y + lanes3);
+        y += lanes4;
+        y0 = ElementOp::op(pg, x0, y0);
+        y1 = ElementOp::op(pg, x0, y1);
+        y2 = ElementOp::op(pg, x0, y2);
+        y3 = ElementOp::op(pg, x0, y3);
+        dis[i] = svaddv_f32(pg, y0);
+        dis[i + 1] = svaddv_f32(pg, y1);
+        dis[i + 2] = svaddv_f32(pg, y2);
+        dis[i + 3] = svaddv_f32(pg, y3);
+    }
+    for (; i < ny; ++i) {
+        svfloat32_t y0 = svld1_f32(pg, y);
+        y += lanes;
+        y0 = ElementOp::op(pg, x0, y0);
+        dis[i] = svaddv_f32(pg, y0);
+    }
+}
+template <typename ElementOp>
+void fvec_op_ny_sve_lanes2(
+        float* dis,
+        const float* x,
+        const float* y,
+        size_t ny) {
+    const size_t lanes = svcntw();
+    const size_t lanes2 = lanes * 2;
+    const size_t lanes3 = lanes * 3;
+    const size_t lanes4 = lanes * 4;
+    const svbool_t pg = svptrue_b32();
+    const svfloat32_t x0 = svld1_f32(pg, x);
+    const svfloat32_t x1 = svld1_f32(pg, x + lanes);
+    size_t i = 0;
+    for (; i + 1 < ny; i += 2) {
+        svfloat32_t y00 = svld1_f32(pg, y);
+        const svfloat32_t y01 = svld1_f32(pg, y + lanes);
+        svfloat32_t y10 = svld1_f32(pg, y + lanes2);
+        const svfloat32_t y11 = svld1_f32(pg, y + lanes3);
+        y += lanes4;
+        y00 = ElementOp::op(pg, x0, y00);
+        y10 = ElementOp::op(pg, x0, y10);
+        y00 = ElementOp::merge(pg, y00, x1, y01);
+        y10 = ElementOp::merge(pg, y10, x1, y11);
+        dis[i] = svaddv_f32(pg, y00);
+        dis[i + 1] = svaddv_f32(pg, y10);
+    }
+    if (i < ny) {
+        svfloat32_t y0 = svld1_f32(pg, y);
+        const svfloat32_t y1 = svld1_f32(pg, y + lanes);
+        y0 = ElementOp::op(pg, x0, y0);
+        y0 = ElementOp::merge(pg, y0, x1, y1);
+        dis[i] = svaddv_f32(pg, y0);
+    }
+}
+template <typename ElementOp>
+void fvec_op_ny_sve_lanes3(
+        float* dis,
+        const float* x,
+        const float* y,
+        size_t ny) {
+    const size_t lanes = svcntw();
+    const size_t lanes2 = lanes * 2;
+    const size_t lanes3 = lanes * 3;
+    const svbool_t pg = svptrue_b32();
+    const svfloat32_t x0 = svld1_f32(pg, x);
+    const svfloat32_t x1 = svld1_f32(pg, x + lanes);
+    const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
+    for (size_t i = 0; i < ny; ++i) {
+        svfloat32_t y0 = svld1_f32(pg, y);
+        const svfloat32_t y1 = svld1_f32(pg, y + lanes);
+        svfloat32_t y2 = svld1_f32(pg, y + lanes2);
+        y += lanes3;
+        y0 = ElementOp::op(pg, x0, y0);
+        y0 = ElementOp::merge(pg, y0, x1, y1);
+        y0 = ElementOp::merge(pg, y0, x2, y2);
+        dis[i] = svaddv_f32(pg, y0);
+    }
+}
+template <typename ElementOp>
+void fvec_op_ny_sve_lanes4(
+        float* dis,
+        const float* x,
+        const float* y,
+        size_t ny) {
+    const size_t lanes = svcntw();
+    const size_t lanes2 = lanes * 2;
+    const size_t lanes3 = lanes * 3;
+    const size_t lanes4 = lanes * 4;
+    const svbool_t pg = svptrue_b32();
+    const svfloat32_t x0 = svld1_f32(pg, x);
+    const svfloat32_t x1 = svld1_f32(pg, x + lanes);
+    const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
+    const svfloat32_t x3 = svld1_f32(pg, x + lanes3);
+    for (size_t i = 0; i < ny; ++i) {
+        svfloat32_t y0 = svld1_f32(pg, y);
+        const svfloat32_t y1 = svld1_f32(pg, y + lanes);
+        svfloat32_t y2 = svld1_f32(pg, y + lanes2);
+        const svfloat32_t y3 = svld1_f32(pg, y + lanes3);
+        y += lanes4;
+        y0 = ElementOp::op(pg, x0, y0);
+        y2 = ElementOp::op(pg, x2, y2);
+        y0 = ElementOp::merge(pg, y0, x1, y1);
+        y2 = ElementOp::merge(pg, y2, x3, y3);
+        y0 = svadd_f32_x(pg, y0, y2);
+        dis[i] = svaddv_f32(pg, y0);
+    }
+}
+void fvec_L2sqr_ny(
+        float* dis,
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t ny) {
+    fvec_L2sqr_ny_ref(dis, x, y, d, ny);
+}
+void fvec_L2sqr_ny_transposed(
+        float* dis,
+        const float* x,
+        const float* y,
+        const float* y_sqlen,
+        size_t d,
+        size_t d_offset,
+        size_t ny) {
+    return fvec_L2sqr_ny_y_transposed_ref(dis, x, y, y_sqlen, d, d_offset, ny);
+}
+size_t fvec_L2sqr_ny_nearest(
+        float* distances_tmp_buffer,
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t ny) {
+    return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, d, ny);
+}
+size_t fvec_L2sqr_ny_nearest_y_transposed(
+        float* distances_tmp_buffer,
+        const float* x,
+        const float* y,
+        const float* y_sqlen,
+        size_t d,
+        size_t d_offset,
+        size_t ny) {
+    return fvec_L2sqr_ny_nearest_y_transposed_ref(
+            distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
+}
+float fvec_L1(const float* x, const float* y, size_t d) {
+    return fvec_L1_ref(x, y, d);
+}
+float fvec_Linf(const float* x, const float* y, size_t d) {
+    return fvec_Linf_ref(x, y, d);
+}
+void fvec_inner_products_ny(
+        float* dis,
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t ny) {
+    const size_t lanes = svcntw();
+    switch (d) {
+        case 1:
+            fvec_op_ny_sve_d1<ElementOpIP>(dis, x, y, ny);
+            break;
+        case 2:
+            fvec_op_ny_sve_d2<ElementOpIP>(dis, x, y, ny);
+            break;
+        case 4:
+            fvec_op_ny_sve_d4<ElementOpIP>(dis, x, y, ny);
+            break;
+        case 8:
+            fvec_op_ny_sve_d8<ElementOpIP>(dis, x, y, ny);
+            break;
+        default:
+            if (d == lanes)
+                fvec_op_ny_sve_lanes1<ElementOpIP>(dis, x, y, ny);
+            else if (d == lanes * 2)
+                fvec_op_ny_sve_lanes2<ElementOpIP>(dis, x, y, ny);
+            else if (d == lanes * 3)
+                fvec_op_ny_sve_lanes3<ElementOpIP>(dis, x, y, ny);
+            else if (d == lanes * 4)
+                fvec_op_ny_sve_lanes4<ElementOpIP>(dis, x, y, ny);
+            else
+                fvec_inner_products_ny_ref(dis, x, y, d, ny);
+            break;
+    }
+}
 #elif defined(__aarch64__)
 // not optimized for ARM
@@ -2934,6 +3379,60 @@ void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) {
 #endif
 }
+#elif defined(__ARM_FEATURE_SVE)
+void fvec_madd(
+        const size_t n,
+        const float* __restrict a,
+        const float bf,
+        const float* __restrict b,
+        float* __restrict c) {
+    const size_t lanes = static_cast<size_t>(svcntw());
+    const size_t lanes2 = lanes * 2;
+    const size_t lanes3 = lanes * 3;
+    const size_t lanes4 = lanes * 4;
+    size_t i = 0;
+    for (; i + lanes4 < n; i += lanes4) {
+        const auto mask = svptrue_b32();
+        const auto ai0 = svld1_f32(mask, a + i);
+        const auto ai1 = svld1_f32(mask, a + i + lanes);
+        const auto ai2 = svld1_f32(mask, a + i + lanes2);
+        const auto ai3 = svld1_f32(mask, a + i + lanes3);
+        const auto bi0 = svld1_f32(mask, b + i);
+        const auto bi1 = svld1_f32(mask, b + i + lanes);
+        const auto bi2 = svld1_f32(mask, b + i + lanes2);
+        const auto bi3 = svld1_f32(mask, b + i + lanes3);
+        const auto ci0 = svmla_n_f32_x(mask, ai0, bi0, bf);
+        const auto ci1 = svmla_n_f32_x(mask, ai1, bi1, bf);
+        const auto ci2 = svmla_n_f32_x(mask, ai2, bi2, bf);
+        const auto ci3 = svmla_n_f32_x(mask, ai3, bi3, bf);
+        svst1_f32(mask, c + i, ci0);
+        svst1_f32(mask, c + i + lanes, ci1);
+        svst1_f32(mask, c + i + lanes2, ci2);
+        svst1_f32(mask, c + i + lanes3, ci3);
+    }
+    const auto mask0 = svwhilelt_b32_u64(i, n);
+    const auto mask1 = svwhilelt_b32_u64(i + lanes, n);
+    const auto mask2 = svwhilelt_b32_u64(i + lanes2, n);
+    const auto mask3 = svwhilelt_b32_u64(i + lanes3, n);
+    const auto ai0 = svld1_f32(mask0, a + i);
+    const auto ai1 = svld1_f32(mask1, a + i + lanes);
+    const auto ai2 = svld1_f32(mask2, a + i + lanes2);
+    const auto ai3 = svld1_f32(mask3, a + i + lanes3);
+    const auto bi0 = svld1_f32(mask0, b + i);
+    const auto bi1 = svld1_f32(mask1, b + i + lanes);
+    const auto bi2 = svld1_f32(mask2, b + i + lanes2);
+    const auto bi3 = svld1_f32(mask3, b + i + lanes3);
+    const auto ci0 = svmla_n_f32_x(mask0, ai0, bi0, bf);
+    const auto ci1 = svmla_n_f32_x(mask1, ai1, bi1, bf);
+    const auto ci2 = svmla_n_f32_x(mask2, ai2, bi2, bf);
+    const auto ci3 = svmla_n_f32_x(mask3, ai3, bi3, bf);
+    svst1_f32(mask0, c + i, ci0);
+    svst1_f32(mask1, c + i + lanes, ci1);
+    svst1_f32(mask2, c + i + lanes2, ci2);
+    svst1_f32(mask3, c + i + lanes3, ci3);
+}
 #elif defined(__aarch64__)
 void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) {
@@ -3266,7 +3765,7 @@ void fvec_add(size_t d, const float* a, float b, float* c) {
     size_t i;
     simd8float32 bv(b);
     for (i = 0; i + 7 < d; i += 8) {
-        simd8float32 ci, ai, bi;
+        simd8float32 ci, ai;
         ai.loadu(a + i);
         ci = ai + bv;
         ci.storeu(c + i);

data/vendor/faiss/faiss/utils/extra_distances-inl.h CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.

data/vendor/faiss/faiss/utils/extra_distances.cpp CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
@@ -15,7 +15,6 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/DistanceComputer.h>
-#include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/utils.h>
 namespace faiss {

data/vendor/faiss/faiss/utils/extra_distances.h CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.

data/vendor/faiss/faiss/utils/fp16-arm.h CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.

data/vendor/faiss/faiss/utils/fp16-fp16c.h CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.

data/vendor/faiss/faiss/utils/fp16-inl.h CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.

data/vendor/faiss/faiss/utils/fp16.h CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.

data/vendor/faiss/faiss/utils/hamming-inl.h CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.

data/vendor/faiss/faiss/utils/hamming.cpp CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
@@ -24,7 +24,6 @@
 #include <faiss/utils/hamming.h>
 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <memory>
 #include <vector>

data/vendor/faiss/faiss/utils/hamming.h CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.

data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h CHANGED Viewed

@@ -1,5 +1,5 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.