faiss 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +2 -2
- data/vendor/faiss/faiss/AutoTune.h +2 -2
- data/vendor/faiss/faiss/Clustering.cpp +2 -2
- data/vendor/faiss/faiss/Clustering.h +2 -2
- data/vendor/faiss/faiss/IVFlib.cpp +2 -2
- data/vendor/faiss/faiss/IVFlib.h +2 -2
- data/vendor/faiss/faiss/Index.cpp +6 -2
- data/vendor/faiss/faiss/Index.h +10 -3
- data/vendor/faiss/faiss/Index2Layer.cpp +2 -2
- data/vendor/faiss/faiss/Index2Layer.h +2 -2
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +7 -7
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.h +2 -2
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +14 -16
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +2 -2
- data/vendor/faiss/faiss/IndexBinary.cpp +13 -2
- data/vendor/faiss/faiss/IndexBinary.h +8 -2
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +2 -3
- data/vendor/faiss/faiss/IndexBinaryFlat.h +2 -2
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +2 -2
- data/vendor/faiss/faiss/IndexBinaryFromFloat.h +2 -2
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +2 -7
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +2 -2
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +2 -3
- data/vendor/faiss/faiss/IndexBinaryHash.h +2 -2
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +2 -2
- data/vendor/faiss/faiss/IndexBinaryIVF.h +2 -2
- data/vendor/faiss/faiss/IndexFastScan.cpp +10 -14
- data/vendor/faiss/faiss/IndexFastScan.h +11 -2
- data/vendor/faiss/faiss/IndexFlat.cpp +2 -3
- data/vendor/faiss/faiss/IndexFlat.h +2 -2
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +11 -2
- data/vendor/faiss/faiss/IndexFlatCodes.h +5 -2
- data/vendor/faiss/faiss/IndexHNSW.cpp +13 -6
- data/vendor/faiss/faiss/IndexHNSW.h +2 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +19 -3
- data/vendor/faiss/faiss/IndexIDMap.h +5 -2
- data/vendor/faiss/faiss/IndexIVF.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVF.h +5 -4
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +6 -7
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.h +2 -2
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +3 -14
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +2 -4
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +71 -34
- data/vendor/faiss/faiss/IndexIVFFastScan.h +19 -2
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFFlat.h +2 -2
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.h +2 -2
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFPQ.h +2 -2
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +7 -33
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +2 -4
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +2 -2
- data/vendor/faiss/faiss/IndexIVFPQR.h +2 -2
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +2 -3
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +2 -2
- data/vendor/faiss/faiss/IndexLSH.cpp +2 -3
- data/vendor/faiss/faiss/IndexLSH.h +2 -2
- data/vendor/faiss/faiss/IndexLattice.cpp +2 -2
- data/vendor/faiss/faiss/IndexLattice.h +2 -2
- data/vendor/faiss/faiss/IndexNNDescent.cpp +2 -2
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +2 -5
- data/vendor/faiss/faiss/IndexNSG.h +2 -2
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +2 -2
- data/vendor/faiss/faiss/IndexNeuralNetCodec.h +2 -2
- data/vendor/faiss/faiss/IndexPQ.cpp +26 -26
- data/vendor/faiss/faiss/IndexPQ.h +2 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +2 -5
- data/vendor/faiss/faiss/IndexPQFastScan.h +2 -11
- data/vendor/faiss/faiss/IndexPreTransform.cpp +2 -2
- data/vendor/faiss/faiss/IndexPreTransform.h +2 -2
- data/vendor/faiss/faiss/IndexRefine.cpp +41 -4
- data/vendor/faiss/faiss/IndexRefine.h +9 -2
- data/vendor/faiss/faiss/IndexReplicas.cpp +2 -2
- data/vendor/faiss/faiss/IndexReplicas.h +2 -2
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +2 -2
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +2 -2
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +2 -3
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +2 -2
- data/vendor/faiss/faiss/IndexShards.cpp +2 -2
- data/vendor/faiss/faiss/IndexShards.h +2 -2
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +2 -2
- data/vendor/faiss/faiss/IndexShardsIVF.h +2 -2
- data/vendor/faiss/faiss/MatrixStats.cpp +2 -2
- data/vendor/faiss/faiss/MatrixStats.h +2 -2
- data/vendor/faiss/faiss/MetaIndexes.cpp +2 -3
- data/vendor/faiss/faiss/MetaIndexes.h +2 -2
- data/vendor/faiss/faiss/MetricType.h +2 -2
- data/vendor/faiss/faiss/VectorTransform.cpp +2 -2
- data/vendor/faiss/faiss/VectorTransform.h +2 -2
- data/vendor/faiss/faiss/clone_index.cpp +2 -2
- data/vendor/faiss/faiss/clone_index.h +2 -2
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +2 -2
- data/vendor/faiss/faiss/cppcontrib/detail/CoarseBitType.h +2 -2
- data/vendor/faiss/faiss/cppcontrib/detail/UintReader.h +2 -2
- data/vendor/faiss/faiss/cppcontrib/factory_tools.cpp +44 -4
- data/vendor/faiss/faiss/cppcontrib/factory_tools.h +7 -2
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +2 -2
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +2 -2
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +2 -2
- data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h +2 -2
- data/vendor/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h +2 -2
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +2 -2
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +2 -2
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuAutoTune.cpp +2 -5
- data/vendor/faiss/faiss/gpu/GpuAutoTune.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +13 -13
- data/vendor/faiss/faiss/gpu/GpuCloner.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuClonerOptions.h +6 -6
- data/vendor/faiss/faiss/gpu/GpuDistance.h +11 -7
- data/vendor/faiss/faiss/gpu/GpuFaissAssert.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuIcmEncoder.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuIndex.h +8 -7
- data/vendor/faiss/faiss/gpu/GpuIndexBinaryFlat.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -3
- data/vendor/faiss/faiss/gpu/GpuIndexFlat.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVF.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFFlat.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuIndexIVFPQ.h +3 -3
- data/vendor/faiss/faiss/gpu/GpuIndexIVFScalarQuantizer.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuIndicesOptions.h +2 -2
- data/vendor/faiss/faiss/gpu/GpuResources.cpp +7 -2
- data/vendor/faiss/faiss/gpu/GpuResources.h +11 -4
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +51 -21
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +15 -5
- data/vendor/faiss/faiss/gpu/impl/IndexUtils.h +2 -2
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.cpp +2 -2
- data/vendor/faiss/faiss/gpu/impl/InterleavedCodes.h +2 -2
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.cpp +2 -2
- data/vendor/faiss/faiss/gpu/impl/RemapIndices.h +2 -2
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper-inl.h +2 -2
- data/vendor/faiss/faiss/gpu/perf/IndexWrapper.h +2 -2
- data/vendor/faiss/faiss/gpu/perf/PerfClustering.cpp +2 -2
- data/vendor/faiss/faiss/gpu/perf/PerfIVFPQAdd.cpp +2 -3
- data/vendor/faiss/faiss/gpu/perf/WriteIndex.cpp +2 -2
- data/vendor/faiss/faiss/gpu/test/TestCodePacking.cpp +2 -2
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp +2 -2
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +54 -54
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +80 -78
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +51 -51
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/gpu/test/TestGpuMemoryException.cpp +3 -3
- data/vendor/faiss/faiss/gpu/test/TestGpuResidualQuantizer.cpp +70 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +74 -4
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +2 -2
- data/vendor/faiss/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp +2 -2
- data/vendor/faiss/faiss/gpu/utils/{RaftUtils.h → CuvsUtils.h} +12 -11
- data/vendor/faiss/faiss/gpu/utils/DeviceUtils.h +2 -2
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.cpp +2 -2
- data/vendor/faiss/faiss/gpu/utils/StackDeviceMemory.h +2 -2
- data/vendor/faiss/faiss/gpu/utils/StaticUtils.h +2 -2
- data/vendor/faiss/faiss/gpu/utils/Timer.cpp +2 -2
- data/vendor/faiss/faiss/gpu/utils/Timer.h +2 -2
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +79 -11
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +17 -5
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +2 -2
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +2 -2
- data/vendor/faiss/faiss/impl/CodePacker.cpp +2 -2
- data/vendor/faiss/faiss/impl/CodePacker.h +2 -2
- data/vendor/faiss/faiss/impl/DistanceComputer.h +2 -2
- data/vendor/faiss/faiss/impl/FaissAssert.h +2 -2
- data/vendor/faiss/faiss/impl/FaissException.cpp +2 -2
- data/vendor/faiss/faiss/impl/FaissException.h +2 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +24 -19
- data/vendor/faiss/faiss/impl/HNSW.h +12 -2
- data/vendor/faiss/faiss/impl/IDSelector.cpp +2 -2
- data/vendor/faiss/faiss/impl/IDSelector.h +2 -2
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.h +2 -2
- data/vendor/faiss/faiss/impl/LookupTableScaler.h +2 -2
- data/vendor/faiss/faiss/impl/NNDescent.cpp +2 -2
- data/vendor/faiss/faiss/impl/NNDescent.h +2 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +27 -21
- data/vendor/faiss/faiss/impl/NSG.h +20 -8
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +2 -2
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +2 -2
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +2 -4
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.h +2 -2
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +2 -2
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/impl/ProductQuantizer.h +2 -2
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +2 -36
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +3 -13
- data/vendor/faiss/faiss/impl/ResultHandler.h +2 -2
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +2 -2
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +2 -2
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +2 -2
- data/vendor/faiss/faiss/impl/ThreadedIndex.h +2 -2
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx2.h +2 -2
- data/vendor/faiss/faiss/impl/code_distance/code_distance-avx512.h +2 -2
- data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +2 -2
- data/vendor/faiss/faiss/impl/code_distance/code_distance-sve.h +440 -0
- data/vendor/faiss/faiss/impl/code_distance/code_distance.h +55 -2
- data/vendor/faiss/faiss/impl/index_read.cpp +2 -5
- data/vendor/faiss/faiss/impl/index_read_utils.h +2 -2
- data/vendor/faiss/faiss/impl/index_write.cpp +2 -6
- data/vendor/faiss/faiss/impl/io.cpp +2 -2
- data/vendor/faiss/faiss/impl/io.h +2 -2
- data/vendor/faiss/faiss/impl/io_macros.h +2 -9
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +2 -3
- data/vendor/faiss/faiss/impl/kmeans1d.h +2 -2
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +2 -3
- data/vendor/faiss/faiss/impl/lattice_Zn.h +2 -2
- data/vendor/faiss/faiss/impl/platform_macros.h +12 -2
- data/vendor/faiss/faiss/impl/pq4_fast_scan.cpp +2 -2
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +20 -2
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +2 -2
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp +2 -2
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +3 -3
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +2 -2
- data/vendor/faiss/faiss/impl/simd_result_handlers.h +18 -18
- data/vendor/faiss/faiss/index_factory.cpp +20 -21
- data/vendor/faiss/faiss/index_factory.h +2 -2
- data/vendor/faiss/faiss/index_io.h +2 -2
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +2 -2
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.h +2 -2
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +2 -2
- data/vendor/faiss/faiss/invlists/DirectMap.h +2 -2
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +2 -2
- data/vendor/faiss/faiss/invlists/InvertedLists.h +2 -2
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +2 -2
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.h +2 -2
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +2 -3
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +2 -2
- data/vendor/faiss/faiss/python/python_callbacks.cpp +2 -2
- data/vendor/faiss/faiss/python/python_callbacks.h +2 -2
- data/vendor/faiss/faiss/utils/AlignedTable.h +5 -3
- data/vendor/faiss/faiss/utils/Heap.cpp +2 -2
- data/vendor/faiss/faiss/utils/Heap.h +2 -2
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +11 -7
- data/vendor/faiss/faiss/utils/NeuralNet.h +2 -2
- data/vendor/faiss/faiss/utils/WorkerThread.cpp +2 -2
- data/vendor/faiss/faiss/utils/WorkerThread.h +2 -2
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +2 -2
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +2 -2
- data/vendor/faiss/faiss/utils/approx_topk/generic.h +2 -2
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +2 -2
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +2 -2
- data/vendor/faiss/faiss/utils/bf16.h +2 -2
- data/vendor/faiss/faiss/utils/distances.cpp +191 -2
- data/vendor/faiss/faiss/utils/distances.h +3 -3
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +2 -2
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.h +2 -2
- data/vendor/faiss/faiss/utils/distances_simd.cpp +502 -3
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +2 -2
- data/vendor/faiss/faiss/utils/extra_distances.cpp +2 -3
- data/vendor/faiss/faiss/utils/extra_distances.h +2 -2
- data/vendor/faiss/faiss/utils/fp16-arm.h +2 -2
- data/vendor/faiss/faiss/utils/fp16-fp16c.h +2 -2
- data/vendor/faiss/faiss/utils/fp16-inl.h +2 -2
- data/vendor/faiss/faiss/utils/fp16.h +2 -2
- data/vendor/faiss/faiss/utils/hamming-inl.h +2 -2
- data/vendor/faiss/faiss/utils/hamming.cpp +2 -3
- data/vendor/faiss/faiss/utils/hamming.h +2 -2
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +2 -2
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +490 -0
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +2 -2
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +2 -2
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +5 -2
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +5 -5
- data/vendor/faiss/faiss/utils/ordered_key_value.h +2 -2
- data/vendor/faiss/faiss/utils/partitioning.cpp +2 -2
- data/vendor/faiss/faiss/utils/partitioning.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.h +2 -2
- data/vendor/faiss/faiss/utils/random.cpp +2 -2
- data/vendor/faiss/faiss/utils/random.h +2 -2
- data/vendor/faiss/faiss/utils/simdlib.h +2 -2
- data/vendor/faiss/faiss/utils/simdlib_avx2.h +2 -2
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +2 -2
- data/vendor/faiss/faiss/utils/simdlib_emulated.h +2 -2
- data/vendor/faiss/faiss/utils/simdlib_neon.h +2 -2
- data/vendor/faiss/faiss/utils/simdlib_ppc64.h +2 -2
- data/vendor/faiss/faiss/utils/sorting.cpp +2 -2
- data/vendor/faiss/faiss/utils/sorting.h +2 -2
- data/vendor/faiss/faiss/utils/transpose/transpose-avx2-inl.h +2 -2
- data/vendor/faiss/faiss/utils/transpose/transpose-avx512-inl.h +2 -2
- data/vendor/faiss/faiss/utils/utils.cpp +7 -7
- data/vendor/faiss/faiss/utils/utils.h +4 -3
- metadata +9 -10
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
* Copyright (c)
|
1
|
+
/*
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
3
3
|
*
|
4
4
|
* This source code is licensed under the MIT license found in the
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
@@ -29,6 +29,10 @@
|
|
29
29
|
#include <faiss/utils/transpose/transpose-avx2-inl.h>
|
30
30
|
#endif
|
31
31
|
|
32
|
+
#ifdef __ARM_FEATURE_SVE
|
33
|
+
#include <arm_sve.h>
|
34
|
+
#endif
|
35
|
+
|
32
36
|
#ifdef __aarch64__
|
33
37
|
#include <arm_neon.h>
|
34
38
|
#endif
|
@@ -2585,6 +2589,7 @@ size_t fvec_L2sqr_ny_nearest_y_transposed(
|
|
2585
2589
|
|
2586
2590
|
float fvec_L1(const float* x, const float* y, size_t d) {
|
2587
2591
|
__m256 msum1 = _mm256_setzero_ps();
|
2592
|
+
// signmask used for absolute value
|
2588
2593
|
__m256 signmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffffUL));
|
2589
2594
|
|
2590
2595
|
while (d >= 8) {
|
@@ -2592,7 +2597,9 @@ float fvec_L1(const float* x, const float* y, size_t d) {
|
|
2592
2597
|
x += 8;
|
2593
2598
|
__m256 my = _mm256_loadu_ps(y);
|
2594
2599
|
y += 8;
|
2600
|
+
// subtract
|
2595
2601
|
const __m256 a_m_b = _mm256_sub_ps(mx, my);
|
2602
|
+
// find sum of absolute value of distances (manhattan distance)
|
2596
2603
|
msum1 = _mm256_add_ps(msum1, _mm256_and_ps(signmask, a_m_b));
|
2597
2604
|
d -= 8;
|
2598
2605
|
}
|
@@ -2625,6 +2632,7 @@ float fvec_L1(const float* x, const float* y, size_t d) {
|
|
2625
2632
|
|
2626
2633
|
float fvec_Linf(const float* x, const float* y, size_t d) {
|
2627
2634
|
__m256 msum1 = _mm256_setzero_ps();
|
2635
|
+
// signmask used for absolute value
|
2628
2636
|
__m256 signmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffffUL));
|
2629
2637
|
|
2630
2638
|
while (d >= 8) {
|
@@ -2632,7 +2640,9 @@ float fvec_Linf(const float* x, const float* y, size_t d) {
|
|
2632
2640
|
x += 8;
|
2633
2641
|
__m256 my = _mm256_loadu_ps(y);
|
2634
2642
|
y += 8;
|
2643
|
+
// subtract
|
2635
2644
|
const __m256 a_m_b = _mm256_sub_ps(mx, my);
|
2645
|
+
// find max of absolute value of distances (chebyshev distance)
|
2636
2646
|
msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b));
|
2637
2647
|
d -= 8;
|
2638
2648
|
}
|
@@ -2673,6 +2683,441 @@ float fvec_Linf(const float* x, const float* y, size_t d) {
|
|
2673
2683
|
return fvec_Linf_ref(x, y, d);
|
2674
2684
|
}
|
2675
2685
|
|
2686
|
+
#elif defined(__ARM_FEATURE_SVE)
|
2687
|
+
|
2688
|
+
struct ElementOpIP {
|
2689
|
+
static svfloat32_t op(svbool_t pg, svfloat32_t x, svfloat32_t y) {
|
2690
|
+
return svmul_f32_x(pg, x, y);
|
2691
|
+
}
|
2692
|
+
static svfloat32_t merge(
|
2693
|
+
svbool_t pg,
|
2694
|
+
svfloat32_t z,
|
2695
|
+
svfloat32_t x,
|
2696
|
+
svfloat32_t y) {
|
2697
|
+
return svmla_f32_x(pg, z, x, y);
|
2698
|
+
}
|
2699
|
+
};
|
2700
|
+
|
2701
|
+
template <typename ElementOp>
|
2702
|
+
void fvec_op_ny_sve_d1(float* dis, const float* x, const float* y, size_t ny) {
|
2703
|
+
const size_t lanes = svcntw();
|
2704
|
+
const size_t lanes2 = lanes * 2;
|
2705
|
+
const size_t lanes3 = lanes * 3;
|
2706
|
+
const size_t lanes4 = lanes * 4;
|
2707
|
+
const svbool_t pg = svptrue_b32();
|
2708
|
+
const svfloat32_t x0 = svdup_n_f32(x[0]);
|
2709
|
+
size_t i = 0;
|
2710
|
+
for (; i + lanes4 < ny; i += lanes4) {
|
2711
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
2712
|
+
svfloat32_t y1 = svld1_f32(pg, y + lanes);
|
2713
|
+
svfloat32_t y2 = svld1_f32(pg, y + lanes2);
|
2714
|
+
svfloat32_t y3 = svld1_f32(pg, y + lanes3);
|
2715
|
+
y0 = ElementOp::op(pg, x0, y0);
|
2716
|
+
y1 = ElementOp::op(pg, x0, y1);
|
2717
|
+
y2 = ElementOp::op(pg, x0, y2);
|
2718
|
+
y3 = ElementOp::op(pg, x0, y3);
|
2719
|
+
svst1_f32(pg, dis, y0);
|
2720
|
+
svst1_f32(pg, dis + lanes, y1);
|
2721
|
+
svst1_f32(pg, dis + lanes2, y2);
|
2722
|
+
svst1_f32(pg, dis + lanes3, y3);
|
2723
|
+
y += lanes4;
|
2724
|
+
dis += lanes4;
|
2725
|
+
}
|
2726
|
+
const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
|
2727
|
+
const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
|
2728
|
+
const svbool_t pg2 = svwhilelt_b32_u64(i + lanes2, ny);
|
2729
|
+
const svbool_t pg3 = svwhilelt_b32_u64(i + lanes3, ny);
|
2730
|
+
svfloat32_t y0 = svld1_f32(pg0, y);
|
2731
|
+
svfloat32_t y1 = svld1_f32(pg1, y + lanes);
|
2732
|
+
svfloat32_t y2 = svld1_f32(pg2, y + lanes2);
|
2733
|
+
svfloat32_t y3 = svld1_f32(pg3, y + lanes3);
|
2734
|
+
y0 = ElementOp::op(pg0, x0, y0);
|
2735
|
+
y1 = ElementOp::op(pg1, x0, y1);
|
2736
|
+
y2 = ElementOp::op(pg2, x0, y2);
|
2737
|
+
y3 = ElementOp::op(pg3, x0, y3);
|
2738
|
+
svst1_f32(pg0, dis, y0);
|
2739
|
+
svst1_f32(pg1, dis + lanes, y1);
|
2740
|
+
svst1_f32(pg2, dis + lanes2, y2);
|
2741
|
+
svst1_f32(pg3, dis + lanes3, y3);
|
2742
|
+
}
|
2743
|
+
|
2744
|
+
template <typename ElementOp>
|
2745
|
+
void fvec_op_ny_sve_d2(float* dis, const float* x, const float* y, size_t ny) {
|
2746
|
+
const size_t lanes = svcntw();
|
2747
|
+
const size_t lanes2 = lanes * 2;
|
2748
|
+
const size_t lanes4 = lanes * 4;
|
2749
|
+
const svbool_t pg = svptrue_b32();
|
2750
|
+
const svfloat32_t x0 = svdup_n_f32(x[0]);
|
2751
|
+
const svfloat32_t x1 = svdup_n_f32(x[1]);
|
2752
|
+
size_t i = 0;
|
2753
|
+
for (; i + lanes2 < ny; i += lanes2) {
|
2754
|
+
const svfloat32x2_t y0 = svld2_f32(pg, y);
|
2755
|
+
const svfloat32x2_t y1 = svld2_f32(pg, y + lanes2);
|
2756
|
+
svfloat32_t y00 = svget2_f32(y0, 0);
|
2757
|
+
const svfloat32_t y01 = svget2_f32(y0, 1);
|
2758
|
+
svfloat32_t y10 = svget2_f32(y1, 0);
|
2759
|
+
const svfloat32_t y11 = svget2_f32(y1, 1);
|
2760
|
+
y00 = ElementOp::op(pg, x0, y00);
|
2761
|
+
y10 = ElementOp::op(pg, x0, y10);
|
2762
|
+
y00 = ElementOp::merge(pg, y00, x1, y01);
|
2763
|
+
y10 = ElementOp::merge(pg, y10, x1, y11);
|
2764
|
+
svst1_f32(pg, dis, y00);
|
2765
|
+
svst1_f32(pg, dis + lanes, y10);
|
2766
|
+
y += lanes4;
|
2767
|
+
dis += lanes2;
|
2768
|
+
}
|
2769
|
+
const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
|
2770
|
+
const svbool_t pg1 = svwhilelt_b32_u64(i + lanes, ny);
|
2771
|
+
const svfloat32x2_t y0 = svld2_f32(pg0, y);
|
2772
|
+
const svfloat32x2_t y1 = svld2_f32(pg1, y + lanes2);
|
2773
|
+
svfloat32_t y00 = svget2_f32(y0, 0);
|
2774
|
+
const svfloat32_t y01 = svget2_f32(y0, 1);
|
2775
|
+
svfloat32_t y10 = svget2_f32(y1, 0);
|
2776
|
+
const svfloat32_t y11 = svget2_f32(y1, 1);
|
2777
|
+
y00 = ElementOp::op(pg0, x0, y00);
|
2778
|
+
y10 = ElementOp::op(pg1, x0, y10);
|
2779
|
+
y00 = ElementOp::merge(pg0, y00, x1, y01);
|
2780
|
+
y10 = ElementOp::merge(pg1, y10, x1, y11);
|
2781
|
+
svst1_f32(pg0, dis, y00);
|
2782
|
+
svst1_f32(pg1, dis + lanes, y10);
|
2783
|
+
}
|
2784
|
+
|
2785
|
+
template <typename ElementOp>
|
2786
|
+
void fvec_op_ny_sve_d4(float* dis, const float* x, const float* y, size_t ny) {
|
2787
|
+
const size_t lanes = svcntw();
|
2788
|
+
const size_t lanes4 = lanes * 4;
|
2789
|
+
const svbool_t pg = svptrue_b32();
|
2790
|
+
const svfloat32_t x0 = svdup_n_f32(x[0]);
|
2791
|
+
const svfloat32_t x1 = svdup_n_f32(x[1]);
|
2792
|
+
const svfloat32_t x2 = svdup_n_f32(x[2]);
|
2793
|
+
const svfloat32_t x3 = svdup_n_f32(x[3]);
|
2794
|
+
size_t i = 0;
|
2795
|
+
for (; i + lanes < ny; i += lanes) {
|
2796
|
+
const svfloat32x4_t y0 = svld4_f32(pg, y);
|
2797
|
+
svfloat32_t y00 = svget4_f32(y0, 0);
|
2798
|
+
const svfloat32_t y01 = svget4_f32(y0, 1);
|
2799
|
+
svfloat32_t y02 = svget4_f32(y0, 2);
|
2800
|
+
const svfloat32_t y03 = svget4_f32(y0, 3);
|
2801
|
+
y00 = ElementOp::op(pg, x0, y00);
|
2802
|
+
y02 = ElementOp::op(pg, x2, y02);
|
2803
|
+
y00 = ElementOp::merge(pg, y00, x1, y01);
|
2804
|
+
y02 = ElementOp::merge(pg, y02, x3, y03);
|
2805
|
+
y00 = svadd_f32_x(pg, y00, y02);
|
2806
|
+
svst1_f32(pg, dis, y00);
|
2807
|
+
y += lanes4;
|
2808
|
+
dis += lanes;
|
2809
|
+
}
|
2810
|
+
const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
|
2811
|
+
const svfloat32x4_t y0 = svld4_f32(pg0, y);
|
2812
|
+
svfloat32_t y00 = svget4_f32(y0, 0);
|
2813
|
+
const svfloat32_t y01 = svget4_f32(y0, 1);
|
2814
|
+
svfloat32_t y02 = svget4_f32(y0, 2);
|
2815
|
+
const svfloat32_t y03 = svget4_f32(y0, 3);
|
2816
|
+
y00 = ElementOp::op(pg0, x0, y00);
|
2817
|
+
y02 = ElementOp::op(pg0, x2, y02);
|
2818
|
+
y00 = ElementOp::merge(pg0, y00, x1, y01);
|
2819
|
+
y02 = ElementOp::merge(pg0, y02, x3, y03);
|
2820
|
+
y00 = svadd_f32_x(pg0, y00, y02);
|
2821
|
+
svst1_f32(pg0, dis, y00);
|
2822
|
+
}
|
2823
|
+
|
2824
|
+
template <typename ElementOp>
|
2825
|
+
void fvec_op_ny_sve_d8(float* dis, const float* x, const float* y, size_t ny) {
|
2826
|
+
const size_t lanes = svcntw();
|
2827
|
+
const size_t lanes4 = lanes * 4;
|
2828
|
+
const size_t lanes8 = lanes * 8;
|
2829
|
+
const svbool_t pg = svptrue_b32();
|
2830
|
+
const svfloat32_t x0 = svdup_n_f32(x[0]);
|
2831
|
+
const svfloat32_t x1 = svdup_n_f32(x[1]);
|
2832
|
+
const svfloat32_t x2 = svdup_n_f32(x[2]);
|
2833
|
+
const svfloat32_t x3 = svdup_n_f32(x[3]);
|
2834
|
+
const svfloat32_t x4 = svdup_n_f32(x[4]);
|
2835
|
+
const svfloat32_t x5 = svdup_n_f32(x[5]);
|
2836
|
+
const svfloat32_t x6 = svdup_n_f32(x[6]);
|
2837
|
+
const svfloat32_t x7 = svdup_n_f32(x[7]);
|
2838
|
+
size_t i = 0;
|
2839
|
+
for (; i + lanes < ny; i += lanes) {
|
2840
|
+
const svfloat32x4_t ya = svld4_f32(pg, y);
|
2841
|
+
const svfloat32x4_t yb = svld4_f32(pg, y + lanes4);
|
2842
|
+
const svfloat32_t ya0 = svget4_f32(ya, 0);
|
2843
|
+
const svfloat32_t ya1 = svget4_f32(ya, 1);
|
2844
|
+
const svfloat32_t ya2 = svget4_f32(ya, 2);
|
2845
|
+
const svfloat32_t ya3 = svget4_f32(ya, 3);
|
2846
|
+
const svfloat32_t yb0 = svget4_f32(yb, 0);
|
2847
|
+
const svfloat32_t yb1 = svget4_f32(yb, 1);
|
2848
|
+
const svfloat32_t yb2 = svget4_f32(yb, 2);
|
2849
|
+
const svfloat32_t yb3 = svget4_f32(yb, 3);
|
2850
|
+
svfloat32_t y0 = svuzp1(ya0, yb0);
|
2851
|
+
const svfloat32_t y1 = svuzp1(ya1, yb1);
|
2852
|
+
svfloat32_t y2 = svuzp1(ya2, yb2);
|
2853
|
+
const svfloat32_t y3 = svuzp1(ya3, yb3);
|
2854
|
+
svfloat32_t y4 = svuzp2(ya0, yb0);
|
2855
|
+
const svfloat32_t y5 = svuzp2(ya1, yb1);
|
2856
|
+
svfloat32_t y6 = svuzp2(ya2, yb2);
|
2857
|
+
const svfloat32_t y7 = svuzp2(ya3, yb3);
|
2858
|
+
y0 = ElementOp::op(pg, x0, y0);
|
2859
|
+
y2 = ElementOp::op(pg, x2, y2);
|
2860
|
+
y4 = ElementOp::op(pg, x4, y4);
|
2861
|
+
y6 = ElementOp::op(pg, x6, y6);
|
2862
|
+
y0 = ElementOp::merge(pg, y0, x1, y1);
|
2863
|
+
y2 = ElementOp::merge(pg, y2, x3, y3);
|
2864
|
+
y4 = ElementOp::merge(pg, y4, x5, y5);
|
2865
|
+
y6 = ElementOp::merge(pg, y6, x7, y7);
|
2866
|
+
y0 = svadd_f32_x(pg, y0, y2);
|
2867
|
+
y4 = svadd_f32_x(pg, y4, y6);
|
2868
|
+
y0 = svadd_f32_x(pg, y0, y4);
|
2869
|
+
svst1_f32(pg, dis, y0);
|
2870
|
+
y += lanes8;
|
2871
|
+
dis += lanes;
|
2872
|
+
}
|
2873
|
+
const svbool_t pg0 = svwhilelt_b32_u64(i, ny);
|
2874
|
+
const svbool_t pga = svwhilelt_b32_u64(i * 2, ny * 2);
|
2875
|
+
const svbool_t pgb = svwhilelt_b32_u64(i * 2 + lanes, ny * 2);
|
2876
|
+
const svfloat32x4_t ya = svld4_f32(pga, y);
|
2877
|
+
const svfloat32x4_t yb = svld4_f32(pgb, y + lanes4);
|
2878
|
+
const svfloat32_t ya0 = svget4_f32(ya, 0);
|
2879
|
+
const svfloat32_t ya1 = svget4_f32(ya, 1);
|
2880
|
+
const svfloat32_t ya2 = svget4_f32(ya, 2);
|
2881
|
+
const svfloat32_t ya3 = svget4_f32(ya, 3);
|
2882
|
+
const svfloat32_t yb0 = svget4_f32(yb, 0);
|
2883
|
+
const svfloat32_t yb1 = svget4_f32(yb, 1);
|
2884
|
+
const svfloat32_t yb2 = svget4_f32(yb, 2);
|
2885
|
+
const svfloat32_t yb3 = svget4_f32(yb, 3);
|
2886
|
+
svfloat32_t y0 = svuzp1(ya0, yb0);
|
2887
|
+
const svfloat32_t y1 = svuzp1(ya1, yb1);
|
2888
|
+
svfloat32_t y2 = svuzp1(ya2, yb2);
|
2889
|
+
const svfloat32_t y3 = svuzp1(ya3, yb3);
|
2890
|
+
svfloat32_t y4 = svuzp2(ya0, yb0);
|
2891
|
+
const svfloat32_t y5 = svuzp2(ya1, yb1);
|
2892
|
+
svfloat32_t y6 = svuzp2(ya2, yb2);
|
2893
|
+
const svfloat32_t y7 = svuzp2(ya3, yb3);
|
2894
|
+
y0 = ElementOp::op(pg0, x0, y0);
|
2895
|
+
y2 = ElementOp::op(pg0, x2, y2);
|
2896
|
+
y4 = ElementOp::op(pg0, x4, y4);
|
2897
|
+
y6 = ElementOp::op(pg0, x6, y6);
|
2898
|
+
y0 = ElementOp::merge(pg0, y0, x1, y1);
|
2899
|
+
y2 = ElementOp::merge(pg0, y2, x3, y3);
|
2900
|
+
y4 = ElementOp::merge(pg0, y4, x5, y5);
|
2901
|
+
y6 = ElementOp::merge(pg0, y6, x7, y7);
|
2902
|
+
y0 = svadd_f32_x(pg0, y0, y2);
|
2903
|
+
y4 = svadd_f32_x(pg0, y4, y6);
|
2904
|
+
y0 = svadd_f32_x(pg0, y0, y4);
|
2905
|
+
svst1_f32(pg0, dis, y0);
|
2906
|
+
y += lanes8;
|
2907
|
+
dis += lanes;
|
2908
|
+
}
|
2909
|
+
|
2910
|
+
template <typename ElementOp>
|
2911
|
+
void fvec_op_ny_sve_lanes1(
|
2912
|
+
float* dis,
|
2913
|
+
const float* x,
|
2914
|
+
const float* y,
|
2915
|
+
size_t ny) {
|
2916
|
+
const size_t lanes = svcntw();
|
2917
|
+
const size_t lanes2 = lanes * 2;
|
2918
|
+
const size_t lanes3 = lanes * 3;
|
2919
|
+
const size_t lanes4 = lanes * 4;
|
2920
|
+
const svbool_t pg = svptrue_b32();
|
2921
|
+
const svfloat32_t x0 = svld1_f32(pg, x);
|
2922
|
+
size_t i = 0;
|
2923
|
+
for (; i + 3 < ny; i += 4) {
|
2924
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
2925
|
+
svfloat32_t y1 = svld1_f32(pg, y + lanes);
|
2926
|
+
svfloat32_t y2 = svld1_f32(pg, y + lanes2);
|
2927
|
+
svfloat32_t y3 = svld1_f32(pg, y + lanes3);
|
2928
|
+
y += lanes4;
|
2929
|
+
y0 = ElementOp::op(pg, x0, y0);
|
2930
|
+
y1 = ElementOp::op(pg, x0, y1);
|
2931
|
+
y2 = ElementOp::op(pg, x0, y2);
|
2932
|
+
y3 = ElementOp::op(pg, x0, y3);
|
2933
|
+
dis[i] = svaddv_f32(pg, y0);
|
2934
|
+
dis[i + 1] = svaddv_f32(pg, y1);
|
2935
|
+
dis[i + 2] = svaddv_f32(pg, y2);
|
2936
|
+
dis[i + 3] = svaddv_f32(pg, y3);
|
2937
|
+
}
|
2938
|
+
for (; i < ny; ++i) {
|
2939
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
2940
|
+
y += lanes;
|
2941
|
+
y0 = ElementOp::op(pg, x0, y0);
|
2942
|
+
dis[i] = svaddv_f32(pg, y0);
|
2943
|
+
}
|
2944
|
+
}
|
2945
|
+
|
2946
|
+
template <typename ElementOp>
|
2947
|
+
void fvec_op_ny_sve_lanes2(
|
2948
|
+
float* dis,
|
2949
|
+
const float* x,
|
2950
|
+
const float* y,
|
2951
|
+
size_t ny) {
|
2952
|
+
const size_t lanes = svcntw();
|
2953
|
+
const size_t lanes2 = lanes * 2;
|
2954
|
+
const size_t lanes3 = lanes * 3;
|
2955
|
+
const size_t lanes4 = lanes * 4;
|
2956
|
+
const svbool_t pg = svptrue_b32();
|
2957
|
+
const svfloat32_t x0 = svld1_f32(pg, x);
|
2958
|
+
const svfloat32_t x1 = svld1_f32(pg, x + lanes);
|
2959
|
+
size_t i = 0;
|
2960
|
+
for (; i + 1 < ny; i += 2) {
|
2961
|
+
svfloat32_t y00 = svld1_f32(pg, y);
|
2962
|
+
const svfloat32_t y01 = svld1_f32(pg, y + lanes);
|
2963
|
+
svfloat32_t y10 = svld1_f32(pg, y + lanes2);
|
2964
|
+
const svfloat32_t y11 = svld1_f32(pg, y + lanes3);
|
2965
|
+
y += lanes4;
|
2966
|
+
y00 = ElementOp::op(pg, x0, y00);
|
2967
|
+
y10 = ElementOp::op(pg, x0, y10);
|
2968
|
+
y00 = ElementOp::merge(pg, y00, x1, y01);
|
2969
|
+
y10 = ElementOp::merge(pg, y10, x1, y11);
|
2970
|
+
dis[i] = svaddv_f32(pg, y00);
|
2971
|
+
dis[i + 1] = svaddv_f32(pg, y10);
|
2972
|
+
}
|
2973
|
+
if (i < ny) {
|
2974
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
2975
|
+
const svfloat32_t y1 = svld1_f32(pg, y + lanes);
|
2976
|
+
y0 = ElementOp::op(pg, x0, y0);
|
2977
|
+
y0 = ElementOp::merge(pg, y0, x1, y1);
|
2978
|
+
dis[i] = svaddv_f32(pg, y0);
|
2979
|
+
}
|
2980
|
+
}
|
2981
|
+
|
2982
|
+
template <typename ElementOp>
|
2983
|
+
void fvec_op_ny_sve_lanes3(
|
2984
|
+
float* dis,
|
2985
|
+
const float* x,
|
2986
|
+
const float* y,
|
2987
|
+
size_t ny) {
|
2988
|
+
const size_t lanes = svcntw();
|
2989
|
+
const size_t lanes2 = lanes * 2;
|
2990
|
+
const size_t lanes3 = lanes * 3;
|
2991
|
+
const svbool_t pg = svptrue_b32();
|
2992
|
+
const svfloat32_t x0 = svld1_f32(pg, x);
|
2993
|
+
const svfloat32_t x1 = svld1_f32(pg, x + lanes);
|
2994
|
+
const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
|
2995
|
+
for (size_t i = 0; i < ny; ++i) {
|
2996
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
2997
|
+
const svfloat32_t y1 = svld1_f32(pg, y + lanes);
|
2998
|
+
svfloat32_t y2 = svld1_f32(pg, y + lanes2);
|
2999
|
+
y += lanes3;
|
3000
|
+
y0 = ElementOp::op(pg, x0, y0);
|
3001
|
+
y0 = ElementOp::merge(pg, y0, x1, y1);
|
3002
|
+
y0 = ElementOp::merge(pg, y0, x2, y2);
|
3003
|
+
dis[i] = svaddv_f32(pg, y0);
|
3004
|
+
}
|
3005
|
+
}
|
3006
|
+
|
3007
|
+
template <typename ElementOp>
|
3008
|
+
void fvec_op_ny_sve_lanes4(
|
3009
|
+
float* dis,
|
3010
|
+
const float* x,
|
3011
|
+
const float* y,
|
3012
|
+
size_t ny) {
|
3013
|
+
const size_t lanes = svcntw();
|
3014
|
+
const size_t lanes2 = lanes * 2;
|
3015
|
+
const size_t lanes3 = lanes * 3;
|
3016
|
+
const size_t lanes4 = lanes * 4;
|
3017
|
+
const svbool_t pg = svptrue_b32();
|
3018
|
+
const svfloat32_t x0 = svld1_f32(pg, x);
|
3019
|
+
const svfloat32_t x1 = svld1_f32(pg, x + lanes);
|
3020
|
+
const svfloat32_t x2 = svld1_f32(pg, x + lanes2);
|
3021
|
+
const svfloat32_t x3 = svld1_f32(pg, x + lanes3);
|
3022
|
+
for (size_t i = 0; i < ny; ++i) {
|
3023
|
+
svfloat32_t y0 = svld1_f32(pg, y);
|
3024
|
+
const svfloat32_t y1 = svld1_f32(pg, y + lanes);
|
3025
|
+
svfloat32_t y2 = svld1_f32(pg, y + lanes2);
|
3026
|
+
const svfloat32_t y3 = svld1_f32(pg, y + lanes3);
|
3027
|
+
y += lanes4;
|
3028
|
+
y0 = ElementOp::op(pg, x0, y0);
|
3029
|
+
y2 = ElementOp::op(pg, x2, y2);
|
3030
|
+
y0 = ElementOp::merge(pg, y0, x1, y1);
|
3031
|
+
y2 = ElementOp::merge(pg, y2, x3, y3);
|
3032
|
+
y0 = svadd_f32_x(pg, y0, y2);
|
3033
|
+
dis[i] = svaddv_f32(pg, y0);
|
3034
|
+
}
|
3035
|
+
}
|
3036
|
+
|
3037
|
+
void fvec_L2sqr_ny(
|
3038
|
+
float* dis,
|
3039
|
+
const float* x,
|
3040
|
+
const float* y,
|
3041
|
+
size_t d,
|
3042
|
+
size_t ny) {
|
3043
|
+
fvec_L2sqr_ny_ref(dis, x, y, d, ny);
|
3044
|
+
}
|
3045
|
+
|
3046
|
+
void fvec_L2sqr_ny_transposed(
|
3047
|
+
float* dis,
|
3048
|
+
const float* x,
|
3049
|
+
const float* y,
|
3050
|
+
const float* y_sqlen,
|
3051
|
+
size_t d,
|
3052
|
+
size_t d_offset,
|
3053
|
+
size_t ny) {
|
3054
|
+
return fvec_L2sqr_ny_y_transposed_ref(dis, x, y, y_sqlen, d, d_offset, ny);
|
3055
|
+
}
|
3056
|
+
|
3057
|
+
size_t fvec_L2sqr_ny_nearest(
|
3058
|
+
float* distances_tmp_buffer,
|
3059
|
+
const float* x,
|
3060
|
+
const float* y,
|
3061
|
+
size_t d,
|
3062
|
+
size_t ny) {
|
3063
|
+
return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, d, ny);
|
3064
|
+
}
|
3065
|
+
|
3066
|
+
size_t fvec_L2sqr_ny_nearest_y_transposed(
|
3067
|
+
float* distances_tmp_buffer,
|
3068
|
+
const float* x,
|
3069
|
+
const float* y,
|
3070
|
+
const float* y_sqlen,
|
3071
|
+
size_t d,
|
3072
|
+
size_t d_offset,
|
3073
|
+
size_t ny) {
|
3074
|
+
return fvec_L2sqr_ny_nearest_y_transposed_ref(
|
3075
|
+
distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
|
3076
|
+
}
|
3077
|
+
|
3078
|
+
float fvec_L1(const float* x, const float* y, size_t d) {
|
3079
|
+
return fvec_L1_ref(x, y, d);
|
3080
|
+
}
|
3081
|
+
|
3082
|
+
float fvec_Linf(const float* x, const float* y, size_t d) {
|
3083
|
+
return fvec_Linf_ref(x, y, d);
|
3084
|
+
}
|
3085
|
+
|
3086
|
+
void fvec_inner_products_ny(
|
3087
|
+
float* dis,
|
3088
|
+
const float* x,
|
3089
|
+
const float* y,
|
3090
|
+
size_t d,
|
3091
|
+
size_t ny) {
|
3092
|
+
const size_t lanes = svcntw();
|
3093
|
+
switch (d) {
|
3094
|
+
case 1:
|
3095
|
+
fvec_op_ny_sve_d1<ElementOpIP>(dis, x, y, ny);
|
3096
|
+
break;
|
3097
|
+
case 2:
|
3098
|
+
fvec_op_ny_sve_d2<ElementOpIP>(dis, x, y, ny);
|
3099
|
+
break;
|
3100
|
+
case 4:
|
3101
|
+
fvec_op_ny_sve_d4<ElementOpIP>(dis, x, y, ny);
|
3102
|
+
break;
|
3103
|
+
case 8:
|
3104
|
+
fvec_op_ny_sve_d8<ElementOpIP>(dis, x, y, ny);
|
3105
|
+
break;
|
3106
|
+
default:
|
3107
|
+
if (d == lanes)
|
3108
|
+
fvec_op_ny_sve_lanes1<ElementOpIP>(dis, x, y, ny);
|
3109
|
+
else if (d == lanes * 2)
|
3110
|
+
fvec_op_ny_sve_lanes2<ElementOpIP>(dis, x, y, ny);
|
3111
|
+
else if (d == lanes * 3)
|
3112
|
+
fvec_op_ny_sve_lanes3<ElementOpIP>(dis, x, y, ny);
|
3113
|
+
else if (d == lanes * 4)
|
3114
|
+
fvec_op_ny_sve_lanes4<ElementOpIP>(dis, x, y, ny);
|
3115
|
+
else
|
3116
|
+
fvec_inner_products_ny_ref(dis, x, y, d, ny);
|
3117
|
+
break;
|
3118
|
+
}
|
3119
|
+
}
|
3120
|
+
|
2676
3121
|
#elif defined(__aarch64__)
|
2677
3122
|
|
2678
3123
|
// not optimized for ARM
|
@@ -2934,6 +3379,60 @@ void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) {
|
|
2934
3379
|
#endif
|
2935
3380
|
}
|
2936
3381
|
|
3382
|
+
#elif defined(__ARM_FEATURE_SVE)
|
3383
|
+
|
3384
|
+
void fvec_madd(
|
3385
|
+
const size_t n,
|
3386
|
+
const float* __restrict a,
|
3387
|
+
const float bf,
|
3388
|
+
const float* __restrict b,
|
3389
|
+
float* __restrict c) {
|
3390
|
+
const size_t lanes = static_cast<size_t>(svcntw());
|
3391
|
+
const size_t lanes2 = lanes * 2;
|
3392
|
+
const size_t lanes3 = lanes * 3;
|
3393
|
+
const size_t lanes4 = lanes * 4;
|
3394
|
+
size_t i = 0;
|
3395
|
+
for (; i + lanes4 < n; i += lanes4) {
|
3396
|
+
const auto mask = svptrue_b32();
|
3397
|
+
const auto ai0 = svld1_f32(mask, a + i);
|
3398
|
+
const auto ai1 = svld1_f32(mask, a + i + lanes);
|
3399
|
+
const auto ai2 = svld1_f32(mask, a + i + lanes2);
|
3400
|
+
const auto ai3 = svld1_f32(mask, a + i + lanes3);
|
3401
|
+
const auto bi0 = svld1_f32(mask, b + i);
|
3402
|
+
const auto bi1 = svld1_f32(mask, b + i + lanes);
|
3403
|
+
const auto bi2 = svld1_f32(mask, b + i + lanes2);
|
3404
|
+
const auto bi3 = svld1_f32(mask, b + i + lanes3);
|
3405
|
+
const auto ci0 = svmla_n_f32_x(mask, ai0, bi0, bf);
|
3406
|
+
const auto ci1 = svmla_n_f32_x(mask, ai1, bi1, bf);
|
3407
|
+
const auto ci2 = svmla_n_f32_x(mask, ai2, bi2, bf);
|
3408
|
+
const auto ci3 = svmla_n_f32_x(mask, ai3, bi3, bf);
|
3409
|
+
svst1_f32(mask, c + i, ci0);
|
3410
|
+
svst1_f32(mask, c + i + lanes, ci1);
|
3411
|
+
svst1_f32(mask, c + i + lanes2, ci2);
|
3412
|
+
svst1_f32(mask, c + i + lanes3, ci3);
|
3413
|
+
}
|
3414
|
+
const auto mask0 = svwhilelt_b32_u64(i, n);
|
3415
|
+
const auto mask1 = svwhilelt_b32_u64(i + lanes, n);
|
3416
|
+
const auto mask2 = svwhilelt_b32_u64(i + lanes2, n);
|
3417
|
+
const auto mask3 = svwhilelt_b32_u64(i + lanes3, n);
|
3418
|
+
const auto ai0 = svld1_f32(mask0, a + i);
|
3419
|
+
const auto ai1 = svld1_f32(mask1, a + i + lanes);
|
3420
|
+
const auto ai2 = svld1_f32(mask2, a + i + lanes2);
|
3421
|
+
const auto ai3 = svld1_f32(mask3, a + i + lanes3);
|
3422
|
+
const auto bi0 = svld1_f32(mask0, b + i);
|
3423
|
+
const auto bi1 = svld1_f32(mask1, b + i + lanes);
|
3424
|
+
const auto bi2 = svld1_f32(mask2, b + i + lanes2);
|
3425
|
+
const auto bi3 = svld1_f32(mask3, b + i + lanes3);
|
3426
|
+
const auto ci0 = svmla_n_f32_x(mask0, ai0, bi0, bf);
|
3427
|
+
const auto ci1 = svmla_n_f32_x(mask1, ai1, bi1, bf);
|
3428
|
+
const auto ci2 = svmla_n_f32_x(mask2, ai2, bi2, bf);
|
3429
|
+
const auto ci3 = svmla_n_f32_x(mask3, ai3, bi3, bf);
|
3430
|
+
svst1_f32(mask0, c + i, ci0);
|
3431
|
+
svst1_f32(mask1, c + i + lanes, ci1);
|
3432
|
+
svst1_f32(mask2, c + i + lanes2, ci2);
|
3433
|
+
svst1_f32(mask3, c + i + lanes3, ci3);
|
3434
|
+
}
|
3435
|
+
|
2937
3436
|
#elif defined(__aarch64__)
|
2938
3437
|
|
2939
3438
|
void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c) {
|
@@ -3266,7 +3765,7 @@ void fvec_add(size_t d, const float* a, float b, float* c) {
|
|
3266
3765
|
size_t i;
|
3267
3766
|
simd8float32 bv(b);
|
3268
3767
|
for (i = 0; i + 7 < d; i += 8) {
|
3269
|
-
simd8float32 ci, ai
|
3768
|
+
simd8float32 ci, ai;
|
3270
3769
|
ai.loadu(a + i);
|
3271
3770
|
ci = ai + bv;
|
3272
3771
|
ci.storeu(c + i);
|
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
* Copyright (c)
|
1
|
+
/*
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
3
3
|
*
|
4
4
|
* This source code is licensed under the MIT license found in the
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
@@ -15,7 +15,6 @@
|
|
15
15
|
|
16
16
|
#include <faiss/impl/AuxIndexStructures.h>
|
17
17
|
#include <faiss/impl/DistanceComputer.h>
|
18
|
-
#include <faiss/impl/FaissAssert.h>
|
19
18
|
#include <faiss/utils/utils.h>
|
20
19
|
|
21
20
|
namespace faiss {
|
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
* Copyright (c)
|
1
|
+
/*
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
3
3
|
*
|
4
4
|
* This source code is licensed under the MIT license found in the
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
@@ -24,7 +24,6 @@
|
|
24
24
|
#include <faiss/utils/hamming.h>
|
25
25
|
|
26
26
|
#include <algorithm>
|
27
|
-
#include <cmath>
|
28
27
|
#include <cstdio>
|
29
28
|
#include <memory>
|
30
29
|
#include <vector>
|