faiss 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/faiss/extconf.rb +2 -1
- data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
- data/ext/faiss/index_binary.cpp +1 -1
- data/ext/faiss/kmeans.cpp +1 -1
- data/ext/faiss/pca_matrix.cpp +1 -1
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +93 -80
- data/vendor/faiss/faiss/Clustering.cpp +39 -240
- data/vendor/faiss/faiss/Clustering.h +6 -0
- data/vendor/faiss/faiss/IVFlib.cpp +41 -21
- data/vendor/faiss/faiss/Index.cpp +6 -5
- data/vendor/faiss/faiss/Index.h +5 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
- data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
- data/vendor/faiss/faiss/IndexFastScan.h +25 -23
- data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
- data/vendor/faiss/faiss/IndexFlat.h +21 -18
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
- data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
- data/vendor/faiss/faiss/IndexHNSW.h +16 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
- data/vendor/faiss/faiss/IndexIVF.h +33 -12
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
- data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
- data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
- data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
- data/vendor/faiss/faiss/IndexShards.cpp +10 -9
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +14 -7
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
- data/vendor/faiss/faiss/VectorTransform.h +16 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +48 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/factory_tools.cpp +5 -0
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
- data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
- data/vendor/faiss/faiss/impl/HNSW.h +13 -34
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
- data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
- data/vendor/faiss/faiss/impl/NSG.h +4 -4
- data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
- data/vendor/faiss/faiss/impl/Panorama.h +258 -87
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +33 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
- data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +86 -18
- data/vendor/faiss/faiss/index_io.h +24 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
- data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
- data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/distances.cpp +390 -560
- data/vendor/faiss/faiss/utils/distances.h +20 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
- data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
- data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +5 -5
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +119 -34
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
- /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
#include <arm_neon.h>
|
|
20
20
|
|
|
21
21
|
#include <faiss/impl/FaissAssert.h>
|
|
22
|
+
#include <faiss/impl/simdlib/simdlib.h>
|
|
22
23
|
|
|
23
24
|
namespace faiss {
|
|
24
25
|
|
|
@@ -269,18 +270,20 @@ static inline uint16x8_t vshrq(uint16x8_t vec) {
|
|
|
269
270
|
} // namespace detail
|
|
270
271
|
|
|
271
272
|
/// vector of 16 elements in uint16
|
|
272
|
-
|
|
273
|
+
template <>
|
|
274
|
+
struct simd16uint16_tpl<SIMDLevel::ARM_NEON> {
|
|
273
275
|
uint16x8x2_t data;
|
|
274
276
|
|
|
275
|
-
|
|
277
|
+
simd16uint16_tpl() = default;
|
|
276
278
|
|
|
277
|
-
explicit
|
|
279
|
+
explicit simd16uint16_tpl(int x) : data{vdupq_n_u16(x), vdupq_n_u16(x)} {}
|
|
278
280
|
|
|
279
|
-
explicit
|
|
281
|
+
explicit simd16uint16_tpl(uint16_t x)
|
|
282
|
+
: data{vdupq_n_u16(x), vdupq_n_u16(x)} {}
|
|
280
283
|
|
|
281
|
-
explicit
|
|
284
|
+
explicit simd16uint16_tpl(const uint16x8x2_t& v) : data{v} {}
|
|
282
285
|
|
|
283
|
-
explicit
|
|
286
|
+
explicit simd16uint16_tpl(
|
|
284
287
|
uint16_t u0,
|
|
285
288
|
uint16_t u1,
|
|
286
289
|
uint16_t u2,
|
|
@@ -323,10 +326,10 @@ struct simd16uint16 {
|
|
|
323
326
|
typename std::enable_if<
|
|
324
327
|
detail::simdlib::is_simd256bit<T>::value,
|
|
325
328
|
std::nullptr_t>::type = nullptr>
|
|
326
|
-
explicit
|
|
329
|
+
explicit simd16uint16_tpl(const T& x)
|
|
327
330
|
: data{detail::simdlib::reinterpret_u16(x.data)} {}
|
|
328
331
|
|
|
329
|
-
explicit
|
|
332
|
+
explicit simd16uint16_tpl(const uint16_t* x)
|
|
330
333
|
: data{vld1q_u16(x), vld1q_u16(x + 8)} {}
|
|
331
334
|
|
|
332
335
|
void clear() {
|
|
@@ -372,65 +375,74 @@ struct simd16uint16 {
|
|
|
372
375
|
detail::simdlib::set1(data, x).call<&detail::simdlib::vdupq_n_u16>();
|
|
373
376
|
}
|
|
374
377
|
|
|
375
|
-
|
|
376
|
-
return
|
|
377
|
-
|
|
378
|
+
simd16uint16_tpl operator*(const simd16uint16_tpl& other) const {
|
|
379
|
+
return simd16uint16_tpl{detail::simdlib::binary_func(data, other.data)
|
|
380
|
+
.call<&vmulq_u16>()};
|
|
378
381
|
}
|
|
379
382
|
|
|
380
383
|
// shift must be known at compile time
|
|
381
|
-
|
|
384
|
+
simd16uint16_tpl operator>>(const int shift) const {
|
|
382
385
|
switch (shift) {
|
|
383
386
|
case 0:
|
|
384
387
|
return *this;
|
|
385
388
|
case 1:
|
|
386
|
-
return
|
|
387
|
-
|
|
389
|
+
return simd16uint16_tpl{
|
|
390
|
+
detail::simdlib::unary_func(data)
|
|
391
|
+
.call<detail::simdlib::vshrq<1>>()};
|
|
388
392
|
case 2:
|
|
389
|
-
return
|
|
390
|
-
|
|
393
|
+
return simd16uint16_tpl{
|
|
394
|
+
detail::simdlib::unary_func(data)
|
|
395
|
+
.call<detail::simdlib::vshrq<2>>()};
|
|
391
396
|
case 3:
|
|
392
|
-
return
|
|
393
|
-
|
|
397
|
+
return simd16uint16_tpl{
|
|
398
|
+
detail::simdlib::unary_func(data)
|
|
399
|
+
.call<detail::simdlib::vshrq<3>>()};
|
|
394
400
|
case 4:
|
|
395
|
-
return
|
|
396
|
-
|
|
401
|
+
return simd16uint16_tpl{
|
|
402
|
+
detail::simdlib::unary_func(data)
|
|
403
|
+
.call<detail::simdlib::vshrq<4>>()};
|
|
397
404
|
case 5:
|
|
398
|
-
return
|
|
399
|
-
|
|
405
|
+
return simd16uint16_tpl{
|
|
406
|
+
detail::simdlib::unary_func(data)
|
|
407
|
+
.call<detail::simdlib::vshrq<5>>()};
|
|
400
408
|
case 6:
|
|
401
|
-
return
|
|
402
|
-
|
|
409
|
+
return simd16uint16_tpl{
|
|
410
|
+
detail::simdlib::unary_func(data)
|
|
411
|
+
.call<detail::simdlib::vshrq<6>>()};
|
|
403
412
|
case 7:
|
|
404
|
-
return
|
|
405
|
-
|
|
413
|
+
return simd16uint16_tpl{
|
|
414
|
+
detail::simdlib::unary_func(data)
|
|
415
|
+
.call<detail::simdlib::vshrq<7>>()};
|
|
406
416
|
case 8:
|
|
407
|
-
return
|
|
408
|
-
|
|
417
|
+
return simd16uint16_tpl{
|
|
418
|
+
detail::simdlib::unary_func(data)
|
|
419
|
+
.call<detail::simdlib::vshrq<8>>()};
|
|
409
420
|
case 9:
|
|
410
|
-
return
|
|
411
|
-
|
|
421
|
+
return simd16uint16_tpl{
|
|
422
|
+
detail::simdlib::unary_func(data)
|
|
423
|
+
.call<detail::simdlib::vshrq<9>>()};
|
|
412
424
|
case 10:
|
|
413
|
-
return
|
|
425
|
+
return simd16uint16_tpl{
|
|
414
426
|
detail::simdlib::unary_func(data)
|
|
415
427
|
.call<detail::simdlib::vshrq<10>>()};
|
|
416
428
|
case 11:
|
|
417
|
-
return
|
|
429
|
+
return simd16uint16_tpl{
|
|
418
430
|
detail::simdlib::unary_func(data)
|
|
419
431
|
.call<detail::simdlib::vshrq<11>>()};
|
|
420
432
|
case 12:
|
|
421
|
-
return
|
|
433
|
+
return simd16uint16_tpl{
|
|
422
434
|
detail::simdlib::unary_func(data)
|
|
423
435
|
.call<detail::simdlib::vshrq<12>>()};
|
|
424
436
|
case 13:
|
|
425
|
-
return
|
|
437
|
+
return simd16uint16_tpl{
|
|
426
438
|
detail::simdlib::unary_func(data)
|
|
427
439
|
.call<detail::simdlib::vshrq<13>>()};
|
|
428
440
|
case 14:
|
|
429
|
-
return
|
|
441
|
+
return simd16uint16_tpl{
|
|
430
442
|
detail::simdlib::unary_func(data)
|
|
431
443
|
.call<detail::simdlib::vshrq<14>>()};
|
|
432
444
|
case 15:
|
|
433
|
-
return
|
|
445
|
+
return simd16uint16_tpl{
|
|
434
446
|
detail::simdlib::unary_func(data)
|
|
435
447
|
.call<detail::simdlib::vshrq<15>>()};
|
|
436
448
|
default:
|
|
@@ -439,59 +451,68 @@ struct simd16uint16 {
|
|
|
439
451
|
}
|
|
440
452
|
|
|
441
453
|
// shift must be known at compile time
|
|
442
|
-
|
|
454
|
+
simd16uint16_tpl operator<<(const int shift) const {
|
|
443
455
|
switch (shift) {
|
|
444
456
|
case 0:
|
|
445
457
|
return *this;
|
|
446
458
|
case 1:
|
|
447
|
-
return
|
|
448
|
-
|
|
459
|
+
return simd16uint16_tpl{
|
|
460
|
+
detail::simdlib::unary_func(data)
|
|
461
|
+
.call<detail::simdlib::vshlq<1>>()};
|
|
449
462
|
case 2:
|
|
450
|
-
return
|
|
451
|
-
|
|
463
|
+
return simd16uint16_tpl{
|
|
464
|
+
detail::simdlib::unary_func(data)
|
|
465
|
+
.call<detail::simdlib::vshlq<2>>()};
|
|
452
466
|
case 3:
|
|
453
|
-
return
|
|
454
|
-
|
|
467
|
+
return simd16uint16_tpl{
|
|
468
|
+
detail::simdlib::unary_func(data)
|
|
469
|
+
.call<detail::simdlib::vshlq<3>>()};
|
|
455
470
|
case 4:
|
|
456
|
-
return
|
|
457
|
-
|
|
471
|
+
return simd16uint16_tpl{
|
|
472
|
+
detail::simdlib::unary_func(data)
|
|
473
|
+
.call<detail::simdlib::vshlq<4>>()};
|
|
458
474
|
case 5:
|
|
459
|
-
return
|
|
460
|
-
|
|
475
|
+
return simd16uint16_tpl{
|
|
476
|
+
detail::simdlib::unary_func(data)
|
|
477
|
+
.call<detail::simdlib::vshlq<5>>()};
|
|
461
478
|
case 6:
|
|
462
|
-
return
|
|
463
|
-
|
|
479
|
+
return simd16uint16_tpl{
|
|
480
|
+
detail::simdlib::unary_func(data)
|
|
481
|
+
.call<detail::simdlib::vshlq<6>>()};
|
|
464
482
|
case 7:
|
|
465
|
-
return
|
|
466
|
-
|
|
483
|
+
return simd16uint16_tpl{
|
|
484
|
+
detail::simdlib::unary_func(data)
|
|
485
|
+
.call<detail::simdlib::vshlq<7>>()};
|
|
467
486
|
case 8:
|
|
468
|
-
return
|
|
469
|
-
|
|
487
|
+
return simd16uint16_tpl{
|
|
488
|
+
detail::simdlib::unary_func(data)
|
|
489
|
+
.call<detail::simdlib::vshlq<8>>()};
|
|
470
490
|
case 9:
|
|
471
|
-
return
|
|
472
|
-
|
|
491
|
+
return simd16uint16_tpl{
|
|
492
|
+
detail::simdlib::unary_func(data)
|
|
493
|
+
.call<detail::simdlib::vshlq<9>>()};
|
|
473
494
|
case 10:
|
|
474
|
-
return
|
|
495
|
+
return simd16uint16_tpl{
|
|
475
496
|
detail::simdlib::unary_func(data)
|
|
476
497
|
.call<detail::simdlib::vshlq<10>>()};
|
|
477
498
|
case 11:
|
|
478
|
-
return
|
|
499
|
+
return simd16uint16_tpl{
|
|
479
500
|
detail::simdlib::unary_func(data)
|
|
480
501
|
.call<detail::simdlib::vshlq<11>>()};
|
|
481
502
|
case 12:
|
|
482
|
-
return
|
|
503
|
+
return simd16uint16_tpl{
|
|
483
504
|
detail::simdlib::unary_func(data)
|
|
484
505
|
.call<detail::simdlib::vshlq<12>>()};
|
|
485
506
|
case 13:
|
|
486
|
-
return
|
|
507
|
+
return simd16uint16_tpl{
|
|
487
508
|
detail::simdlib::unary_func(data)
|
|
488
509
|
.call<detail::simdlib::vshlq<13>>()};
|
|
489
510
|
case 14:
|
|
490
|
-
return
|
|
511
|
+
return simd16uint16_tpl{
|
|
491
512
|
detail::simdlib::unary_func(data)
|
|
492
513
|
.call<detail::simdlib::vshlq<14>>()};
|
|
493
514
|
case 15:
|
|
494
|
-
return
|
|
515
|
+
return simd16uint16_tpl{
|
|
495
516
|
detail::simdlib::unary_func(data)
|
|
496
517
|
.call<detail::simdlib::vshlq<15>>()};
|
|
497
518
|
default:
|
|
@@ -499,24 +520,24 @@ struct simd16uint16 {
|
|
|
499
520
|
}
|
|
500
521
|
}
|
|
501
522
|
|
|
502
|
-
|
|
523
|
+
simd16uint16_tpl& operator+=(const simd16uint16_tpl& other) {
|
|
503
524
|
*this = *this + other;
|
|
504
525
|
return *this;
|
|
505
526
|
}
|
|
506
527
|
|
|
507
|
-
|
|
528
|
+
simd16uint16_tpl& operator-=(const simd16uint16_tpl& other) {
|
|
508
529
|
*this = *this - other;
|
|
509
530
|
return *this;
|
|
510
531
|
}
|
|
511
532
|
|
|
512
|
-
|
|
513
|
-
return
|
|
514
|
-
|
|
533
|
+
simd16uint16_tpl operator+(const simd16uint16_tpl& other) const {
|
|
534
|
+
return simd16uint16_tpl{detail::simdlib::binary_func(data, other.data)
|
|
535
|
+
.call<&vaddq_u16>()};
|
|
515
536
|
}
|
|
516
537
|
|
|
517
|
-
|
|
518
|
-
return
|
|
519
|
-
|
|
538
|
+
simd16uint16_tpl operator-(const simd16uint16_tpl& other) const {
|
|
539
|
+
return simd16uint16_tpl{detail::simdlib::binary_func(data, other.data)
|
|
540
|
+
.call<&vsubq_u16>()};
|
|
520
541
|
}
|
|
521
542
|
|
|
522
543
|
template <
|
|
@@ -524,8 +545,8 @@ struct simd16uint16 {
|
|
|
524
545
|
typename std::enable_if<
|
|
525
546
|
detail::simdlib::is_simd256bit<T>::value,
|
|
526
547
|
std::nullptr_t>::type = nullptr>
|
|
527
|
-
|
|
528
|
-
return
|
|
548
|
+
simd16uint16_tpl operator&(const T& other) const {
|
|
549
|
+
return simd16uint16_tpl{
|
|
529
550
|
detail::simdlib::binary_func(
|
|
530
551
|
data, detail::simdlib::reinterpret_u16(other.data))
|
|
531
552
|
.template call<&vandq_u16>()};
|
|
@@ -536,8 +557,8 @@ struct simd16uint16 {
|
|
|
536
557
|
typename std::enable_if<
|
|
537
558
|
detail::simdlib::is_simd256bit<T>::value,
|
|
538
559
|
std::nullptr_t>::type = nullptr>
|
|
539
|
-
|
|
540
|
-
return
|
|
560
|
+
simd16uint16_tpl operator|(const T& other) const {
|
|
561
|
+
return simd16uint16_tpl{
|
|
541
562
|
detail::simdlib::binary_func(
|
|
542
563
|
data, detail::simdlib::reinterpret_u16(other.data))
|
|
543
564
|
.template call<&vorrq_u16>()};
|
|
@@ -548,17 +569,17 @@ struct simd16uint16 {
|
|
|
548
569
|
typename std::enable_if<
|
|
549
570
|
detail::simdlib::is_simd256bit<T>::value,
|
|
550
571
|
std::nullptr_t>::type = nullptr>
|
|
551
|
-
|
|
552
|
-
return
|
|
572
|
+
simd16uint16_tpl operator^(const T& other) const {
|
|
573
|
+
return simd16uint16_tpl{
|
|
553
574
|
detail::simdlib::binary_func(
|
|
554
575
|
data, detail::simdlib::reinterpret_u16(other.data))
|
|
555
576
|
.template call<&veorq_u16>()};
|
|
556
577
|
}
|
|
557
578
|
|
|
558
579
|
// returns binary masks
|
|
559
|
-
|
|
560
|
-
return
|
|
561
|
-
|
|
580
|
+
simd16uint16_tpl operator==(const simd16uint16_tpl& other) const {
|
|
581
|
+
return simd16uint16_tpl{detail::simdlib::binary_func(data, other.data)
|
|
582
|
+
.call<&vceqq_u16>()};
|
|
562
583
|
}
|
|
563
584
|
|
|
564
585
|
// Checks whether the other holds exactly the same bytes.
|
|
@@ -571,8 +592,8 @@ struct simd16uint16 {
|
|
|
571
592
|
return vminvq_u16(equal) == 0xffffu;
|
|
572
593
|
}
|
|
573
594
|
|
|
574
|
-
|
|
575
|
-
return
|
|
595
|
+
simd16uint16_tpl operator~() const {
|
|
596
|
+
return simd16uint16_tpl{
|
|
576
597
|
detail::simdlib::unary_func(data).call<&vmvnq_u16>()};
|
|
577
598
|
}
|
|
578
599
|
|
|
@@ -583,7 +604,7 @@ struct simd16uint16 {
|
|
|
583
604
|
|
|
584
605
|
// mask of elements where this >= thresh
|
|
585
606
|
// 2 bit per component: 16 * 2 = 32 bit
|
|
586
|
-
uint32_t ge_mask(const
|
|
607
|
+
uint32_t ge_mask(const simd16uint16_tpl& thresh) const {
|
|
587
608
|
const auto input = detail::simdlib::binary_func(data, thresh.data)
|
|
588
609
|
.call<&vcgeq_u16>();
|
|
589
610
|
const auto vmovmask_u16 = [](uint16x8_t v) -> uint16_t {
|
|
@@ -597,15 +618,15 @@ struct simd16uint16 {
|
|
|
597
618
|
vmovmask_u16(input.val[0]);
|
|
598
619
|
}
|
|
599
620
|
|
|
600
|
-
uint32_t le_mask(const
|
|
621
|
+
uint32_t le_mask(const simd16uint16_tpl& thresh) const {
|
|
601
622
|
return thresh.ge_mask(*this);
|
|
602
623
|
}
|
|
603
624
|
|
|
604
|
-
uint32_t gt_mask(const
|
|
625
|
+
uint32_t gt_mask(const simd16uint16_tpl& thresh) const {
|
|
605
626
|
return ~le_mask(thresh);
|
|
606
627
|
}
|
|
607
628
|
|
|
608
|
-
bool all_gt(const
|
|
629
|
+
bool all_gt(const simd16uint16_tpl& thresh) const {
|
|
609
630
|
return le_mask(thresh) == 0;
|
|
610
631
|
}
|
|
611
632
|
|
|
@@ -617,33 +638,39 @@ struct simd16uint16 {
|
|
|
617
638
|
return tab[i - high * 8];
|
|
618
639
|
}
|
|
619
640
|
|
|
620
|
-
void accu_min(const
|
|
641
|
+
void accu_min(const simd16uint16_tpl& incoming) {
|
|
621
642
|
data = detail::simdlib::binary_func(incoming.data, data)
|
|
622
643
|
.call<&vminq_u16>();
|
|
623
644
|
}
|
|
624
645
|
|
|
625
|
-
void accu_max(const
|
|
646
|
+
void accu_max(const simd16uint16_tpl& incoming) {
|
|
626
647
|
data = detail::simdlib::binary_func(incoming.data, data)
|
|
627
648
|
.call<&vmaxq_u16>();
|
|
628
649
|
}
|
|
629
650
|
};
|
|
630
651
|
|
|
631
652
|
// not really a std::min because it returns an elementwise min
|
|
632
|
-
inline
|
|
633
|
-
|
|
653
|
+
inline simd16uint16_tpl<SIMDLevel::ARM_NEON> min(
|
|
654
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& av,
|
|
655
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& bv) {
|
|
656
|
+
return simd16uint16_tpl<SIMDLevel::ARM_NEON>{
|
|
634
657
|
detail::simdlib::binary_func(av.data, bv.data).call<&vminq_u16>()};
|
|
635
658
|
}
|
|
636
659
|
|
|
637
|
-
inline
|
|
638
|
-
|
|
660
|
+
inline simd16uint16_tpl<SIMDLevel::ARM_NEON> max(
|
|
661
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& av,
|
|
662
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& bv) {
|
|
663
|
+
return simd16uint16_tpl<SIMDLevel::ARM_NEON>{
|
|
639
664
|
detail::simdlib::binary_func(av.data, bv.data).call<&vmaxq_u16>()};
|
|
640
665
|
}
|
|
641
666
|
|
|
642
667
|
// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
|
|
643
668
|
// return (a0 + a1, b0 + b1)
|
|
644
669
|
// TODO find a better name
|
|
645
|
-
inline
|
|
646
|
-
|
|
670
|
+
inline simd16uint16_tpl<SIMDLevel::ARM_NEON> combine2x2(
|
|
671
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& a,
|
|
672
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& b) {
|
|
673
|
+
return simd16uint16_tpl<SIMDLevel::ARM_NEON>{uint16x8x2_t{
|
|
647
674
|
vaddq_u16(a.data.val[0], a.data.val[1]),
|
|
648
675
|
vaddq_u16(b.data.val[0], b.data.val[1])}};
|
|
649
676
|
}
|
|
@@ -651,22 +678,24 @@ inline simd16uint16 combine2x2(const simd16uint16& a, const simd16uint16& b) {
|
|
|
651
678
|
// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
|
|
652
679
|
// of d0 and d1 with thr
|
|
653
680
|
inline uint32_t cmp_ge32(
|
|
654
|
-
const
|
|
655
|
-
const
|
|
656
|
-
const
|
|
681
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& d0,
|
|
682
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& d1,
|
|
683
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& thr) {
|
|
657
684
|
return detail::simdlib::cmp_xe32<&vcgeq_u16>(d0.data, d1.data, thr.data);
|
|
658
685
|
}
|
|
659
686
|
|
|
660
687
|
inline uint32_t cmp_le32(
|
|
661
|
-
const
|
|
662
|
-
const
|
|
663
|
-
const
|
|
688
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& d0,
|
|
689
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& d1,
|
|
690
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& thr) {
|
|
664
691
|
return detail::simdlib::cmp_xe32<&vcleq_u16>(d0.data, d1.data, thr.data);
|
|
665
692
|
}
|
|
666
693
|
|
|
667
694
|
// hadd does not cross lanes
|
|
668
|
-
inline
|
|
669
|
-
|
|
695
|
+
inline simd16uint16_tpl<SIMDLevel::ARM_NEON> hadd(
|
|
696
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& a,
|
|
697
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& b) {
|
|
698
|
+
return simd16uint16_tpl<SIMDLevel::ARM_NEON>{
|
|
670
699
|
detail::simdlib::binary_func(a.data, b.data).call<&vpaddq_u16>()};
|
|
671
700
|
}
|
|
672
701
|
|
|
@@ -682,14 +711,14 @@ inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
|
|
|
682
711
|
// the last equal value is saved instead of the first one), but this behavior
|
|
683
712
|
// saves instructions.
|
|
684
713
|
inline void cmplt_min_max_fast(
|
|
685
|
-
const
|
|
686
|
-
const
|
|
687
|
-
const
|
|
688
|
-
const
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
714
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON> candidateValues,
|
|
715
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON> candidateIndices,
|
|
716
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON> currentValues,
|
|
717
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON> currentIndices,
|
|
718
|
+
simd16uint16_tpl<SIMDLevel::ARM_NEON>& minValues,
|
|
719
|
+
simd16uint16_tpl<SIMDLevel::ARM_NEON>& minIndices,
|
|
720
|
+
simd16uint16_tpl<SIMDLevel::ARM_NEON>& maxValues,
|
|
721
|
+
simd16uint16_tpl<SIMDLevel::ARM_NEON>& maxIndices) {
|
|
693
722
|
const uint16x8x2_t comparison =
|
|
694
723
|
detail::simdlib::binary_func(
|
|
695
724
|
candidateValues.data, currentValues.data)
|
|
@@ -719,16 +748,17 @@ inline void cmplt_min_max_fast(
|
|
|
719
748
|
}
|
|
720
749
|
|
|
721
750
|
// vector of 32 unsigned 8-bit integers
|
|
722
|
-
|
|
751
|
+
template <>
|
|
752
|
+
struct simd32uint8_tpl<SIMDLevel::ARM_NEON> {
|
|
723
753
|
uint8x16x2_t data;
|
|
724
754
|
|
|
725
|
-
|
|
755
|
+
simd32uint8_tpl() = default;
|
|
726
756
|
|
|
727
|
-
explicit
|
|
757
|
+
explicit simd32uint8_tpl(int x) : data{vdupq_n_u8(x), vdupq_n_u8(x)} {}
|
|
728
758
|
|
|
729
|
-
explicit
|
|
759
|
+
explicit simd32uint8_tpl(uint8_t x) : data{vdupq_n_u8(x), vdupq_n_u8(x)} {}
|
|
730
760
|
|
|
731
|
-
explicit
|
|
761
|
+
explicit simd32uint8_tpl(const uint8x16x2_t& v) : data{v} {}
|
|
732
762
|
|
|
733
763
|
template <
|
|
734
764
|
uint8_t _0,
|
|
@@ -763,12 +793,12 @@ struct simd32uint8 {
|
|
|
763
793
|
uint8_t _29,
|
|
764
794
|
uint8_t _30,
|
|
765
795
|
uint8_t _31>
|
|
766
|
-
static
|
|
796
|
+
static simd32uint8_tpl create() {
|
|
767
797
|
constexpr uint8_t ds[32] = {_0, _1, _2, _3, _4, _5, _6, _7,
|
|
768
798
|
_8, _9, _10, _11, _12, _13, _14, _15,
|
|
769
799
|
_16, _17, _18, _19, _20, _21, _22, _23,
|
|
770
800
|
_24, _25, _26, _27, _28, _29, _30, _31};
|
|
771
|
-
return
|
|
801
|
+
return simd32uint8_tpl{ds};
|
|
772
802
|
}
|
|
773
803
|
|
|
774
804
|
template <
|
|
@@ -776,10 +806,10 @@ struct simd32uint8 {
|
|
|
776
806
|
typename std::enable_if<
|
|
777
807
|
detail::simdlib::is_simd256bit<T>::value,
|
|
778
808
|
std::nullptr_t>::type = nullptr>
|
|
779
|
-
explicit
|
|
809
|
+
explicit simd32uint8_tpl(const T& x)
|
|
780
810
|
: data{detail::simdlib::reinterpret_u8(x.data)} {}
|
|
781
811
|
|
|
782
|
-
explicit
|
|
812
|
+
explicit simd32uint8_tpl(const uint8_t* x)
|
|
783
813
|
: data{vld1q_u8(x), vld1q_u8(x + 16)} {}
|
|
784
814
|
|
|
785
815
|
void clear() {
|
|
@@ -835,25 +865,25 @@ struct simd32uint8 {
|
|
|
835
865
|
typename std::enable_if<
|
|
836
866
|
detail::simdlib::is_simd256bit<T>::value,
|
|
837
867
|
std::nullptr_t>::type = nullptr>
|
|
838
|
-
|
|
839
|
-
return
|
|
868
|
+
simd32uint8_tpl operator&(const T& other) const {
|
|
869
|
+
return simd32uint8_tpl{
|
|
840
870
|
detail::simdlib::binary_func(
|
|
841
871
|
data, detail::simdlib::reinterpret_u8(other.data))
|
|
842
872
|
.template call<&vandq_u8>()};
|
|
843
873
|
}
|
|
844
874
|
|
|
845
|
-
|
|
846
|
-
return
|
|
847
|
-
|
|
875
|
+
simd32uint8_tpl operator+(const simd32uint8_tpl& other) const {
|
|
876
|
+
return simd32uint8_tpl{detail::simdlib::binary_func(data, other.data)
|
|
877
|
+
.call<&vaddq_u8>()};
|
|
848
878
|
}
|
|
849
879
|
|
|
850
880
|
// The very important operation that everything relies on
|
|
851
|
-
|
|
852
|
-
return
|
|
853
|
-
|
|
881
|
+
simd32uint8_tpl lookup_2_lanes(const simd32uint8_tpl& idx) const {
|
|
882
|
+
return simd32uint8_tpl{detail::simdlib::binary_func(data, idx.data)
|
|
883
|
+
.call<&vqtbl1q_u8>()};
|
|
854
884
|
}
|
|
855
885
|
|
|
856
|
-
|
|
886
|
+
simd32uint8_tpl& operator+=(const simd32uint8_tpl& other) {
|
|
857
887
|
*this = *this + other;
|
|
858
888
|
return *this;
|
|
859
889
|
}
|
|
@@ -879,56 +909,59 @@ struct simd32uint8 {
|
|
|
879
909
|
|
|
880
910
|
// convert with saturation
|
|
881
911
|
// careful: this does not cross lanes, so the order is weird
|
|
882
|
-
inline
|
|
883
|
-
const
|
|
884
|
-
const
|
|
885
|
-
return
|
|
912
|
+
inline simd32uint8_tpl<SIMDLevel::ARM_NEON> uint16_to_uint8_saturate(
|
|
913
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& a,
|
|
914
|
+
const simd16uint16_tpl<SIMDLevel::ARM_NEON>& b) {
|
|
915
|
+
return simd32uint8_tpl<SIMDLevel::ARM_NEON>{uint8x16x2_t{
|
|
886
916
|
vqmovn_high_u16(vqmovn_u16(a.data.val[0]), b.data.val[0]),
|
|
887
917
|
vqmovn_high_u16(vqmovn_u16(a.data.val[1]), b.data.val[1])}};
|
|
888
918
|
}
|
|
889
919
|
|
|
890
920
|
/// get most significant bit of each byte
|
|
891
|
-
inline uint32_t get_MSBs(const
|
|
921
|
+
inline uint32_t get_MSBs(const simd32uint8_tpl<SIMDLevel::ARM_NEON>& a) {
|
|
892
922
|
using detail::simdlib::vmovmask_u8;
|
|
893
923
|
return vmovmask_u8(a.data.val[0]) |
|
|
894
924
|
static_cast<uint32_t>(vmovmask_u8(a.data.val[1])) << 16u;
|
|
895
925
|
}
|
|
896
926
|
|
|
897
927
|
/// use MSB of each byte of mask to select a byte between a and b
|
|
898
|
-
inline
|
|
899
|
-
const
|
|
900
|
-
const
|
|
901
|
-
const
|
|
928
|
+
inline simd32uint8_tpl<SIMDLevel::ARM_NEON> blendv(
|
|
929
|
+
const simd32uint8_tpl<SIMDLevel::ARM_NEON>& a,
|
|
930
|
+
const simd32uint8_tpl<SIMDLevel::ARM_NEON>& b,
|
|
931
|
+
const simd32uint8_tpl<SIMDLevel::ARM_NEON>& mask) {
|
|
902
932
|
const auto msb = vdupq_n_u8(0x80);
|
|
903
933
|
const uint8x16x2_t msb_mask = {
|
|
904
934
|
vtstq_u8(mask.data.val[0], msb), vtstq_u8(mask.data.val[1], msb)};
|
|
905
935
|
const uint8x16x2_t selected = {
|
|
906
936
|
vbslq_u8(msb_mask.val[0], b.data.val[0], a.data.val[0]),
|
|
907
937
|
vbslq_u8(msb_mask.val[1], b.data.val[1], a.data.val[1])};
|
|
908
|
-
return
|
|
938
|
+
return simd32uint8_tpl<SIMDLevel::ARM_NEON>{selected};
|
|
909
939
|
}
|
|
910
940
|
|
|
911
941
|
/// vector of 8 unsigned 32-bit integers
|
|
912
|
-
|
|
942
|
+
template <>
|
|
943
|
+
struct simd8uint32_tpl<SIMDLevel::ARM_NEON> {
|
|
913
944
|
uint32x4x2_t data;
|
|
914
945
|
|
|
915
|
-
|
|
946
|
+
simd8uint32_tpl() = default;
|
|
916
947
|
|
|
917
|
-
explicit
|
|
948
|
+
explicit simd8uint32_tpl(uint32_t x)
|
|
949
|
+
: data{vdupq_n_u32(x), vdupq_n_u32(x)} {}
|
|
918
950
|
|
|
919
|
-
explicit
|
|
951
|
+
explicit simd8uint32_tpl(const uint32x4x2_t& v) : data{v} {}
|
|
920
952
|
|
|
921
953
|
template <
|
|
922
954
|
typename T,
|
|
923
955
|
typename std::enable_if<
|
|
924
956
|
detail::simdlib::is_simd256bit<T>::value,
|
|
925
957
|
std::nullptr_t>::type = nullptr>
|
|
926
|
-
explicit
|
|
958
|
+
explicit simd8uint32_tpl(const T& x)
|
|
927
959
|
: data{detail::simdlib::reinterpret_u32(x.data)} {}
|
|
928
960
|
|
|
929
|
-
explicit
|
|
961
|
+
explicit simd8uint32_tpl(const uint8_t* x)
|
|
962
|
+
: simd8uint32_tpl(simd32uint8_tpl<SIMDLevel::ARM_NEON>(x)) {}
|
|
930
963
|
|
|
931
|
-
explicit
|
|
964
|
+
explicit simd8uint32_tpl(
|
|
932
965
|
uint32_t u0,
|
|
933
966
|
uint32_t u1,
|
|
934
967
|
uint32_t u2,
|
|
@@ -942,33 +975,33 @@ struct simd8uint32 {
|
|
|
942
975
|
data.val[1] = vld1q_u32(temp + 4);
|
|
943
976
|
}
|
|
944
977
|
|
|
945
|
-
|
|
946
|
-
return
|
|
947
|
-
|
|
978
|
+
simd8uint32_tpl operator+(simd8uint32_tpl other) const {
|
|
979
|
+
return simd8uint32_tpl{detail::simdlib::binary_func(data, other.data)
|
|
980
|
+
.call<&vaddq_u32>()};
|
|
948
981
|
}
|
|
949
982
|
|
|
950
|
-
|
|
951
|
-
return
|
|
952
|
-
|
|
983
|
+
simd8uint32_tpl operator-(simd8uint32_tpl other) const {
|
|
984
|
+
return simd8uint32_tpl{detail::simdlib::binary_func(data, other.data)
|
|
985
|
+
.call<&vsubq_u32>()};
|
|
953
986
|
}
|
|
954
987
|
|
|
955
|
-
|
|
988
|
+
simd8uint32_tpl& operator+=(const simd8uint32_tpl& other) {
|
|
956
989
|
data.val[0] = vaddq_u32(data.val[0], other.data.val[0]);
|
|
957
990
|
data.val[1] = vaddq_u32(data.val[1], other.data.val[1]);
|
|
958
991
|
return *this;
|
|
959
992
|
}
|
|
960
993
|
|
|
961
|
-
|
|
962
|
-
return
|
|
963
|
-
|
|
994
|
+
simd8uint32_tpl operator==(simd8uint32_tpl other) const {
|
|
995
|
+
return simd8uint32_tpl{detail::simdlib::binary_func(data, other.data)
|
|
996
|
+
.call<&vceqq_u32>()};
|
|
964
997
|
}
|
|
965
998
|
|
|
966
|
-
|
|
967
|
-
return
|
|
999
|
+
simd8uint32_tpl operator~() const {
|
|
1000
|
+
return simd8uint32_tpl{
|
|
968
1001
|
detail::simdlib::unary_func(data).call<&vmvnq_u32>()};
|
|
969
1002
|
}
|
|
970
1003
|
|
|
971
|
-
|
|
1004
|
+
simd8uint32_tpl operator!=(simd8uint32_tpl other) const {
|
|
972
1005
|
return ~(*this == other);
|
|
973
1006
|
}
|
|
974
1007
|
|
|
@@ -1025,8 +1058,8 @@ struct simd8uint32 {
|
|
|
1025
1058
|
detail::simdlib::set1(data, x).call<&vdupq_n_u32>();
|
|
1026
1059
|
}
|
|
1027
1060
|
|
|
1028
|
-
|
|
1029
|
-
return
|
|
1061
|
+
simd8uint32_tpl unzip() const {
|
|
1062
|
+
return simd8uint32_tpl{uint32x4x2_t{
|
|
1030
1063
|
vuzp1q_u32(data.val[0], data.val[1]),
|
|
1031
1064
|
vuzp2q_u32(data.val[0], data.val[1])}};
|
|
1032
1065
|
}
|
|
@@ -1044,14 +1077,14 @@ struct simd8uint32 {
|
|
|
1044
1077
|
// the last equal value is saved instead of the first one), but this behavior
|
|
1045
1078
|
// saves instructions.
|
|
1046
1079
|
inline void cmplt_min_max_fast(
|
|
1047
|
-
const
|
|
1048
|
-
const
|
|
1049
|
-
const
|
|
1050
|
-
const
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1080
|
+
const simd8uint32_tpl<SIMDLevel::ARM_NEON> candidateValues,
|
|
1081
|
+
const simd8uint32_tpl<SIMDLevel::ARM_NEON> candidateIndices,
|
|
1082
|
+
const simd8uint32_tpl<SIMDLevel::ARM_NEON> currentValues,
|
|
1083
|
+
const simd8uint32_tpl<SIMDLevel::ARM_NEON> currentIndices,
|
|
1084
|
+
simd8uint32_tpl<SIMDLevel::ARM_NEON>& minValues,
|
|
1085
|
+
simd8uint32_tpl<SIMDLevel::ARM_NEON>& minIndices,
|
|
1086
|
+
simd8uint32_tpl<SIMDLevel::ARM_NEON>& maxValues,
|
|
1087
|
+
simd8uint32_tpl<SIMDLevel::ARM_NEON>& maxIndices) {
|
|
1055
1088
|
const uint32x4x2_t comparison =
|
|
1056
1089
|
detail::simdlib::binary_func(
|
|
1057
1090
|
candidateValues.data, currentValues.data)
|
|
@@ -1084,27 +1117,28 @@ inline void cmplt_min_max_fast(
|
|
|
1084
1117
|
candidateIndices.data.val[1])};
|
|
1085
1118
|
}
|
|
1086
1119
|
|
|
1087
|
-
|
|
1120
|
+
template <>
|
|
1121
|
+
struct simd8float32_tpl<SIMDLevel::ARM_NEON> {
|
|
1088
1122
|
float32x4x2_t data;
|
|
1089
1123
|
|
|
1090
|
-
|
|
1124
|
+
simd8float32_tpl() = default;
|
|
1091
1125
|
|
|
1092
|
-
explicit
|
|
1126
|
+
explicit simd8float32_tpl(float x) : data{vdupq_n_f32(x), vdupq_n_f32(x)} {}
|
|
1093
1127
|
|
|
1094
|
-
explicit
|
|
1128
|
+
explicit simd8float32_tpl(const float32x4x2_t& v) : data{v} {}
|
|
1095
1129
|
|
|
1096
1130
|
template <
|
|
1097
1131
|
typename T,
|
|
1098
1132
|
typename std::enable_if<
|
|
1099
1133
|
detail::simdlib::is_simd256bit<T>::value,
|
|
1100
1134
|
std::nullptr_t>::type = nullptr>
|
|
1101
|
-
explicit
|
|
1135
|
+
explicit simd8float32_tpl(const T& x)
|
|
1102
1136
|
: data{detail::simdlib::reinterpret_f32(x.data)} {}
|
|
1103
1137
|
|
|
1104
|
-
explicit
|
|
1138
|
+
explicit simd8float32_tpl(const float* x)
|
|
1105
1139
|
: data{vld1q_f32(x), vld1q_f32(x + 4)} {}
|
|
1106
1140
|
|
|
1107
|
-
explicit
|
|
1141
|
+
explicit simd8float32_tpl(
|
|
1108
1142
|
float f0,
|
|
1109
1143
|
float f1,
|
|
1110
1144
|
float f2,
|
|
@@ -1144,22 +1178,22 @@ struct simd8float32 {
|
|
|
1144
1178
|
return detail::simdlib::bin(*this);
|
|
1145
1179
|
}
|
|
1146
1180
|
|
|
1147
|
-
|
|
1148
|
-
return
|
|
1149
|
-
|
|
1181
|
+
simd8float32_tpl operator*(const simd8float32_tpl& other) const {
|
|
1182
|
+
return simd8float32_tpl{detail::simdlib::binary_func(data, other.data)
|
|
1183
|
+
.call<&vmulq_f32>()};
|
|
1150
1184
|
}
|
|
1151
1185
|
|
|
1152
|
-
|
|
1153
|
-
return
|
|
1154
|
-
|
|
1186
|
+
simd8float32_tpl operator+(const simd8float32_tpl& other) const {
|
|
1187
|
+
return simd8float32_tpl{detail::simdlib::binary_func(data, other.data)
|
|
1188
|
+
.call<&vaddq_f32>()};
|
|
1155
1189
|
}
|
|
1156
1190
|
|
|
1157
|
-
|
|
1158
|
-
return
|
|
1159
|
-
|
|
1191
|
+
simd8float32_tpl operator-(const simd8float32_tpl& other) const {
|
|
1192
|
+
return simd8float32_tpl{detail::simdlib::binary_func(data, other.data)
|
|
1193
|
+
.call<&vsubq_f32>()};
|
|
1160
1194
|
}
|
|
1161
1195
|
|
|
1162
|
-
|
|
1196
|
+
simd8float32_tpl& operator+=(const simd8float32_tpl& other) {
|
|
1163
1197
|
// In this context, it is more compiler friendly to write intrinsics
|
|
1164
1198
|
// directly instead of using binary_func
|
|
1165
1199
|
data.val[0] = vaddq_f32(data.val[0], other.data.val[0]);
|
|
@@ -1167,13 +1201,15 @@ struct simd8float32 {
|
|
|
1167
1201
|
return *this;
|
|
1168
1202
|
}
|
|
1169
1203
|
|
|
1170
|
-
|
|
1171
|
-
|
|
1204
|
+
simd8uint32_tpl<SIMDLevel::ARM_NEON> operator==(
|
|
1205
|
+
simd8float32_tpl other) const {
|
|
1206
|
+
return simd8uint32_tpl<SIMDLevel::ARM_NEON>{
|
|
1172
1207
|
detail::simdlib::binary_func<::uint32x4x2_t>(data, other.data)
|
|
1173
1208
|
.call<&vceqq_f32>()};
|
|
1174
1209
|
}
|
|
1175
1210
|
|
|
1176
|
-
|
|
1211
|
+
simd8uint32_tpl<SIMDLevel::ARM_NEON> operator!=(
|
|
1212
|
+
simd8float32_tpl other) const {
|
|
1177
1213
|
return ~(*this == other);
|
|
1178
1214
|
}
|
|
1179
1215
|
|
|
@@ -1194,27 +1230,33 @@ struct simd8float32 {
|
|
|
1194
1230
|
};
|
|
1195
1231
|
|
|
1196
1232
|
// hadd does not cross lanes
|
|
1197
|
-
inline
|
|
1198
|
-
|
|
1233
|
+
inline simd8float32_tpl<SIMDLevel::ARM_NEON> hadd(
|
|
1234
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& a,
|
|
1235
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& b) {
|
|
1236
|
+
return simd8float32_tpl<SIMDLevel::ARM_NEON>{
|
|
1199
1237
|
detail::simdlib::binary_func(a.data, b.data).call<&vpaddq_f32>()};
|
|
1200
1238
|
}
|
|
1201
1239
|
|
|
1202
|
-
inline
|
|
1203
|
-
|
|
1240
|
+
inline simd8float32_tpl<SIMDLevel::ARM_NEON> unpacklo(
|
|
1241
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& a,
|
|
1242
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& b) {
|
|
1243
|
+
return simd8float32_tpl<SIMDLevel::ARM_NEON>{
|
|
1204
1244
|
detail::simdlib::binary_func(a.data, b.data).call<&vzip1q_f32>()};
|
|
1205
1245
|
}
|
|
1206
1246
|
|
|
1207
|
-
inline
|
|
1208
|
-
|
|
1247
|
+
inline simd8float32_tpl<SIMDLevel::ARM_NEON> unpackhi(
|
|
1248
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& a,
|
|
1249
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& b) {
|
|
1250
|
+
return simd8float32_tpl<SIMDLevel::ARM_NEON>{
|
|
1209
1251
|
detail::simdlib::binary_func(a.data, b.data).call<&vzip2q_f32>()};
|
|
1210
1252
|
}
|
|
1211
1253
|
|
|
1212
1254
|
// compute a * b + c
|
|
1213
|
-
inline
|
|
1214
|
-
const
|
|
1215
|
-
const
|
|
1216
|
-
const
|
|
1217
|
-
return
|
|
1255
|
+
inline simd8float32_tpl<SIMDLevel::ARM_NEON> fmadd(
|
|
1256
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& a,
|
|
1257
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& b,
|
|
1258
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& c) {
|
|
1259
|
+
return simd8float32_tpl<SIMDLevel::ARM_NEON>{float32x4x2_t{
|
|
1218
1260
|
vfmaq_f32(c.data.val[0], a.data.val[0], b.data.val[0]),
|
|
1219
1261
|
vfmaq_f32(c.data.val[1], a.data.val[1], b.data.val[1])}};
|
|
1220
1262
|
}
|
|
@@ -1251,10 +1293,10 @@ inline simd8float32 fmadd(
|
|
|
1251
1293
|
// confusion for ppl who write in low-level SIMD instructions. Additionally,
|
|
1252
1294
|
// these two ops (cmp and blend) are very often used together.
|
|
1253
1295
|
inline void cmplt_and_blend_inplace(
|
|
1254
|
-
const
|
|
1255
|
-
const
|
|
1256
|
-
|
|
1257
|
-
|
|
1296
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON> candidateValues,
|
|
1297
|
+
const simd8uint32_tpl<SIMDLevel::ARM_NEON> candidateIndices,
|
|
1298
|
+
simd8float32_tpl<SIMDLevel::ARM_NEON>& lowestValues,
|
|
1299
|
+
simd8uint32_tpl<SIMDLevel::ARM_NEON>& lowestIndices) {
|
|
1258
1300
|
const auto comparison = detail::simdlib::binary_func<::uint32x4x2_t>(
|
|
1259
1301
|
candidateValues.data, lowestValues.data)
|
|
1260
1302
|
.call<&vcltq_f32>();
|
|
@@ -1291,14 +1333,14 @@ inline void cmplt_and_blend_inplace(
|
|
|
1291
1333
|
// the last equal value is saved instead of the first one), but this behavior
|
|
1292
1334
|
// saves instructions.
|
|
1293
1335
|
inline void cmplt_min_max_fast(
|
|
1294
|
-
const
|
|
1295
|
-
const
|
|
1296
|
-
const
|
|
1297
|
-
const
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1336
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON> candidateValues,
|
|
1337
|
+
const simd8uint32_tpl<SIMDLevel::ARM_NEON> candidateIndices,
|
|
1338
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON> currentValues,
|
|
1339
|
+
const simd8uint32_tpl<SIMDLevel::ARM_NEON> currentIndices,
|
|
1340
|
+
simd8float32_tpl<SIMDLevel::ARM_NEON>& minValues,
|
|
1341
|
+
simd8uint32_tpl<SIMDLevel::ARM_NEON>& minIndices,
|
|
1342
|
+
simd8float32_tpl<SIMDLevel::ARM_NEON>& maxValues,
|
|
1343
|
+
simd8uint32_tpl<SIMDLevel::ARM_NEON>& maxIndices) {
|
|
1302
1344
|
const uint32x4x2_t comparison =
|
|
1303
1345
|
detail::simdlib::binary_func<::uint32x4x2_t>(
|
|
1304
1346
|
candidateValues.data, currentValues.data)
|
|
@@ -1334,29 +1376,39 @@ inline void cmplt_min_max_fast(
|
|
|
1334
1376
|
namespace {
|
|
1335
1377
|
|
|
1336
1378
|
// get even float32's of a and b, interleaved
|
|
1337
|
-
|
|
1338
|
-
|
|
1379
|
+
simd8float32_tpl<SIMDLevel::ARM_NEON> geteven(
|
|
1380
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& a,
|
|
1381
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& b) {
|
|
1382
|
+
return simd8float32_tpl<SIMDLevel::ARM_NEON>{
|
|
1339
1383
|
detail::simdlib::binary_func(a.data, b.data).call<&vuzp1q_f32>()};
|
|
1340
1384
|
}
|
|
1341
1385
|
|
|
1342
1386
|
// get odd float32's of a and b, interleaved
|
|
1343
|
-
|
|
1344
|
-
|
|
1387
|
+
simd8float32_tpl<SIMDLevel::ARM_NEON> getodd(
|
|
1388
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& a,
|
|
1389
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& b) {
|
|
1390
|
+
return simd8float32_tpl<SIMDLevel::ARM_NEON>{
|
|
1345
1391
|
detail::simdlib::binary_func(a.data, b.data).call<&vuzp2q_f32>()};
|
|
1346
1392
|
}
|
|
1347
1393
|
|
|
1348
1394
|
// 3 cycles
|
|
1349
1395
|
// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
|
|
1350
|
-
|
|
1351
|
-
|
|
1396
|
+
simd8float32_tpl<SIMDLevel::ARM_NEON> getlow128(
|
|
1397
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& a,
|
|
1398
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& b) {
|
|
1399
|
+
return simd8float32_tpl<SIMDLevel::ARM_NEON>{
|
|
1400
|
+
float32x4x2_t{a.data.val[0], b.data.val[0]}};
|
|
1352
1401
|
}
|
|
1353
1402
|
|
|
1354
|
-
|
|
1355
|
-
|
|
1403
|
+
simd8float32_tpl<SIMDLevel::ARM_NEON> gethigh128(
|
|
1404
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& a,
|
|
1405
|
+
const simd8float32_tpl<SIMDLevel::ARM_NEON>& b) {
|
|
1406
|
+
return simd8float32_tpl<SIMDLevel::ARM_NEON>{
|
|
1407
|
+
float32x4x2_t{a.data.val[1], b.data.val[1]}};
|
|
1356
1408
|
}
|
|
1357
1409
|
|
|
1358
1410
|
// horizontal add: sum all 8 floats in the register
|
|
1359
|
-
inline float horizontal_add(const
|
|
1411
|
+
inline float horizontal_add(const simd8float32_tpl<SIMDLevel::ARM_NEON>& a) {
|
|
1360
1412
|
float32x4_t sum = vaddq_f32(a.data.val[0], a.data.val[1]);
|
|
1361
1413
|
return vaddvq_f32(sum);
|
|
1362
1414
|
}
|