faiss 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/faiss/extconf.rb +2 -1
- data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
- data/ext/faiss/index_binary.cpp +1 -1
- data/ext/faiss/kmeans.cpp +1 -1
- data/ext/faiss/pca_matrix.cpp +1 -1
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +93 -80
- data/vendor/faiss/faiss/Clustering.cpp +39 -240
- data/vendor/faiss/faiss/Clustering.h +6 -0
- data/vendor/faiss/faiss/IVFlib.cpp +41 -21
- data/vendor/faiss/faiss/Index.cpp +6 -5
- data/vendor/faiss/faiss/Index.h +5 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
- data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
- data/vendor/faiss/faiss/IndexFastScan.h +25 -23
- data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
- data/vendor/faiss/faiss/IndexFlat.h +21 -18
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
- data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
- data/vendor/faiss/faiss/IndexHNSW.h +16 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
- data/vendor/faiss/faiss/IndexIVF.h +33 -12
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
- data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
- data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
- data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
- data/vendor/faiss/faiss/IndexShards.cpp +10 -9
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +14 -7
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
- data/vendor/faiss/faiss/VectorTransform.h +16 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +48 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/factory_tools.cpp +5 -0
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
- data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
- data/vendor/faiss/faiss/impl/HNSW.h +13 -34
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
- data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
- data/vendor/faiss/faiss/impl/NSG.h +4 -4
- data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
- data/vendor/faiss/faiss/impl/Panorama.h +258 -87
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +33 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
- data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +86 -18
- data/vendor/faiss/faiss/index_io.h +24 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
- data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
- data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/distances.cpp +390 -560
- data/vendor/faiss/faiss/utils/distances.h +20 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
- data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
- data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +5 -5
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +119 -34
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
- /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
|
@@ -12,9 +12,12 @@
|
|
|
12
12
|
#include <cstring>
|
|
13
13
|
#include <string>
|
|
14
14
|
|
|
15
|
+
#include <faiss/impl/simdlib/simdlib.h>
|
|
16
|
+
|
|
15
17
|
namespace faiss {
|
|
16
18
|
|
|
17
|
-
|
|
19
|
+
template <>
|
|
20
|
+
struct simd256bit_tpl<SIMDLevel::NONE> {
|
|
18
21
|
union {
|
|
19
22
|
uint8_t u8[32];
|
|
20
23
|
uint16_t u16[16];
|
|
@@ -22,9 +25,9 @@ struct simd256bit {
|
|
|
22
25
|
float f32[8];
|
|
23
26
|
};
|
|
24
27
|
|
|
25
|
-
|
|
28
|
+
simd256bit_tpl() {}
|
|
26
29
|
|
|
27
|
-
explicit
|
|
30
|
+
explicit simd256bit_tpl(const void* x) {
|
|
28
31
|
memcpy(u8, x, 32);
|
|
29
32
|
}
|
|
30
33
|
|
|
@@ -59,7 +62,7 @@ struct simd256bit {
|
|
|
59
62
|
}
|
|
60
63
|
|
|
61
64
|
// Checks whether the other holds exactly the same bytes.
|
|
62
|
-
bool is_same_as(
|
|
65
|
+
bool is_same_as(simd256bit_tpl other) const {
|
|
63
66
|
for (size_t i = 0; i < 8; i++) {
|
|
64
67
|
if (u32[i] != other.u32[i]) {
|
|
65
68
|
return false;
|
|
@@ -71,22 +74,25 @@ struct simd256bit {
|
|
|
71
74
|
};
|
|
72
75
|
|
|
73
76
|
/// vector of 16 elements in uint16
|
|
74
|
-
|
|
75
|
-
|
|
77
|
+
template <>
|
|
78
|
+
struct simd16uint16_tpl<SIMDLevel::NONE> : simd256bit_tpl<SIMDLevel::NONE> {
|
|
79
|
+
simd16uint16_tpl() {}
|
|
76
80
|
|
|
77
|
-
explicit
|
|
81
|
+
explicit simd16uint16_tpl(int x) {
|
|
78
82
|
set1(x);
|
|
79
83
|
}
|
|
80
84
|
|
|
81
|
-
explicit
|
|
85
|
+
explicit simd16uint16_tpl(uint16_t x) {
|
|
82
86
|
set1(x);
|
|
83
87
|
}
|
|
84
88
|
|
|
85
|
-
explicit
|
|
89
|
+
explicit simd16uint16_tpl(const simd256bit_tpl<SIMDLevel::NONE>& x)
|
|
90
|
+
: simd256bit_tpl<SIMDLevel::NONE>(x) {}
|
|
86
91
|
|
|
87
|
-
explicit
|
|
92
|
+
explicit simd16uint16_tpl(const uint16_t* x)
|
|
93
|
+
: simd256bit_tpl<SIMDLevel::NONE>((const void*)x) {}
|
|
88
94
|
|
|
89
|
-
explicit
|
|
95
|
+
explicit simd16uint16_tpl(
|
|
90
96
|
uint16_t u0,
|
|
91
97
|
uint16_t u1,
|
|
92
98
|
uint16_t u2,
|
|
@@ -95,7 +101,7 @@ struct simd16uint16 : simd256bit {
|
|
|
95
101
|
uint16_t u5,
|
|
96
102
|
uint16_t u6,
|
|
97
103
|
uint16_t u7,
|
|
98
|
-
uint16_t
|
|
104
|
+
uint16_t v8,
|
|
99
105
|
uint16_t u9,
|
|
100
106
|
uint16_t u10,
|
|
101
107
|
uint16_t u11,
|
|
@@ -111,7 +117,7 @@ struct simd16uint16 : simd256bit {
|
|
|
111
117
|
this->u16[5] = u5;
|
|
112
118
|
this->u16[6] = u6;
|
|
113
119
|
this->u16[7] = u7;
|
|
114
|
-
this->u16[8] =
|
|
120
|
+
this->u16[8] = v8;
|
|
115
121
|
this->u16[9] = u9;
|
|
116
122
|
this->u16[10] = u10;
|
|
117
123
|
this->u16[11] = u11;
|
|
@@ -123,9 +129,17 @@ struct simd16uint16 : simd256bit {
|
|
|
123
129
|
|
|
124
130
|
std::string elements_to_string(const char* fmt) const {
|
|
125
131
|
char res[1000], *ptr = res;
|
|
132
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
133
|
+
#pragma GCC diagnostic push
|
|
134
|
+
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
|
|
135
|
+
#endif
|
|
126
136
|
for (int i = 0; i < 16; i++) {
|
|
127
|
-
ptr +=
|
|
137
|
+
ptr += snprintf(
|
|
138
|
+
ptr, (size_t)(res + sizeof(res) - ptr), fmt, u16[i]);
|
|
128
139
|
}
|
|
140
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
141
|
+
#pragma GCC diagnostic pop
|
|
142
|
+
#endif
|
|
129
143
|
// strip last ,
|
|
130
144
|
ptr[-1] = 0;
|
|
131
145
|
return std::string(res);
|
|
@@ -140,8 +154,8 @@ struct simd16uint16 : simd256bit {
|
|
|
140
154
|
}
|
|
141
155
|
|
|
142
156
|
template <typename F>
|
|
143
|
-
static
|
|
144
|
-
|
|
157
|
+
static simd16uint16_tpl unary_func(const simd16uint16_tpl& a, F&& f) {
|
|
158
|
+
simd16uint16_tpl c;
|
|
145
159
|
for (int j = 0; j < 16; j++) {
|
|
146
160
|
c.u16[j] = f(a.u16[j]);
|
|
147
161
|
}
|
|
@@ -149,11 +163,11 @@ struct simd16uint16 : simd256bit {
|
|
|
149
163
|
}
|
|
150
164
|
|
|
151
165
|
template <typename F>
|
|
152
|
-
static
|
|
153
|
-
const
|
|
154
|
-
const
|
|
166
|
+
static simd16uint16_tpl binary_func(
|
|
167
|
+
const simd16uint16_tpl& a,
|
|
168
|
+
const simd16uint16_tpl& b,
|
|
155
169
|
F&& f) {
|
|
156
|
-
|
|
170
|
+
simd16uint16_tpl c;
|
|
157
171
|
for (int j = 0; j < 16; j++) {
|
|
158
172
|
c.u16[j] = f(a.u16[j], b.u16[j]);
|
|
159
173
|
}
|
|
@@ -166,70 +180,73 @@ struct simd16uint16 : simd256bit {
|
|
|
166
180
|
}
|
|
167
181
|
}
|
|
168
182
|
|
|
169
|
-
|
|
183
|
+
simd16uint16_tpl operator*(const simd16uint16_tpl& other) const {
|
|
170
184
|
return binary_func(
|
|
171
185
|
*this, other, [](uint16_t a, uint16_t b) { return a * b; });
|
|
172
186
|
}
|
|
173
187
|
|
|
174
188
|
// shift must be known at compile time
|
|
175
|
-
|
|
189
|
+
simd16uint16_tpl operator>>(const int shift) const {
|
|
176
190
|
return unary_func(*this, [shift](uint16_t a) { return a >> shift; });
|
|
177
191
|
}
|
|
178
192
|
|
|
179
193
|
// shift must be known at compile time
|
|
180
|
-
|
|
194
|
+
simd16uint16_tpl operator<<(const int shift) const {
|
|
181
195
|
return unary_func(*this, [shift](uint16_t a) { return a << shift; });
|
|
182
196
|
}
|
|
183
197
|
|
|
184
|
-
|
|
198
|
+
simd16uint16_tpl& operator+=(const simd16uint16_tpl& other) {
|
|
185
199
|
*this = *this + other;
|
|
186
200
|
return *this;
|
|
187
201
|
}
|
|
188
202
|
|
|
189
|
-
|
|
203
|
+
simd16uint16_tpl& operator-=(const simd16uint16_tpl& other) {
|
|
190
204
|
*this = *this - other;
|
|
191
205
|
return *this;
|
|
192
206
|
}
|
|
193
207
|
|
|
194
|
-
|
|
208
|
+
simd16uint16_tpl operator+(const simd16uint16_tpl& other) const {
|
|
195
209
|
return binary_func(
|
|
196
210
|
*this, other, [](uint16_t a, uint16_t b) { return a + b; });
|
|
197
211
|
}
|
|
198
212
|
|
|
199
|
-
|
|
213
|
+
simd16uint16_tpl operator-(const simd16uint16_tpl& other) const {
|
|
200
214
|
return binary_func(
|
|
201
215
|
*this, other, [](uint16_t a, uint16_t b) { return a - b; });
|
|
202
216
|
}
|
|
203
217
|
|
|
204
|
-
|
|
218
|
+
simd16uint16_tpl operator&(
|
|
219
|
+
const simd256bit_tpl<SIMDLevel::NONE>& other) const {
|
|
205
220
|
return binary_func(
|
|
206
|
-
*this,
|
|
221
|
+
*this, simd16uint16_tpl(other), [](uint16_t a, uint16_t b) {
|
|
207
222
|
return a & b;
|
|
208
223
|
});
|
|
209
224
|
}
|
|
210
225
|
|
|
211
|
-
|
|
226
|
+
simd16uint16_tpl operator|(
|
|
227
|
+
const simd256bit_tpl<SIMDLevel::NONE>& other) const {
|
|
212
228
|
return binary_func(
|
|
213
|
-
*this,
|
|
229
|
+
*this, simd16uint16_tpl(other), [](uint16_t a, uint16_t b) {
|
|
214
230
|
return a | b;
|
|
215
231
|
});
|
|
216
232
|
}
|
|
217
233
|
|
|
218
|
-
|
|
234
|
+
simd16uint16_tpl operator^(
|
|
235
|
+
const simd256bit_tpl<SIMDLevel::NONE>& other) const {
|
|
219
236
|
return binary_func(
|
|
220
|
-
*this,
|
|
237
|
+
*this, simd16uint16_tpl(other), [](uint16_t a, uint16_t b) {
|
|
221
238
|
return a ^ b;
|
|
222
239
|
});
|
|
223
240
|
}
|
|
224
241
|
|
|
225
242
|
// returns binary masks
|
|
226
|
-
|
|
243
|
+
simd16uint16_tpl operator==(const simd16uint16_tpl& other) const {
|
|
227
244
|
return binary_func(*this, other, [](uint16_t a, uint16_t b) {
|
|
228
245
|
return a == b ? 0xffff : 0;
|
|
229
246
|
});
|
|
230
247
|
}
|
|
231
248
|
|
|
232
|
-
|
|
249
|
+
simd16uint16_tpl operator~() const {
|
|
233
250
|
return unary_func(*this, [](uint16_t a) { return ~a; });
|
|
234
251
|
}
|
|
235
252
|
|
|
@@ -240,7 +257,7 @@ struct simd16uint16 : simd256bit {
|
|
|
240
257
|
|
|
241
258
|
// mask of elements where this >= thresh
|
|
242
259
|
// 2 bit per component: 16 * 2 = 32 bit
|
|
243
|
-
uint32_t ge_mask(const
|
|
260
|
+
uint32_t ge_mask(const simd16uint16_tpl& thresh) const {
|
|
244
261
|
uint32_t gem = 0;
|
|
245
262
|
for (int j = 0; j < 16; j++) {
|
|
246
263
|
if (u16[j] >= thresh.u16[j]) {
|
|
@@ -250,15 +267,15 @@ struct simd16uint16 : simd256bit {
|
|
|
250
267
|
return gem;
|
|
251
268
|
}
|
|
252
269
|
|
|
253
|
-
uint32_t le_mask(const
|
|
270
|
+
uint32_t le_mask(const simd16uint16_tpl& thresh) const {
|
|
254
271
|
return thresh.ge_mask(*this);
|
|
255
272
|
}
|
|
256
273
|
|
|
257
|
-
uint32_t gt_mask(const
|
|
274
|
+
uint32_t gt_mask(const simd16uint16_tpl& thresh) const {
|
|
258
275
|
return ~le_mask(thresh);
|
|
259
276
|
}
|
|
260
277
|
|
|
261
|
-
bool all_gt(const
|
|
278
|
+
bool all_gt(const simd16uint16_tpl& thresh) const {
|
|
262
279
|
return le_mask(thresh) == 0;
|
|
263
280
|
}
|
|
264
281
|
|
|
@@ -267,7 +284,7 @@ struct simd16uint16 : simd256bit {
|
|
|
267
284
|
return u16[i];
|
|
268
285
|
}
|
|
269
286
|
|
|
270
|
-
void accu_min(const
|
|
287
|
+
void accu_min(const simd16uint16_tpl& incoming) {
|
|
271
288
|
for (int j = 0; j < 16; j++) {
|
|
272
289
|
if (incoming.u16[j] < u16[j]) {
|
|
273
290
|
u16[j] = incoming.u16[j];
|
|
@@ -275,7 +292,7 @@ struct simd16uint16 : simd256bit {
|
|
|
275
292
|
}
|
|
276
293
|
}
|
|
277
294
|
|
|
278
|
-
void accu_max(const
|
|
295
|
+
void accu_max(const simd16uint16_tpl& incoming) {
|
|
279
296
|
for (int j = 0; j < 16; j++) {
|
|
280
297
|
if (incoming.u16[j] > u16[j]) {
|
|
281
298
|
u16[j] = incoming.u16[j];
|
|
@@ -285,21 +302,27 @@ struct simd16uint16 : simd256bit {
|
|
|
285
302
|
};
|
|
286
303
|
|
|
287
304
|
// not really a std::min because it returns an elementwise min
|
|
288
|
-
inline
|
|
289
|
-
|
|
305
|
+
inline simd16uint16_tpl<SIMDLevel::NONE> min(
|
|
306
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& av,
|
|
307
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& bv) {
|
|
308
|
+
return simd16uint16_tpl<SIMDLevel::NONE>::binary_func(
|
|
290
309
|
av, bv, [](uint16_t a, uint16_t b) { return std::min(a, b); });
|
|
291
310
|
}
|
|
292
311
|
|
|
293
|
-
inline
|
|
294
|
-
|
|
312
|
+
inline simd16uint16_tpl<SIMDLevel::NONE> max(
|
|
313
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& av,
|
|
314
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& bv) {
|
|
315
|
+
return simd16uint16_tpl<SIMDLevel::NONE>::binary_func(
|
|
295
316
|
av, bv, [](uint16_t a, uint16_t b) { return std::max(a, b); });
|
|
296
317
|
}
|
|
297
318
|
|
|
298
319
|
// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
|
|
299
320
|
// return (a0 + a1, b0 + b1)
|
|
300
321
|
// TODO find a better name
|
|
301
|
-
inline
|
|
302
|
-
|
|
322
|
+
inline simd16uint16_tpl<SIMDLevel::NONE> combine2x2(
|
|
323
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& a,
|
|
324
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& b) {
|
|
325
|
+
simd16uint16_tpl<SIMDLevel::NONE> c;
|
|
303
326
|
for (int j = 0; j < 8; j++) {
|
|
304
327
|
c.u16[j] = a.u16[j] + a.u16[j + 8];
|
|
305
328
|
c.u16[j + 8] = b.u16[j] + b.u16[j + 8];
|
|
@@ -310,9 +333,9 @@ inline simd16uint16 combine2x2(const simd16uint16& a, const simd16uint16& b) {
|
|
|
310
333
|
// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
|
|
311
334
|
// of d0 and d1 with thr
|
|
312
335
|
inline uint32_t cmp_ge32(
|
|
313
|
-
const
|
|
314
|
-
const
|
|
315
|
-
const
|
|
336
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& d0,
|
|
337
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& d1,
|
|
338
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& thr) {
|
|
316
339
|
uint32_t gem = 0;
|
|
317
340
|
for (int j = 0; j < 16; j++) {
|
|
318
341
|
if (d0.u16[j] >= thr.u16[j]) {
|
|
@@ -326,9 +349,9 @@ inline uint32_t cmp_ge32(
|
|
|
326
349
|
}
|
|
327
350
|
|
|
328
351
|
inline uint32_t cmp_le32(
|
|
329
|
-
const
|
|
330
|
-
const
|
|
331
|
-
const
|
|
352
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& d0,
|
|
353
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& d1,
|
|
354
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& thr) {
|
|
332
355
|
uint32_t gem = 0;
|
|
333
356
|
for (int j = 0; j < 16; j++) {
|
|
334
357
|
if (d0.u16[j] <= thr.u16[j]) {
|
|
@@ -342,8 +365,10 @@ inline uint32_t cmp_le32(
|
|
|
342
365
|
}
|
|
343
366
|
|
|
344
367
|
// hadd does not cross lanes
|
|
345
|
-
inline
|
|
346
|
-
|
|
368
|
+
inline simd16uint16_tpl<SIMDLevel::NONE> hadd(
|
|
369
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& a,
|
|
370
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& b) {
|
|
371
|
+
simd16uint16_tpl<SIMDLevel::NONE> c;
|
|
347
372
|
c.u16[0] = a.u16[0] + a.u16[1];
|
|
348
373
|
c.u16[1] = a.u16[2] + a.u16[3];
|
|
349
374
|
c.u16[2] = a.u16[4] + a.u16[5];
|
|
@@ -377,14 +402,14 @@ inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
|
|
|
377
402
|
// the last equal value is saved instead of the first one), but this behavior
|
|
378
403
|
// saves instructions.
|
|
379
404
|
inline void cmplt_min_max_fast(
|
|
380
|
-
const
|
|
381
|
-
const
|
|
382
|
-
const
|
|
383
|
-
const
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
405
|
+
const simd16uint16_tpl<SIMDLevel::NONE> candidateValues,
|
|
406
|
+
const simd16uint16_tpl<SIMDLevel::NONE> candidateIndices,
|
|
407
|
+
const simd16uint16_tpl<SIMDLevel::NONE> currentValues,
|
|
408
|
+
const simd16uint16_tpl<SIMDLevel::NONE> currentIndices,
|
|
409
|
+
simd16uint16_tpl<SIMDLevel::NONE>& minValues,
|
|
410
|
+
simd16uint16_tpl<SIMDLevel::NONE>& minIndices,
|
|
411
|
+
simd16uint16_tpl<SIMDLevel::NONE>& maxValues,
|
|
412
|
+
simd16uint16_tpl<SIMDLevel::NONE>& maxIndices) {
|
|
388
413
|
for (size_t i = 0; i < 16; i++) {
|
|
389
414
|
bool flag = (candidateValues.u16[i] < currentValues.u16[i]);
|
|
390
415
|
minValues.u16[i] = flag ? candidateValues.u16[i] : currentValues.u16[i];
|
|
@@ -398,14 +423,15 @@ inline void cmplt_min_max_fast(
|
|
|
398
423
|
}
|
|
399
424
|
|
|
400
425
|
// vector of 32 unsigned 8-bit integers
|
|
401
|
-
|
|
402
|
-
|
|
426
|
+
template <>
|
|
427
|
+
struct simd32uint8_tpl<SIMDLevel::NONE> : simd256bit_tpl<SIMDLevel::NONE> {
|
|
428
|
+
simd32uint8_tpl() {}
|
|
403
429
|
|
|
404
|
-
explicit
|
|
430
|
+
explicit simd32uint8_tpl(int x) {
|
|
405
431
|
set1(x);
|
|
406
432
|
}
|
|
407
433
|
|
|
408
|
-
explicit
|
|
434
|
+
explicit simd32uint8_tpl(uint8_t x) {
|
|
409
435
|
set1(x);
|
|
410
436
|
}
|
|
411
437
|
template <
|
|
@@ -441,8 +467,8 @@ struct simd32uint8 : simd256bit {
|
|
|
441
467
|
uint8_t _29,
|
|
442
468
|
uint8_t _30,
|
|
443
469
|
uint8_t _31>
|
|
444
|
-
static
|
|
445
|
-
|
|
470
|
+
static simd32uint8_tpl create() {
|
|
471
|
+
simd32uint8_tpl ret;
|
|
446
472
|
ret.u8[0] = _0;
|
|
447
473
|
ret.u8[1] = _1;
|
|
448
474
|
ret.u8[2] = _2;
|
|
@@ -478,15 +504,24 @@ struct simd32uint8 : simd256bit {
|
|
|
478
504
|
return ret;
|
|
479
505
|
}
|
|
480
506
|
|
|
481
|
-
explicit
|
|
507
|
+
explicit simd32uint8_tpl(const simd256bit_tpl<SIMDLevel::NONE>& x)
|
|
508
|
+
: simd256bit_tpl<SIMDLevel::NONE>(x) {}
|
|
482
509
|
|
|
483
|
-
explicit
|
|
510
|
+
explicit simd32uint8_tpl(const uint8_t* x)
|
|
511
|
+
: simd256bit_tpl<SIMDLevel::NONE>((const void*)x) {}
|
|
484
512
|
|
|
485
513
|
std::string elements_to_string(const char* fmt) const {
|
|
486
514
|
char res[1000], *ptr = res;
|
|
515
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
516
|
+
#pragma GCC diagnostic push
|
|
517
|
+
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
|
|
518
|
+
#endif
|
|
487
519
|
for (int i = 0; i < 32; i++) {
|
|
488
|
-
ptr +=
|
|
520
|
+
ptr += snprintf(ptr, (size_t)(res + sizeof(res) - ptr), fmt, u8[i]);
|
|
489
521
|
}
|
|
522
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
523
|
+
#pragma GCC diagnostic pop
|
|
524
|
+
#endif
|
|
490
525
|
// strip last ,
|
|
491
526
|
ptr[-1] = 0;
|
|
492
527
|
return std::string(res);
|
|
@@ -507,31 +542,33 @@ struct simd32uint8 : simd256bit {
|
|
|
507
542
|
}
|
|
508
543
|
|
|
509
544
|
template <typename F>
|
|
510
|
-
static
|
|
511
|
-
const
|
|
512
|
-
const
|
|
545
|
+
static simd32uint8_tpl binary_func(
|
|
546
|
+
const simd32uint8_tpl& a,
|
|
547
|
+
const simd32uint8_tpl& b,
|
|
513
548
|
F&& f) {
|
|
514
|
-
|
|
549
|
+
simd32uint8_tpl c;
|
|
515
550
|
for (int j = 0; j < 32; j++) {
|
|
516
551
|
c.u8[j] = f(a.u8[j], b.u8[j]);
|
|
517
552
|
}
|
|
518
553
|
return c;
|
|
519
554
|
}
|
|
520
555
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
556
|
+
simd32uint8_tpl operator&(
|
|
557
|
+
const simd256bit_tpl<SIMDLevel::NONE>& other) const {
|
|
558
|
+
return binary_func(
|
|
559
|
+
*this, simd32uint8_tpl(other), [](uint8_t a, uint8_t b) {
|
|
560
|
+
return a & b;
|
|
561
|
+
});
|
|
525
562
|
}
|
|
526
563
|
|
|
527
|
-
|
|
564
|
+
simd32uint8_tpl operator+(const simd32uint8_tpl& other) const {
|
|
528
565
|
return binary_func(
|
|
529
566
|
*this, other, [](uint8_t a, uint8_t b) { return a + b; });
|
|
530
567
|
}
|
|
531
568
|
|
|
532
569
|
// The very important operation that everything relies on
|
|
533
|
-
|
|
534
|
-
|
|
570
|
+
simd32uint8_tpl lookup_2_lanes(const simd32uint8_tpl& idx) const {
|
|
571
|
+
simd32uint8_tpl c;
|
|
535
572
|
for (int j = 0; j < 32; j++) {
|
|
536
573
|
if (idx.u8[j] & 0x80) {
|
|
537
574
|
c.u8[j] = 0;
|
|
@@ -550,7 +587,7 @@ struct simd32uint8 : simd256bit {
|
|
|
550
587
|
// extract + 0-extend lane
|
|
551
588
|
// this operation is slow (3 cycles)
|
|
552
589
|
|
|
553
|
-
|
|
590
|
+
simd32uint8_tpl& operator+=(const simd32uint8_tpl& other) {
|
|
554
591
|
*this = *this + other;
|
|
555
592
|
return *this;
|
|
556
593
|
}
|
|
@@ -563,10 +600,10 @@ struct simd32uint8 : simd256bit {
|
|
|
563
600
|
|
|
564
601
|
// convert with saturation
|
|
565
602
|
// careful: this does not cross lanes, so the order is weird
|
|
566
|
-
inline
|
|
567
|
-
const
|
|
568
|
-
const
|
|
569
|
-
|
|
603
|
+
inline simd32uint8_tpl<SIMDLevel::NONE> uint16_to_uint8_saturate(
|
|
604
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& a,
|
|
605
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& b) {
|
|
606
|
+
simd32uint8_tpl<SIMDLevel::NONE> c;
|
|
570
607
|
|
|
571
608
|
auto saturate_16_to_8 = [](uint16_t x) { return x >= 256 ? 0xff : x; };
|
|
572
609
|
|
|
@@ -580,7 +617,7 @@ inline simd32uint8 uint16_to_uint8_saturate(
|
|
|
580
617
|
}
|
|
581
618
|
|
|
582
619
|
/// get most significant bit of each byte
|
|
583
|
-
inline uint32_t get_MSBs(const
|
|
620
|
+
inline uint32_t get_MSBs(const simd32uint8_tpl<SIMDLevel::NONE>& a) {
|
|
584
621
|
uint32_t res = 0;
|
|
585
622
|
for (int i = 0; i < 32; i++) {
|
|
586
623
|
if (a.u8[i] & 0x80) {
|
|
@@ -591,11 +628,11 @@ inline uint32_t get_MSBs(const simd32uint8& a) {
|
|
|
591
628
|
}
|
|
592
629
|
|
|
593
630
|
/// use MSB of each byte of mask to select a byte between a and b
|
|
594
|
-
inline
|
|
595
|
-
const
|
|
596
|
-
const
|
|
597
|
-
const
|
|
598
|
-
|
|
631
|
+
inline simd32uint8_tpl<SIMDLevel::NONE> blendv(
|
|
632
|
+
const simd32uint8_tpl<SIMDLevel::NONE>& a,
|
|
633
|
+
const simd32uint8_tpl<SIMDLevel::NONE>& b,
|
|
634
|
+
const simd32uint8_tpl<SIMDLevel::NONE>& mask) {
|
|
635
|
+
simd32uint8_tpl<SIMDLevel::NONE> c;
|
|
599
636
|
for (int i = 0; i < 32; i++) {
|
|
600
637
|
if (mask.u8[i] & 0x80) {
|
|
601
638
|
c.u8[i] = b.u8[i];
|
|
@@ -607,18 +644,21 @@ inline simd32uint8 blendv(
|
|
|
607
644
|
}
|
|
608
645
|
|
|
609
646
|
/// vector of 8 unsigned 32-bit integers
|
|
610
|
-
|
|
611
|
-
|
|
647
|
+
template <>
|
|
648
|
+
struct simd8uint32_tpl<SIMDLevel::NONE> : simd256bit_tpl<SIMDLevel::NONE> {
|
|
649
|
+
simd8uint32_tpl() {}
|
|
612
650
|
|
|
613
|
-
explicit
|
|
651
|
+
explicit simd8uint32_tpl(uint32_t x) {
|
|
614
652
|
set1(x);
|
|
615
653
|
}
|
|
616
654
|
|
|
617
|
-
explicit
|
|
655
|
+
explicit simd8uint32_tpl(const simd256bit_tpl<SIMDLevel::NONE>& x)
|
|
656
|
+
: simd256bit_tpl<SIMDLevel::NONE>(x) {}
|
|
618
657
|
|
|
619
|
-
explicit
|
|
658
|
+
explicit simd8uint32_tpl(const uint32_t* x)
|
|
659
|
+
: simd256bit_tpl<SIMDLevel::NONE>((const void*)x) {}
|
|
620
660
|
|
|
621
|
-
explicit
|
|
661
|
+
explicit simd8uint32_tpl(
|
|
622
662
|
uint32_t u0,
|
|
623
663
|
uint32_t u1,
|
|
624
664
|
uint32_t u2,
|
|
@@ -637,30 +677,30 @@ struct simd8uint32 : simd256bit {
|
|
|
637
677
|
u32[7] = u7;
|
|
638
678
|
}
|
|
639
679
|
|
|
640
|
-
|
|
641
|
-
|
|
680
|
+
simd8uint32_tpl operator+(simd8uint32_tpl other) const {
|
|
681
|
+
simd8uint32_tpl result;
|
|
642
682
|
for (int i = 0; i < 8; i++) {
|
|
643
683
|
result.u32[i] = u32[i] + other.u32[i];
|
|
644
684
|
}
|
|
645
685
|
return result;
|
|
646
686
|
}
|
|
647
687
|
|
|
648
|
-
|
|
649
|
-
|
|
688
|
+
simd8uint32_tpl operator-(simd8uint32_tpl other) const {
|
|
689
|
+
simd8uint32_tpl result;
|
|
650
690
|
for (int i = 0; i < 8; i++) {
|
|
651
691
|
result.u32[i] = u32[i] - other.u32[i];
|
|
652
692
|
}
|
|
653
693
|
return result;
|
|
654
694
|
}
|
|
655
695
|
|
|
656
|
-
|
|
696
|
+
simd8uint32_tpl& operator+=(const simd8uint32_tpl& other) {
|
|
657
697
|
for (int i = 0; i < 8; i++) {
|
|
658
698
|
u32[i] += other.u32[i];
|
|
659
699
|
}
|
|
660
700
|
return *this;
|
|
661
701
|
}
|
|
662
702
|
|
|
663
|
-
bool operator==(
|
|
703
|
+
bool operator==(simd8uint32_tpl other) const {
|
|
664
704
|
for (size_t i = 0; i < 8; i++) {
|
|
665
705
|
if (u32[i] != other.u32[i]) {
|
|
666
706
|
return false;
|
|
@@ -670,15 +710,23 @@ struct simd8uint32 : simd256bit {
|
|
|
670
710
|
return true;
|
|
671
711
|
}
|
|
672
712
|
|
|
673
|
-
bool operator!=(
|
|
713
|
+
bool operator!=(simd8uint32_tpl other) const {
|
|
674
714
|
return !(*this == other);
|
|
675
715
|
}
|
|
676
716
|
|
|
677
717
|
std::string elements_to_string(const char* fmt) const {
|
|
678
718
|
char res[1000], *ptr = res;
|
|
719
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
720
|
+
#pragma GCC diagnostic push
|
|
721
|
+
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
|
|
722
|
+
#endif
|
|
679
723
|
for (int i = 0; i < 8; i++) {
|
|
680
|
-
ptr +=
|
|
724
|
+
ptr += snprintf(
|
|
725
|
+
ptr, (size_t)(res + sizeof(res) - ptr), fmt, u32[i]);
|
|
681
726
|
}
|
|
727
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
728
|
+
#pragma GCC diagnostic pop
|
|
729
|
+
#endif
|
|
682
730
|
// strip last ,
|
|
683
731
|
ptr[-1] = 0;
|
|
684
732
|
return std::string(res);
|
|
@@ -698,10 +746,10 @@ struct simd8uint32 : simd256bit {
|
|
|
698
746
|
}
|
|
699
747
|
}
|
|
700
748
|
|
|
701
|
-
|
|
749
|
+
simd8uint32_tpl unzip() const {
|
|
702
750
|
const uint32_t ret[] = {
|
|
703
751
|
u32[0], u32[2], u32[4], u32[6], u32[1], u32[3], u32[5], u32[7]};
|
|
704
|
-
return
|
|
752
|
+
return simd8uint32_tpl{ret};
|
|
705
753
|
}
|
|
706
754
|
};
|
|
707
755
|
|
|
@@ -717,14 +765,14 @@ struct simd8uint32 : simd256bit {
|
|
|
717
765
|
// the last equal value is saved instead of the first one), but this behavior
|
|
718
766
|
// saves instructions.
|
|
719
767
|
inline void cmplt_min_max_fast(
|
|
720
|
-
const
|
|
721
|
-
const
|
|
722
|
-
const
|
|
723
|
-
const
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
768
|
+
const simd8uint32_tpl<SIMDLevel::NONE> candidateValues,
|
|
769
|
+
const simd8uint32_tpl<SIMDLevel::NONE> candidateIndices,
|
|
770
|
+
const simd8uint32_tpl<SIMDLevel::NONE> currentValues,
|
|
771
|
+
const simd8uint32_tpl<SIMDLevel::NONE> currentIndices,
|
|
772
|
+
simd8uint32_tpl<SIMDLevel::NONE>& minValues,
|
|
773
|
+
simd8uint32_tpl<SIMDLevel::NONE>& minIndices,
|
|
774
|
+
simd8uint32_tpl<SIMDLevel::NONE>& maxValues,
|
|
775
|
+
simd8uint32_tpl<SIMDLevel::NONE>& maxIndices) {
|
|
728
776
|
for (size_t i = 0; i < 8; i++) {
|
|
729
777
|
bool flag = (candidateValues.u32[i] < currentValues.u32[i]);
|
|
730
778
|
minValues.u32[i] = flag ? candidateValues.u32[i] : currentValues.u32[i];
|
|
@@ -737,16 +785,18 @@ inline void cmplt_min_max_fast(
|
|
|
737
785
|
}
|
|
738
786
|
}
|
|
739
787
|
|
|
740
|
-
|
|
741
|
-
|
|
788
|
+
template <>
|
|
789
|
+
struct simd8float32_tpl<SIMDLevel::NONE> : simd256bit_tpl<SIMDLevel::NONE> {
|
|
790
|
+
simd8float32_tpl() {}
|
|
742
791
|
|
|
743
|
-
explicit
|
|
792
|
+
explicit simd8float32_tpl(const simd256bit_tpl<SIMDLevel::NONE>& x)
|
|
793
|
+
: simd256bit_tpl<SIMDLevel::NONE>(x) {}
|
|
744
794
|
|
|
745
|
-
explicit
|
|
795
|
+
explicit simd8float32_tpl(float x) {
|
|
746
796
|
set1(x);
|
|
747
797
|
}
|
|
748
798
|
|
|
749
|
-
explicit
|
|
799
|
+
explicit simd8float32_tpl(const float* x) {
|
|
750
800
|
loadu((void*)x);
|
|
751
801
|
}
|
|
752
802
|
|
|
@@ -756,7 +806,7 @@ struct simd8float32 : simd256bit {
|
|
|
756
806
|
}
|
|
757
807
|
}
|
|
758
808
|
|
|
759
|
-
explicit
|
|
809
|
+
explicit simd8float32_tpl(
|
|
760
810
|
float f0,
|
|
761
811
|
float f1,
|
|
762
812
|
float f2,
|
|
@@ -776,33 +826,33 @@ struct simd8float32 : simd256bit {
|
|
|
776
826
|
}
|
|
777
827
|
|
|
778
828
|
template <typename F>
|
|
779
|
-
static
|
|
780
|
-
const
|
|
781
|
-
const
|
|
829
|
+
static simd8float32_tpl binary_func(
|
|
830
|
+
const simd8float32_tpl& a,
|
|
831
|
+
const simd8float32_tpl& b,
|
|
782
832
|
F&& f) {
|
|
783
|
-
|
|
833
|
+
simd8float32_tpl c;
|
|
784
834
|
for (int j = 0; j < 8; j++) {
|
|
785
835
|
c.f32[j] = f(a.f32[j], b.f32[j]);
|
|
786
836
|
}
|
|
787
837
|
return c;
|
|
788
838
|
}
|
|
789
839
|
|
|
790
|
-
|
|
840
|
+
simd8float32_tpl operator*(const simd8float32_tpl& other) const {
|
|
791
841
|
return binary_func(
|
|
792
842
|
*this, other, [](float a, float b) { return a * b; });
|
|
793
843
|
}
|
|
794
844
|
|
|
795
|
-
|
|
845
|
+
simd8float32_tpl operator+(const simd8float32_tpl& other) const {
|
|
796
846
|
return binary_func(
|
|
797
847
|
*this, other, [](float a, float b) { return a + b; });
|
|
798
848
|
}
|
|
799
849
|
|
|
800
|
-
|
|
850
|
+
simd8float32_tpl operator-(const simd8float32_tpl& other) const {
|
|
801
851
|
return binary_func(
|
|
802
852
|
*this, other, [](float a, float b) { return a - b; });
|
|
803
853
|
}
|
|
804
854
|
|
|
805
|
-
|
|
855
|
+
simd8float32_tpl& operator+=(const simd8float32_tpl& other) {
|
|
806
856
|
for (size_t i = 0; i < 8; i++) {
|
|
807
857
|
f32[i] += other.f32[i];
|
|
808
858
|
}
|
|
@@ -810,7 +860,7 @@ struct simd8float32 : simd256bit {
|
|
|
810
860
|
return *this;
|
|
811
861
|
}
|
|
812
862
|
|
|
813
|
-
bool operator==(
|
|
863
|
+
bool operator==(simd8float32_tpl other) const {
|
|
814
864
|
for (size_t i = 0; i < 8; i++) {
|
|
815
865
|
if (f32[i] != other.f32[i]) {
|
|
816
866
|
return false;
|
|
@@ -820,14 +870,15 @@ struct simd8float32 : simd256bit {
|
|
|
820
870
|
return true;
|
|
821
871
|
}
|
|
822
872
|
|
|
823
|
-
bool operator!=(
|
|
873
|
+
bool operator!=(simd8float32_tpl other) const {
|
|
824
874
|
return !(*this == other);
|
|
825
875
|
}
|
|
826
876
|
|
|
827
877
|
std::string tostring() const {
|
|
828
878
|
char res[1000], *ptr = res;
|
|
829
879
|
for (int i = 0; i < 8; i++) {
|
|
830
|
-
ptr +=
|
|
880
|
+
ptr += snprintf(
|
|
881
|
+
ptr, (size_t)(res + sizeof(res) - ptr), "%g,", f32[i]);
|
|
831
882
|
}
|
|
832
883
|
// strip last ,
|
|
833
884
|
ptr[-1] = 0;
|
|
@@ -836,8 +887,10 @@ struct simd8float32 : simd256bit {
|
|
|
836
887
|
};
|
|
837
888
|
|
|
838
889
|
// hadd does not cross lanes
|
|
839
|
-
inline
|
|
840
|
-
|
|
890
|
+
inline simd8float32_tpl<SIMDLevel::NONE> hadd(
|
|
891
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
892
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
893
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
841
894
|
c.f32[0] = a.f32[0] + a.f32[1];
|
|
842
895
|
c.f32[1] = a.f32[2] + a.f32[3];
|
|
843
896
|
c.f32[2] = b.f32[0] + b.f32[1];
|
|
@@ -851,8 +904,10 @@ inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
|
|
|
851
904
|
return c;
|
|
852
905
|
}
|
|
853
906
|
|
|
854
|
-
inline
|
|
855
|
-
|
|
907
|
+
inline simd8float32_tpl<SIMDLevel::NONE> unpacklo(
|
|
908
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
909
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
910
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
856
911
|
c.f32[0] = a.f32[0];
|
|
857
912
|
c.f32[1] = b.f32[0];
|
|
858
913
|
c.f32[2] = a.f32[1];
|
|
@@ -866,8 +921,10 @@ inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
|
|
|
866
921
|
return c;
|
|
867
922
|
}
|
|
868
923
|
|
|
869
|
-
inline
|
|
870
|
-
|
|
924
|
+
inline simd8float32_tpl<SIMDLevel::NONE> unpackhi(
|
|
925
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
926
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
927
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
871
928
|
c.f32[0] = a.f32[2];
|
|
872
929
|
c.f32[1] = b.f32[2];
|
|
873
930
|
c.f32[2] = a.f32[3];
|
|
@@ -882,11 +939,11 @@ inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
|
|
|
882
939
|
}
|
|
883
940
|
|
|
884
941
|
// compute a * b + c
|
|
885
|
-
inline
|
|
886
|
-
const
|
|
887
|
-
const
|
|
888
|
-
const
|
|
889
|
-
|
|
942
|
+
inline simd8float32_tpl<SIMDLevel::NONE> fmadd(
|
|
943
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
944
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b,
|
|
945
|
+
const simd8float32_tpl<SIMDLevel::NONE>& c) {
|
|
946
|
+
simd8float32_tpl<SIMDLevel::NONE> res;
|
|
890
947
|
for (int i = 0; i < 8; i++) {
|
|
891
948
|
res.f32[i] = a.f32[i] * b.f32[i] + c.f32[i];
|
|
892
949
|
}
|
|
@@ -896,8 +953,10 @@ inline simd8float32 fmadd(
|
|
|
896
953
|
namespace {
|
|
897
954
|
|
|
898
955
|
// get even float32's of a and b, interleaved
|
|
899
|
-
|
|
900
|
-
|
|
956
|
+
[[maybe_unused]] simd8float32_tpl<SIMDLevel::NONE> geteven(
|
|
957
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
958
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
959
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
901
960
|
|
|
902
961
|
c.f32[0] = a.f32[0];
|
|
903
962
|
c.f32[1] = a.f32[2];
|
|
@@ -913,8 +972,10 @@ simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
|
|
|
913
972
|
}
|
|
914
973
|
|
|
915
974
|
// get odd float32's of a and b, interleaved
|
|
916
|
-
|
|
917
|
-
|
|
975
|
+
[[maybe_unused]] simd8float32_tpl<SIMDLevel::NONE> getodd(
|
|
976
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
977
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
978
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
918
979
|
|
|
919
980
|
c.f32[0] = a.f32[1];
|
|
920
981
|
c.f32[1] = a.f32[3];
|
|
@@ -931,8 +992,10 @@ simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
|
|
|
931
992
|
|
|
932
993
|
// 3 cycles
|
|
933
994
|
// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
|
|
934
|
-
|
|
935
|
-
|
|
995
|
+
[[maybe_unused]] simd8float32_tpl<SIMDLevel::NONE> getlow128(
|
|
996
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
997
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
998
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
936
999
|
|
|
937
1000
|
c.f32[0] = a.f32[0];
|
|
938
1001
|
c.f32[1] = a.f32[1];
|
|
@@ -947,8 +1010,10 @@ simd8float32 getlow128(const simd8float32& a, const simd8float32& b) {
|
|
|
947
1010
|
return c;
|
|
948
1011
|
}
|
|
949
1012
|
|
|
950
|
-
|
|
951
|
-
|
|
1013
|
+
[[maybe_unused]] simd8float32_tpl<SIMDLevel::NONE> gethigh128(
|
|
1014
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
1015
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
1016
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
952
1017
|
|
|
953
1018
|
c.f32[0] = a.f32[4];
|
|
954
1019
|
c.f32[1] = a.f32[5];
|
|
@@ -995,10 +1060,10 @@ simd8float32 gethigh128(const simd8float32& a, const simd8float32& b) {
|
|
|
995
1060
|
// confusion for ppl who write in low-level SIMD instructions. Additionally,
|
|
996
1061
|
// these two ops (cmp and blend) are very often used together.
|
|
997
1062
|
inline void cmplt_and_blend_inplace(
|
|
998
|
-
const
|
|
999
|
-
const
|
|
1000
|
-
|
|
1001
|
-
|
|
1063
|
+
const simd8float32_tpl<SIMDLevel::NONE> candidateValues,
|
|
1064
|
+
const simd8uint32_tpl<SIMDLevel::NONE> candidateIndices,
|
|
1065
|
+
simd8float32_tpl<SIMDLevel::NONE>& lowestValues,
|
|
1066
|
+
simd8uint32_tpl<SIMDLevel::NONE>& lowestIndices) {
|
|
1002
1067
|
for (size_t j = 0; j < 8; j++) {
|
|
1003
1068
|
bool comparison = (candidateValues.f32[j] < lowestValues.f32[j]);
|
|
1004
1069
|
if (comparison) {
|
|
@@ -1020,14 +1085,14 @@ inline void cmplt_and_blend_inplace(
|
|
|
1020
1085
|
// the last equal value is saved instead of the first one), but this behavior
|
|
1021
1086
|
// saves instructions.
|
|
1022
1087
|
inline void cmplt_min_max_fast(
|
|
1023
|
-
const
|
|
1024
|
-
const
|
|
1025
|
-
const
|
|
1026
|
-
const
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1088
|
+
const simd8float32_tpl<SIMDLevel::NONE> candidateValues,
|
|
1089
|
+
const simd8uint32_tpl<SIMDLevel::NONE> candidateIndices,
|
|
1090
|
+
const simd8float32_tpl<SIMDLevel::NONE> currentValues,
|
|
1091
|
+
const simd8uint32_tpl<SIMDLevel::NONE> currentIndices,
|
|
1092
|
+
simd8float32_tpl<SIMDLevel::NONE>& minValues,
|
|
1093
|
+
simd8uint32_tpl<SIMDLevel::NONE>& minIndices,
|
|
1094
|
+
simd8float32_tpl<SIMDLevel::NONE>& maxValues,
|
|
1095
|
+
simd8uint32_tpl<SIMDLevel::NONE>& maxIndices) {
|
|
1031
1096
|
for (size_t i = 0; i < 8; i++) {
|
|
1032
1097
|
bool flag = (candidateValues.f32[i] < currentValues.f32[i]);
|
|
1033
1098
|
minValues.f32[i] = flag ? candidateValues.f32[i] : currentValues.f32[i];
|