faiss 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/faiss/extconf.rb +2 -1
- data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
- data/ext/faiss/index_binary.cpp +1 -1
- data/ext/faiss/kmeans.cpp +1 -1
- data/ext/faiss/pca_matrix.cpp +1 -1
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +93 -80
- data/vendor/faiss/faiss/Clustering.cpp +39 -240
- data/vendor/faiss/faiss/Clustering.h +6 -0
- data/vendor/faiss/faiss/IVFlib.cpp +41 -21
- data/vendor/faiss/faiss/Index.cpp +6 -5
- data/vendor/faiss/faiss/Index.h +5 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
- data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
- data/vendor/faiss/faiss/IndexFastScan.h +25 -23
- data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
- data/vendor/faiss/faiss/IndexFlat.h +21 -18
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
- data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
- data/vendor/faiss/faiss/IndexHNSW.h +16 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
- data/vendor/faiss/faiss/IndexIVF.h +33 -12
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
- data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
- data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
- data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
- data/vendor/faiss/faiss/IndexShards.cpp +10 -9
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +14 -7
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
- data/vendor/faiss/faiss/VectorTransform.h +16 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +48 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/factory_tools.cpp +5 -0
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
- data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
- data/vendor/faiss/faiss/impl/HNSW.h +13 -34
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
- data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
- data/vendor/faiss/faiss/impl/NSG.h +4 -4
- data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
- data/vendor/faiss/faiss/impl/Panorama.h +258 -87
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +33 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
- data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +86 -18
- data/vendor/faiss/faiss/index_io.h +24 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
- data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
- data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/distances.cpp +390 -560
- data/vendor/faiss/faiss/utils/distances.h +20 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
- data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
- data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +5 -5
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +119 -34
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
- /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
|
@@ -12,9 +12,12 @@
|
|
|
12
12
|
#include <cstring>
|
|
13
13
|
#include <string>
|
|
14
14
|
|
|
15
|
+
#include <faiss/impl/simdlib/simdlib.h>
|
|
16
|
+
|
|
15
17
|
namespace faiss {
|
|
16
18
|
|
|
17
|
-
|
|
19
|
+
template <>
|
|
20
|
+
struct simd256bit_tpl<SIMDLevel::NONE> {
|
|
18
21
|
union {
|
|
19
22
|
uint8_t u8[32];
|
|
20
23
|
uint16_t u16[16];
|
|
@@ -22,9 +25,9 @@ struct simd256bit {
|
|
|
22
25
|
float f32[8];
|
|
23
26
|
};
|
|
24
27
|
|
|
25
|
-
|
|
28
|
+
simd256bit_tpl() {}
|
|
26
29
|
|
|
27
|
-
explicit
|
|
30
|
+
explicit simd256bit_tpl(const void* x) {
|
|
28
31
|
memcpy(u8, x, 32);
|
|
29
32
|
}
|
|
30
33
|
|
|
@@ -59,7 +62,7 @@ struct simd256bit {
|
|
|
59
62
|
}
|
|
60
63
|
|
|
61
64
|
// Checks whether the other holds exactly the same bytes.
|
|
62
|
-
bool is_same_as(
|
|
65
|
+
bool is_same_as(simd256bit_tpl other) const {
|
|
63
66
|
for (size_t i = 0; i < 8; i++) {
|
|
64
67
|
if (u32[i] != other.u32[i]) {
|
|
65
68
|
return false;
|
|
@@ -71,22 +74,25 @@ struct simd256bit {
|
|
|
71
74
|
};
|
|
72
75
|
|
|
73
76
|
/// vector of 16 elements in uint16
|
|
74
|
-
|
|
75
|
-
|
|
77
|
+
template <>
|
|
78
|
+
struct simd16uint16_tpl<SIMDLevel::NONE> : simd256bit_tpl<SIMDLevel::NONE> {
|
|
79
|
+
simd16uint16_tpl() {}
|
|
76
80
|
|
|
77
|
-
explicit
|
|
81
|
+
explicit simd16uint16_tpl(int x) {
|
|
78
82
|
set1(x);
|
|
79
83
|
}
|
|
80
84
|
|
|
81
|
-
explicit
|
|
85
|
+
explicit simd16uint16_tpl(uint16_t x) {
|
|
82
86
|
set1(x);
|
|
83
87
|
}
|
|
84
88
|
|
|
85
|
-
explicit
|
|
89
|
+
explicit simd16uint16_tpl(const simd256bit_tpl<SIMDLevel::NONE>& x)
|
|
90
|
+
: simd256bit_tpl<SIMDLevel::NONE>(x) {}
|
|
86
91
|
|
|
87
|
-
explicit
|
|
92
|
+
explicit simd16uint16_tpl(const uint16_t* x)
|
|
93
|
+
: simd256bit_tpl<SIMDLevel::NONE>((const void*)x) {}
|
|
88
94
|
|
|
89
|
-
explicit
|
|
95
|
+
explicit simd16uint16_tpl(
|
|
90
96
|
uint16_t u0,
|
|
91
97
|
uint16_t u1,
|
|
92
98
|
uint16_t u2,
|
|
@@ -140,8 +146,8 @@ struct simd16uint16 : simd256bit {
|
|
|
140
146
|
}
|
|
141
147
|
|
|
142
148
|
template <typename F>
|
|
143
|
-
static
|
|
144
|
-
|
|
149
|
+
static simd16uint16_tpl unary_func(const simd16uint16_tpl& a, F&& f) {
|
|
150
|
+
simd16uint16_tpl c;
|
|
145
151
|
for (int j = 0; j < 16; j++) {
|
|
146
152
|
c.u16[j] = f(a.u16[j]);
|
|
147
153
|
}
|
|
@@ -149,11 +155,11 @@ struct simd16uint16 : simd256bit {
|
|
|
149
155
|
}
|
|
150
156
|
|
|
151
157
|
template <typename F>
|
|
152
|
-
static
|
|
153
|
-
const
|
|
154
|
-
const
|
|
158
|
+
static simd16uint16_tpl binary_func(
|
|
159
|
+
const simd16uint16_tpl& a,
|
|
160
|
+
const simd16uint16_tpl& b,
|
|
155
161
|
F&& f) {
|
|
156
|
-
|
|
162
|
+
simd16uint16_tpl c;
|
|
157
163
|
for (int j = 0; j < 16; j++) {
|
|
158
164
|
c.u16[j] = f(a.u16[j], b.u16[j]);
|
|
159
165
|
}
|
|
@@ -166,70 +172,73 @@ struct simd16uint16 : simd256bit {
|
|
|
166
172
|
}
|
|
167
173
|
}
|
|
168
174
|
|
|
169
|
-
|
|
175
|
+
simd16uint16_tpl operator*(const simd16uint16_tpl& other) const {
|
|
170
176
|
return binary_func(
|
|
171
177
|
*this, other, [](uint16_t a, uint16_t b) { return a * b; });
|
|
172
178
|
}
|
|
173
179
|
|
|
174
180
|
// shift must be known at compile time
|
|
175
|
-
|
|
181
|
+
simd16uint16_tpl operator>>(const int shift) const {
|
|
176
182
|
return unary_func(*this, [shift](uint16_t a) { return a >> shift; });
|
|
177
183
|
}
|
|
178
184
|
|
|
179
185
|
// shift must be known at compile time
|
|
180
|
-
|
|
186
|
+
simd16uint16_tpl operator<<(const int shift) const {
|
|
181
187
|
return unary_func(*this, [shift](uint16_t a) { return a << shift; });
|
|
182
188
|
}
|
|
183
189
|
|
|
184
|
-
|
|
190
|
+
simd16uint16_tpl& operator+=(const simd16uint16_tpl& other) {
|
|
185
191
|
*this = *this + other;
|
|
186
192
|
return *this;
|
|
187
193
|
}
|
|
188
194
|
|
|
189
|
-
|
|
195
|
+
simd16uint16_tpl& operator-=(const simd16uint16_tpl& other) {
|
|
190
196
|
*this = *this - other;
|
|
191
197
|
return *this;
|
|
192
198
|
}
|
|
193
199
|
|
|
194
|
-
|
|
200
|
+
simd16uint16_tpl operator+(const simd16uint16_tpl& other) const {
|
|
195
201
|
return binary_func(
|
|
196
202
|
*this, other, [](uint16_t a, uint16_t b) { return a + b; });
|
|
197
203
|
}
|
|
198
204
|
|
|
199
|
-
|
|
205
|
+
simd16uint16_tpl operator-(const simd16uint16_tpl& other) const {
|
|
200
206
|
return binary_func(
|
|
201
207
|
*this, other, [](uint16_t a, uint16_t b) { return a - b; });
|
|
202
208
|
}
|
|
203
209
|
|
|
204
|
-
|
|
210
|
+
simd16uint16_tpl operator&(
|
|
211
|
+
const simd256bit_tpl<SIMDLevel::NONE>& other) const {
|
|
205
212
|
return binary_func(
|
|
206
|
-
*this,
|
|
213
|
+
*this, simd16uint16_tpl(other), [](uint16_t a, uint16_t b) {
|
|
207
214
|
return a & b;
|
|
208
215
|
});
|
|
209
216
|
}
|
|
210
217
|
|
|
211
|
-
|
|
218
|
+
simd16uint16_tpl operator|(
|
|
219
|
+
const simd256bit_tpl<SIMDLevel::NONE>& other) const {
|
|
212
220
|
return binary_func(
|
|
213
|
-
*this,
|
|
221
|
+
*this, simd16uint16_tpl(other), [](uint16_t a, uint16_t b) {
|
|
214
222
|
return a | b;
|
|
215
223
|
});
|
|
216
224
|
}
|
|
217
225
|
|
|
218
|
-
|
|
226
|
+
simd16uint16_tpl operator^(
|
|
227
|
+
const simd256bit_tpl<SIMDLevel::NONE>& other) const {
|
|
219
228
|
return binary_func(
|
|
220
|
-
*this,
|
|
229
|
+
*this, simd16uint16_tpl(other), [](uint16_t a, uint16_t b) {
|
|
221
230
|
return a ^ b;
|
|
222
231
|
});
|
|
223
232
|
}
|
|
224
233
|
|
|
225
234
|
// returns binary masks
|
|
226
|
-
|
|
235
|
+
simd16uint16_tpl operator==(const simd16uint16_tpl& other) const {
|
|
227
236
|
return binary_func(*this, other, [](uint16_t a, uint16_t b) {
|
|
228
237
|
return a == b ? 0xffff : 0;
|
|
229
238
|
});
|
|
230
239
|
}
|
|
231
240
|
|
|
232
|
-
|
|
241
|
+
simd16uint16_tpl operator~() const {
|
|
233
242
|
return unary_func(*this, [](uint16_t a) { return ~a; });
|
|
234
243
|
}
|
|
235
244
|
|
|
@@ -240,7 +249,7 @@ struct simd16uint16 : simd256bit {
|
|
|
240
249
|
|
|
241
250
|
// mask of elements where this >= thresh
|
|
242
251
|
// 2 bit per component: 16 * 2 = 32 bit
|
|
243
|
-
uint32_t ge_mask(const
|
|
252
|
+
uint32_t ge_mask(const simd16uint16_tpl& thresh) const {
|
|
244
253
|
uint32_t gem = 0;
|
|
245
254
|
for (int j = 0; j < 16; j++) {
|
|
246
255
|
if (u16[j] >= thresh.u16[j]) {
|
|
@@ -250,15 +259,15 @@ struct simd16uint16 : simd256bit {
|
|
|
250
259
|
return gem;
|
|
251
260
|
}
|
|
252
261
|
|
|
253
|
-
uint32_t le_mask(const
|
|
262
|
+
uint32_t le_mask(const simd16uint16_tpl& thresh) const {
|
|
254
263
|
return thresh.ge_mask(*this);
|
|
255
264
|
}
|
|
256
265
|
|
|
257
|
-
uint32_t gt_mask(const
|
|
266
|
+
uint32_t gt_mask(const simd16uint16_tpl& thresh) const {
|
|
258
267
|
return ~le_mask(thresh);
|
|
259
268
|
}
|
|
260
269
|
|
|
261
|
-
bool all_gt(const
|
|
270
|
+
bool all_gt(const simd16uint16_tpl& thresh) const {
|
|
262
271
|
return le_mask(thresh) == 0;
|
|
263
272
|
}
|
|
264
273
|
|
|
@@ -267,7 +276,7 @@ struct simd16uint16 : simd256bit {
|
|
|
267
276
|
return u16[i];
|
|
268
277
|
}
|
|
269
278
|
|
|
270
|
-
void accu_min(const
|
|
279
|
+
void accu_min(const simd16uint16_tpl& incoming) {
|
|
271
280
|
for (int j = 0; j < 16; j++) {
|
|
272
281
|
if (incoming.u16[j] < u16[j]) {
|
|
273
282
|
u16[j] = incoming.u16[j];
|
|
@@ -275,7 +284,7 @@ struct simd16uint16 : simd256bit {
|
|
|
275
284
|
}
|
|
276
285
|
}
|
|
277
286
|
|
|
278
|
-
void accu_max(const
|
|
287
|
+
void accu_max(const simd16uint16_tpl& incoming) {
|
|
279
288
|
for (int j = 0; j < 16; j++) {
|
|
280
289
|
if (incoming.u16[j] > u16[j]) {
|
|
281
290
|
u16[j] = incoming.u16[j];
|
|
@@ -285,21 +294,27 @@ struct simd16uint16 : simd256bit {
|
|
|
285
294
|
};
|
|
286
295
|
|
|
287
296
|
// not really a std::min because it returns an elementwise min
|
|
288
|
-
inline
|
|
289
|
-
|
|
297
|
+
inline simd16uint16_tpl<SIMDLevel::NONE> min(
|
|
298
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& av,
|
|
299
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& bv) {
|
|
300
|
+
return simd16uint16_tpl<SIMDLevel::NONE>::binary_func(
|
|
290
301
|
av, bv, [](uint16_t a, uint16_t b) { return std::min(a, b); });
|
|
291
302
|
}
|
|
292
303
|
|
|
293
|
-
inline
|
|
294
|
-
|
|
304
|
+
inline simd16uint16_tpl<SIMDLevel::NONE> max(
|
|
305
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& av,
|
|
306
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& bv) {
|
|
307
|
+
return simd16uint16_tpl<SIMDLevel::NONE>::binary_func(
|
|
295
308
|
av, bv, [](uint16_t a, uint16_t b) { return std::max(a, b); });
|
|
296
309
|
}
|
|
297
310
|
|
|
298
311
|
// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
|
|
299
312
|
// return (a0 + a1, b0 + b1)
|
|
300
313
|
// TODO find a better name
|
|
301
|
-
inline
|
|
302
|
-
|
|
314
|
+
inline simd16uint16_tpl<SIMDLevel::NONE> combine2x2(
|
|
315
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& a,
|
|
316
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& b) {
|
|
317
|
+
simd16uint16_tpl<SIMDLevel::NONE> c;
|
|
303
318
|
for (int j = 0; j < 8; j++) {
|
|
304
319
|
c.u16[j] = a.u16[j] + a.u16[j + 8];
|
|
305
320
|
c.u16[j + 8] = b.u16[j] + b.u16[j + 8];
|
|
@@ -310,9 +325,9 @@ inline simd16uint16 combine2x2(const simd16uint16& a, const simd16uint16& b) {
|
|
|
310
325
|
// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
|
|
311
326
|
// of d0 and d1 with thr
|
|
312
327
|
inline uint32_t cmp_ge32(
|
|
313
|
-
const
|
|
314
|
-
const
|
|
315
|
-
const
|
|
328
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& d0,
|
|
329
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& d1,
|
|
330
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& thr) {
|
|
316
331
|
uint32_t gem = 0;
|
|
317
332
|
for (int j = 0; j < 16; j++) {
|
|
318
333
|
if (d0.u16[j] >= thr.u16[j]) {
|
|
@@ -326,9 +341,9 @@ inline uint32_t cmp_ge32(
|
|
|
326
341
|
}
|
|
327
342
|
|
|
328
343
|
inline uint32_t cmp_le32(
|
|
329
|
-
const
|
|
330
|
-
const
|
|
331
|
-
const
|
|
344
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& d0,
|
|
345
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& d1,
|
|
346
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& thr) {
|
|
332
347
|
uint32_t gem = 0;
|
|
333
348
|
for (int j = 0; j < 16; j++) {
|
|
334
349
|
if (d0.u16[j] <= thr.u16[j]) {
|
|
@@ -342,8 +357,10 @@ inline uint32_t cmp_le32(
|
|
|
342
357
|
}
|
|
343
358
|
|
|
344
359
|
// hadd does not cross lanes
|
|
345
|
-
inline
|
|
346
|
-
|
|
360
|
+
inline simd16uint16_tpl<SIMDLevel::NONE> hadd(
|
|
361
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& a,
|
|
362
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& b) {
|
|
363
|
+
simd16uint16_tpl<SIMDLevel::NONE> c;
|
|
347
364
|
c.u16[0] = a.u16[0] + a.u16[1];
|
|
348
365
|
c.u16[1] = a.u16[2] + a.u16[3];
|
|
349
366
|
c.u16[2] = a.u16[4] + a.u16[5];
|
|
@@ -377,14 +394,14 @@ inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
|
|
|
377
394
|
// the last equal value is saved instead of the first one), but this behavior
|
|
378
395
|
// saves instructions.
|
|
379
396
|
inline void cmplt_min_max_fast(
|
|
380
|
-
const
|
|
381
|
-
const
|
|
382
|
-
const
|
|
383
|
-
const
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
397
|
+
const simd16uint16_tpl<SIMDLevel::NONE> candidateValues,
|
|
398
|
+
const simd16uint16_tpl<SIMDLevel::NONE> candidateIndices,
|
|
399
|
+
const simd16uint16_tpl<SIMDLevel::NONE> currentValues,
|
|
400
|
+
const simd16uint16_tpl<SIMDLevel::NONE> currentIndices,
|
|
401
|
+
simd16uint16_tpl<SIMDLevel::NONE>& minValues,
|
|
402
|
+
simd16uint16_tpl<SIMDLevel::NONE>& minIndices,
|
|
403
|
+
simd16uint16_tpl<SIMDLevel::NONE>& maxValues,
|
|
404
|
+
simd16uint16_tpl<SIMDLevel::NONE>& maxIndices) {
|
|
388
405
|
for (size_t i = 0; i < 16; i++) {
|
|
389
406
|
bool flag = (candidateValues.u16[i] < currentValues.u16[i]);
|
|
390
407
|
minValues.u16[i] = flag ? candidateValues.u16[i] : currentValues.u16[i];
|
|
@@ -398,14 +415,15 @@ inline void cmplt_min_max_fast(
|
|
|
398
415
|
}
|
|
399
416
|
|
|
400
417
|
// vector of 32 unsigned 8-bit integers
|
|
401
|
-
|
|
402
|
-
|
|
418
|
+
template <>
|
|
419
|
+
struct simd32uint8_tpl<SIMDLevel::NONE> : simd256bit_tpl<SIMDLevel::NONE> {
|
|
420
|
+
simd32uint8_tpl() {}
|
|
403
421
|
|
|
404
|
-
explicit
|
|
422
|
+
explicit simd32uint8_tpl(int x) {
|
|
405
423
|
set1(x);
|
|
406
424
|
}
|
|
407
425
|
|
|
408
|
-
explicit
|
|
426
|
+
explicit simd32uint8_tpl(uint8_t x) {
|
|
409
427
|
set1(x);
|
|
410
428
|
}
|
|
411
429
|
template <
|
|
@@ -441,8 +459,8 @@ struct simd32uint8 : simd256bit {
|
|
|
441
459
|
uint8_t _29,
|
|
442
460
|
uint8_t _30,
|
|
443
461
|
uint8_t _31>
|
|
444
|
-
static
|
|
445
|
-
|
|
462
|
+
static simd32uint8_tpl create() {
|
|
463
|
+
simd32uint8_tpl ret;
|
|
446
464
|
ret.u8[0] = _0;
|
|
447
465
|
ret.u8[1] = _1;
|
|
448
466
|
ret.u8[2] = _2;
|
|
@@ -478,9 +496,11 @@ struct simd32uint8 : simd256bit {
|
|
|
478
496
|
return ret;
|
|
479
497
|
}
|
|
480
498
|
|
|
481
|
-
explicit
|
|
499
|
+
explicit simd32uint8_tpl(const simd256bit_tpl<SIMDLevel::NONE>& x)
|
|
500
|
+
: simd256bit_tpl<SIMDLevel::NONE>(x) {}
|
|
482
501
|
|
|
483
|
-
explicit
|
|
502
|
+
explicit simd32uint8_tpl(const uint8_t* x)
|
|
503
|
+
: simd256bit_tpl<SIMDLevel::NONE>((const void*)x) {}
|
|
484
504
|
|
|
485
505
|
std::string elements_to_string(const char* fmt) const {
|
|
486
506
|
char res[1000], *ptr = res;
|
|
@@ -507,31 +527,33 @@ struct simd32uint8 : simd256bit {
|
|
|
507
527
|
}
|
|
508
528
|
|
|
509
529
|
template <typename F>
|
|
510
|
-
static
|
|
511
|
-
const
|
|
512
|
-
const
|
|
530
|
+
static simd32uint8_tpl binary_func(
|
|
531
|
+
const simd32uint8_tpl& a,
|
|
532
|
+
const simd32uint8_tpl& b,
|
|
513
533
|
F&& f) {
|
|
514
|
-
|
|
534
|
+
simd32uint8_tpl c;
|
|
515
535
|
for (int j = 0; j < 32; j++) {
|
|
516
536
|
c.u8[j] = f(a.u8[j], b.u8[j]);
|
|
517
537
|
}
|
|
518
538
|
return c;
|
|
519
539
|
}
|
|
520
540
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
541
|
+
simd32uint8_tpl operator&(
|
|
542
|
+
const simd256bit_tpl<SIMDLevel::NONE>& other) const {
|
|
543
|
+
return binary_func(
|
|
544
|
+
*this, simd32uint8_tpl(other), [](uint8_t a, uint8_t b) {
|
|
545
|
+
return a & b;
|
|
546
|
+
});
|
|
525
547
|
}
|
|
526
548
|
|
|
527
|
-
|
|
549
|
+
simd32uint8_tpl operator+(const simd32uint8_tpl& other) const {
|
|
528
550
|
return binary_func(
|
|
529
551
|
*this, other, [](uint8_t a, uint8_t b) { return a + b; });
|
|
530
552
|
}
|
|
531
553
|
|
|
532
554
|
// The very important operation that everything relies on
|
|
533
|
-
|
|
534
|
-
|
|
555
|
+
simd32uint8_tpl lookup_2_lanes(const simd32uint8_tpl& idx) const {
|
|
556
|
+
simd32uint8_tpl c;
|
|
535
557
|
// The original for loop:
|
|
536
558
|
// for (int j = 0; j < 32; j++) {
|
|
537
559
|
// if (idx.u8[j] & 0x80) {
|
|
@@ -589,7 +611,7 @@ struct simd32uint8 : simd256bit {
|
|
|
589
611
|
// extract + 0-extend lane
|
|
590
612
|
// this operation is slow (3 cycles)
|
|
591
613
|
|
|
592
|
-
|
|
614
|
+
simd32uint8_tpl& operator+=(const simd32uint8_tpl& other) {
|
|
593
615
|
*this = *this + other;
|
|
594
616
|
return *this;
|
|
595
617
|
}
|
|
@@ -602,10 +624,10 @@ struct simd32uint8 : simd256bit {
|
|
|
602
624
|
|
|
603
625
|
// convert with saturation
|
|
604
626
|
// careful: this does not cross lanes, so the order is weird
|
|
605
|
-
inline
|
|
606
|
-
const
|
|
607
|
-
const
|
|
608
|
-
|
|
627
|
+
inline simd32uint8_tpl<SIMDLevel::NONE> uint16_to_uint8_saturate(
|
|
628
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& a,
|
|
629
|
+
const simd16uint16_tpl<SIMDLevel::NONE>& b) {
|
|
630
|
+
simd32uint8_tpl<SIMDLevel::NONE> c;
|
|
609
631
|
|
|
610
632
|
auto saturate_16_to_8 = [](uint16_t x) { return x >= 256 ? 0xff : x; };
|
|
611
633
|
|
|
@@ -619,7 +641,7 @@ inline simd32uint8 uint16_to_uint8_saturate(
|
|
|
619
641
|
}
|
|
620
642
|
|
|
621
643
|
/// get most significant bit of each byte
|
|
622
|
-
inline uint32_t get_MSBs(const
|
|
644
|
+
inline uint32_t get_MSBs(const simd32uint8_tpl<SIMDLevel::NONE>& a) {
|
|
623
645
|
uint32_t res = 0;
|
|
624
646
|
for (int i = 0; i < 32; i++) {
|
|
625
647
|
if (a.u8[i] & 0x80) {
|
|
@@ -630,11 +652,11 @@ inline uint32_t get_MSBs(const simd32uint8& a) {
|
|
|
630
652
|
}
|
|
631
653
|
|
|
632
654
|
/// use MSB of each byte of mask to select a byte between a and b
|
|
633
|
-
inline
|
|
634
|
-
const
|
|
635
|
-
const
|
|
636
|
-
const
|
|
637
|
-
|
|
655
|
+
inline simd32uint8_tpl<SIMDLevel::NONE> blendv(
|
|
656
|
+
const simd32uint8_tpl<SIMDLevel::NONE>& a,
|
|
657
|
+
const simd32uint8_tpl<SIMDLevel::NONE>& b,
|
|
658
|
+
const simd32uint8_tpl<SIMDLevel::NONE>& mask) {
|
|
659
|
+
simd32uint8_tpl<SIMDLevel::NONE> c;
|
|
638
660
|
for (int i = 0; i < 32; i++) {
|
|
639
661
|
if (mask.u8[i] & 0x80) {
|
|
640
662
|
c.u8[i] = b.u8[i];
|
|
@@ -646,18 +668,21 @@ inline simd32uint8 blendv(
|
|
|
646
668
|
}
|
|
647
669
|
|
|
648
670
|
/// vector of 8 unsigned 32-bit integers
|
|
649
|
-
|
|
650
|
-
|
|
671
|
+
template <>
|
|
672
|
+
struct simd8uint32_tpl<SIMDLevel::NONE> : simd256bit_tpl<SIMDLevel::NONE> {
|
|
673
|
+
simd8uint32_tpl() {}
|
|
651
674
|
|
|
652
|
-
explicit
|
|
675
|
+
explicit simd8uint32_tpl(uint32_t x) {
|
|
653
676
|
set1(x);
|
|
654
677
|
}
|
|
655
678
|
|
|
656
|
-
explicit
|
|
679
|
+
explicit simd8uint32_tpl(const simd256bit_tpl<SIMDLevel::NONE>& x)
|
|
680
|
+
: simd256bit_tpl<SIMDLevel::NONE>(x) {}
|
|
657
681
|
|
|
658
|
-
explicit
|
|
682
|
+
explicit simd8uint32_tpl(const uint32_t* x)
|
|
683
|
+
: simd256bit_tpl<SIMDLevel::NONE>((const void*)x) {}
|
|
659
684
|
|
|
660
|
-
explicit
|
|
685
|
+
explicit simd8uint32_tpl(
|
|
661
686
|
uint32_t u0,
|
|
662
687
|
uint32_t u1,
|
|
663
688
|
uint32_t u2,
|
|
@@ -676,30 +701,30 @@ struct simd8uint32 : simd256bit {
|
|
|
676
701
|
u32[7] = u7;
|
|
677
702
|
}
|
|
678
703
|
|
|
679
|
-
|
|
680
|
-
|
|
704
|
+
simd8uint32_tpl operator+(simd8uint32_tpl other) const {
|
|
705
|
+
simd8uint32_tpl result;
|
|
681
706
|
for (int i = 0; i < 8; i++) {
|
|
682
707
|
result.u32[i] = u32[i] + other.u32[i];
|
|
683
708
|
}
|
|
684
709
|
return result;
|
|
685
710
|
}
|
|
686
711
|
|
|
687
|
-
|
|
688
|
-
|
|
712
|
+
simd8uint32_tpl operator-(simd8uint32_tpl other) const {
|
|
713
|
+
simd8uint32_tpl result;
|
|
689
714
|
for (int i = 0; i < 8; i++) {
|
|
690
715
|
result.u32[i] = u32[i] - other.u32[i];
|
|
691
716
|
}
|
|
692
717
|
return result;
|
|
693
718
|
}
|
|
694
719
|
|
|
695
|
-
|
|
720
|
+
simd8uint32_tpl& operator+=(const simd8uint32_tpl& other) {
|
|
696
721
|
for (int i = 0; i < 8; i++) {
|
|
697
722
|
u32[i] += other.u32[i];
|
|
698
723
|
}
|
|
699
724
|
return *this;
|
|
700
725
|
}
|
|
701
726
|
|
|
702
|
-
bool operator==(
|
|
727
|
+
bool operator==(simd8uint32_tpl other) const {
|
|
703
728
|
for (size_t i = 0; i < 8; i++) {
|
|
704
729
|
if (u32[i] != other.u32[i]) {
|
|
705
730
|
return false;
|
|
@@ -709,7 +734,7 @@ struct simd8uint32 : simd256bit {
|
|
|
709
734
|
return true;
|
|
710
735
|
}
|
|
711
736
|
|
|
712
|
-
bool operator!=(
|
|
737
|
+
bool operator!=(simd8uint32_tpl other) const {
|
|
713
738
|
return !(*this == other);
|
|
714
739
|
}
|
|
715
740
|
|
|
@@ -737,10 +762,10 @@ struct simd8uint32 : simd256bit {
|
|
|
737
762
|
}
|
|
738
763
|
}
|
|
739
764
|
|
|
740
|
-
|
|
765
|
+
simd8uint32_tpl unzip() const {
|
|
741
766
|
const uint32_t ret[] = {
|
|
742
767
|
u32[0], u32[2], u32[4], u32[6], u32[1], u32[3], u32[5], u32[7]};
|
|
743
|
-
return
|
|
768
|
+
return simd8uint32_tpl{ret};
|
|
744
769
|
}
|
|
745
770
|
};
|
|
746
771
|
|
|
@@ -756,14 +781,14 @@ struct simd8uint32 : simd256bit {
|
|
|
756
781
|
// the last equal value is saved instead of the first one), but this behavior
|
|
757
782
|
// saves instructions.
|
|
758
783
|
inline void cmplt_min_max_fast(
|
|
759
|
-
const
|
|
760
|
-
const
|
|
761
|
-
const
|
|
762
|
-
const
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
784
|
+
const simd8uint32_tpl<SIMDLevel::NONE> candidateValues,
|
|
785
|
+
const simd8uint32_tpl<SIMDLevel::NONE> candidateIndices,
|
|
786
|
+
const simd8uint32_tpl<SIMDLevel::NONE> currentValues,
|
|
787
|
+
const simd8uint32_tpl<SIMDLevel::NONE> currentIndices,
|
|
788
|
+
simd8uint32_tpl<SIMDLevel::NONE>& minValues,
|
|
789
|
+
simd8uint32_tpl<SIMDLevel::NONE>& minIndices,
|
|
790
|
+
simd8uint32_tpl<SIMDLevel::NONE>& maxValues,
|
|
791
|
+
simd8uint32_tpl<SIMDLevel::NONE>& maxIndices) {
|
|
767
792
|
for (size_t i = 0; i < 8; i++) {
|
|
768
793
|
bool flag = (candidateValues.u32[i] < currentValues.u32[i]);
|
|
769
794
|
minValues.u32[i] = flag ? candidateValues.u32[i] : currentValues.u32[i];
|
|
@@ -776,16 +801,18 @@ inline void cmplt_min_max_fast(
|
|
|
776
801
|
}
|
|
777
802
|
}
|
|
778
803
|
|
|
779
|
-
|
|
780
|
-
|
|
804
|
+
template <>
|
|
805
|
+
struct simd8float32_tpl<SIMDLevel::NONE> : simd256bit_tpl<SIMDLevel::NONE> {
|
|
806
|
+
simd8float32_tpl() {}
|
|
781
807
|
|
|
782
|
-
explicit
|
|
808
|
+
explicit simd8float32_tpl(const simd256bit_tpl<SIMDLevel::NONE>& x)
|
|
809
|
+
: simd256bit_tpl<SIMDLevel::NONE>(x) {}
|
|
783
810
|
|
|
784
|
-
explicit
|
|
811
|
+
explicit simd8float32_tpl(float x) {
|
|
785
812
|
set1(x);
|
|
786
813
|
}
|
|
787
814
|
|
|
788
|
-
explicit
|
|
815
|
+
explicit simd8float32_tpl(const float* x) {
|
|
789
816
|
loadu((void*)x);
|
|
790
817
|
}
|
|
791
818
|
|
|
@@ -795,7 +822,7 @@ struct simd8float32 : simd256bit {
|
|
|
795
822
|
}
|
|
796
823
|
}
|
|
797
824
|
|
|
798
|
-
explicit
|
|
825
|
+
explicit simd8float32_tpl(
|
|
799
826
|
float f0,
|
|
800
827
|
float f1,
|
|
801
828
|
float f2,
|
|
@@ -815,33 +842,33 @@ struct simd8float32 : simd256bit {
|
|
|
815
842
|
}
|
|
816
843
|
|
|
817
844
|
template <typename F>
|
|
818
|
-
static
|
|
819
|
-
const
|
|
820
|
-
const
|
|
845
|
+
static simd8float32_tpl binary_func(
|
|
846
|
+
const simd8float32_tpl& a,
|
|
847
|
+
const simd8float32_tpl& b,
|
|
821
848
|
F&& f) {
|
|
822
|
-
|
|
849
|
+
simd8float32_tpl c;
|
|
823
850
|
for (int j = 0; j < 8; j++) {
|
|
824
851
|
c.f32[j] = f(a.f32[j], b.f32[j]);
|
|
825
852
|
}
|
|
826
853
|
return c;
|
|
827
854
|
}
|
|
828
855
|
|
|
829
|
-
|
|
856
|
+
simd8float32_tpl operator*(const simd8float32_tpl& other) const {
|
|
830
857
|
return binary_func(
|
|
831
858
|
*this, other, [](float a, float b) { return a * b; });
|
|
832
859
|
}
|
|
833
860
|
|
|
834
|
-
|
|
861
|
+
simd8float32_tpl operator+(const simd8float32_tpl& other) const {
|
|
835
862
|
return binary_func(
|
|
836
863
|
*this, other, [](float a, float b) { return a + b; });
|
|
837
864
|
}
|
|
838
865
|
|
|
839
|
-
|
|
866
|
+
simd8float32_tpl operator-(const simd8float32_tpl& other) const {
|
|
840
867
|
return binary_func(
|
|
841
868
|
*this, other, [](float a, float b) { return a - b; });
|
|
842
869
|
}
|
|
843
870
|
|
|
844
|
-
|
|
871
|
+
simd8float32_tpl& operator+=(const simd8float32_tpl& other) {
|
|
845
872
|
for (size_t i = 0; i < 8; i++) {
|
|
846
873
|
f32[i] += other.f32[i];
|
|
847
874
|
}
|
|
@@ -849,7 +876,7 @@ struct simd8float32 : simd256bit {
|
|
|
849
876
|
return *this;
|
|
850
877
|
}
|
|
851
878
|
|
|
852
|
-
bool operator==(
|
|
879
|
+
bool operator==(simd8float32_tpl other) const {
|
|
853
880
|
for (size_t i = 0; i < 8; i++) {
|
|
854
881
|
if (f32[i] != other.f32[i]) {
|
|
855
882
|
return false;
|
|
@@ -859,7 +886,7 @@ struct simd8float32 : simd256bit {
|
|
|
859
886
|
return true;
|
|
860
887
|
}
|
|
861
888
|
|
|
862
|
-
bool operator!=(
|
|
889
|
+
bool operator!=(simd8float32_tpl other) const {
|
|
863
890
|
return !(*this == other);
|
|
864
891
|
}
|
|
865
892
|
|
|
@@ -875,8 +902,10 @@ struct simd8float32 : simd256bit {
|
|
|
875
902
|
};
|
|
876
903
|
|
|
877
904
|
// hadd does not cross lanes
|
|
878
|
-
inline
|
|
879
|
-
|
|
905
|
+
inline simd8float32_tpl<SIMDLevel::NONE> hadd(
|
|
906
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
907
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
908
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
880
909
|
c.f32[0] = a.f32[0] + a.f32[1];
|
|
881
910
|
c.f32[1] = a.f32[2] + a.f32[3];
|
|
882
911
|
c.f32[2] = b.f32[0] + b.f32[1];
|
|
@@ -890,8 +919,10 @@ inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
|
|
|
890
919
|
return c;
|
|
891
920
|
}
|
|
892
921
|
|
|
893
|
-
inline
|
|
894
|
-
|
|
922
|
+
inline simd8float32_tpl<SIMDLevel::NONE> unpacklo(
|
|
923
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
924
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
925
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
895
926
|
c.f32[0] = a.f32[0];
|
|
896
927
|
c.f32[1] = b.f32[0];
|
|
897
928
|
c.f32[2] = a.f32[1];
|
|
@@ -905,8 +936,10 @@ inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
|
|
|
905
936
|
return c;
|
|
906
937
|
}
|
|
907
938
|
|
|
908
|
-
inline
|
|
909
|
-
|
|
939
|
+
inline simd8float32_tpl<SIMDLevel::NONE> unpackhi(
|
|
940
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
941
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
942
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
910
943
|
c.f32[0] = a.f32[2];
|
|
911
944
|
c.f32[1] = b.f32[2];
|
|
912
945
|
c.f32[2] = a.f32[3];
|
|
@@ -921,11 +954,11 @@ inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
|
|
|
921
954
|
}
|
|
922
955
|
|
|
923
956
|
// compute a * b + c
|
|
924
|
-
inline
|
|
925
|
-
const
|
|
926
|
-
const
|
|
927
|
-
const
|
|
928
|
-
|
|
957
|
+
inline simd8float32_tpl<SIMDLevel::NONE> fmadd(
|
|
958
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
959
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b,
|
|
960
|
+
const simd8float32_tpl<SIMDLevel::NONE>& c) {
|
|
961
|
+
simd8float32_tpl<SIMDLevel::NONE> res;
|
|
929
962
|
for (int i = 0; i < 8; i++) {
|
|
930
963
|
res.f32[i] = a.f32[i] * b.f32[i] + c.f32[i];
|
|
931
964
|
}
|
|
@@ -935,8 +968,10 @@ inline simd8float32 fmadd(
|
|
|
935
968
|
namespace {
|
|
936
969
|
|
|
937
970
|
// get even float32's of a and b, interleaved
|
|
938
|
-
|
|
939
|
-
|
|
971
|
+
simd8float32_tpl<SIMDLevel::NONE> geteven(
|
|
972
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
973
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
974
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
940
975
|
|
|
941
976
|
c.f32[0] = a.f32[0];
|
|
942
977
|
c.f32[1] = a.f32[2];
|
|
@@ -952,8 +987,10 @@ simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
|
|
|
952
987
|
}
|
|
953
988
|
|
|
954
989
|
// get odd float32's of a and b, interleaved
|
|
955
|
-
|
|
956
|
-
|
|
990
|
+
simd8float32_tpl<SIMDLevel::NONE> getodd(
|
|
991
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
992
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
993
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
957
994
|
|
|
958
995
|
c.f32[0] = a.f32[1];
|
|
959
996
|
c.f32[1] = a.f32[3];
|
|
@@ -970,8 +1007,10 @@ simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
|
|
|
970
1007
|
|
|
971
1008
|
// 3 cycles
|
|
972
1009
|
// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
|
|
973
|
-
|
|
974
|
-
|
|
1010
|
+
simd8float32_tpl<SIMDLevel::NONE> getlow128(
|
|
1011
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
1012
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
1013
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
975
1014
|
|
|
976
1015
|
c.f32[0] = a.f32[0];
|
|
977
1016
|
c.f32[1] = a.f32[1];
|
|
@@ -986,8 +1025,10 @@ simd8float32 getlow128(const simd8float32& a, const simd8float32& b) {
|
|
|
986
1025
|
return c;
|
|
987
1026
|
}
|
|
988
1027
|
|
|
989
|
-
|
|
990
|
-
|
|
1028
|
+
simd8float32_tpl<SIMDLevel::NONE> gethigh128(
|
|
1029
|
+
const simd8float32_tpl<SIMDLevel::NONE>& a,
|
|
1030
|
+
const simd8float32_tpl<SIMDLevel::NONE>& b) {
|
|
1031
|
+
simd8float32_tpl<SIMDLevel::NONE> c;
|
|
991
1032
|
|
|
992
1033
|
c.f32[0] = a.f32[4];
|
|
993
1034
|
c.f32[1] = a.f32[5];
|
|
@@ -1034,10 +1075,10 @@ simd8float32 gethigh128(const simd8float32& a, const simd8float32& b) {
|
|
|
1034
1075
|
// confusion for ppl who write in low-level SIMD instructions. Additionally,
|
|
1035
1076
|
// these two ops (cmp and blend) are very often used together.
|
|
1036
1077
|
inline void cmplt_and_blend_inplace(
|
|
1037
|
-
const
|
|
1038
|
-
const
|
|
1039
|
-
|
|
1040
|
-
|
|
1078
|
+
const simd8float32_tpl<SIMDLevel::NONE> candidateValues,
|
|
1079
|
+
const simd8uint32_tpl<SIMDLevel::NONE> candidateIndices,
|
|
1080
|
+
simd8float32_tpl<SIMDLevel::NONE>& lowestValues,
|
|
1081
|
+
simd8uint32_tpl<SIMDLevel::NONE>& lowestIndices) {
|
|
1041
1082
|
for (size_t j = 0; j < 8; j++) {
|
|
1042
1083
|
bool comparison = (candidateValues.f32[j] < lowestValues.f32[j]);
|
|
1043
1084
|
if (comparison) {
|
|
@@ -1059,14 +1100,14 @@ inline void cmplt_and_blend_inplace(
|
|
|
1059
1100
|
// the last equal value is saved instead of the first one), but this behavior
|
|
1060
1101
|
// saves instructions.
|
|
1061
1102
|
inline void cmplt_min_max_fast(
|
|
1062
|
-
const
|
|
1063
|
-
const
|
|
1064
|
-
const
|
|
1065
|
-
const
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1103
|
+
const simd8float32_tpl<SIMDLevel::NONE> candidateValues,
|
|
1104
|
+
const simd8uint32_tpl<SIMDLevel::NONE> candidateIndices,
|
|
1105
|
+
const simd8float32_tpl<SIMDLevel::NONE> currentValues,
|
|
1106
|
+
const simd8uint32_tpl<SIMDLevel::NONE> currentIndices,
|
|
1107
|
+
simd8float32_tpl<SIMDLevel::NONE>& minValues,
|
|
1108
|
+
simd8uint32_tpl<SIMDLevel::NONE>& minIndices,
|
|
1109
|
+
simd8float32_tpl<SIMDLevel::NONE>& maxValues,
|
|
1110
|
+
simd8uint32_tpl<SIMDLevel::NONE>& maxIndices) {
|
|
1070
1111
|
for (size_t i = 0; i < 8; i++) {
|
|
1071
1112
|
bool flag = (candidateValues.f32[i] < currentValues.f32[i]);
|
|
1072
1113
|
minValues.f32[i] = flag ? candidateValues.f32[i] : currentValues.f32[i];
|