faiss 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/faiss/extconf.rb +2 -1
- data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
- data/ext/faiss/index_binary.cpp +1 -1
- data/ext/faiss/kmeans.cpp +1 -1
- data/ext/faiss/pca_matrix.cpp +1 -1
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +93 -80
- data/vendor/faiss/faiss/Clustering.cpp +39 -240
- data/vendor/faiss/faiss/Clustering.h +6 -0
- data/vendor/faiss/faiss/IVFlib.cpp +41 -21
- data/vendor/faiss/faiss/Index.cpp +6 -5
- data/vendor/faiss/faiss/Index.h +5 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
- data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
- data/vendor/faiss/faiss/IndexFastScan.h +25 -23
- data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
- data/vendor/faiss/faiss/IndexFlat.h +21 -18
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
- data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
- data/vendor/faiss/faiss/IndexHNSW.h +16 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
- data/vendor/faiss/faiss/IndexIVF.h +33 -12
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
- data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
- data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
- data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
- data/vendor/faiss/faiss/IndexShards.cpp +10 -9
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +14 -7
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
- data/vendor/faiss/faiss/VectorTransform.h +16 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +48 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/factory_tools.cpp +5 -0
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
- data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
- data/vendor/faiss/faiss/impl/HNSW.h +13 -34
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
- data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
- data/vendor/faiss/faiss/impl/NSG.h +4 -4
- data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
- data/vendor/faiss/faiss/impl/Panorama.h +258 -87
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +33 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
- data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +86 -18
- data/vendor/faiss/faiss/index_io.h +24 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
- data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
- data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/distances.cpp +390 -560
- data/vendor/faiss/faiss/utils/distances.h +20 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
- data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
- data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +5 -5
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +119 -34
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
- /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
#include <immintrin.h>
|
|
14
14
|
|
|
15
15
|
#include <faiss/impl/platform_macros.h>
|
|
16
|
+
#include <faiss/impl/simdlib/simdlib.h>
|
|
16
17
|
|
|
17
18
|
namespace faiss {
|
|
18
19
|
|
|
@@ -25,20 +26,21 @@ namespace faiss {
|
|
|
25
26
|
*/
|
|
26
27
|
|
|
27
28
|
/// 256-bit representation without interpretation as a vector
|
|
28
|
-
|
|
29
|
+
template <>
|
|
30
|
+
struct simd256bit_tpl<SIMDLevel::AVX2> {
|
|
29
31
|
union {
|
|
30
32
|
__m256i i;
|
|
31
33
|
__m256 f;
|
|
32
34
|
};
|
|
33
35
|
|
|
34
|
-
|
|
36
|
+
simd256bit_tpl() {}
|
|
35
37
|
|
|
36
|
-
explicit
|
|
38
|
+
explicit simd256bit_tpl(__m256i val) : i(val) {}
|
|
37
39
|
|
|
38
|
-
explicit
|
|
40
|
+
explicit simd256bit_tpl(__m256 val) : f(val) {}
|
|
39
41
|
|
|
40
|
-
explicit
|
|
41
|
-
: i(
|
|
42
|
+
explicit simd256bit_tpl(const void* x)
|
|
43
|
+
: i(_mm256_loadu_si256((__m256i const*)x)) {}
|
|
42
44
|
|
|
43
45
|
void clear() {
|
|
44
46
|
i = _mm256_setzero_si256();
|
|
@@ -59,8 +61,8 @@ struct simd256bit {
|
|
|
59
61
|
void bin(char bits[257]) const {
|
|
60
62
|
char bytes[32];
|
|
61
63
|
storeu((void*)bytes);
|
|
62
|
-
for (int
|
|
63
|
-
bits[
|
|
64
|
+
for (int idx = 0; idx < 256; idx++) {
|
|
65
|
+
bits[idx] = '0' + ((bytes[idx / 8] >> (idx % 8)) & 1);
|
|
64
66
|
}
|
|
65
67
|
bits[256] = 0;
|
|
66
68
|
}
|
|
@@ -72,7 +74,7 @@ struct simd256bit {
|
|
|
72
74
|
}
|
|
73
75
|
|
|
74
76
|
// Checks whether the other holds exactly the same bytes.
|
|
75
|
-
bool is_same_as(
|
|
77
|
+
bool is_same_as(simd256bit_tpl other) const {
|
|
76
78
|
const __m256i pcmp = _mm256_cmpeq_epi32(i, other.i);
|
|
77
79
|
unsigned bitmask = _mm256_movemask_epi8(pcmp);
|
|
78
80
|
return (bitmask == 0xffffffffU);
|
|
@@ -80,20 +82,26 @@ struct simd256bit {
|
|
|
80
82
|
};
|
|
81
83
|
|
|
82
84
|
/// vector of 16 elements in uint16
|
|
83
|
-
|
|
84
|
-
|
|
85
|
+
template <>
|
|
86
|
+
struct simd16uint16_tpl<SIMDLevel::AVX2> : simd256bit_tpl<SIMDLevel::AVX2> {
|
|
87
|
+
simd16uint16_tpl() {}
|
|
85
88
|
|
|
86
|
-
explicit
|
|
89
|
+
explicit simd16uint16_tpl(__m256i val)
|
|
90
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(val) {}
|
|
87
91
|
|
|
88
|
-
explicit
|
|
92
|
+
explicit simd16uint16_tpl(int x)
|
|
93
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(_mm256_set1_epi16(x)) {}
|
|
89
94
|
|
|
90
|
-
explicit
|
|
95
|
+
explicit simd16uint16_tpl(uint16_t x)
|
|
96
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(_mm256_set1_epi16(x)) {}
|
|
91
97
|
|
|
92
|
-
explicit
|
|
98
|
+
explicit simd16uint16_tpl(simd256bit_tpl<SIMDLevel::AVX2> x)
|
|
99
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(x) {}
|
|
93
100
|
|
|
94
|
-
explicit
|
|
101
|
+
explicit simd16uint16_tpl(const uint16_t* x)
|
|
102
|
+
: simd256bit_tpl<SIMDLevel::AVX2>((const void*)x) {}
|
|
95
103
|
|
|
96
|
-
explicit
|
|
104
|
+
explicit simd16uint16_tpl(
|
|
97
105
|
uint16_t u0,
|
|
98
106
|
uint16_t u1,
|
|
99
107
|
uint16_t u2,
|
|
@@ -110,7 +118,7 @@ struct simd16uint16 : simd256bit {
|
|
|
110
118
|
uint16_t u13,
|
|
111
119
|
uint16_t u14,
|
|
112
120
|
uint16_t u15)
|
|
113
|
-
:
|
|
121
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(_mm256_setr_epi16(
|
|
114
122
|
u0,
|
|
115
123
|
u1,
|
|
116
124
|
u2,
|
|
@@ -133,9 +141,17 @@ struct simd16uint16 : simd256bit {
|
|
|
133
141
|
storeu((void*)bytes);
|
|
134
142
|
char res[1000];
|
|
135
143
|
char* ptr = res;
|
|
136
|
-
|
|
137
|
-
|
|
144
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
145
|
+
#pragma GCC diagnostic push
|
|
146
|
+
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
|
|
147
|
+
#endif
|
|
148
|
+
for (int idx = 0; idx < 16; idx++) {
|
|
149
|
+
ptr += snprintf(
|
|
150
|
+
ptr, (size_t)(res + sizeof(res) - ptr), fmt, bytes[idx]);
|
|
138
151
|
}
|
|
152
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
153
|
+
#pragma GCC diagnostic pop
|
|
154
|
+
#endif
|
|
139
155
|
// strip last ,
|
|
140
156
|
ptr[-1] = 0;
|
|
141
157
|
return std::string(res);
|
|
@@ -153,57 +169,59 @@ struct simd16uint16 : simd256bit {
|
|
|
153
169
|
i = _mm256_set1_epi16((short)x);
|
|
154
170
|
}
|
|
155
171
|
|
|
156
|
-
|
|
157
|
-
return
|
|
172
|
+
simd16uint16_tpl operator*(const simd16uint16_tpl& other) const {
|
|
173
|
+
return simd16uint16_tpl(_mm256_mullo_epi16(i, other.i));
|
|
158
174
|
}
|
|
159
175
|
|
|
160
176
|
// shift must be known at compile time
|
|
161
|
-
|
|
162
|
-
return
|
|
177
|
+
simd16uint16_tpl operator>>(const int shift) const {
|
|
178
|
+
return simd16uint16_tpl(_mm256_srli_epi16(i, shift));
|
|
163
179
|
}
|
|
164
180
|
|
|
165
181
|
// shift must be known at compile time
|
|
166
|
-
|
|
167
|
-
return
|
|
182
|
+
simd16uint16_tpl operator<<(const int shift) const {
|
|
183
|
+
return simd16uint16_tpl(_mm256_slli_epi16(i, shift));
|
|
168
184
|
}
|
|
169
185
|
|
|
170
|
-
|
|
186
|
+
simd16uint16_tpl& operator+=(simd16uint16_tpl other) {
|
|
171
187
|
i = _mm256_add_epi16(i, other.i);
|
|
172
188
|
return *this;
|
|
173
189
|
}
|
|
174
190
|
|
|
175
|
-
|
|
191
|
+
simd16uint16_tpl& operator-=(simd16uint16_tpl other) {
|
|
176
192
|
i = _mm256_sub_epi16(i, other.i);
|
|
177
193
|
return *this;
|
|
178
194
|
}
|
|
179
195
|
|
|
180
|
-
|
|
181
|
-
return
|
|
196
|
+
simd16uint16_tpl operator+(simd16uint16_tpl other) const {
|
|
197
|
+
return simd16uint16_tpl(_mm256_add_epi16(i, other.i));
|
|
182
198
|
}
|
|
183
199
|
|
|
184
|
-
|
|
185
|
-
return
|
|
200
|
+
simd16uint16_tpl operator-(simd16uint16_tpl other) const {
|
|
201
|
+
return simd16uint16_tpl(_mm256_sub_epi16(i, other.i));
|
|
186
202
|
}
|
|
187
203
|
|
|
188
|
-
|
|
189
|
-
return
|
|
204
|
+
simd16uint16_tpl operator&(simd256bit_tpl<SIMDLevel::AVX2> other) const {
|
|
205
|
+
return simd16uint16_tpl(_mm256_and_si256(i, other.i));
|
|
190
206
|
}
|
|
191
207
|
|
|
192
|
-
|
|
193
|
-
return
|
|
208
|
+
simd16uint16_tpl operator|(simd256bit_tpl<SIMDLevel::AVX2> other) const {
|
|
209
|
+
return simd16uint16_tpl(_mm256_or_si256(i, other.i));
|
|
194
210
|
}
|
|
195
211
|
|
|
196
|
-
|
|
197
|
-
return
|
|
212
|
+
simd16uint16_tpl operator^(simd256bit_tpl<SIMDLevel::AVX2> other) const {
|
|
213
|
+
return simd16uint16_tpl(_mm256_xor_si256(i, other.i));
|
|
198
214
|
}
|
|
199
215
|
|
|
200
216
|
// returns binary masks
|
|
201
|
-
friend
|
|
202
|
-
|
|
217
|
+
friend simd16uint16_tpl operator==(
|
|
218
|
+
const simd256bit_tpl<SIMDLevel::AVX2> lhs,
|
|
219
|
+
const simd256bit_tpl<SIMDLevel::AVX2> rhs) {
|
|
220
|
+
return simd16uint16_tpl(_mm256_cmpeq_epi16(lhs.i, rhs.i));
|
|
203
221
|
}
|
|
204
222
|
|
|
205
|
-
|
|
206
|
-
return
|
|
223
|
+
simd16uint16_tpl operator~() const {
|
|
224
|
+
return simd16uint16_tpl(_mm256_xor_si256(i, _mm256_set1_epi32(-1)));
|
|
207
225
|
}
|
|
208
226
|
|
|
209
227
|
// get scalar at index 0
|
|
@@ -213,63 +231,73 @@ struct simd16uint16 : simd256bit {
|
|
|
213
231
|
|
|
214
232
|
// mask of elements where this >= thresh
|
|
215
233
|
// 2 bit per component: 16 * 2 = 32 bit
|
|
216
|
-
uint32_t ge_mask(
|
|
234
|
+
uint32_t ge_mask(simd16uint16_tpl thresh) const {
|
|
217
235
|
__m256i j = thresh.i;
|
|
218
236
|
__m256i max = _mm256_max_epu16(i, j);
|
|
219
237
|
__m256i ge = _mm256_cmpeq_epi16(i, max);
|
|
220
238
|
return _mm256_movemask_epi8(ge);
|
|
221
239
|
}
|
|
222
240
|
|
|
223
|
-
uint32_t le_mask(
|
|
241
|
+
uint32_t le_mask(simd16uint16_tpl thresh) const {
|
|
224
242
|
return thresh.ge_mask(*this);
|
|
225
243
|
}
|
|
226
244
|
|
|
227
|
-
uint32_t gt_mask(
|
|
245
|
+
uint32_t gt_mask(simd16uint16_tpl thresh) const {
|
|
228
246
|
return ~le_mask(thresh);
|
|
229
247
|
}
|
|
230
248
|
|
|
231
|
-
bool all_gt(
|
|
249
|
+
bool all_gt(simd16uint16_tpl thresh) const {
|
|
232
250
|
return le_mask(thresh) == 0;
|
|
233
251
|
}
|
|
234
252
|
|
|
235
253
|
// for debugging only
|
|
236
|
-
uint16_t operator[](int
|
|
254
|
+
uint16_t operator[](int idx) const {
|
|
237
255
|
ALIGNED(32) uint16_t tab[16];
|
|
238
256
|
store(tab);
|
|
239
|
-
return tab[
|
|
257
|
+
return tab[idx];
|
|
240
258
|
}
|
|
241
259
|
|
|
242
|
-
void accu_min(
|
|
260
|
+
void accu_min(simd16uint16_tpl incoming) {
|
|
243
261
|
i = _mm256_min_epu16(i, incoming.i);
|
|
244
262
|
}
|
|
245
263
|
|
|
246
|
-
void accu_max(
|
|
264
|
+
void accu_max(simd16uint16_tpl incoming) {
|
|
247
265
|
i = _mm256_max_epu16(i, incoming.i);
|
|
248
266
|
}
|
|
249
267
|
};
|
|
250
268
|
|
|
251
269
|
// not really a std::min because it returns an elementwise min
|
|
252
|
-
inline
|
|
253
|
-
|
|
270
|
+
inline simd16uint16_tpl<SIMDLevel::AVX2> min(
|
|
271
|
+
simd16uint16_tpl<SIMDLevel::AVX2> a,
|
|
272
|
+
simd16uint16_tpl<SIMDLevel::AVX2> b) {
|
|
273
|
+
return simd16uint16_tpl<SIMDLevel::AVX2>(_mm256_min_epu16(a.i, b.i));
|
|
254
274
|
}
|
|
255
275
|
|
|
256
|
-
inline
|
|
257
|
-
|
|
276
|
+
inline simd16uint16_tpl<SIMDLevel::AVX2> max(
|
|
277
|
+
simd16uint16_tpl<SIMDLevel::AVX2> a,
|
|
278
|
+
simd16uint16_tpl<SIMDLevel::AVX2> b) {
|
|
279
|
+
return simd16uint16_tpl<SIMDLevel::AVX2>(_mm256_max_epu16(a.i, b.i));
|
|
258
280
|
}
|
|
259
281
|
|
|
260
282
|
// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
|
|
261
283
|
// return (a0 + a1, b0 + b1)
|
|
262
284
|
// TODO find a better name
|
|
263
|
-
inline
|
|
285
|
+
inline simd16uint16_tpl<SIMDLevel::AVX2> combine2x2(
|
|
286
|
+
simd16uint16_tpl<SIMDLevel::AVX2> a,
|
|
287
|
+
simd16uint16_tpl<SIMDLevel::AVX2> b) {
|
|
264
288
|
__m256i a1b0 = _mm256_permute2f128_si256(a.i, b.i, 0x21);
|
|
265
289
|
__m256i a0b1 = _mm256_blend_epi32(a.i, b.i, 0xF0);
|
|
266
290
|
|
|
267
|
-
return
|
|
291
|
+
return simd16uint16_tpl<SIMDLevel::AVX2>(a1b0) +
|
|
292
|
+
simd16uint16_tpl<SIMDLevel::AVX2>(a0b1);
|
|
268
293
|
}
|
|
269
294
|
|
|
270
295
|
// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
|
|
271
296
|
// of d0 and d1 with thr
|
|
272
|
-
inline uint32_t cmp_ge32(
|
|
297
|
+
inline uint32_t cmp_ge32(
|
|
298
|
+
simd16uint16_tpl<SIMDLevel::AVX2> d0,
|
|
299
|
+
simd16uint16_tpl<SIMDLevel::AVX2> d1,
|
|
300
|
+
simd16uint16_tpl<SIMDLevel::AVX2> thr) {
|
|
273
301
|
__m256i max0 = _mm256_max_epu16(d0.i, thr.i);
|
|
274
302
|
__m256i ge0 = _mm256_cmpeq_epi16(d0.i, max0);
|
|
275
303
|
|
|
@@ -285,7 +313,10 @@ inline uint32_t cmp_ge32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
|
|
|
285
313
|
return ge;
|
|
286
314
|
}
|
|
287
315
|
|
|
288
|
-
inline uint32_t cmp_le32(
|
|
316
|
+
inline uint32_t cmp_le32(
|
|
317
|
+
simd16uint16_tpl<SIMDLevel::AVX2> d0,
|
|
318
|
+
simd16uint16_tpl<SIMDLevel::AVX2> d1,
|
|
319
|
+
simd16uint16_tpl<SIMDLevel::AVX2> thr) {
|
|
289
320
|
__m256i max0 = _mm256_min_epu16(d0.i, thr.i);
|
|
290
321
|
__m256i ge0 = _mm256_cmpeq_epi16(d0.i, max0);
|
|
291
322
|
|
|
@@ -301,8 +332,10 @@ inline uint32_t cmp_le32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
|
|
|
301
332
|
return ge;
|
|
302
333
|
}
|
|
303
334
|
|
|
304
|
-
inline
|
|
305
|
-
|
|
335
|
+
inline simd16uint16_tpl<SIMDLevel::AVX2> hadd(
|
|
336
|
+
const simd16uint16_tpl<SIMDLevel::AVX2>& a,
|
|
337
|
+
const simd16uint16_tpl<SIMDLevel::AVX2>& b) {
|
|
338
|
+
return simd16uint16_tpl<SIMDLevel::AVX2>(_mm256_hadd_epi16(a.i, b.i));
|
|
306
339
|
}
|
|
307
340
|
|
|
308
341
|
// Vectorized version of the following code:
|
|
@@ -320,14 +353,14 @@ inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
|
|
|
320
353
|
// Works in i16 mode in order to save instructions. One may
|
|
321
354
|
// switch from i16 to u16.
|
|
322
355
|
inline void cmplt_min_max_fast(
|
|
323
|
-
const
|
|
324
|
-
const
|
|
325
|
-
const
|
|
326
|
-
const
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
356
|
+
const simd16uint16_tpl<SIMDLevel::AVX2> candidateValues,
|
|
357
|
+
const simd16uint16_tpl<SIMDLevel::AVX2> candidateIndices,
|
|
358
|
+
const simd16uint16_tpl<SIMDLevel::AVX2> currentValues,
|
|
359
|
+
const simd16uint16_tpl<SIMDLevel::AVX2> currentIndices,
|
|
360
|
+
simd16uint16_tpl<SIMDLevel::AVX2>& minValues,
|
|
361
|
+
simd16uint16_tpl<SIMDLevel::AVX2>& minIndices,
|
|
362
|
+
simd16uint16_tpl<SIMDLevel::AVX2>& maxValues,
|
|
363
|
+
simd16uint16_tpl<SIMDLevel::AVX2>& maxIndices) {
|
|
331
364
|
// there's no lt instruction, so we'll need to emulate one
|
|
332
365
|
__m256i comparison = _mm256_cmpgt_epi16(currentValues.i, candidateValues.i);
|
|
333
366
|
comparison = _mm256_andnot_si256(comparison, _mm256_set1_epi16(-1));
|
|
@@ -341,14 +374,18 @@ inline void cmplt_min_max_fast(
|
|
|
341
374
|
}
|
|
342
375
|
|
|
343
376
|
// vector of 32 unsigned 8-bit integers
|
|
344
|
-
|
|
345
|
-
|
|
377
|
+
template <>
|
|
378
|
+
struct simd32uint8_tpl<SIMDLevel::AVX2> : simd256bit_tpl<SIMDLevel::AVX2> {
|
|
379
|
+
simd32uint8_tpl() {}
|
|
346
380
|
|
|
347
|
-
explicit
|
|
381
|
+
explicit simd32uint8_tpl(__m256i val)
|
|
382
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(val) {}
|
|
348
383
|
|
|
349
|
-
explicit
|
|
384
|
+
explicit simd32uint8_tpl(int x)
|
|
385
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(_mm256_set1_epi8(x)) {}
|
|
350
386
|
|
|
351
|
-
explicit
|
|
387
|
+
explicit simd32uint8_tpl(uint8_t x)
|
|
388
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(_mm256_set1_epi8(x)) {}
|
|
352
389
|
|
|
353
390
|
template <
|
|
354
391
|
uint8_t _0,
|
|
@@ -383,8 +420,8 @@ struct simd32uint8 : simd256bit {
|
|
|
383
420
|
uint8_t _29,
|
|
384
421
|
uint8_t _30,
|
|
385
422
|
uint8_t _31>
|
|
386
|
-
static
|
|
387
|
-
return
|
|
423
|
+
static simd32uint8_tpl create() {
|
|
424
|
+
return simd32uint8_tpl(_mm256_setr_epi8(
|
|
388
425
|
(char)_0,
|
|
389
426
|
(char)_1,
|
|
390
427
|
(char)_2,
|
|
@@ -419,18 +456,28 @@ struct simd32uint8 : simd256bit {
|
|
|
419
456
|
(char)_31));
|
|
420
457
|
}
|
|
421
458
|
|
|
422
|
-
explicit
|
|
459
|
+
explicit simd32uint8_tpl(simd256bit_tpl<SIMDLevel::AVX2> x)
|
|
460
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(x) {}
|
|
423
461
|
|
|
424
|
-
explicit
|
|
462
|
+
explicit simd32uint8_tpl(const uint8_t* x)
|
|
463
|
+
: simd256bit_tpl<SIMDLevel::AVX2>((const void*)x) {}
|
|
425
464
|
|
|
426
465
|
std::string elements_to_string(const char* fmt) const {
|
|
427
466
|
uint8_t bytes[32];
|
|
428
467
|
storeu((void*)bytes);
|
|
429
468
|
char res[1000];
|
|
430
469
|
char* ptr = res;
|
|
431
|
-
|
|
432
|
-
|
|
470
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
471
|
+
#pragma GCC diagnostic push
|
|
472
|
+
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
|
|
473
|
+
#endif
|
|
474
|
+
for (int idx = 0; idx < 32; idx++) {
|
|
475
|
+
ptr += snprintf(
|
|
476
|
+
ptr, (size_t)(res + sizeof(res) - ptr), fmt, bytes[idx]);
|
|
433
477
|
}
|
|
478
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
479
|
+
#pragma GCC diagnostic pop
|
|
480
|
+
#endif
|
|
434
481
|
// strip last ,
|
|
435
482
|
ptr[-1] = 0;
|
|
436
483
|
return std::string(res);
|
|
@@ -448,72 +495,83 @@ struct simd32uint8 : simd256bit {
|
|
|
448
495
|
i = _mm256_set1_epi8((char)x);
|
|
449
496
|
}
|
|
450
497
|
|
|
451
|
-
|
|
452
|
-
return
|
|
498
|
+
simd32uint8_tpl operator&(simd256bit_tpl<SIMDLevel::AVX2> other) const {
|
|
499
|
+
return simd32uint8_tpl(_mm256_and_si256(i, other.i));
|
|
453
500
|
}
|
|
454
501
|
|
|
455
|
-
|
|
456
|
-
return
|
|
502
|
+
simd32uint8_tpl operator+(simd32uint8_tpl other) const {
|
|
503
|
+
return simd32uint8_tpl(_mm256_add_epi8(i, other.i));
|
|
457
504
|
}
|
|
458
505
|
|
|
459
|
-
|
|
460
|
-
return
|
|
506
|
+
simd32uint8_tpl lookup_2_lanes(simd32uint8_tpl idx) const {
|
|
507
|
+
return simd32uint8_tpl(_mm256_shuffle_epi8(i, idx.i));
|
|
461
508
|
}
|
|
462
509
|
|
|
463
510
|
// extract + 0-extend lane
|
|
464
511
|
// this operation is slow (3 cycles)
|
|
465
|
-
|
|
512
|
+
simd16uint16_tpl<SIMDLevel::AVX2> lane0_as_uint16() const {
|
|
466
513
|
__m128i x = _mm256_extracti128_si256(i, 0);
|
|
467
|
-
return
|
|
514
|
+
return simd16uint16_tpl<SIMDLevel::AVX2>(_mm256_cvtepu8_epi16(x));
|
|
468
515
|
}
|
|
469
516
|
|
|
470
|
-
|
|
517
|
+
simd16uint16_tpl<SIMDLevel::AVX2> lane1_as_uint16() const {
|
|
471
518
|
__m128i x = _mm256_extracti128_si256(i, 1);
|
|
472
|
-
return
|
|
519
|
+
return simd16uint16_tpl<SIMDLevel::AVX2>(_mm256_cvtepu8_epi16(x));
|
|
473
520
|
}
|
|
474
521
|
|
|
475
|
-
|
|
522
|
+
simd32uint8_tpl& operator+=(simd32uint8_tpl other) {
|
|
476
523
|
i = _mm256_add_epi8(i, other.i);
|
|
477
524
|
return *this;
|
|
478
525
|
}
|
|
479
526
|
|
|
480
527
|
// for debugging only
|
|
481
|
-
uint8_t operator[](int
|
|
528
|
+
uint8_t operator[](int idx) const {
|
|
482
529
|
ALIGNED(32) uint8_t tab[32];
|
|
483
530
|
store(tab);
|
|
484
|
-
return tab[
|
|
531
|
+
return tab[idx];
|
|
485
532
|
}
|
|
486
533
|
};
|
|
487
534
|
|
|
488
535
|
// convert with saturation
|
|
489
536
|
// careful: this does not cross lanes, so the order is weird
|
|
490
|
-
inline
|
|
491
|
-
|
|
537
|
+
inline simd32uint8_tpl<SIMDLevel::AVX2> uint16_to_uint8_saturate(
|
|
538
|
+
simd16uint16_tpl<SIMDLevel::AVX2> a,
|
|
539
|
+
simd16uint16_tpl<SIMDLevel::AVX2> b) {
|
|
540
|
+
return simd32uint8_tpl<SIMDLevel::AVX2>(_mm256_packs_epi16(a.i, b.i));
|
|
492
541
|
}
|
|
493
542
|
|
|
494
543
|
/// get most significant bit of each byte
|
|
495
|
-
inline uint32_t get_MSBs(
|
|
544
|
+
inline uint32_t get_MSBs(simd32uint8_tpl<SIMDLevel::AVX2> a) {
|
|
496
545
|
return _mm256_movemask_epi8(a.i);
|
|
497
546
|
}
|
|
498
547
|
|
|
499
548
|
/// use MSB of each byte of mask to select a byte between a and b
|
|
500
|
-
inline
|
|
501
|
-
|
|
549
|
+
inline simd32uint8_tpl<SIMDLevel::AVX2> blendv(
|
|
550
|
+
simd32uint8_tpl<SIMDLevel::AVX2> a,
|
|
551
|
+
simd32uint8_tpl<SIMDLevel::AVX2> b,
|
|
552
|
+
simd32uint8_tpl<SIMDLevel::AVX2> mask) {
|
|
553
|
+
return simd32uint8_tpl<SIMDLevel::AVX2>(
|
|
554
|
+
_mm256_blendv_epi8(a.i, b.i, mask.i));
|
|
502
555
|
}
|
|
503
556
|
|
|
504
557
|
/// vector of 8 unsigned 32-bit integers
|
|
505
|
-
|
|
506
|
-
|
|
558
|
+
template <>
|
|
559
|
+
struct simd8uint32_tpl<SIMDLevel::AVX2> : simd256bit_tpl<SIMDLevel::AVX2> {
|
|
560
|
+
simd8uint32_tpl() {}
|
|
507
561
|
|
|
508
|
-
explicit
|
|
562
|
+
explicit simd8uint32_tpl(__m256i val)
|
|
563
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(val) {}
|
|
509
564
|
|
|
510
|
-
explicit
|
|
565
|
+
explicit simd8uint32_tpl(uint32_t x)
|
|
566
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(_mm256_set1_epi32(x)) {}
|
|
511
567
|
|
|
512
|
-
explicit
|
|
568
|
+
explicit simd8uint32_tpl(simd256bit_tpl<SIMDLevel::AVX2> x)
|
|
569
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(x) {}
|
|
513
570
|
|
|
514
|
-
explicit
|
|
571
|
+
explicit simd8uint32_tpl(const uint8_t* x)
|
|
572
|
+
: simd256bit_tpl<SIMDLevel::AVX2>((const void*)x) {}
|
|
515
573
|
|
|
516
|
-
explicit
|
|
574
|
+
explicit simd8uint32_tpl(
|
|
517
575
|
uint32_t u0,
|
|
518
576
|
uint32_t u1,
|
|
519
577
|
uint32_t u2,
|
|
@@ -522,28 +580,29 @@ struct simd8uint32 : simd256bit {
|
|
|
522
580
|
uint32_t u5,
|
|
523
581
|
uint32_t u6,
|
|
524
582
|
uint32_t u7)
|
|
525
|
-
:
|
|
583
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(
|
|
584
|
+
_mm256_setr_epi32(u0, u1, u2, u3, u4, u5, u6, u7)) {}
|
|
526
585
|
|
|
527
|
-
|
|
528
|
-
return
|
|
586
|
+
simd8uint32_tpl operator+(simd8uint32_tpl other) const {
|
|
587
|
+
return simd8uint32_tpl(_mm256_add_epi32(i, other.i));
|
|
529
588
|
}
|
|
530
589
|
|
|
531
|
-
|
|
532
|
-
return
|
|
590
|
+
simd8uint32_tpl operator-(simd8uint32_tpl other) const {
|
|
591
|
+
return simd8uint32_tpl(_mm256_sub_epi32(i, other.i));
|
|
533
592
|
}
|
|
534
593
|
|
|
535
|
-
|
|
594
|
+
simd8uint32_tpl& operator+=(const simd8uint32_tpl& other) {
|
|
536
595
|
i = _mm256_add_epi32(i, other.i);
|
|
537
596
|
return *this;
|
|
538
597
|
}
|
|
539
598
|
|
|
540
|
-
bool operator==(
|
|
599
|
+
bool operator==(simd8uint32_tpl other) const {
|
|
541
600
|
const __m256i pcmp = _mm256_cmpeq_epi32(i, other.i);
|
|
542
601
|
unsigned bitmask = _mm256_movemask_epi8(pcmp);
|
|
543
602
|
return (bitmask == 0xffffffffU);
|
|
544
603
|
}
|
|
545
604
|
|
|
546
|
-
bool operator!=(
|
|
605
|
+
bool operator!=(simd8uint32_tpl other) const {
|
|
547
606
|
return !(*this == other);
|
|
548
607
|
}
|
|
549
608
|
|
|
@@ -552,9 +611,17 @@ struct simd8uint32 : simd256bit {
|
|
|
552
611
|
storeu((void*)bytes);
|
|
553
612
|
char res[1000];
|
|
554
613
|
char* ptr = res;
|
|
555
|
-
|
|
556
|
-
|
|
614
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
615
|
+
#pragma GCC diagnostic push
|
|
616
|
+
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
|
|
617
|
+
#endif
|
|
618
|
+
for (int idx = 0; idx < 8; idx++) {
|
|
619
|
+
ptr += snprintf(
|
|
620
|
+
ptr, (size_t)(res + sizeof(res) - ptr), fmt, bytes[idx]);
|
|
557
621
|
}
|
|
622
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
623
|
+
#pragma GCC diagnostic pop
|
|
624
|
+
#endif
|
|
558
625
|
// strip last ,
|
|
559
626
|
ptr[-1] = 0;
|
|
560
627
|
return std::string(res);
|
|
@@ -572,8 +639,8 @@ struct simd8uint32 : simd256bit {
|
|
|
572
639
|
i = _mm256_set1_epi32((int)x);
|
|
573
640
|
}
|
|
574
641
|
|
|
575
|
-
|
|
576
|
-
return
|
|
642
|
+
simd8uint32_tpl unzip() const {
|
|
643
|
+
return simd8uint32_tpl(_mm256_permutevar8x32_epi32(
|
|
577
644
|
i, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
|
|
578
645
|
}
|
|
579
646
|
};
|
|
@@ -590,14 +657,14 @@ struct simd8uint32 : simd256bit {
|
|
|
590
657
|
// the last equal value is saved instead of the first one), but this behavior
|
|
591
658
|
// saves instructions.
|
|
592
659
|
inline void cmplt_min_max_fast(
|
|
593
|
-
const
|
|
594
|
-
const
|
|
595
|
-
const
|
|
596
|
-
const
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
660
|
+
const simd8uint32_tpl<SIMDLevel::AVX2> candidateValues,
|
|
661
|
+
const simd8uint32_tpl<SIMDLevel::AVX2> candidateIndices,
|
|
662
|
+
const simd8uint32_tpl<SIMDLevel::AVX2> currentValues,
|
|
663
|
+
const simd8uint32_tpl<SIMDLevel::AVX2> currentIndices,
|
|
664
|
+
simd8uint32_tpl<SIMDLevel::AVX2>& minValues,
|
|
665
|
+
simd8uint32_tpl<SIMDLevel::AVX2>& minIndices,
|
|
666
|
+
simd8uint32_tpl<SIMDLevel::AVX2>& maxValues,
|
|
667
|
+
simd8uint32_tpl<SIMDLevel::AVX2>& maxIndices) {
|
|
601
668
|
// there's no lt instruction, so we'll need to emulate one
|
|
602
669
|
__m256i comparison = _mm256_cmpgt_epi32(currentValues.i, candidateValues.i);
|
|
603
670
|
comparison = _mm256_andnot_si256(comparison, _mm256_set1_epi32(-1));
|
|
@@ -614,18 +681,22 @@ inline void cmplt_min_max_fast(
|
|
|
614
681
|
_mm256_castsi256_ps(comparison)));
|
|
615
682
|
}
|
|
616
683
|
|
|
617
|
-
|
|
618
|
-
|
|
684
|
+
template <>
|
|
685
|
+
struct simd8float32_tpl<SIMDLevel::AVX2> : simd256bit_tpl<SIMDLevel::AVX2> {
|
|
686
|
+
simd8float32_tpl() {}
|
|
619
687
|
|
|
620
|
-
explicit
|
|
688
|
+
explicit simd8float32_tpl(simd256bit_tpl<SIMDLevel::AVX2> x)
|
|
689
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(x) {}
|
|
621
690
|
|
|
622
|
-
explicit
|
|
691
|
+
explicit simd8float32_tpl(__m256 x) : simd256bit_tpl<SIMDLevel::AVX2>(x) {}
|
|
623
692
|
|
|
624
|
-
explicit
|
|
693
|
+
explicit simd8float32_tpl(float x)
|
|
694
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(_mm256_set1_ps(x)) {}
|
|
625
695
|
|
|
626
|
-
explicit
|
|
696
|
+
explicit simd8float32_tpl(const float* x)
|
|
697
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(_mm256_loadu_ps(x)) {}
|
|
627
698
|
|
|
628
|
-
explicit
|
|
699
|
+
explicit simd8float32_tpl(
|
|
629
700
|
float f0,
|
|
630
701
|
float f1,
|
|
631
702
|
float f2,
|
|
@@ -634,33 +705,34 @@ struct simd8float32 : simd256bit {
|
|
|
634
705
|
float f5,
|
|
635
706
|
float f6,
|
|
636
707
|
float f7)
|
|
637
|
-
:
|
|
708
|
+
: simd256bit_tpl<SIMDLevel::AVX2>(
|
|
709
|
+
_mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7)) {}
|
|
638
710
|
|
|
639
|
-
|
|
640
|
-
return
|
|
711
|
+
simd8float32_tpl operator*(simd8float32_tpl other) const {
|
|
712
|
+
return simd8float32_tpl(_mm256_mul_ps(f, other.f));
|
|
641
713
|
}
|
|
642
714
|
|
|
643
|
-
|
|
644
|
-
return
|
|
715
|
+
simd8float32_tpl operator+(simd8float32_tpl other) const {
|
|
716
|
+
return simd8float32_tpl(_mm256_add_ps(f, other.f));
|
|
645
717
|
}
|
|
646
718
|
|
|
647
|
-
|
|
648
|
-
return
|
|
719
|
+
simd8float32_tpl operator-(simd8float32_tpl other) const {
|
|
720
|
+
return simd8float32_tpl(_mm256_sub_ps(f, other.f));
|
|
649
721
|
}
|
|
650
722
|
|
|
651
|
-
|
|
723
|
+
simd8float32_tpl& operator+=(const simd8float32_tpl& other) {
|
|
652
724
|
f = _mm256_add_ps(f, other.f);
|
|
653
725
|
return *this;
|
|
654
726
|
}
|
|
655
727
|
|
|
656
|
-
bool operator==(
|
|
728
|
+
bool operator==(simd8float32_tpl other) const {
|
|
657
729
|
const __m256i pcmp =
|
|
658
730
|
_mm256_castps_si256(_mm256_cmp_ps(f, other.f, _CMP_EQ_OQ));
|
|
659
731
|
unsigned bitmask = _mm256_movemask_epi8(pcmp);
|
|
660
732
|
return (bitmask == 0xffffffffU);
|
|
661
733
|
}
|
|
662
734
|
|
|
663
|
-
bool operator!=(
|
|
735
|
+
bool operator!=(simd8float32_tpl other) const {
|
|
664
736
|
return !(*this == other);
|
|
665
737
|
}
|
|
666
738
|
|
|
@@ -669,8 +741,9 @@ struct simd8float32 : simd256bit {
|
|
|
669
741
|
storeu((void*)tab);
|
|
670
742
|
char res[1000];
|
|
671
743
|
char* ptr = res;
|
|
672
|
-
for (int
|
|
673
|
-
ptr +=
|
|
744
|
+
for (int idx = 0; idx < 8; idx++) {
|
|
745
|
+
ptr += snprintf(
|
|
746
|
+
ptr, (size_t)(res + sizeof(res) - ptr), "%g,", tab[idx]);
|
|
674
747
|
}
|
|
675
748
|
// strip last ,
|
|
676
749
|
ptr[-1] = 0;
|
|
@@ -678,21 +751,30 @@ struct simd8float32 : simd256bit {
|
|
|
678
751
|
}
|
|
679
752
|
};
|
|
680
753
|
|
|
681
|
-
inline
|
|
682
|
-
|
|
754
|
+
inline simd8float32_tpl<SIMDLevel::AVX2> hadd(
|
|
755
|
+
simd8float32_tpl<SIMDLevel::AVX2> a,
|
|
756
|
+
simd8float32_tpl<SIMDLevel::AVX2> b) {
|
|
757
|
+
return simd8float32_tpl<SIMDLevel::AVX2>(_mm256_hadd_ps(a.f, b.f));
|
|
683
758
|
}
|
|
684
759
|
|
|
685
|
-
inline
|
|
686
|
-
|
|
760
|
+
inline simd8float32_tpl<SIMDLevel::AVX2> unpacklo(
|
|
761
|
+
simd8float32_tpl<SIMDLevel::AVX2> a,
|
|
762
|
+
simd8float32_tpl<SIMDLevel::AVX2> b) {
|
|
763
|
+
return simd8float32_tpl<SIMDLevel::AVX2>(_mm256_unpacklo_ps(a.f, b.f));
|
|
687
764
|
}
|
|
688
765
|
|
|
689
|
-
inline
|
|
690
|
-
|
|
766
|
+
inline simd8float32_tpl<SIMDLevel::AVX2> unpackhi(
|
|
767
|
+
simd8float32_tpl<SIMDLevel::AVX2> a,
|
|
768
|
+
simd8float32_tpl<SIMDLevel::AVX2> b) {
|
|
769
|
+
return simd8float32_tpl<SIMDLevel::AVX2>(_mm256_unpackhi_ps(a.f, b.f));
|
|
691
770
|
}
|
|
692
771
|
|
|
693
772
|
// compute a * b + c
|
|
694
|
-
inline
|
|
695
|
-
|
|
773
|
+
inline simd8float32_tpl<SIMDLevel::AVX2> fmadd(
|
|
774
|
+
simd8float32_tpl<SIMDLevel::AVX2> a,
|
|
775
|
+
simd8float32_tpl<SIMDLevel::AVX2> b,
|
|
776
|
+
simd8float32_tpl<SIMDLevel::AVX2> c) {
|
|
777
|
+
return simd8float32_tpl<SIMDLevel::AVX2>(_mm256_fmadd_ps(a.f, b.f, c.f));
|
|
696
778
|
}
|
|
697
779
|
|
|
698
780
|
// The following primitive is a vectorized version of the following code
|
|
@@ -727,10 +809,10 @@ inline simd8float32 fmadd(simd8float32 a, simd8float32 b, simd8float32 c) {
|
|
|
727
809
|
// confusion for ppl who write in low-level SIMD instructions. Additionally,
|
|
728
810
|
// these two ops (cmp and blend) are very often used together.
|
|
729
811
|
inline void cmplt_and_blend_inplace(
|
|
730
|
-
const
|
|
731
|
-
const
|
|
732
|
-
|
|
733
|
-
|
|
812
|
+
const simd8float32_tpl<SIMDLevel::AVX2> candidateValues,
|
|
813
|
+
const simd8uint32_tpl<SIMDLevel::AVX2> candidateIndices,
|
|
814
|
+
simd8float32_tpl<SIMDLevel::AVX2>& lowestValues,
|
|
815
|
+
simd8uint32_tpl<SIMDLevel::AVX2>& lowestIndices) {
|
|
734
816
|
const __m256 comparison =
|
|
735
817
|
_mm256_cmp_ps(lowestValues.f, candidateValues.f, _CMP_LE_OS);
|
|
736
818
|
lowestValues.f = _mm256_min_ps(candidateValues.f, lowestValues.f);
|
|
@@ -752,14 +834,14 @@ inline void cmplt_and_blend_inplace(
|
|
|
752
834
|
// the last equal value is saved instead of the first one), but this behavior
|
|
753
835
|
// saves instructions.
|
|
754
836
|
inline void cmplt_min_max_fast(
|
|
755
|
-
const
|
|
756
|
-
const
|
|
757
|
-
const
|
|
758
|
-
const
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
837
|
+
const simd8float32_tpl<SIMDLevel::AVX2> candidateValues,
|
|
838
|
+
const simd8uint32_tpl<SIMDLevel::AVX2> candidateIndices,
|
|
839
|
+
const simd8float32_tpl<SIMDLevel::AVX2> currentValues,
|
|
840
|
+
const simd8uint32_tpl<SIMDLevel::AVX2> currentIndices,
|
|
841
|
+
simd8float32_tpl<SIMDLevel::AVX2>& minValues,
|
|
842
|
+
simd8uint32_tpl<SIMDLevel::AVX2>& minIndices,
|
|
843
|
+
simd8float32_tpl<SIMDLevel::AVX2>& maxValues,
|
|
844
|
+
simd8uint32_tpl<SIMDLevel::AVX2>& maxIndices) {
|
|
763
845
|
const __m256 comparison =
|
|
764
846
|
_mm256_cmp_ps(currentValues.f, candidateValues.f, _CMP_LE_OS);
|
|
765
847
|
minValues.f = _mm256_min_ps(candidateValues.f, currentValues.f);
|
|
@@ -777,29 +859,39 @@ inline void cmplt_min_max_fast(
|
|
|
777
859
|
namespace {
|
|
778
860
|
|
|
779
861
|
// get even float32's of a and b, interleaved
|
|
780
|
-
inline
|
|
781
|
-
|
|
862
|
+
inline simd8float32_tpl<SIMDLevel::AVX2> geteven(
|
|
863
|
+
simd8float32_tpl<SIMDLevel::AVX2> a,
|
|
864
|
+
simd8float32_tpl<SIMDLevel::AVX2> b) {
|
|
865
|
+
return simd8float32_tpl<SIMDLevel::AVX2>(
|
|
782
866
|
_mm256_shuffle_ps(a.f, b.f, 0 << 0 | 2 << 2 | 0 << 4 | 2 << 6));
|
|
783
867
|
}
|
|
784
868
|
|
|
785
869
|
// get odd float32's of a and b, interleaved
|
|
786
|
-
inline
|
|
787
|
-
|
|
870
|
+
inline simd8float32_tpl<SIMDLevel::AVX2> getodd(
|
|
871
|
+
simd8float32_tpl<SIMDLevel::AVX2> a,
|
|
872
|
+
simd8float32_tpl<SIMDLevel::AVX2> b) {
|
|
873
|
+
return simd8float32_tpl<SIMDLevel::AVX2>(
|
|
788
874
|
_mm256_shuffle_ps(a.f, b.f, 1 << 0 | 3 << 2 | 1 << 4 | 3 << 6));
|
|
789
875
|
}
|
|
790
876
|
|
|
791
877
|
// 3 cycles
|
|
792
878
|
// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
|
|
793
|
-
inline
|
|
794
|
-
|
|
879
|
+
inline simd8float32_tpl<SIMDLevel::AVX2> getlow128(
|
|
880
|
+
simd8float32_tpl<SIMDLevel::AVX2> a,
|
|
881
|
+
simd8float32_tpl<SIMDLevel::AVX2> b) {
|
|
882
|
+
return simd8float32_tpl<SIMDLevel::AVX2>(
|
|
883
|
+
_mm256_permute2f128_ps(a.f, b.f, 0 | 2 << 4));
|
|
795
884
|
}
|
|
796
885
|
|
|
797
|
-
inline
|
|
798
|
-
|
|
886
|
+
inline simd8float32_tpl<SIMDLevel::AVX2> gethigh128(
|
|
887
|
+
const simd8float32_tpl<SIMDLevel::AVX2>& a,
|
|
888
|
+
const simd8float32_tpl<SIMDLevel::AVX2>& b) {
|
|
889
|
+
return simd8float32_tpl<SIMDLevel::AVX2>(
|
|
890
|
+
_mm256_permute2f128_ps(a.f, b.f, 1 | 3 << 4));
|
|
799
891
|
}
|
|
800
892
|
|
|
801
893
|
// horizontal add: sum all 8 floats in the register
|
|
802
|
-
inline float horizontal_add(const
|
|
894
|
+
inline float horizontal_add(const simd8float32_tpl<SIMDLevel::AVX2>& a) {
|
|
803
895
|
__m128 sum = _mm_add_ps(
|
|
804
896
|
_mm256_castps256_ps128(a.f), _mm256_extractf128_ps(a.f, 1));
|
|
805
897
|
__m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2));
|