faiss 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/faiss/extconf.rb +2 -1
- data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
- data/ext/faiss/index_binary.cpp +1 -1
- data/ext/faiss/kmeans.cpp +1 -1
- data/ext/faiss/pca_matrix.cpp +1 -1
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +93 -80
- data/vendor/faiss/faiss/Clustering.cpp +39 -240
- data/vendor/faiss/faiss/Clustering.h +6 -0
- data/vendor/faiss/faiss/IVFlib.cpp +41 -21
- data/vendor/faiss/faiss/Index.cpp +6 -5
- data/vendor/faiss/faiss/Index.h +5 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +88 -97
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +89 -417
- data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
- data/vendor/faiss/faiss/IndexFastScan.h +25 -23
- data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
- data/vendor/faiss/faiss/IndexFlat.h +21 -18
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
- data/vendor/faiss/faiss/IndexHNSW.cpp +374 -206
- data/vendor/faiss/faiss/IndexHNSW.h +16 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +467 -364
- data/vendor/faiss/faiss/IndexIVF.h +33 -12
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +79 -76
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
- data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +39 -69
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +56 -33
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +73 -846
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +23 -20
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +30 -52
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
- data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +38 -23
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
- data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +150 -20
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
- data/vendor/faiss/faiss/IndexShards.cpp +10 -9
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +14 -7
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
- data/vendor/faiss/faiss/VectorTransform.h +16 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +48 -47
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
- data/vendor/faiss/faiss/factory_tools.cpp +9 -0
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +15 -16
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +5 -4
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +58 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +111 -0
- data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
- data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +639 -507
- data/vendor/faiss/faiss/impl/HNSW.h +61 -44
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
- data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +53 -32
- data/vendor/faiss/faiss/impl/NSG.h +4 -4
- data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
- data/vendor/faiss/faiss/impl/Panorama.h +269 -87
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +55 -25
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +302 -283
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +100 -75
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +318 -7
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +77 -1
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
- data/vendor/faiss/faiss/impl/VisitedTable.h +70 -28
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +270 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +83 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +113 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +150 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +142 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1227 -79
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +96 -13
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +58 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
- data/vendor/faiss/faiss/impl/platform_macros.h +15 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +23 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +23 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +23 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +45 -107
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +274 -5
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +10 -7
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +70 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +9 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +419 -19
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +387 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +341 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +425 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +290 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +337 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +157 -66
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +90 -18
- data/vendor/faiss/faiss/index_io.h +40 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +28 -15
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +170 -86
- data/vendor/faiss/faiss/invlists/InvertedLists.h +88 -25
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +142 -21
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +33 -7
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +77 -27
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +10 -4
- data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/bf16.h +34 -0
- data/vendor/faiss/faiss/utils/distances.cpp +390 -560
- data/vendor/faiss/faiss/utils/distances.h +20 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -178
- data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +16 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +210 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -989
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1031 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +29 -7
- data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +5 -5
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +129 -34
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
- /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
|
@@ -8,14 +8,15 @@
|
|
|
8
8
|
#include <faiss/utils/partitioning.h>
|
|
9
9
|
|
|
10
10
|
#include <cassert>
|
|
11
|
+
#include <cinttypes>
|
|
11
12
|
#include <cmath>
|
|
13
|
+
#include <cstring>
|
|
14
|
+
#include <type_traits>
|
|
12
15
|
|
|
13
16
|
#include <faiss/impl/FaissAssert.h>
|
|
17
|
+
#include <faiss/impl/simd_dispatch.h>
|
|
14
18
|
#include <faiss/utils/AlignedTable.h>
|
|
15
19
|
#include <faiss/utils/ordered_key_value.h>
|
|
16
|
-
#include <faiss/utils/simdlib.h>
|
|
17
|
-
|
|
18
|
-
#include <faiss/impl/platform_macros.h>
|
|
19
20
|
|
|
20
21
|
namespace faiss {
|
|
21
22
|
|
|
@@ -50,8 +51,8 @@ typename C::T sample_threshold_median3(
|
|
|
50
51
|
T val3[3];
|
|
51
52
|
int vi = 0;
|
|
52
53
|
|
|
53
|
-
for (size_t i = 0; i < n; i++) {
|
|
54
|
-
T v = vals[(i * big_prime) % n];
|
|
54
|
+
for (size_t i = 0; i < static_cast<size_t>(n); i++) {
|
|
55
|
+
T v = vals[(i * big_prime) % static_cast<size_t>(n)];
|
|
55
56
|
// thresh_inf < v < thresh_sup (for CMax)
|
|
56
57
|
if (C::cmp(v, thresh_inf) && C::cmp(thresh_sup, v)) {
|
|
57
58
|
val3[vi++] = v;
|
|
@@ -217,527 +218,9 @@ typename C::T partition_fuzzy_median3(
|
|
|
217
218
|
return thresh;
|
|
218
219
|
}
|
|
219
220
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
/******************************************************************
|
|
223
|
-
* SIMD routines when vals is an aligned array of uint16_t
|
|
224
|
-
******************************************************************/
|
|
225
|
-
|
|
226
|
-
namespace simd_partitioning {
|
|
227
|
-
|
|
228
|
-
void find_minimax(
|
|
229
|
-
const uint16_t* vals,
|
|
230
|
-
size_t n,
|
|
231
|
-
uint16_t& smin,
|
|
232
|
-
uint16_t& smax) {
|
|
233
|
-
simd16uint16 vmin(0xffff), vmax(0);
|
|
234
|
-
for (size_t i = 0; i + 15 < n; i += 16) {
|
|
235
|
-
simd16uint16 v(vals + i);
|
|
236
|
-
vmin.accu_min(v);
|
|
237
|
-
vmax.accu_max(v);
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
ALIGNED(32) uint16_t tab32[32];
|
|
241
|
-
vmin.store(tab32);
|
|
242
|
-
vmax.store(tab32 + 16);
|
|
243
|
-
|
|
244
|
-
smin = tab32[0], smax = tab32[16];
|
|
245
|
-
|
|
246
|
-
for (int i = 1; i < 16; i++) {
|
|
247
|
-
smin = std::min(smin, tab32[i]);
|
|
248
|
-
smax = std::max(smax, tab32[i + 16]);
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
// missing values
|
|
252
|
-
for (size_t i = (n & ~15); i < n; i++) {
|
|
253
|
-
smin = std::min(smin, vals[i]);
|
|
254
|
-
smax = std::max(smax, vals[i]);
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
// max func differentiates between CMin and CMax (keep lowest or largest)
|
|
259
|
-
template <class C>
|
|
260
|
-
simd16uint16 max_func(simd16uint16 v, simd16uint16 thr16) {
|
|
261
|
-
constexpr bool is_max = C::is_max;
|
|
262
|
-
if (is_max) {
|
|
263
|
-
return max(v, thr16);
|
|
264
|
-
} else {
|
|
265
|
-
return min(v, thr16);
|
|
266
|
-
}
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
template <class C>
|
|
270
|
-
void count_lt_and_eq(
|
|
271
|
-
const uint16_t* vals,
|
|
272
|
-
int n,
|
|
273
|
-
uint16_t thresh,
|
|
274
|
-
size_t& n_lt,
|
|
275
|
-
size_t& n_eq) {
|
|
276
|
-
n_lt = n_eq = 0;
|
|
277
|
-
simd16uint16 thr16(thresh);
|
|
278
|
-
|
|
279
|
-
size_t n1 = n / 16;
|
|
280
|
-
|
|
281
|
-
for (size_t i = 0; i < n1; i++) {
|
|
282
|
-
simd16uint16 v(vals);
|
|
283
|
-
vals += 16;
|
|
284
|
-
simd16uint16 eqmask = (v == thr16);
|
|
285
|
-
simd16uint16 max2 = max_func<C>(v, thr16);
|
|
286
|
-
simd16uint16 gemask = (v == max2);
|
|
287
|
-
uint32_t bits = get_MSBs(uint16_to_uint8_saturate(eqmask, gemask));
|
|
288
|
-
int i_eq = __builtin_popcount(bits & 0x00ff00ff);
|
|
289
|
-
int i_ge = __builtin_popcount(bits) - i_eq;
|
|
290
|
-
n_eq += i_eq;
|
|
291
|
-
n_lt += 16 - i_ge;
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
for (size_t i = n1 * 16; i < n; i++) {
|
|
295
|
-
uint16_t v = *vals++;
|
|
296
|
-
if (C::cmp(thresh, v)) {
|
|
297
|
-
n_lt++;
|
|
298
|
-
} else if (v == thresh) {
|
|
299
|
-
n_eq++;
|
|
300
|
-
}
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
/* compress separated values and ids table, keeping all values < thresh and at
|
|
305
|
-
* most n_eq equal values */
|
|
306
|
-
template <class C>
|
|
307
|
-
int simd_compress_array(
|
|
308
|
-
uint16_t* vals,
|
|
309
|
-
typename C::TI* ids,
|
|
310
|
-
size_t n,
|
|
311
|
-
uint16_t thresh,
|
|
312
|
-
int n_eq) {
|
|
313
|
-
simd16uint16 thr16(thresh);
|
|
314
|
-
simd16uint16 mixmask(0xff00);
|
|
315
|
-
|
|
316
|
-
int wp = 0;
|
|
317
|
-
size_t i0;
|
|
318
|
-
|
|
319
|
-
// loop while there are eqs to collect
|
|
320
|
-
for (i0 = 0; i0 + 15 < n && n_eq > 0; i0 += 16) {
|
|
321
|
-
simd16uint16 v(vals + i0);
|
|
322
|
-
simd16uint16 max2 = max_func<C>(v, thr16);
|
|
323
|
-
simd16uint16 gemask = (v == max2);
|
|
324
|
-
simd16uint16 eqmask = (v == thr16);
|
|
325
|
-
uint32_t bits = get_MSBs(
|
|
326
|
-
blendv(simd32uint8(eqmask),
|
|
327
|
-
simd32uint8(gemask),
|
|
328
|
-
simd32uint8(mixmask)));
|
|
329
|
-
bits ^= 0xAAAAAAAA;
|
|
330
|
-
// bit 2*i : eq
|
|
331
|
-
// bit 2*i + 1 : lt
|
|
332
|
-
|
|
333
|
-
while (bits) {
|
|
334
|
-
int j = __builtin_ctz(bits) & (~1);
|
|
335
|
-
bool is_eq = (bits >> j) & 1;
|
|
336
|
-
bool is_lt = (bits >> j) & 2;
|
|
337
|
-
bits &= ~(3 << j);
|
|
338
|
-
j >>= 1;
|
|
339
|
-
|
|
340
|
-
if (is_lt) {
|
|
341
|
-
vals[wp] = vals[i0 + j];
|
|
342
|
-
ids[wp] = ids[i0 + j];
|
|
343
|
-
wp++;
|
|
344
|
-
} else if (is_eq && n_eq > 0) {
|
|
345
|
-
vals[wp] = vals[i0 + j];
|
|
346
|
-
ids[wp] = ids[i0 + j];
|
|
347
|
-
wp++;
|
|
348
|
-
n_eq--;
|
|
349
|
-
}
|
|
350
|
-
}
|
|
351
|
-
}
|
|
352
|
-
|
|
353
|
-
// handle remaining, only strictly lt ones.
|
|
354
|
-
for (; i0 + 15 < n; i0 += 16) {
|
|
355
|
-
simd16uint16 v(vals + i0);
|
|
356
|
-
simd16uint16 max2 = max_func<C>(v, thr16);
|
|
357
|
-
simd16uint16 gemask = (v == max2);
|
|
358
|
-
uint32_t bits = ~get_MSBs(simd32uint8(gemask));
|
|
359
|
-
|
|
360
|
-
while (bits) {
|
|
361
|
-
int j = __builtin_ctz(bits);
|
|
362
|
-
bits &= ~(3 << j);
|
|
363
|
-
j >>= 1;
|
|
364
|
-
|
|
365
|
-
vals[wp] = vals[i0 + j];
|
|
366
|
-
ids[wp] = ids[i0 + j];
|
|
367
|
-
wp++;
|
|
368
|
-
}
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
// end with scalar
|
|
372
|
-
for (int i = (n & ~15); i < n; i++) {
|
|
373
|
-
if (C::cmp(thresh, vals[i])) {
|
|
374
|
-
vals[wp] = vals[i];
|
|
375
|
-
ids[wp] = ids[i];
|
|
376
|
-
wp++;
|
|
377
|
-
} else if (vals[i] == thresh && n_eq > 0) {
|
|
378
|
-
vals[wp] = vals[i];
|
|
379
|
-
ids[wp] = ids[i];
|
|
380
|
-
wp++;
|
|
381
|
-
n_eq--;
|
|
382
|
-
}
|
|
383
|
-
}
|
|
384
|
-
assert(n_eq == 0);
|
|
385
|
-
return wp;
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
// #define MICRO_BENCHMARK
|
|
389
|
-
|
|
390
|
-
static uint64_t get_cy() {
|
|
391
|
-
#ifdef MICRO_BENCHMARK
|
|
392
|
-
uint32_t high, low;
|
|
393
|
-
asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
|
|
394
|
-
return ((uint64_t)high << 32) | (low);
|
|
395
|
-
#else
|
|
396
|
-
return 0;
|
|
397
|
-
#endif
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
#define IFV if (false)
|
|
401
|
-
|
|
402
|
-
template <class C>
|
|
403
|
-
uint16_t simd_partition_fuzzy_with_bounds(
|
|
404
|
-
uint16_t* vals,
|
|
405
|
-
typename C::TI* ids,
|
|
406
|
-
size_t n,
|
|
407
|
-
size_t q_min,
|
|
408
|
-
size_t q_max,
|
|
409
|
-
size_t* q_out,
|
|
410
|
-
uint16_t s0i,
|
|
411
|
-
uint16_t s1i) {
|
|
412
|
-
if (q_min == 0) {
|
|
413
|
-
if (q_out) {
|
|
414
|
-
*q_out = 0;
|
|
415
|
-
}
|
|
416
|
-
return 0;
|
|
417
|
-
}
|
|
418
|
-
if (q_max >= n) {
|
|
419
|
-
if (q_out) {
|
|
420
|
-
*q_out = q_max;
|
|
421
|
-
}
|
|
422
|
-
return 0xffff;
|
|
423
|
-
}
|
|
424
|
-
if (s0i == s1i) {
|
|
425
|
-
if (q_out) {
|
|
426
|
-
*q_out = q_min;
|
|
427
|
-
}
|
|
428
|
-
return s0i;
|
|
429
|
-
}
|
|
430
|
-
uint64_t t0 = get_cy();
|
|
431
|
-
|
|
432
|
-
// lower bound inclusive, upper exclusive
|
|
433
|
-
size_t s0 = s0i, s1 = s1i + 1;
|
|
434
|
-
|
|
435
|
-
IFV printf("bounds: %ld %ld\n", s0, s1 - 1);
|
|
436
|
-
|
|
437
|
-
int thresh;
|
|
438
|
-
size_t n_eq = 0, n_lt = 0;
|
|
439
|
-
size_t q = 0;
|
|
440
|
-
|
|
441
|
-
for (int it = 0; it < 200; it++) {
|
|
442
|
-
// while(s0 + 1 < s1) {
|
|
443
|
-
thresh = (s0 + s1) / 2;
|
|
444
|
-
count_lt_and_eq<C>(vals, n, thresh, n_lt, n_eq);
|
|
445
|
-
|
|
446
|
-
IFV printf(
|
|
447
|
-
" [%ld %ld] thresh=%d n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n",
|
|
448
|
-
s0,
|
|
449
|
-
s1,
|
|
450
|
-
thresh,
|
|
451
|
-
n_lt,
|
|
452
|
-
n_eq,
|
|
453
|
-
q_min,
|
|
454
|
-
q_max,
|
|
455
|
-
n);
|
|
456
|
-
if (n_lt <= q_min) {
|
|
457
|
-
if (n_lt + n_eq >= q_min) {
|
|
458
|
-
q = q_min;
|
|
459
|
-
break;
|
|
460
|
-
} else {
|
|
461
|
-
if (C::is_max) {
|
|
462
|
-
s0 = thresh;
|
|
463
|
-
} else {
|
|
464
|
-
s1 = thresh;
|
|
465
|
-
}
|
|
466
|
-
}
|
|
467
|
-
} else if (n_lt <= q_max) {
|
|
468
|
-
q = n_lt;
|
|
469
|
-
break;
|
|
470
|
-
} else {
|
|
471
|
-
if (C::is_max) {
|
|
472
|
-
s1 = thresh;
|
|
473
|
-
} else {
|
|
474
|
-
s0 = thresh;
|
|
475
|
-
}
|
|
476
|
-
}
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
uint64_t t1 = get_cy();
|
|
480
|
-
|
|
481
|
-
// number of equal values to keep
|
|
482
|
-
int64_t n_eq_1 = q - n_lt;
|
|
483
|
-
|
|
484
|
-
IFV printf("shrink: thresh=%d q=%ld n_eq_1=%ld\n", thresh, q, n_eq_1);
|
|
485
|
-
if (n_eq_1 < 0) { // happens when > q elements are at lower bound
|
|
486
|
-
assert(s0 + 1 == s1);
|
|
487
|
-
q = q_min;
|
|
488
|
-
if (C::is_max) {
|
|
489
|
-
thresh--;
|
|
490
|
-
} else {
|
|
491
|
-
thresh++;
|
|
492
|
-
}
|
|
493
|
-
n_eq_1 = q;
|
|
494
|
-
IFV printf(" override: thresh=%d n_eq_1=%ld\n", thresh, n_eq_1);
|
|
495
|
-
} else {
|
|
496
|
-
assert(n_eq_1 <= n_eq);
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq_1);
|
|
500
|
-
|
|
501
|
-
IFV printf("wp=%ld\n", wp);
|
|
502
|
-
assert(wp == q);
|
|
503
|
-
if (q_out) {
|
|
504
|
-
*q_out = q;
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
uint64_t t2 = get_cy();
|
|
508
|
-
|
|
509
|
-
partition_stats.bisect_cycles += t1 - t0;
|
|
510
|
-
partition_stats.compress_cycles += t2 - t1;
|
|
511
|
-
|
|
512
|
-
return thresh;
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
template <class C>
|
|
516
|
-
uint16_t simd_partition_fuzzy_with_bounds_histogram(
|
|
517
|
-
uint16_t* vals,
|
|
518
|
-
typename C::TI* ids,
|
|
519
|
-
size_t n,
|
|
520
|
-
size_t q_min,
|
|
521
|
-
size_t q_max,
|
|
522
|
-
size_t* q_out,
|
|
523
|
-
uint16_t s0i,
|
|
524
|
-
uint16_t s1i) {
|
|
525
|
-
if (q_min == 0) {
|
|
526
|
-
if (q_out) {
|
|
527
|
-
*q_out = 0;
|
|
528
|
-
}
|
|
529
|
-
return 0;
|
|
530
|
-
}
|
|
531
|
-
if (q_max >= n) {
|
|
532
|
-
if (q_out) {
|
|
533
|
-
*q_out = q_max;
|
|
534
|
-
}
|
|
535
|
-
return 0xffff;
|
|
536
|
-
}
|
|
537
|
-
if (s0i == s1i) {
|
|
538
|
-
if (q_out) {
|
|
539
|
-
*q_out = q_min;
|
|
540
|
-
}
|
|
541
|
-
return s0i;
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
IFV printf(
|
|
545
|
-
"partition fuzzy, q=%ld:%ld / %ld, bounds=%d %d\n",
|
|
546
|
-
q_min,
|
|
547
|
-
q_max,
|
|
548
|
-
n,
|
|
549
|
-
s0i,
|
|
550
|
-
s1i);
|
|
551
|
-
|
|
552
|
-
if (!C::is_max) {
|
|
553
|
-
IFV printf(
|
|
554
|
-
"revert due to CMin, q_min:q_max -> %ld:%ld\n", q_min, q_max);
|
|
555
|
-
q_min = n - q_min;
|
|
556
|
-
q_max = n - q_max;
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
// lower and upper bound of range, inclusive
|
|
560
|
-
int s0 = s0i, s1 = s1i;
|
|
561
|
-
// number of values < s0 and > s1
|
|
562
|
-
size_t n_lt = 0, n_gt = 0;
|
|
563
|
-
|
|
564
|
-
// output of loop:
|
|
565
|
-
int thresh; // final threshold
|
|
566
|
-
uint64_t tot_eq = 0; // total nb of equal values
|
|
567
|
-
uint64_t n_eq = 0; // nb of equal values to keep
|
|
568
|
-
size_t q; // final quantile
|
|
569
|
-
|
|
570
|
-
// buffer for the histograms
|
|
571
|
-
int hist[16];
|
|
572
|
-
|
|
573
|
-
for (int it = 0; it < 20; it++) {
|
|
574
|
-
// otherwise we would be done already
|
|
575
|
-
|
|
576
|
-
int shift = 0;
|
|
577
|
-
|
|
578
|
-
IFV printf(
|
|
579
|
-
" it %d bounds: %d %d n_lt=%ld n_gt=%ld\n",
|
|
580
|
-
it,
|
|
581
|
-
s0,
|
|
582
|
-
s1,
|
|
583
|
-
n_lt,
|
|
584
|
-
n_gt);
|
|
585
|
-
|
|
586
|
-
int maxval = s1 - s0;
|
|
587
|
-
|
|
588
|
-
while (maxval > 15) {
|
|
589
|
-
shift++;
|
|
590
|
-
maxval >>= 1;
|
|
591
|
-
}
|
|
592
|
-
|
|
593
|
-
IFV printf(
|
|
594
|
-
" histogram shift %d maxval %d ?= %d\n",
|
|
595
|
-
shift,
|
|
596
|
-
maxval,
|
|
597
|
-
int((s1 - s0) >> shift));
|
|
598
|
-
|
|
599
|
-
if (maxval > 7) {
|
|
600
|
-
simd_histogram_16(vals, n, s0, shift, hist);
|
|
601
|
-
} else {
|
|
602
|
-
simd_histogram_8(vals, n, s0, shift, hist);
|
|
603
|
-
}
|
|
604
|
-
IFV {
|
|
605
|
-
int sum = n_lt + n_gt;
|
|
606
|
-
printf(" n_lt=%ld hist=[", n_lt);
|
|
607
|
-
for (int i = 0; i <= maxval; i++) {
|
|
608
|
-
printf("%d ", hist[i]);
|
|
609
|
-
sum += hist[i];
|
|
610
|
-
}
|
|
611
|
-
printf("] n_gt=%ld sum=%d\n", n_gt, sum);
|
|
612
|
-
assert(sum == n);
|
|
613
|
-
}
|
|
614
|
-
|
|
615
|
-
size_t sum_below = n_lt;
|
|
616
|
-
int i;
|
|
617
|
-
for (i = 0; i <= maxval; i++) {
|
|
618
|
-
sum_below += hist[i];
|
|
619
|
-
if (sum_below >= q_min) {
|
|
620
|
-
break;
|
|
621
|
-
}
|
|
622
|
-
}
|
|
623
|
-
IFV printf(" i=%d sum_below=%ld\n", i, sum_below);
|
|
624
|
-
if (i <= maxval) {
|
|
625
|
-
s0 = s0 + (i << shift);
|
|
626
|
-
s1 = s0 + (1 << shift) - 1;
|
|
627
|
-
n_lt = sum_below - hist[i];
|
|
628
|
-
n_gt = n - sum_below;
|
|
629
|
-
} else {
|
|
630
|
-
assert(false && "not implemented");
|
|
631
|
-
}
|
|
632
|
-
|
|
633
|
-
IFV printf(
|
|
634
|
-
" new bin: s0=%d s1=%d n_lt=%ld n_gt=%ld\n",
|
|
635
|
-
s0,
|
|
636
|
-
s1,
|
|
637
|
-
n_lt,
|
|
638
|
-
n_gt);
|
|
639
|
-
|
|
640
|
-
if (s1 > s0) {
|
|
641
|
-
if (n_lt >= q_min && q_max >= n_lt) {
|
|
642
|
-
IFV printf(" FOUND1\n");
|
|
643
|
-
thresh = s0;
|
|
644
|
-
q = n_lt;
|
|
645
|
-
break;
|
|
646
|
-
}
|
|
647
|
-
|
|
648
|
-
size_t n_lt_2 = n - n_gt;
|
|
649
|
-
if (n_lt_2 >= q_min && q_max >= n_lt_2) {
|
|
650
|
-
thresh = s1 + 1;
|
|
651
|
-
q = n_lt_2;
|
|
652
|
-
IFV printf(" FOUND2\n");
|
|
653
|
-
break;
|
|
654
|
-
}
|
|
655
|
-
} else {
|
|
656
|
-
thresh = s0;
|
|
657
|
-
q = q_min;
|
|
658
|
-
tot_eq = n - n_gt - n_lt;
|
|
659
|
-
n_eq = q_min - n_lt;
|
|
660
|
-
IFV printf(" FOUND3\n");
|
|
661
|
-
break;
|
|
662
|
-
}
|
|
663
|
-
}
|
|
664
|
-
|
|
665
|
-
IFV printf("end bisection: thresh=%d q=%ld n_eq=%ld\n", thresh, q, n_eq);
|
|
666
|
-
|
|
667
|
-
if (!C::is_max) {
|
|
668
|
-
if (n_eq == 0) {
|
|
669
|
-
thresh--;
|
|
670
|
-
} else {
|
|
671
|
-
// thresh unchanged
|
|
672
|
-
n_eq = tot_eq - n_eq;
|
|
673
|
-
}
|
|
674
|
-
q = n - q;
|
|
675
|
-
IFV printf("revert due to CMin, q->%ld n_eq->%ld\n", q, n_eq);
|
|
676
|
-
}
|
|
677
|
-
|
|
678
|
-
size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq);
|
|
679
|
-
IFV printf("wp=%ld ?= %ld\n", wp, q);
|
|
680
|
-
assert(wp == q);
|
|
681
|
-
if (q_out) {
|
|
682
|
-
*q_out = wp;
|
|
683
|
-
}
|
|
684
|
-
|
|
685
|
-
return thresh;
|
|
686
|
-
}
|
|
687
|
-
|
|
688
|
-
template <class C>
|
|
689
|
-
uint16_t simd_partition_fuzzy(
|
|
690
|
-
uint16_t* vals,
|
|
691
|
-
typename C::TI* ids,
|
|
692
|
-
size_t n,
|
|
693
|
-
size_t q_min,
|
|
694
|
-
size_t q_max,
|
|
695
|
-
size_t* q_out) {
|
|
696
|
-
assert(is_aligned_pointer(vals));
|
|
697
|
-
|
|
698
|
-
uint16_t s0i, s1i;
|
|
699
|
-
find_minimax(vals, n, s0i, s1i);
|
|
700
|
-
// QSelect_stats.t0 += get_cy() - t0;
|
|
701
|
-
|
|
702
|
-
return simd_partition_fuzzy_with_bounds<C>(
|
|
703
|
-
vals, ids, n, q_min, q_max, q_out, s0i, s1i);
|
|
704
|
-
}
|
|
705
|
-
|
|
706
|
-
template <class C>
|
|
707
|
-
uint16_t simd_partition(
|
|
708
|
-
uint16_t* vals,
|
|
709
|
-
typename C::TI* ids,
|
|
710
|
-
size_t n,
|
|
711
|
-
size_t q) {
|
|
712
|
-
assert(is_aligned_pointer(vals));
|
|
713
|
-
|
|
714
|
-
if (q == 0) {
|
|
715
|
-
return 0;
|
|
716
|
-
}
|
|
717
|
-
if (q >= n) {
|
|
718
|
-
return 0xffff;
|
|
719
|
-
}
|
|
720
|
-
|
|
721
|
-
uint16_t s0i, s1i;
|
|
722
|
-
find_minimax(vals, n, s0i, s1i);
|
|
723
|
-
|
|
724
|
-
return simd_partition_fuzzy_with_bounds<C>(
|
|
725
|
-
vals, ids, n, q, q, nullptr, s0i, s1i);
|
|
726
|
-
}
|
|
727
|
-
|
|
728
|
-
template <class C>
|
|
729
|
-
uint16_t simd_partition_with_bounds(
|
|
730
|
-
uint16_t* vals,
|
|
731
|
-
typename C::TI* ids,
|
|
732
|
-
size_t n,
|
|
733
|
-
size_t q,
|
|
734
|
-
uint16_t s0i,
|
|
735
|
-
uint16_t s1i) {
|
|
736
|
-
return simd_partition_fuzzy_with_bounds<C>(
|
|
737
|
-
vals, ids, n, q, q, nullptr, s0i, s1i);
|
|
738
|
-
}
|
|
221
|
+
#undef IFV
|
|
739
222
|
|
|
740
|
-
} // namespace
|
|
223
|
+
} // namespace partitioning
|
|
741
224
|
|
|
742
225
|
/******************************************************************
|
|
743
226
|
* Driver routine
|
|
@@ -751,13 +234,20 @@ typename C::T partition_fuzzy(
|
|
|
751
234
|
size_t q_min,
|
|
752
235
|
size_t q_max,
|
|
753
236
|
size_t* q_out) {
|
|
754
|
-
#ifdef __AVX2__
|
|
755
237
|
constexpr bool is_uint16 = std::is_same<typename C::T, uint16_t>::value;
|
|
756
|
-
if (is_uint16
|
|
757
|
-
|
|
758
|
-
|
|
238
|
+
if constexpr (is_uint16) {
|
|
239
|
+
if (is_aligned_pointer(vals)) {
|
|
240
|
+
return with_simd_level_256bit([&]<SIMDLevel SL>() -> typename C::T {
|
|
241
|
+
if constexpr (SL == SIMDLevel::NONE) {
|
|
242
|
+
return partitioning::partition_fuzzy_median3<C>(
|
|
243
|
+
vals, ids, n, q_min, q_max, q_out);
|
|
244
|
+
} else {
|
|
245
|
+
return partition_fuzzy_simd<SL, C>(
|
|
246
|
+
(uint16_t*)vals, ids, n, q_min, q_max, q_out);
|
|
247
|
+
}
|
|
248
|
+
});
|
|
249
|
+
}
|
|
759
250
|
}
|
|
760
|
-
#endif
|
|
761
251
|
return partitioning::partition_fuzzy_median3<C>(
|
|
762
252
|
vals, ids, n, q_min, q_max, q_out);
|
|
763
253
|
}
|
|
@@ -813,457 +303,12 @@ template uint16_t partition_fuzzy<CMax<uint16_t, int>>(
|
|
|
813
303
|
size_t* q_out);
|
|
814
304
|
|
|
815
305
|
/******************************************************************
|
|
816
|
-
* Histogram subroutines
|
|
306
|
+
* Histogram subroutines — scalar fallbacks
|
|
817
307
|
******************************************************************/
|
|
818
308
|
|
|
819
|
-
#if defined(__AVX2__) || defined(__aarch64__)
|
|
820
|
-
/// FIXME when MSB of uint16 is set
|
|
821
|
-
// this code does not compile properly with GCC 7.4.0
|
|
822
|
-
|
|
823
309
|
namespace {
|
|
824
310
|
|
|
825
|
-
|
|
826
|
-
* 8 bins
|
|
827
|
-
************************************************************/
|
|
828
|
-
|
|
829
|
-
simd32uint8 accu4to8(simd16uint16 a4) {
|
|
830
|
-
simd16uint16 mask4(0x0f0f);
|
|
831
|
-
|
|
832
|
-
simd16uint16 a8_0 = a4 & mask4;
|
|
833
|
-
simd16uint16 a8_1 = (a4 >> 4) & mask4;
|
|
834
|
-
|
|
835
|
-
return simd32uint8(hadd(a8_0, a8_1));
|
|
836
|
-
}
|
|
837
|
-
|
|
838
|
-
simd16uint16 accu8to16(simd32uint8 a8) {
|
|
839
|
-
simd16uint16 mask8(0x00ff);
|
|
840
|
-
|
|
841
|
-
simd16uint16 a8_0 = simd16uint16(a8) & mask8;
|
|
842
|
-
simd16uint16 a8_1 = (simd16uint16(a8) >> 8) & mask8;
|
|
843
|
-
|
|
844
|
-
return hadd(a8_0, a8_1);
|
|
845
|
-
}
|
|
846
|
-
|
|
847
|
-
static const simd32uint8 shifts = simd32uint8::create<
|
|
848
|
-
1,
|
|
849
|
-
16,
|
|
850
|
-
0,
|
|
851
|
-
0,
|
|
852
|
-
4,
|
|
853
|
-
64,
|
|
854
|
-
0,
|
|
855
|
-
0,
|
|
856
|
-
0,
|
|
857
|
-
0,
|
|
858
|
-
1,
|
|
859
|
-
16,
|
|
860
|
-
0,
|
|
861
|
-
0,
|
|
862
|
-
4,
|
|
863
|
-
64,
|
|
864
|
-
1,
|
|
865
|
-
16,
|
|
866
|
-
0,
|
|
867
|
-
0,
|
|
868
|
-
4,
|
|
869
|
-
64,
|
|
870
|
-
0,
|
|
871
|
-
0,
|
|
872
|
-
0,
|
|
873
|
-
0,
|
|
874
|
-
1,
|
|
875
|
-
16,
|
|
876
|
-
0,
|
|
877
|
-
0,
|
|
878
|
-
4,
|
|
879
|
-
64>();
|
|
880
|
-
|
|
881
|
-
// 2-bit accumulator: we can add only up to 3 elements
|
|
882
|
-
// on output we return 2*4-bit results
|
|
883
|
-
// preproc returns either an index in 0..7 or 0xffff
|
|
884
|
-
// that yields a 0 when used in the table look-up
|
|
885
|
-
template <int N, class Preproc>
|
|
886
|
-
void compute_accu2(
|
|
887
|
-
const uint16_t*& data,
|
|
888
|
-
Preproc& pp,
|
|
889
|
-
simd16uint16& a4lo,
|
|
890
|
-
simd16uint16& a4hi) {
|
|
891
|
-
simd16uint16 mask2(0x3333);
|
|
892
|
-
simd16uint16 a2((uint16_t)0); // 2-bit accu
|
|
893
|
-
for (int j = 0; j < N; j++) {
|
|
894
|
-
simd16uint16 v(data);
|
|
895
|
-
data += 16;
|
|
896
|
-
v = pp(v);
|
|
897
|
-
// 0x800 -> force second half of table
|
|
898
|
-
simd16uint16 idx = v | (v << 8) | simd16uint16(0x800);
|
|
899
|
-
a2 += simd16uint16(shifts.lookup_2_lanes(simd32uint8(idx)));
|
|
900
|
-
}
|
|
901
|
-
a4lo += a2 & mask2;
|
|
902
|
-
a4hi += (a2 >> 2) & mask2;
|
|
903
|
-
}
|
|
904
|
-
|
|
905
|
-
template <class Preproc>
|
|
906
|
-
simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) {
|
|
907
|
-
assert(n_in % 16 == 0);
|
|
908
|
-
int n = n_in / 16;
|
|
909
|
-
|
|
910
|
-
simd32uint8 a8lo(0);
|
|
911
|
-
simd32uint8 a8hi(0);
|
|
912
|
-
|
|
913
|
-
for (int i0 = 0; i0 < n; i0 += 15) {
|
|
914
|
-
simd16uint16 a4lo(0); // 4-bit accus
|
|
915
|
-
simd16uint16 a4hi(0);
|
|
916
|
-
|
|
917
|
-
int i1 = std::min(i0 + 15, n);
|
|
918
|
-
int i;
|
|
919
|
-
for (i = i0; i + 2 < i1; i += 3) {
|
|
920
|
-
compute_accu2<3>(data, pp, a4lo, a4hi); // adds 3 max
|
|
921
|
-
}
|
|
922
|
-
switch (i1 - i) {
|
|
923
|
-
case 2:
|
|
924
|
-
compute_accu2<2>(data, pp, a4lo, a4hi);
|
|
925
|
-
break;
|
|
926
|
-
case 1:
|
|
927
|
-
compute_accu2<1>(data, pp, a4lo, a4hi);
|
|
928
|
-
break;
|
|
929
|
-
}
|
|
930
|
-
|
|
931
|
-
a8lo += accu4to8(a4lo);
|
|
932
|
-
a8hi += accu4to8(a4hi);
|
|
933
|
-
}
|
|
934
|
-
|
|
935
|
-
// move to 16-bit accu
|
|
936
|
-
simd16uint16 a16lo = accu8to16(a8lo);
|
|
937
|
-
simd16uint16 a16hi = accu8to16(a8hi);
|
|
938
|
-
|
|
939
|
-
simd16uint16 a16 = hadd(a16lo, a16hi);
|
|
940
|
-
|
|
941
|
-
// the 2 lanes must still be combined
|
|
942
|
-
return a16;
|
|
943
|
-
}
|
|
944
|
-
|
|
945
|
-
/************************************************************
|
|
946
|
-
* 16 bins
|
|
947
|
-
************************************************************/
|
|
948
|
-
|
|
949
|
-
static const simd32uint8 shifts2 = simd32uint8::create<
|
|
950
|
-
1,
|
|
951
|
-
2,
|
|
952
|
-
4,
|
|
953
|
-
8,
|
|
954
|
-
16,
|
|
955
|
-
32,
|
|
956
|
-
64,
|
|
957
|
-
128,
|
|
958
|
-
1,
|
|
959
|
-
2,
|
|
960
|
-
4,
|
|
961
|
-
8,
|
|
962
|
-
16,
|
|
963
|
-
32,
|
|
964
|
-
64,
|
|
965
|
-
128,
|
|
966
|
-
1,
|
|
967
|
-
2,
|
|
968
|
-
4,
|
|
969
|
-
8,
|
|
970
|
-
16,
|
|
971
|
-
32,
|
|
972
|
-
64,
|
|
973
|
-
128,
|
|
974
|
-
1,
|
|
975
|
-
2,
|
|
976
|
-
4,
|
|
977
|
-
8,
|
|
978
|
-
16,
|
|
979
|
-
32,
|
|
980
|
-
64,
|
|
981
|
-
128>();
|
|
982
|
-
|
|
983
|
-
simd32uint8 shiftr_16(simd32uint8 x, int n) {
|
|
984
|
-
return simd32uint8(simd16uint16(x) >> n);
|
|
985
|
-
}
|
|
986
|
-
|
|
987
|
-
// 2-bit accumulator: we can add only up to 3 elements
|
|
988
|
-
// on output we return 2*4-bit results
|
|
989
|
-
template <int N, class Preproc>
|
|
990
|
-
void compute_accu2_16(
|
|
991
|
-
const uint16_t*& data,
|
|
992
|
-
Preproc pp,
|
|
993
|
-
simd32uint8& a4_0,
|
|
994
|
-
simd32uint8& a4_1,
|
|
995
|
-
simd32uint8& a4_2,
|
|
996
|
-
simd32uint8& a4_3) {
|
|
997
|
-
simd32uint8 mask1(0x55);
|
|
998
|
-
simd32uint8 a2_0; // 2-bit accu
|
|
999
|
-
simd32uint8 a2_1; // 2-bit accu
|
|
1000
|
-
a2_0.clear();
|
|
1001
|
-
a2_1.clear();
|
|
1002
|
-
|
|
1003
|
-
for (int j = 0; j < N; j++) {
|
|
1004
|
-
simd16uint16 v(data);
|
|
1005
|
-
data += 16;
|
|
1006
|
-
v = pp(v);
|
|
1007
|
-
|
|
1008
|
-
simd16uint16 idx = v | (v << 8);
|
|
1009
|
-
simd32uint8 a1 = shifts2.lookup_2_lanes(simd32uint8(idx));
|
|
1010
|
-
// contains 0s for out-of-bounds elements
|
|
1011
|
-
|
|
1012
|
-
simd16uint16 lt8 = (v >> 3) == simd16uint16(0);
|
|
1013
|
-
lt8 = lt8 ^ simd16uint16(0xff00);
|
|
1014
|
-
|
|
1015
|
-
a1 = a1 & lt8;
|
|
1016
|
-
|
|
1017
|
-
a2_0 += a1 & mask1;
|
|
1018
|
-
a2_1 += shiftr_16(a1, 1) & mask1;
|
|
1019
|
-
}
|
|
1020
|
-
simd32uint8 mask2(0x33);
|
|
1021
|
-
|
|
1022
|
-
a4_0 += a2_0 & mask2;
|
|
1023
|
-
a4_1 += a2_1 & mask2;
|
|
1024
|
-
a4_2 += shiftr_16(a2_0, 2) & mask2;
|
|
1025
|
-
a4_3 += shiftr_16(a2_1, 2) & mask2;
|
|
1026
|
-
}
|
|
1027
|
-
|
|
1028
|
-
simd32uint8 accu4to8_2(simd32uint8 a4_0, simd32uint8 a4_1) {
|
|
1029
|
-
simd32uint8 mask4(0x0f);
|
|
1030
|
-
|
|
1031
|
-
simd16uint16 a8_0 = combine2x2(
|
|
1032
|
-
(simd16uint16)(a4_0 & mask4),
|
|
1033
|
-
(simd16uint16)(shiftr_16(a4_0, 4) & mask4));
|
|
1034
|
-
|
|
1035
|
-
simd16uint16 a8_1 = combine2x2(
|
|
1036
|
-
(simd16uint16)(a4_1 & mask4),
|
|
1037
|
-
(simd16uint16)(shiftr_16(a4_1, 4) & mask4));
|
|
1038
|
-
|
|
1039
|
-
return simd32uint8(hadd(a8_0, a8_1));
|
|
1040
|
-
}
|
|
1041
|
-
|
|
1042
|
-
template <class Preproc>
|
|
1043
|
-
simd16uint16 histogram_16(const uint16_t* data, Preproc pp, size_t n_in) {
|
|
1044
|
-
assert(n_in % 16 == 0);
|
|
1045
|
-
int n = n_in / 16;
|
|
1046
|
-
|
|
1047
|
-
simd32uint8 a8lo((uint8_t)0);
|
|
1048
|
-
simd32uint8 a8hi((uint8_t)0);
|
|
1049
|
-
|
|
1050
|
-
for (int i0 = 0; i0 < n; i0 += 7) {
|
|
1051
|
-
simd32uint8 a4_0(0); // 0, 4, 8, 12
|
|
1052
|
-
simd32uint8 a4_1(0); // 1, 5, 9, 13
|
|
1053
|
-
simd32uint8 a4_2(0); // 2, 6, 10, 14
|
|
1054
|
-
simd32uint8 a4_3(0); // 3, 7, 11, 15
|
|
1055
|
-
|
|
1056
|
-
int i1 = std::min(i0 + 7, n);
|
|
1057
|
-
int i;
|
|
1058
|
-
for (i = i0; i + 2 < i1; i += 3) {
|
|
1059
|
-
compute_accu2_16<3>(data, pp, a4_0, a4_1, a4_2, a4_3);
|
|
1060
|
-
}
|
|
1061
|
-
switch (i1 - i) {
|
|
1062
|
-
case 2:
|
|
1063
|
-
compute_accu2_16<2>(data, pp, a4_0, a4_1, a4_2, a4_3);
|
|
1064
|
-
break;
|
|
1065
|
-
case 1:
|
|
1066
|
-
compute_accu2_16<1>(data, pp, a4_0, a4_1, a4_2, a4_3);
|
|
1067
|
-
break;
|
|
1068
|
-
}
|
|
1069
|
-
|
|
1070
|
-
a8lo += accu4to8_2(a4_0, a4_1);
|
|
1071
|
-
a8hi += accu4to8_2(a4_2, a4_3);
|
|
1072
|
-
}
|
|
1073
|
-
|
|
1074
|
-
// move to 16-bit accu
|
|
1075
|
-
simd16uint16 a16lo = accu8to16(a8lo);
|
|
1076
|
-
simd16uint16 a16hi = accu8to16(a8hi);
|
|
1077
|
-
|
|
1078
|
-
simd16uint16 a16 = hadd(a16lo, a16hi);
|
|
1079
|
-
|
|
1080
|
-
a16 = simd16uint16{simd8uint32{a16}.unzip()};
|
|
1081
|
-
|
|
1082
|
-
return a16;
|
|
1083
|
-
}
|
|
1084
|
-
|
|
1085
|
-
struct PreprocNOP {
|
|
1086
|
-
simd16uint16 operator()(simd16uint16 x) {
|
|
1087
|
-
return x;
|
|
1088
|
-
}
|
|
1089
|
-
};
|
|
1090
|
-
|
|
1091
|
-
template <int shift, int nbin>
|
|
1092
|
-
struct PreprocMinShift {
|
|
1093
|
-
simd16uint16 min16;
|
|
1094
|
-
simd16uint16 max16;
|
|
1095
|
-
|
|
1096
|
-
explicit PreprocMinShift(uint16_t min) {
|
|
1097
|
-
min16.set1(min);
|
|
1098
|
-
int vmax0 = std::min((nbin << shift) + min, 65536);
|
|
1099
|
-
uint16_t vmax = uint16_t(vmax0 - 1 - min);
|
|
1100
|
-
max16.set1(vmax); // vmax inclusive
|
|
1101
|
-
}
|
|
1102
|
-
|
|
1103
|
-
simd16uint16 operator()(simd16uint16 x) {
|
|
1104
|
-
x = x - min16;
|
|
1105
|
-
simd16uint16 mask = (x == max(x, max16)) - (x == max16);
|
|
1106
|
-
return (x >> shift) | mask;
|
|
1107
|
-
}
|
|
1108
|
-
};
|
|
1109
|
-
|
|
1110
|
-
/* unbounded versions of the functions */
|
|
1111
|
-
|
|
1112
|
-
void simd_histogram_8_unbounded(const uint16_t* data, int n, int* hist) {
|
|
1113
|
-
PreprocNOP pp;
|
|
1114
|
-
simd16uint16 a16 = histogram_8(data, pp, (n & ~15));
|
|
1115
|
-
|
|
1116
|
-
ALIGNED(32) uint16_t a16_tab[16];
|
|
1117
|
-
a16.store(a16_tab);
|
|
1118
|
-
|
|
1119
|
-
for (int i = 0; i < 8; i++) {
|
|
1120
|
-
hist[i] = a16_tab[i] + a16_tab[i + 8];
|
|
1121
|
-
}
|
|
1122
|
-
|
|
1123
|
-
for (int i = (n & ~15); i < n; i++) {
|
|
1124
|
-
hist[data[i]]++;
|
|
1125
|
-
}
|
|
1126
|
-
}
|
|
1127
|
-
|
|
1128
|
-
void simd_histogram_16_unbounded(const uint16_t* data, int n, int* hist) {
|
|
1129
|
-
simd16uint16 a16 = histogram_16(data, PreprocNOP(), (n & ~15));
|
|
1130
|
-
|
|
1131
|
-
ALIGNED(32) uint16_t a16_tab[16];
|
|
1132
|
-
a16.store(a16_tab);
|
|
1133
|
-
|
|
1134
|
-
for (int i = 0; i < 16; i++) {
|
|
1135
|
-
hist[i] = a16_tab[i];
|
|
1136
|
-
}
|
|
1137
|
-
|
|
1138
|
-
for (int i = (n & ~15); i < n; i++) {
|
|
1139
|
-
hist[data[i]]++;
|
|
1140
|
-
}
|
|
1141
|
-
}
|
|
1142
|
-
|
|
1143
|
-
} // anonymous namespace
|
|
1144
|
-
|
|
1145
|
-
/************************************************************
|
|
1146
|
-
* Driver routines
|
|
1147
|
-
************************************************************/
|
|
1148
|
-
|
|
1149
|
-
void simd_histogram_8(
|
|
1150
|
-
const uint16_t* data,
|
|
1151
|
-
int n,
|
|
1152
|
-
uint16_t min,
|
|
1153
|
-
int shift,
|
|
1154
|
-
int* hist) {
|
|
1155
|
-
if (shift < 0) {
|
|
1156
|
-
simd_histogram_8_unbounded(data, n, hist);
|
|
1157
|
-
return;
|
|
1158
|
-
}
|
|
1159
|
-
|
|
1160
|
-
simd16uint16 a16;
|
|
1161
|
-
|
|
1162
|
-
#define DISPATCH(s) \
|
|
1163
|
-
case s: \
|
|
1164
|
-
a16 = histogram_8(data, PreprocMinShift<s, 8>(min), (n & ~15)); \
|
|
1165
|
-
break
|
|
1166
|
-
|
|
1167
|
-
switch (shift) {
|
|
1168
|
-
DISPATCH(0);
|
|
1169
|
-
DISPATCH(1);
|
|
1170
|
-
DISPATCH(2);
|
|
1171
|
-
DISPATCH(3);
|
|
1172
|
-
DISPATCH(4);
|
|
1173
|
-
DISPATCH(5);
|
|
1174
|
-
DISPATCH(6);
|
|
1175
|
-
DISPATCH(7);
|
|
1176
|
-
DISPATCH(8);
|
|
1177
|
-
DISPATCH(9);
|
|
1178
|
-
DISPATCH(10);
|
|
1179
|
-
DISPATCH(11);
|
|
1180
|
-
DISPATCH(12);
|
|
1181
|
-
DISPATCH(13);
|
|
1182
|
-
default:
|
|
1183
|
-
FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
|
|
1184
|
-
}
|
|
1185
|
-
#undef DISPATCH
|
|
1186
|
-
|
|
1187
|
-
ALIGNED(32) uint16_t a16_tab[16];
|
|
1188
|
-
a16.store(a16_tab);
|
|
1189
|
-
|
|
1190
|
-
for (int i = 0; i < 8; i++) {
|
|
1191
|
-
hist[i] = a16_tab[i] + a16_tab[i + 8];
|
|
1192
|
-
}
|
|
1193
|
-
|
|
1194
|
-
// complete with remaining bins
|
|
1195
|
-
for (int i = (n & ~15); i < n; i++) {
|
|
1196
|
-
if (data[i] < min) {
|
|
1197
|
-
continue;
|
|
1198
|
-
}
|
|
1199
|
-
uint16_t v = data[i] - min;
|
|
1200
|
-
v >>= shift;
|
|
1201
|
-
if (v < 8) {
|
|
1202
|
-
hist[v]++;
|
|
1203
|
-
}
|
|
1204
|
-
}
|
|
1205
|
-
}
|
|
1206
|
-
|
|
1207
|
-
void simd_histogram_16(
|
|
1208
|
-
const uint16_t* data,
|
|
1209
|
-
int n,
|
|
1210
|
-
uint16_t min,
|
|
1211
|
-
int shift,
|
|
1212
|
-
int* hist) {
|
|
1213
|
-
if (shift < 0) {
|
|
1214
|
-
simd_histogram_16_unbounded(data, n, hist);
|
|
1215
|
-
return;
|
|
1216
|
-
}
|
|
1217
|
-
|
|
1218
|
-
simd16uint16 a16;
|
|
1219
|
-
|
|
1220
|
-
#define DISPATCH(s) \
|
|
1221
|
-
case s: \
|
|
1222
|
-
a16 = histogram_16(data, PreprocMinShift<s, 16>(min), (n & ~15)); \
|
|
1223
|
-
break
|
|
1224
|
-
|
|
1225
|
-
switch (shift) {
|
|
1226
|
-
DISPATCH(0);
|
|
1227
|
-
DISPATCH(1);
|
|
1228
|
-
DISPATCH(2);
|
|
1229
|
-
DISPATCH(3);
|
|
1230
|
-
DISPATCH(4);
|
|
1231
|
-
DISPATCH(5);
|
|
1232
|
-
DISPATCH(6);
|
|
1233
|
-
DISPATCH(7);
|
|
1234
|
-
DISPATCH(8);
|
|
1235
|
-
DISPATCH(9);
|
|
1236
|
-
DISPATCH(10);
|
|
1237
|
-
DISPATCH(11);
|
|
1238
|
-
DISPATCH(12);
|
|
1239
|
-
default:
|
|
1240
|
-
FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
|
|
1241
|
-
}
|
|
1242
|
-
#undef DISPATCH
|
|
1243
|
-
|
|
1244
|
-
ALIGNED(32) uint16_t a16_tab[16];
|
|
1245
|
-
a16.store(a16_tab);
|
|
1246
|
-
|
|
1247
|
-
for (int i = 0; i < 16; i++) {
|
|
1248
|
-
hist[i] = a16_tab[i];
|
|
1249
|
-
}
|
|
1250
|
-
|
|
1251
|
-
for (int i = (n & ~15); i < n; i++) {
|
|
1252
|
-
if (data[i] < min) {
|
|
1253
|
-
continue;
|
|
1254
|
-
}
|
|
1255
|
-
uint16_t v = data[i] - min;
|
|
1256
|
-
v >>= shift;
|
|
1257
|
-
if (v < 16) {
|
|
1258
|
-
hist[v]++;
|
|
1259
|
-
}
|
|
1260
|
-
}
|
|
1261
|
-
}
|
|
1262
|
-
|
|
1263
|
-
// no AVX2
|
|
1264
|
-
#else
|
|
1265
|
-
|
|
1266
|
-
void simd_histogram_16(
|
|
311
|
+
void simd_histogram_16_scalar(
|
|
1267
312
|
const uint16_t* data,
|
|
1268
313
|
int n,
|
|
1269
314
|
uint16_t min,
|
|
@@ -1271,32 +316,25 @@ void simd_histogram_16(
|
|
|
1271
316
|
int* hist) {
|
|
1272
317
|
memset(hist, 0, sizeof(*hist) * 16);
|
|
1273
318
|
if (shift < 0) {
|
|
1274
|
-
for (size_t i = 0; i < n; i++) {
|
|
319
|
+
for (size_t i = 0; i < static_cast<size_t>(n); i++) {
|
|
1275
320
|
hist[data[i]]++;
|
|
1276
321
|
}
|
|
1277
322
|
} else {
|
|
1278
323
|
int vmax0 = std::min((16 << shift) + min, 65536);
|
|
1279
324
|
uint16_t vmax = uint16_t(vmax0 - 1 - min);
|
|
1280
325
|
|
|
1281
|
-
for (size_t i = 0; i < n; i++) {
|
|
326
|
+
for (size_t i = 0; i < static_cast<size_t>(n); i++) {
|
|
1282
327
|
uint16_t v = data[i];
|
|
1283
328
|
v -= min;
|
|
1284
329
|
if (!(v <= vmax))
|
|
1285
330
|
continue;
|
|
1286
331
|
v >>= shift;
|
|
1287
332
|
hist[v]++;
|
|
1288
|
-
|
|
1289
|
-
/*
|
|
1290
|
-
if (data[i] < min) continue;
|
|
1291
|
-
uint16_t v = data[i] - min;
|
|
1292
|
-
v >>= shift;
|
|
1293
|
-
if (v < 16) hist[v]++;
|
|
1294
|
-
*/
|
|
1295
333
|
}
|
|
1296
334
|
}
|
|
1297
335
|
}
|
|
1298
336
|
|
|
1299
|
-
void
|
|
337
|
+
void simd_histogram_8_scalar(
|
|
1300
338
|
const uint16_t* data,
|
|
1301
339
|
int n,
|
|
1302
340
|
uint16_t min,
|
|
@@ -1304,11 +342,11 @@ void simd_histogram_8(
|
|
|
1304
342
|
int* hist) {
|
|
1305
343
|
memset(hist, 0, sizeof(*hist) * 8);
|
|
1306
344
|
if (shift < 0) {
|
|
1307
|
-
for (size_t i = 0; i < n; i++) {
|
|
345
|
+
for (size_t i = 0; i < static_cast<size_t>(n); i++) {
|
|
1308
346
|
hist[data[i]]++;
|
|
1309
347
|
}
|
|
1310
348
|
} else {
|
|
1311
|
-
for (size_t i = 0; i < n; i++) {
|
|
349
|
+
for (size_t i = 0; i < static_cast<size_t>(n); i++) {
|
|
1312
350
|
if (data[i] < min)
|
|
1313
351
|
continue;
|
|
1314
352
|
uint16_t v = data[i] - min;
|
|
@@ -1319,7 +357,46 @@ void simd_histogram_8(
|
|
|
1319
357
|
}
|
|
1320
358
|
}
|
|
1321
359
|
|
|
360
|
+
} // anonymous namespace
|
|
361
|
+
|
|
362
|
+
/******************************************************************
|
|
363
|
+
* Histogram subroutines — dispatch to SIMD or scalar
|
|
364
|
+
******************************************************************/
|
|
365
|
+
|
|
366
|
+
void simd_histogram_8(
|
|
367
|
+
const uint16_t* data,
|
|
368
|
+
int n,
|
|
369
|
+
uint16_t min,
|
|
370
|
+
int shift,
|
|
371
|
+
int* hist) {
|
|
372
|
+
with_simd_level_256bit([&]<SIMDLevel SL>() {
|
|
373
|
+
if constexpr (SL == SIMDLevel::NONE) {
|
|
374
|
+
simd_histogram_8_scalar(data, n, min, shift, hist);
|
|
375
|
+
} else {
|
|
376
|
+
faiss::simd_histogram_8<SL>(data, n, min, shift, hist);
|
|
377
|
+
}
|
|
378
|
+
});
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
void simd_histogram_16(
|
|
382
|
+
const uint16_t* data,
|
|
383
|
+
int n,
|
|
384
|
+
uint16_t min,
|
|
385
|
+
int shift,
|
|
386
|
+
int* hist) {
|
|
387
|
+
// GCC 12 miscompiles the AVX2 SIMD histogram — fall back to scalar.
|
|
388
|
+
#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ == 12
|
|
389
|
+
simd_histogram_16_scalar(data, n, min, shift, hist);
|
|
390
|
+
#else
|
|
391
|
+
with_simd_level_256bit([&]<SIMDLevel SL>() {
|
|
392
|
+
if constexpr (SL == SIMDLevel::NONE) {
|
|
393
|
+
simd_histogram_16_scalar(data, n, min, shift, hist);
|
|
394
|
+
} else {
|
|
395
|
+
faiss::simd_histogram_16<SL>(data, n, min, shift, hist);
|
|
396
|
+
}
|
|
397
|
+
});
|
|
1322
398
|
#endif
|
|
399
|
+
}
|
|
1323
400
|
|
|
1324
401
|
void PartitionStats::reset() {
|
|
1325
402
|
memset(this, 0, sizeof(*this));
|