faiss 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/faiss/extconf.rb +2 -1
- data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
- data/ext/faiss/index_binary.cpp +1 -1
- data/ext/faiss/kmeans.cpp +1 -1
- data/ext/faiss/pca_matrix.cpp +1 -1
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +93 -80
- data/vendor/faiss/faiss/Clustering.cpp +39 -240
- data/vendor/faiss/faiss/Clustering.h +6 -0
- data/vendor/faiss/faiss/IVFlib.cpp +41 -21
- data/vendor/faiss/faiss/Index.cpp +6 -5
- data/vendor/faiss/faiss/Index.h +5 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
- data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
- data/vendor/faiss/faiss/IndexFastScan.h +25 -23
- data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
- data/vendor/faiss/faiss/IndexFlat.h +21 -18
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
- data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
- data/vendor/faiss/faiss/IndexHNSW.h +16 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
- data/vendor/faiss/faiss/IndexIVF.h +33 -12
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
- data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
- data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
- data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
- data/vendor/faiss/faiss/IndexShards.cpp +10 -9
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +14 -7
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
- data/vendor/faiss/faiss/VectorTransform.h +16 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +48 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/factory_tools.cpp +5 -0
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
- data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
- data/vendor/faiss/faiss/impl/HNSW.h +13 -34
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
- data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
- data/vendor/faiss/faiss/impl/NSG.h +4 -4
- data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
- data/vendor/faiss/faiss/impl/Panorama.h +258 -87
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +33 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
- data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +86 -18
- data/vendor/faiss/faiss/index_io.h +24 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
- data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
- data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/distances.cpp +390 -560
- data/vendor/faiss/faiss/utils/distances.h +20 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
- data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
- data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +5 -5
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +119 -34
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
- /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#ifdef COMPILE_SIMD_AVX2
|
|
9
|
+
|
|
10
|
+
#include <faiss/utils/simd_impl/super_kmeans_kernels.h>
|
|
11
|
+
|
|
12
|
+
#include <immintrin.h>
|
|
13
|
+
|
|
14
|
+
namespace faiss {
|
|
15
|
+
namespace detail {
|
|
16
|
+
|
|
17
|
+
namespace {
|
|
18
|
+
|
|
19
|
+
// Reduce 8 float lanes of an AVX2 register to a scalar sum.
|
|
20
|
+
// Uses a shuffle+add tree instead of two _mm_hadd_ps. On Skylake-class
|
|
21
|
+
// cores, hadd is 3-cycle latency / 2-uop, while movehdup/movehl/add_ss
|
|
22
|
+
// are single-uop, single-cycle ops.
|
|
23
|
+
inline float horizontal_sum_avx2(__m256 v) {
|
|
24
|
+
__m128 lo = _mm256_castps256_ps128(v);
|
|
25
|
+
__m128 hi = _mm256_extractf128_ps(v, 1);
|
|
26
|
+
__m128 sum128 = _mm_add_ps(lo, hi); // 4 lanes
|
|
27
|
+
__m128 shuf = _mm_movehdup_ps(sum128); // [s1, s1, s3, s3]
|
|
28
|
+
__m128 sums = _mm_add_ps(sum128, shuf); // [s0+s1, _, s2+s3, _]
|
|
29
|
+
shuf = _mm_movehl_ps(shuf, sums); // [s2+s3, s3, _, _]
|
|
30
|
+
sums = _mm_add_ss(sums, shuf); // (s0+s1) + (s2+s3)
|
|
31
|
+
return _mm_cvtss_f32(sums);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
} // namespace
|
|
35
|
+
|
|
36
|
+
template <>
|
|
37
|
+
float block_l2<SIMDLevel::AVX2>(const float* x, const float* y, int n) {
|
|
38
|
+
__m256 acc = _mm256_setzero_ps();
|
|
39
|
+
int m = 0;
|
|
40
|
+
for (; m + 8 <= n; m += 8) {
|
|
41
|
+
__m256 xv = _mm256_loadu_ps(x + m);
|
|
42
|
+
__m256 yv = _mm256_loadu_ps(y + m);
|
|
43
|
+
__m256 diff = _mm256_sub_ps(xv, yv);
|
|
44
|
+
acc = _mm256_fmadd_ps(diff, diff, acc);
|
|
45
|
+
}
|
|
46
|
+
float result = horizontal_sum_avx2(acc);
|
|
47
|
+
for (; m < n; ++m) {
|
|
48
|
+
const float d = x[m] - y[m];
|
|
49
|
+
result += d * d;
|
|
50
|
+
}
|
|
51
|
+
return result;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
} // namespace detail
|
|
55
|
+
} // namespace faiss
|
|
56
|
+
|
|
57
|
+
#endif // COMPILE_SIMD_AVX2
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#ifdef COMPILE_SIMD_AVX512
|
|
9
|
+
|
|
10
|
+
#include <faiss/utils/simd_impl/super_kmeans_kernels.h>
|
|
11
|
+
|
|
12
|
+
#include <immintrin.h>
|
|
13
|
+
|
|
14
|
+
namespace faiss {
|
|
15
|
+
namespace detail {
|
|
16
|
+
|
|
17
|
+
template <>
|
|
18
|
+
float block_l2<SIMDLevel::AVX512>(const float* x, const float* y, int n) {
|
|
19
|
+
__m512 acc = _mm512_setzero_ps();
|
|
20
|
+
int m = 0;
|
|
21
|
+
for (; m + 16 <= n; m += 16) {
|
|
22
|
+
__m512 xv = _mm512_loadu_ps(x + m);
|
|
23
|
+
__m512 yv = _mm512_loadu_ps(y + m);
|
|
24
|
+
__m512 diff = _mm512_sub_ps(xv, yv);
|
|
25
|
+
acc = _mm512_fmadd_ps(diff, diff, acc);
|
|
26
|
+
}
|
|
27
|
+
// _mm512_reduce_add_ps: on modern AVX-512 SKUs (Cascade Lake+, Sapphire
|
|
28
|
+
// Rapids) GCC/Clang lower this to a shuffle+add tree, ~5-cycle latency.
|
|
29
|
+
// On older AVX-512 SKUs (Skylake-X, Ice Lake) the cross-lane reduction
|
|
30
|
+
// can be ~20 cycles. Acceptable here because n ~ pdx_block_size = 64
|
|
31
|
+
// (4 iterations of 16-wide accumulation), so per-block work dominates
|
|
32
|
+
// the reduction cost. AVX2 uses a manual shuffle+add tree explicitly
|
|
33
|
+
// to avoid `_mm_hadd_ps` overhead, where the ratio is reversed.
|
|
34
|
+
float result = _mm512_reduce_add_ps(acc);
|
|
35
|
+
for (; m < n; ++m) {
|
|
36
|
+
const float d = x[m] - y[m];
|
|
37
|
+
result += d * d;
|
|
38
|
+
}
|
|
39
|
+
return result;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
} // namespace detail
|
|
43
|
+
} // namespace faiss
|
|
44
|
+
|
|
45
|
+
#endif // COMPILE_SIMD_AVX512
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
#include <cstdlib>
|
|
11
11
|
|
|
12
12
|
#include <faiss/impl/FaissAssert.h>
|
|
13
|
+
#include <faiss/impl/simd_dispatch.h>
|
|
13
14
|
|
|
14
15
|
namespace faiss {
|
|
15
16
|
|
|
@@ -47,7 +48,7 @@ static bool has_sve() {
|
|
|
47
48
|
#endif // __linux__ / __APPLE__ / other
|
|
48
49
|
|
|
49
50
|
#else // Not ARM64
|
|
50
|
-
static bool has_sve() {
|
|
51
|
+
[[maybe_unused]] static bool has_sve() {
|
|
51
52
|
return false;
|
|
52
53
|
}
|
|
53
54
|
#endif
|
|
@@ -189,12 +190,15 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() {
|
|
|
189
190
|
}
|
|
190
191
|
#endif
|
|
191
192
|
|
|
193
|
+
#if defined(__riscv) && defined(COMPILE_SIMD_RISCV_RVV)
|
|
194
|
+
// RVV is always available on RISC-V builds compiled with rv64gcv.
|
|
195
|
+
supported_simd_levels |= (1 << static_cast<int>(SIMDLevel::RISCV_RVV));
|
|
196
|
+
detected_level = SIMDLevel::RISCV_RVV;
|
|
197
|
+
#endif
|
|
198
|
+
|
|
192
199
|
return detected_level;
|
|
193
200
|
}
|
|
194
201
|
|
|
195
|
-
// Include private header for DISPATCH_SIMDLevel macro
|
|
196
|
-
#include <faiss/impl/simd_dispatch.h>
|
|
197
|
-
|
|
198
202
|
namespace {
|
|
199
203
|
|
|
200
204
|
template <SIMDLevel Level>
|
|
@@ -205,7 +209,8 @@ SIMDLevel get_dispatched_level_impl() {
|
|
|
205
209
|
} // namespace
|
|
206
210
|
|
|
207
211
|
SIMDLevel SIMDConfig::get_dispatched_level() {
|
|
208
|
-
|
|
212
|
+
return with_selected_simd_levels<AVAILABLE_SIMD_LEVELS_ALL>(
|
|
213
|
+
[&]<SIMDLevel SL>() { return get_dispatched_level_impl<SL>(); });
|
|
209
214
|
}
|
|
210
215
|
|
|
211
216
|
#else // Static mode
|
|
@@ -260,6 +265,8 @@ SIMDLevel SIMDConfig::auto_detect_simd_level() {
|
|
|
260
265
|
return SIMDLevel::ARM_SVE;
|
|
261
266
|
#elif defined(COMPILE_SIMD_ARM_NEON)
|
|
262
267
|
return SIMDLevel::ARM_NEON;
|
|
268
|
+
#elif defined(COMPILE_SIMD_RISCV_RVV)
|
|
269
|
+
return SIMDLevel::RISCV_RVV;
|
|
263
270
|
#else
|
|
264
271
|
return SIMDLevel::NONE;
|
|
265
272
|
#endif
|
|
@@ -290,6 +297,8 @@ std::string to_string(SIMDLevel level) {
|
|
|
290
297
|
return "ARM_NEON";
|
|
291
298
|
case SIMDLevel::ARM_SVE:
|
|
292
299
|
return "ARM_SVE";
|
|
300
|
+
case SIMDLevel::RISCV_RVV:
|
|
301
|
+
return "RISCV_RVV";
|
|
293
302
|
case SIMDLevel::COUNT:
|
|
294
303
|
default:
|
|
295
304
|
throw FaissException("Invalid SIMDLevel");
|
|
@@ -315,6 +324,9 @@ SIMDLevel to_simd_level(const std::string& level_str) {
|
|
|
315
324
|
if (level_str == "ARM_SVE") {
|
|
316
325
|
return SIMDLevel::ARM_SVE;
|
|
317
326
|
}
|
|
327
|
+
if (level_str == "RISCV_RVV") {
|
|
328
|
+
return SIMDLevel::RISCV_RVV;
|
|
329
|
+
}
|
|
318
330
|
|
|
319
331
|
throw FaissException("Invalid SIMD level string: " + level_str);
|
|
320
332
|
}
|
|
@@ -25,13 +25,105 @@ enum class SIMDLevel {
|
|
|
25
25
|
// arm & aarch64
|
|
26
26
|
ARM_NEON,
|
|
27
27
|
ARM_SVE, // Scalable Vector Extension (ARMv8.2+)
|
|
28
|
+
// riscv
|
|
29
|
+
RISCV_RVV, // RISC-V Vector Extension (rv64gcv)
|
|
28
30
|
|
|
29
31
|
COUNT
|
|
30
32
|
};
|
|
31
33
|
|
|
34
|
+
/***************************************************************
|
|
35
|
+
* SINGLE_SIMD_LEVEL: the SIMD level for code without explicit SL context.
|
|
36
|
+
*
|
|
37
|
+
* In static mode: resolves to the compiled-in level (zero overhead).
|
|
38
|
+
* In DD mode: resolves to NONE (emulated scalar). Code using
|
|
39
|
+
* SINGLE_SIMD_LEVEL is meant to be incrementally migrated to use
|
|
40
|
+
* proper SL dispatch — SINGLE_SIMD_LEVEL is migration scaffolding,
|
|
41
|
+
* not permanent API.
|
|
42
|
+
***************************************************************/
|
|
43
|
+
#ifdef FAISS_ENABLE_DD
|
|
44
|
+
// DD dispatches to the highest optional SIMD level at runtime.
|
|
45
|
+
// On ARM64, NEON is mandatory (always available via COMPILE_SIMD_ARM_NEON),
|
|
46
|
+
// so the baseline is ARM_NEON. On x86, the baseline is NONE.
|
|
47
|
+
#if defined(COMPILE_SIMD_ARM_NEON)
|
|
48
|
+
inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::ARM_NEON;
|
|
49
|
+
#else
|
|
50
|
+
inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::NONE;
|
|
51
|
+
#endif
|
|
52
|
+
#else
|
|
53
|
+
#if defined(COMPILE_SIMD_AVX512_SPR)
|
|
54
|
+
inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::AVX512_SPR;
|
|
55
|
+
#elif defined(COMPILE_SIMD_AVX512)
|
|
56
|
+
inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::AVX512;
|
|
57
|
+
#elif defined(COMPILE_SIMD_AVX2)
|
|
58
|
+
inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::AVX2;
|
|
59
|
+
#elif defined(COMPILE_SIMD_ARM_SVE)
|
|
60
|
+
inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::ARM_SVE;
|
|
61
|
+
#elif defined(COMPILE_SIMD_ARM_NEON)
|
|
62
|
+
inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::ARM_NEON;
|
|
63
|
+
#elif defined(COMPILE_SIMD_RISCV_RVV)
|
|
64
|
+
inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::RISCV_RVV;
|
|
65
|
+
#else
|
|
66
|
+
inline constexpr SIMDLevel SINGLE_SIMD_LEVEL = SIMDLevel::NONE;
|
|
67
|
+
#endif
|
|
68
|
+
#endif
|
|
69
|
+
|
|
70
|
+
/***************************************************************
|
|
71
|
+
* Helper to select the appropriate 256-bit SIMD level.
|
|
72
|
+
*
|
|
73
|
+
* For 256-bit SIMD types (simd16uint16, simd32uint8, etc.), maps:
|
|
74
|
+
* AVX512/AVX512_SPR → AVX2 (256-bit ops use AVX2 instructions)
|
|
75
|
+
* AVX2 → AVX2
|
|
76
|
+
* ARM_NEON/ARM_SVE → ARM_NEON
|
|
77
|
+
* NONE → NONE
|
|
78
|
+
***************************************************************/
|
|
79
|
+
template <SIMDLevel SL>
|
|
80
|
+
struct simd256_level_selector {
|
|
81
|
+
static constexpr SIMDLevel value =
|
|
82
|
+
(SL == SIMDLevel::AVX512 || SL == SIMDLevel::AVX512_SPR)
|
|
83
|
+
? SIMDLevel::AVX2
|
|
84
|
+
: (SL == SIMDLevel::ARM_SVE ? SIMDLevel::ARM_NEON
|
|
85
|
+
: SL == SIMDLevel::RISCV_RVV ? SIMDLevel::NONE
|
|
86
|
+
: SL);
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
/// SINGLE_SIMD_LEVEL mapped to 256-bit: use this for 256-bit simd types
|
|
90
|
+
/// (simd16uint16, simd32uint8, etc.) which don't have AVX512/SVE
|
|
91
|
+
/// specializations.
|
|
92
|
+
inline constexpr SIMDLevel SINGLE_SIMD_LEVEL_256 =
|
|
93
|
+
simd256_level_selector<SINGLE_SIMD_LEVEL>::value;
|
|
94
|
+
|
|
95
|
+
/***************************************************************
|
|
96
|
+
* Helper to select the appropriate 512-bit SIMD level.
|
|
97
|
+
*
|
|
98
|
+
* For 512-bit SIMD types (simd32uint16, simd64uint8, etc.), maps:
|
|
99
|
+
* AVX512_SPR → AVX512 (512-bit ops share the same instructions)
|
|
100
|
+
* AVX512 → AVX512
|
|
101
|
+
* NONE → NONE
|
|
102
|
+
***************************************************************/
|
|
103
|
+
template <SIMDLevel SL>
|
|
104
|
+
struct simd512_level_selector {
|
|
105
|
+
static constexpr SIMDLevel value = (SL == SIMDLevel::AVX512_SPR)
|
|
106
|
+
? SIMDLevel::AVX512
|
|
107
|
+
: (SL == SIMDLevel::RISCV_RVV) ? SIMDLevel::NONE
|
|
108
|
+
: SL;
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
/// SINGLE_SIMD_LEVEL mapped to 512-bit: use this for 512-bit simd types
|
|
112
|
+
/// (simd32uint16, simd64uint8, etc.) which don't have AVX512_SPR
|
|
113
|
+
/// specializations (AVX512_SPR uses the same 512-bit integer ops as AVX512).
|
|
114
|
+
inline constexpr SIMDLevel SINGLE_SIMD_LEVEL_512 =
|
|
115
|
+
simd512_level_selector<SINGLE_SIMD_LEVEL>::value;
|
|
116
|
+
|
|
32
117
|
/// Number of float32 lanes for a given SIMD level.
|
|
118
|
+
/// ARM_SVE is variable-width (128–2048 bits); no single constant is correct.
|
|
33
119
|
template <SIMDLevel SL>
|
|
34
120
|
constexpr int simd_width() {
|
|
121
|
+
static_assert(
|
|
122
|
+
SL != SIMDLevel::ARM_SVE,
|
|
123
|
+
"simd_width<ARM_SVE> is not supported: SVE is variable-width");
|
|
124
|
+
static_assert(
|
|
125
|
+
SL != SIMDLevel::RISCV_RVV,
|
|
126
|
+
"simd_width<RISCV_RVV> is not supported: RVV is variable-width");
|
|
35
127
|
if constexpr (SL == SIMDLevel::AVX512 || SL == SIMDLevel::AVX512_SPR)
|
|
36
128
|
return 16;
|
|
37
129
|
else if constexpr (SL == SIMDLevel::AVX2 || SL == SIMDLevel::ARM_NEON)
|
|
@@ -82,7 +174,7 @@ struct FAISS_API SIMDConfig {
|
|
|
82
174
|
static bool is_simd_level_available(SIMDLevel level);
|
|
83
175
|
|
|
84
176
|
/// Returns the SIMD level via the dispatch mechanism.
|
|
85
|
-
/// In DD mode, uses
|
|
177
|
+
/// In DD mode, uses with_simd_level internally.
|
|
86
178
|
/// In static mode, returns the compiled-in level.
|
|
87
179
|
/// Useful for verification: get_level() == get_dispatched_level()
|
|
88
180
|
static SIMDLevel get_dispatched_level();
|
|
@@ -134,9 +134,9 @@ void fvec_argsort(size_t n, const float* vals, size_t* perm) {
|
|
|
134
134
|
}
|
|
135
135
|
|
|
136
136
|
void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
|
|
137
|
-
size_t
|
|
137
|
+
std::vector<size_t> perm2(n);
|
|
138
138
|
// 2 result tables, during merging, flip between them
|
|
139
|
-
size_t *permB = perm2, *permA = perm;
|
|
139
|
+
size_t *permB = perm2.data(), *permA = perm;
|
|
140
140
|
|
|
141
141
|
int nt = omp_get_max_threads();
|
|
142
142
|
{ // prepare correct permutation so that the result ends in perm
|
|
@@ -148,8 +148,8 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
|
|
|
148
148
|
}
|
|
149
149
|
}
|
|
150
150
|
|
|
151
|
-
#pragma omp parallel
|
|
152
|
-
for (
|
|
151
|
+
#pragma omp parallel for
|
|
152
|
+
for (int64_t i = 0; i < static_cast<int64_t>(n); i++) {
|
|
153
153
|
permA[i] = i;
|
|
154
154
|
}
|
|
155
155
|
|
|
@@ -184,7 +184,6 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
|
|
|
184
184
|
} else {
|
|
185
185
|
int t0 = s * sub_nt / sub_nseg1;
|
|
186
186
|
int t1 = (s + 1) * sub_nt / sub_nseg1;
|
|
187
|
-
printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
|
|
188
187
|
parallel_merge(
|
|
189
188
|
permA, permB, segs[s], segs[s + 1], t1 - t0, comp);
|
|
190
189
|
}
|
|
@@ -197,7 +196,6 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
|
|
|
197
196
|
}
|
|
198
197
|
assert(permA == perm);
|
|
199
198
|
omp_set_nested(prev_nested);
|
|
200
|
-
delete[] perm2;
|
|
201
199
|
}
|
|
202
200
|
|
|
203
201
|
/*****************************************************************************
|
|
@@ -226,7 +224,7 @@ void bucket_sort_ref(
|
|
|
226
224
|
for (size_t i = 0; i < vmax; i++) {
|
|
227
225
|
lims[i + 1] += lims[i];
|
|
228
226
|
}
|
|
229
|
-
FAISS_THROW_IF_NOT(lims[vmax] == nval);
|
|
227
|
+
FAISS_THROW_IF_NOT(static_cast<size_t>(lims[vmax]) == nval);
|
|
230
228
|
double t2 = getmillisecs();
|
|
231
229
|
// populate buckets
|
|
232
230
|
for (size_t i = 0; i < nval; i++) {
|
|
@@ -286,7 +284,7 @@ void bucket_sort_parallel(
|
|
|
286
284
|
for (size_t i = 0; i < vmax; i++) {
|
|
287
285
|
lims[i + 1] += lims[i];
|
|
288
286
|
}
|
|
289
|
-
FAISS_THROW_IF_NOT(lims[vmax] == nval);
|
|
287
|
+
FAISS_THROW_IF_NOT(static_cast<size_t>(lims[vmax]) == nval);
|
|
290
288
|
}
|
|
291
289
|
#pragma omp barrier
|
|
292
290
|
|
|
@@ -341,7 +339,8 @@ void bucket_sort_inplace_ref(
|
|
|
341
339
|
double t0 = getmillisecs();
|
|
342
340
|
size_t nval = nrow * ncol;
|
|
343
341
|
FAISS_THROW_IF_NOT(
|
|
344
|
-
nbucket <
|
|
342
|
+
static_cast<size_t>(nbucket) <
|
|
343
|
+
nval); // unclear what would happen in this case...
|
|
345
344
|
|
|
346
345
|
memset(lims, 0, sizeof(*lims) * (nbucket + 1));
|
|
347
346
|
for (size_t i = 0; i < nval; i++) {
|
|
@@ -350,14 +349,14 @@ void bucket_sort_inplace_ref(
|
|
|
350
349
|
}
|
|
351
350
|
double t1 = getmillisecs();
|
|
352
351
|
// compute cumulative sum
|
|
353
|
-
for (size_t i = 0; i < nbucket; i++) {
|
|
352
|
+
for (size_t i = 0; i < static_cast<size_t>(nbucket); i++) {
|
|
354
353
|
lims[i + 1] += lims[i];
|
|
355
354
|
}
|
|
356
|
-
FAISS_THROW_IF_NOT(lims[nbucket] == nval);
|
|
355
|
+
FAISS_THROW_IF_NOT(static_cast<size_t>(lims[nbucket]) == nval);
|
|
357
356
|
double t2 = getmillisecs();
|
|
358
357
|
|
|
359
358
|
std::vector<size_t> ptrs(nbucket);
|
|
360
|
-
for (size_t i = 0; i < nbucket; i++) {
|
|
359
|
+
for (size_t i = 0; i < static_cast<size_t>(nbucket); i++) {
|
|
361
360
|
ptrs[i] = lims[i];
|
|
362
361
|
}
|
|
363
362
|
|
|
@@ -378,7 +377,8 @@ void bucket_sort_inplace_ref(
|
|
|
378
377
|
} else {
|
|
379
378
|
// start new loop
|
|
380
379
|
for (; init_bucket_no < nbucket; init_bucket_no++) {
|
|
381
|
-
if (ptrs[init_bucket_no] <
|
|
380
|
+
if (ptrs[init_bucket_no] <
|
|
381
|
+
static_cast<size_t>(lims[init_bucket_no + 1])) {
|
|
382
382
|
break;
|
|
383
383
|
}
|
|
384
384
|
}
|
|
@@ -390,7 +390,7 @@ void bucket_sort_inplace_ref(
|
|
|
390
390
|
}
|
|
391
391
|
}
|
|
392
392
|
|
|
393
|
-
for (size_t i = 0; i < nbucket; i++) {
|
|
393
|
+
for (size_t i = 0; i < static_cast<size_t>(nbucket); i++) {
|
|
394
394
|
assert(ptrs[i] == lims[i + 1]);
|
|
395
395
|
}
|
|
396
396
|
double t3 = getmillisecs();
|
|
@@ -407,8 +407,8 @@ struct ToWrite {
|
|
|
407
407
|
std::vector<TI> rows;
|
|
408
408
|
std::vector<size_t> lims;
|
|
409
409
|
|
|
410
|
-
explicit ToWrite(TI
|
|
411
|
-
lims.resize(
|
|
410
|
+
explicit ToWrite(TI nbucket_in) : nbucket(nbucket_in) {
|
|
411
|
+
lims.resize(nbucket_in + 1);
|
|
412
412
|
}
|
|
413
413
|
|
|
414
414
|
/// add one element (row) to write in bucket b
|
|
@@ -428,7 +428,7 @@ struct ToWrite {
|
|
|
428
428
|
lims[buckets[i] + 1]++;
|
|
429
429
|
}
|
|
430
430
|
// compute cumulative sum
|
|
431
|
-
for (size_t i = 0; i < nbucket; i++) {
|
|
431
|
+
for (size_t i = 0; i < static_cast<size_t>(nbucket); i++) {
|
|
432
432
|
lims[i + 1] += lims[i];
|
|
433
433
|
}
|
|
434
434
|
FAISS_THROW_IF_NOT(lims[nbucket] == buckets.size());
|
|
@@ -466,7 +466,8 @@ void bucket_sort_inplace_parallel(
|
|
|
466
466
|
std::vector<ToWrite<TI>> all_to_write;
|
|
467
467
|
size_t nval = nrow * ncol;
|
|
468
468
|
FAISS_THROW_IF_NOT(
|
|
469
|
-
nbucket <
|
|
469
|
+
static_cast<size_t>(nbucket) <
|
|
470
|
+
nval); // unclear what would happen in this case...
|
|
470
471
|
|
|
471
472
|
// try to keep size of all_to_write < 5GiB
|
|
472
473
|
// but we need at least one element per bucket
|
|
@@ -498,7 +499,7 @@ void bucket_sort_inplace_parallel(
|
|
|
498
499
|
}
|
|
499
500
|
#pragma omp critical
|
|
500
501
|
{ // accumulate histograms (not shifted indices to prepare cumsum)
|
|
501
|
-
for (size_t i = 0; i < nbucket; i++) {
|
|
502
|
+
for (size_t i = 0; i < static_cast<size_t>(nbucket); i++) {
|
|
502
503
|
lims[i + 1] += local_lims[i];
|
|
503
504
|
}
|
|
504
505
|
all_to_write.push_back(ToWrite<TI>(nbucket));
|
|
@@ -511,10 +512,10 @@ void bucket_sort_inplace_parallel(
|
|
|
511
512
|
#pragma omp master
|
|
512
513
|
{
|
|
513
514
|
// compute cumulative sum
|
|
514
|
-
for (size_t i = 0; i < nbucket; i++) {
|
|
515
|
+
for (size_t i = 0; i < static_cast<size_t>(nbucket); i++) {
|
|
515
516
|
lims[i + 1] += lims[i];
|
|
516
517
|
}
|
|
517
|
-
FAISS_THROW_IF_NOT(lims[nbucket] == nval);
|
|
518
|
+
FAISS_THROW_IF_NOT(static_cast<size_t>(lims[nbucket]) == nval);
|
|
518
519
|
// at this point lims is final (read only!)
|
|
519
520
|
|
|
520
521
|
memcpy(ptrs.data(), lims, sizeof(lims[0]) * nbucket);
|
|
@@ -559,19 +560,22 @@ void bucket_sort_inplace_parallel(
|
|
|
559
560
|
printf("ROUND %d n_to_write=%zd\n", round, n_to_write);
|
|
560
561
|
}
|
|
561
562
|
if (verbose > 2) {
|
|
562
|
-
for (size_t b = 0; b < nbucket; b++) {
|
|
563
|
+
for (size_t b = 0; b < static_cast<size_t>(nbucket); b++) {
|
|
563
564
|
printf(" b=%zd [", b);
|
|
564
|
-
for (size_t i =
|
|
565
|
+
for (size_t i = static_cast<size_t>(lims[b]);
|
|
566
|
+
i < static_cast<size_t>(lims[b + 1]);
|
|
567
|
+
i++) {
|
|
565
568
|
printf(" %s%d",
|
|
566
569
|
ptrs[b] == i ? ">" : "",
|
|
567
570
|
int(vals[i]));
|
|
568
571
|
}
|
|
569
572
|
printf(" %s] %s\n",
|
|
570
|
-
ptrs[b] == lims[b + 1] ? ">"
|
|
573
|
+
ptrs[b] == static_cast<size_t>(lims[b + 1]) ? ">"
|
|
574
|
+
: "",
|
|
571
575
|
did_wrap[b] ? "w" : "");
|
|
572
576
|
}
|
|
573
577
|
printf("To write\n");
|
|
574
|
-
for (size_t b = 0; b < nbucket; b++) {
|
|
578
|
+
for (size_t b = 0; b < static_cast<size_t>(nbucket); b++) {
|
|
575
579
|
printf(" b=%zd ", b);
|
|
576
580
|
const char* sep = "[";
|
|
577
581
|
for (const ToWrite<TI>& to_write_2 : all_to_write) {
|
|
@@ -609,7 +613,7 @@ void bucket_sort_inplace_parallel(
|
|
|
609
613
|
rank,
|
|
610
614
|
idx);
|
|
611
615
|
}
|
|
612
|
-
if (idx < lims[b + 1]) {
|
|
616
|
+
if (idx < static_cast<size_t>(lims[b + 1])) {
|
|
613
617
|
ptrs[b]++;
|
|
614
618
|
} else {
|
|
615
619
|
// wrapping around
|
|
@@ -709,7 +713,7 @@ inline int64_t hash_function(int64_t x) {
|
|
|
709
713
|
void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab) {
|
|
710
714
|
size_t capacity = (size_t)1 << log2_capacity;
|
|
711
715
|
#pragma omp parallel for
|
|
712
|
-
for (int64_t i = 0; i < capacity; i++) {
|
|
716
|
+
for (int64_t i = 0; i < static_cast<int64_t>(capacity); i++) {
|
|
713
717
|
tab[2 * i] = -1;
|
|
714
718
|
tab[2 * i + 1] = -1;
|
|
715
719
|
}
|
|
@@ -729,7 +733,7 @@ void hashtable_int64_to_int64_add(
|
|
|
729
733
|
size_t nbucket = (size_t)1 << log2_nbucket;
|
|
730
734
|
|
|
731
735
|
#pragma omp parallel for
|
|
732
|
-
for (int64_t i = 0; i < n; i++) {
|
|
736
|
+
for (int64_t i = 0; i < static_cast<int64_t>(n); i++) {
|
|
733
737
|
hk[i] = hash_function(keys[i]) & mask;
|
|
734
738
|
bucket_no[i] = hk[i] >> (log2_capacity - log2_nbucket);
|
|
735
739
|
}
|
|
@@ -746,11 +750,13 @@ void hashtable_int64_to_int64_add(
|
|
|
746
750
|
|
|
747
751
|
int num_errors = 0;
|
|
748
752
|
#pragma omp parallel for reduction(+ : num_errors)
|
|
749
|
-
for (int64_t bucket = 0; bucket < nbucket; bucket++) {
|
|
753
|
+
for (int64_t bucket = 0; bucket < static_cast<int64_t>(nbucket); bucket++) {
|
|
750
754
|
size_t k0 = bucket << (log2_capacity - log2_nbucket);
|
|
751
755
|
size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
|
|
752
756
|
|
|
753
|
-
for (size_t i =
|
|
757
|
+
for (size_t i = static_cast<size_t>(lims[bucket]);
|
|
758
|
+
i < static_cast<size_t>(lims[bucket + 1]);
|
|
759
|
+
i++) {
|
|
754
760
|
int64_t j = perm[i];
|
|
755
761
|
assert(bucket_no[j] == bucket);
|
|
756
762
|
assert(hk[j] >= k0 && hk[j] < k1);
|
|
@@ -768,7 +774,8 @@ void hashtable_int64_to_int64_add(
|
|
|
768
774
|
if (slot == k1) {
|
|
769
775
|
slot = k0;
|
|
770
776
|
}
|
|
771
|
-
if (slot ==
|
|
777
|
+
if (slot ==
|
|
778
|
+
static_cast<size_t>(hk[j])) { // no free slot left in bucket
|
|
772
779
|
num_errors++;
|
|
773
780
|
break;
|
|
774
781
|
}
|
|
@@ -793,20 +800,24 @@ void hashtable_int64_to_int64_lookup(
|
|
|
793
800
|
int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
|
|
794
801
|
|
|
795
802
|
#pragma omp parallel for
|
|
796
|
-
for (int64_t i = 0; i < n; i++) {
|
|
803
|
+
for (int64_t i = 0; i < static_cast<int64_t>(n); i++) {
|
|
797
804
|
int64_t k = keys[i];
|
|
798
|
-
int64_t
|
|
799
|
-
size_t slot =
|
|
805
|
+
int64_t hk_i = hash_function(k) & mask;
|
|
806
|
+
size_t slot = hk_i;
|
|
800
807
|
|
|
801
808
|
if (tab[2 * slot] == -1) { // not in table
|
|
802
809
|
vals[i] = -1;
|
|
803
810
|
} else if (tab[2 * slot] == k) { // found!
|
|
804
811
|
vals[i] = tab[2 * slot + 1];
|
|
805
812
|
} else { // need to search in [k0, k1)
|
|
806
|
-
size_t bucket =
|
|
813
|
+
size_t bucket = hk_i >> (log2_capacity - log2_nbucket);
|
|
807
814
|
size_t k0 = bucket << (log2_capacity - log2_nbucket);
|
|
808
815
|
size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
|
|
809
816
|
for (;;) {
|
|
817
|
+
if (tab[slot * 2] == -1) { // empty slot, key not in table
|
|
818
|
+
vals[i] = -1;
|
|
819
|
+
break;
|
|
820
|
+
}
|
|
810
821
|
if (tab[slot * 2] == k) { // found!
|
|
811
822
|
vals[i] = tab[2 * slot + 1];
|
|
812
823
|
break;
|
|
@@ -815,7 +826,8 @@ void hashtable_int64_to_int64_lookup(
|
|
|
815
826
|
if (slot == k1) {
|
|
816
827
|
slot = k0;
|
|
817
828
|
}
|
|
818
|
-
if (slot ==
|
|
829
|
+
if (slot ==
|
|
830
|
+
static_cast<size_t>(hk_i)) { // bucket is full and not found
|
|
819
831
|
vals[i] = -1;
|
|
820
832
|
break;
|
|
821
833
|
}
|
|
@@ -187,7 +187,7 @@ size_t get_mem_usage_kb() {
|
|
|
187
187
|
char buf[256];
|
|
188
188
|
if (!fgets(buf, 256, f))
|
|
189
189
|
break;
|
|
190
|
-
if (sscanf(buf, "VmRSS: %
|
|
190
|
+
if (sscanf(buf, "VmRSS: %zu kB", &sz) == 1)
|
|
191
191
|
break;
|
|
192
192
|
}
|
|
193
193
|
fclose(f);
|
|
@@ -307,7 +307,7 @@ size_t merge_result_table_with(
|
|
|
307
307
|
std::vector<float> tmpD(k);
|
|
308
308
|
|
|
309
309
|
#pragma omp for
|
|
310
|
-
for (int64_t i = 0; i < n; i++) {
|
|
310
|
+
for (int64_t i = 0; i < static_cast<int64_t>(n); i++) {
|
|
311
311
|
int64_t* lI0 = I0 + i * k;
|
|
312
312
|
float* lD0 = D0 + i * k;
|
|
313
313
|
const int64_t* lI1 = I1 + i * k;
|
|
@@ -437,10 +437,10 @@ void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist) {
|
|
|
437
437
|
std::vector<int> accu(d * 256);
|
|
438
438
|
const uint8_t* c = codes;
|
|
439
439
|
for (size_t i = 0; i < n; i++)
|
|
440
|
-
for (
|
|
440
|
+
for (size_t j = 0; j < d; j++)
|
|
441
441
|
accu[j * 256 + *c++]++;
|
|
442
442
|
memset(hist, 0, sizeof(*hist) * nbits);
|
|
443
|
-
for (
|
|
443
|
+
for (size_t i = 0; i < d; i++) {
|
|
444
444
|
const int* ai = accu.data() + i * 256;
|
|
445
445
|
int* hi = hist + i * 8;
|
|
446
446
|
for (int j = 0; j < 256; j++)
|
|
@@ -500,7 +500,7 @@ const float* fvecs_maybe_subsample(
|
|
|
500
500
|
std::vector<int> subset(*n);
|
|
501
501
|
rand_perm(subset.data(), *n, seed);
|
|
502
502
|
float* x_subset = new float[n2 * d];
|
|
503
|
-
for (int64_t i = 0; i < n2; i++)
|
|
503
|
+
for (int64_t i = 0; i < static_cast<int64_t>(n2); i++)
|
|
504
504
|
memcpy(&x_subset[i * d], &x[subset[i] * size_t(d)], sizeof(x[0]) * d);
|
|
505
505
|
*n = n2;
|
|
506
506
|
return x_subset;
|
|
@@ -172,8 +172,8 @@ struct CombinerRangeKNN {
|
|
|
172
172
|
T r2; /// range search radius
|
|
173
173
|
bool keep_max; /// whether to keep max values instead of min.
|
|
174
174
|
|
|
175
|
-
CombinerRangeKNN(int64_t
|
|
176
|
-
: nq(
|
|
175
|
+
CombinerRangeKNN(int64_t nq_in, size_t k_in, T r2_in, bool keep_max_in)
|
|
176
|
+
: nq(nq_in), k(k_in), r2(r2_in), keep_max(keep_max_in) {}
|
|
177
177
|
|
|
178
178
|
/// Knn search results
|
|
179
179
|
const int64_t* I = nullptr; /// size nq * k
|
|
@@ -200,7 +200,7 @@ struct CodeSet {
|
|
|
200
200
|
size_t d;
|
|
201
201
|
std::set<std::vector<uint8_t>> s;
|
|
202
202
|
|
|
203
|
-
explicit CodeSet(size_t
|
|
203
|
+
explicit CodeSet(size_t d_in) : d(d_in) {}
|
|
204
204
|
void insert(size_t n, const uint8_t* codes, bool* inserted);
|
|
205
205
|
};
|
|
206
206
|
|