faiss 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/faiss/extconf.rb +2 -1
- data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
- data/ext/faiss/index_binary.cpp +1 -1
- data/ext/faiss/kmeans.cpp +1 -1
- data/ext/faiss/pca_matrix.cpp +1 -1
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +93 -80
- data/vendor/faiss/faiss/Clustering.cpp +39 -240
- data/vendor/faiss/faiss/Clustering.h +6 -0
- data/vendor/faiss/faiss/IVFlib.cpp +41 -21
- data/vendor/faiss/faiss/Index.cpp +6 -5
- data/vendor/faiss/faiss/Index.h +5 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
- data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
- data/vendor/faiss/faiss/IndexFastScan.h +25 -23
- data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
- data/vendor/faiss/faiss/IndexFlat.h +21 -18
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
- data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
- data/vendor/faiss/faiss/IndexHNSW.h +16 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
- data/vendor/faiss/faiss/IndexIVF.h +33 -12
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
- data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
- data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
- data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
- data/vendor/faiss/faiss/IndexShards.cpp +10 -9
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +14 -7
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
- data/vendor/faiss/faiss/VectorTransform.h +16 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +48 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/factory_tools.cpp +5 -0
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
- data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
- data/vendor/faiss/faiss/impl/HNSW.h +13 -34
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
- data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
- data/vendor/faiss/faiss/impl/NSG.h +4 -4
- data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
- data/vendor/faiss/faiss/impl/Panorama.h +258 -87
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +33 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
- data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +86 -18
- data/vendor/faiss/faiss/index_io.h +24 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
- data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
- data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/distances.cpp +390 -560
- data/vendor/faiss/faiss/utils/distances.h +20 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
- data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
- data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +5 -5
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +119 -34
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
- /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
|
@@ -22,6 +22,190 @@ static float sqr(float x) {
|
|
|
22
22
|
return x * x;
|
|
23
23
|
}
|
|
24
24
|
|
|
25
|
+
constexpr size_t kTurboQuantMaxBits = 8;
|
|
26
|
+
// TurboQuant builds a 1-D optimal scalar quantizer analytically. We approximate
|
|
27
|
+
// the target density on a uniform grid over [-1, 1]; the grid is kept dense
|
|
28
|
+
// enough both in absolute terms and per output centroid.
|
|
29
|
+
constexpr size_t kTurboQuantGridMin = 1 << 15;
|
|
30
|
+
constexpr size_t kTurboQuantGridPerCentroid = 512;
|
|
31
|
+
constexpr int kTurboQuantMaxIter = 100;
|
|
32
|
+
constexpr double kTurboQuantTol = 1e-8;
|
|
33
|
+
|
|
34
|
+
void build_TurboQuantMSECodebook(
|
|
35
|
+
size_t d,
|
|
36
|
+
size_t nbits,
|
|
37
|
+
std::vector<float>& centroids,
|
|
38
|
+
std::vector<float>& boundaries) {
|
|
39
|
+
FAISS_THROW_IF_NOT_FMT(
|
|
40
|
+
nbits <= kTurboQuantMaxBits,
|
|
41
|
+
"invalid TurboQuant nbits %zu (must be in [0, %zu])",
|
|
42
|
+
nbits,
|
|
43
|
+
kTurboQuantMaxBits);
|
|
44
|
+
|
|
45
|
+
if (nbits == 0) {
|
|
46
|
+
centroids.clear();
|
|
47
|
+
boundaries.clear();
|
|
48
|
+
return;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const size_t k = size_t(1) << nbits;
|
|
52
|
+
|
|
53
|
+
if (d == 1) {
|
|
54
|
+
// In 1-D, a unit vector can only be -1 or +1, so the marginal
|
|
55
|
+
// distribution collapses to two atoms. The TurboQuant codebook is
|
|
56
|
+
// therefore a repeated pair of endpoint centroids.
|
|
57
|
+
centroids.resize(k);
|
|
58
|
+
for (size_t i = 0; i < k; i++) {
|
|
59
|
+
centroids[i] = i < k / 2 ? -1.0f : 1.0f;
|
|
60
|
+
}
|
|
61
|
+
boundaries.resize(k - 1);
|
|
62
|
+
for (size_t i = 0; i + 1 < k; i++) {
|
|
63
|
+
boundaries[i] = 0.5f * (centroids[i] + centroids[i + 1]);
|
|
64
|
+
}
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// For d > 1, TurboQuant uses the marginal distribution of one coordinate of
|
|
69
|
+
// a random unit vector in R^d. On [-1, 1], this density is proportional to
|
|
70
|
+
// (1 - x^2)^((d - 3) / 2), which is a symmetric beta-law after a change of
|
|
71
|
+
// variables. The code below discretizes that density.
|
|
72
|
+
const size_t ngrid =
|
|
73
|
+
std::max(kTurboQuantGridMin, k * kTurboQuantGridPerCentroid);
|
|
74
|
+
const double step = 2.0 / ngrid;
|
|
75
|
+
const double alpha = 0.5 * (double(d) - 3.0);
|
|
76
|
+
|
|
77
|
+
std::vector<double> xs(ngrid);
|
|
78
|
+
// prefix_w stores the cumulative mass of the discretized density and
|
|
79
|
+
// prefix_wx stores its cumulative first moment, so interval means can be
|
|
80
|
+
// recovered in O(1).
|
|
81
|
+
std::vector<double> prefix_w(ngrid + 1, 0.0);
|
|
82
|
+
std::vector<double> prefix_wx(ngrid + 1, 0.0);
|
|
83
|
+
|
|
84
|
+
for (size_t i = 0; i < ngrid; i++) {
|
|
85
|
+
const double x = -1.0 + (i + 0.5) * step;
|
|
86
|
+
const double one_minus_x2 = std::max(0.0, 1.0 - x * x);
|
|
87
|
+
double w;
|
|
88
|
+
if (alpha == 0.0) { // when d == 3
|
|
89
|
+
w = 1.0;
|
|
90
|
+
} else {
|
|
91
|
+
// (1-x^2)^((d-3)/2)
|
|
92
|
+
w = std::pow(one_minus_x2, alpha);
|
|
93
|
+
}
|
|
94
|
+
if (!std::isfinite(w) || w < 0.0) {
|
|
95
|
+
w = 0.0;
|
|
96
|
+
}
|
|
97
|
+
xs[i] = x;
|
|
98
|
+
prefix_w[i + 1] = prefix_w[i] + w;
|
|
99
|
+
prefix_wx[i + 1] = prefix_wx[i] + w * x;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
auto range_mean = [&](size_t i0, size_t i1, double fallback) {
|
|
103
|
+
const double w = prefix_w[i1] - prefix_w[i0];
|
|
104
|
+
if (w <= 0.0) {
|
|
105
|
+
return fallback;
|
|
106
|
+
}
|
|
107
|
+
return (prefix_wx[i1] - prefix_wx[i0]) / w;
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
const double total_w = prefix_w.back();
|
|
111
|
+
std::vector<size_t> cuts(k + 1, 0);
|
|
112
|
+
cuts[k] = ngrid;
|
|
113
|
+
|
|
114
|
+
// Initialize with k equal-mass cells under the target density. This gives
|
|
115
|
+
// a stable starting point before the Lloyd refinements below.
|
|
116
|
+
for (size_t i = 1; i < k; i++) {
|
|
117
|
+
const double target = total_w * i / k;
|
|
118
|
+
cuts[i] = std::lower_bound(prefix_w.begin(), prefix_w.end(), target) -
|
|
119
|
+
prefix_w.begin();
|
|
120
|
+
cuts[i] = std::min(cuts[i], ngrid);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
std::vector<double> centroids_d(k);
|
|
124
|
+
for (size_t i = 0; i < k; i++) {
|
|
125
|
+
const double left = -1.0 + 2.0 * i / k;
|
|
126
|
+
const double right = -1.0 + 2.0 * (i + 1) / k;
|
|
127
|
+
// First estimate of each centroid: the conditional mean of its initial
|
|
128
|
+
// equal-mass cell, with a uniform-cell midpoint as a fallback.
|
|
129
|
+
centroids_d[i] = range_mean(cuts[i], cuts[i + 1], 0.5 * (left + right));
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
std::vector<double> boundaries_d(k > 0 ? k - 1 : 0);
|
|
133
|
+
|
|
134
|
+
// Refine the 1-D codebook with a weighted Lloyd iteration over the
|
|
135
|
+
// discretized marginal density on [-1, 1]:
|
|
136
|
+
// 1. boundaries_d are the Voronoi separators implied by neighboring
|
|
137
|
+
// centroids.
|
|
138
|
+
// 2. cuts map each boundary interval back to a contiguous range of the
|
|
139
|
+
// integration grid xs[].
|
|
140
|
+
// 3. each centroid becomes the weighted mean of the samples currently in
|
|
141
|
+
// its cell, clipped to stay within its neighboring boundaries.
|
|
142
|
+
//
|
|
143
|
+
// The loop stops once the largest centroid update is below kTurboQuantTol.
|
|
144
|
+
for (int iter = 0; iter < kTurboQuantMaxIter; iter++) {
|
|
145
|
+
// Midpoints between adjacent centroids define the current Voronoi
|
|
146
|
+
// partition of [-1, 1].
|
|
147
|
+
for (size_t i = 0; i + 1 < k; i++) {
|
|
148
|
+
boundaries_d[i] = 0.5 * (centroids_d[i] + centroids_d[i + 1]);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
cuts[0] = 0;
|
|
152
|
+
cuts[k] = ngrid;
|
|
153
|
+
// Reassign the discretized density samples to the Voronoi cell induced
|
|
154
|
+
// by each boundary. Because xs is sorted, the reassignment reduces to
|
|
155
|
+
// finding the first grid point strictly greater than each boundary.
|
|
156
|
+
for (size_t i = 1; i < k; i++) {
|
|
157
|
+
cuts[i] = std::upper_bound(
|
|
158
|
+
xs.begin(), xs.end(), boundaries_d[i - 1]) -
|
|
159
|
+
xs.begin();
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
double max_delta = 0.0;
|
|
163
|
+
for (size_t i = 0; i < k; i++) {
|
|
164
|
+
const double left = i == 0 ? -1.0 : boundaries_d[i - 1];
|
|
165
|
+
const double right = i + 1 == k ? 1.0 : boundaries_d[i];
|
|
166
|
+
// Lloyd update: replace the centroid with the weighted average of
|
|
167
|
+
// the mass assigned to its cell. Empty cells fall back to the cell
|
|
168
|
+
// midpoint, and we clamp to [left, right] to preserve ordering.
|
|
169
|
+
double c = range_mean(cuts[i], cuts[i + 1], 0.5 * (left + right));
|
|
170
|
+
c = std::min(std::max(c, left), right);
|
|
171
|
+
max_delta = std::max(max_delta, std::abs(c - centroids_d[i]));
|
|
172
|
+
centroids_d[i] = c;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if (max_delta < kTurboQuantTol) {
|
|
176
|
+
break;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
std::sort(centroids_d.begin(), centroids_d.end());
|
|
181
|
+
|
|
182
|
+
centroids.resize(k);
|
|
183
|
+
boundaries.resize(k - 1);
|
|
184
|
+
for (size_t i = 0; i < k; i++) {
|
|
185
|
+
centroids[i] = centroids_d[i];
|
|
186
|
+
}
|
|
187
|
+
for (size_t i = 0; i + 1 < k; i++) {
|
|
188
|
+
boundaries[i] = 0.5f * (centroids[i] + centroids[i + 1]);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
void train_TurboQuantMSE(size_t d, size_t nbits, std::vector<float>& trained) {
|
|
193
|
+
FAISS_THROW_IF_NOT_FMT(
|
|
194
|
+
nbits > 0, "invalid TurboQuant SQ nbits %zu (must be > 0)", nbits);
|
|
195
|
+
std::vector<float> centroids;
|
|
196
|
+
std::vector<float> boundaries;
|
|
197
|
+
build_TurboQuantMSECodebook(d, nbits, centroids, boundaries);
|
|
198
|
+
const size_t k = centroids.size();
|
|
199
|
+
|
|
200
|
+
trained.resize(k + (k - 1));
|
|
201
|
+
for (size_t i = 0; i < k; i++) {
|
|
202
|
+
trained[i] = centroids[i];
|
|
203
|
+
}
|
|
204
|
+
for (size_t i = 0; i + 1 < k; i++) {
|
|
205
|
+
trained[k + i] = boundaries[i];
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
25
209
|
void train_Uniform(
|
|
26
210
|
RangeStat rs,
|
|
27
211
|
float rs_arg,
|
|
@@ -37,7 +221,7 @@ void train_Uniform(
|
|
|
37
221
|
if (rs == ScalarQuantizer::RS_minmax) {
|
|
38
222
|
vmin = HUGE_VAL;
|
|
39
223
|
vmax = -HUGE_VAL;
|
|
40
|
-
for (
|
|
224
|
+
for (idx_t i = 0; i < n; i++) {
|
|
41
225
|
if (x[i] < vmin) {
|
|
42
226
|
vmin = x[i];
|
|
43
227
|
}
|
|
@@ -50,7 +234,7 @@ void train_Uniform(
|
|
|
50
234
|
vmax += vexp;
|
|
51
235
|
} else if (rs == ScalarQuantizer::RS_meanstd) {
|
|
52
236
|
double sum = 0, sum2 = 0;
|
|
53
|
-
for (
|
|
237
|
+
for (idx_t i = 0; i < n; i++) {
|
|
54
238
|
sum += x[i];
|
|
55
239
|
sum2 += x[i] * x[i];
|
|
56
240
|
}
|
|
@@ -81,7 +265,7 @@ void train_Uniform(
|
|
|
81
265
|
float sx = 0;
|
|
82
266
|
{
|
|
83
267
|
vmin = HUGE_VAL, vmax = -HUGE_VAL;
|
|
84
|
-
for (
|
|
268
|
+
for (idx_t i = 0; i < n; i++) {
|
|
85
269
|
if (x[i] < vmin) {
|
|
86
270
|
vmin = x[i];
|
|
87
271
|
}
|
|
@@ -161,9 +345,9 @@ void train_NonUniform(
|
|
|
161
345
|
if (rs == ScalarQuantizer::RS_minmax) {
|
|
162
346
|
memcpy(vmin, x, sizeof(*x) * d);
|
|
163
347
|
memcpy(vmax, x, sizeof(*x) * d);
|
|
164
|
-
for (
|
|
348
|
+
for (idx_t i = 1; i < n; i++) {
|
|
165
349
|
const float* xi = x + i * d;
|
|
166
|
-
for (
|
|
350
|
+
for (int j = 0; j < d; j++) {
|
|
167
351
|
if (xi[j] < vmin[j]) {
|
|
168
352
|
vmin[j] = xi[j];
|
|
169
353
|
}
|
|
@@ -173,7 +357,7 @@ void train_NonUniform(
|
|
|
173
357
|
}
|
|
174
358
|
}
|
|
175
359
|
float* vdiff = vmax;
|
|
176
|
-
for (
|
|
360
|
+
for (int j = 0; j < d; j++) {
|
|
177
361
|
float vexp = (vmax[j] - vmin[j]) * rs_arg;
|
|
178
362
|
vmin[j] -= vexp;
|
|
179
363
|
vmax[j] += vexp;
|
|
@@ -182,9 +366,9 @@ void train_NonUniform(
|
|
|
182
366
|
} else {
|
|
183
367
|
// transpose
|
|
184
368
|
std::vector<float> xt(n * d);
|
|
185
|
-
for (
|
|
369
|
+
for (idx_t i = 1; i < n; i++) {
|
|
186
370
|
const float* xi = x + i * d;
|
|
187
|
-
for (
|
|
371
|
+
for (int j = 0; j < d; j++) {
|
|
188
372
|
xt[j * n + i] = xi[j];
|
|
189
373
|
}
|
|
190
374
|
}
|
|
@@ -37,6 +37,18 @@ void train_NonUniform(
|
|
|
37
37
|
int k,
|
|
38
38
|
const float* x,
|
|
39
39
|
std::vector<float>& trained);
|
|
40
|
+
|
|
41
|
+
/** Build the TurboQuant MSE codebook using the beta-distribution-optimal
|
|
42
|
+
* quantizer from the TurboQuant paper. The codebook is analytical
|
|
43
|
+
* (depends only on d and nbits, no training data needed).
|
|
44
|
+
*
|
|
45
|
+
* @param d vector dimensionality (used for beta-distribution shape)
|
|
46
|
+
* @param nbits bits per component (1-8)
|
|
47
|
+
* @param trained output: [centroids (k floats), boundaries (k-1 floats)]
|
|
48
|
+
* where k = 2^nbits
|
|
49
|
+
*/
|
|
50
|
+
void train_TurboQuantMSE(size_t d, size_t nbits, std::vector<float>& trained);
|
|
51
|
+
|
|
40
52
|
} // namespace scalar_quantizer
|
|
41
53
|
|
|
42
54
|
} // namespace faiss
|
|
@@ -23,86 +23,107 @@
|
|
|
23
23
|
|
|
24
24
|
namespace faiss {
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
/** Defining which SIMD levels are available for a given function is via a
|
|
27
|
+
* binary mask. Here we predefine the most common masks.
|
|
28
|
+
* */
|
|
27
29
|
|
|
28
|
-
|
|
29
|
-
#define DISPATCH_SIMDLevel_AVX2(f, ...) \
|
|
30
|
-
case SIMDLevel::AVX2: \
|
|
31
|
-
return f<SIMDLevel::AVX2>(__VA_ARGS__)
|
|
32
|
-
#else
|
|
33
|
-
#define DISPATCH_SIMDLevel_AVX2(f, ...)
|
|
34
|
-
#endif
|
|
30
|
+
constexpr int AVAILABLE_SIMD_LEVELS_NONE = (1 << int(SIMDLevel::NONE));
|
|
35
31
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
32
|
+
constexpr int AVAILABLE_SIMD_LEVELS_AVX2_NEON = AVAILABLE_SIMD_LEVELS_NONE |
|
|
33
|
+
(1 << int(SIMDLevel::AVX2)) | (1 << int(SIMDLevel::ARM_NEON));
|
|
34
|
+
|
|
35
|
+
// A0: same + AVX512 + RISCV_RVV
|
|
36
|
+
constexpr int AVAILABLE_SIMD_LEVELS_A0 = AVAILABLE_SIMD_LEVELS_AVX2_NEON |
|
|
37
|
+
(1 << int(SIMDLevel::AVX512)) | (1 << int(SIMDLevel::RISCV_RVV));
|
|
38
|
+
|
|
39
|
+
// A1: same + ARM_SVE (for functions with dedicated SVE implementations)
|
|
40
|
+
constexpr int AVAILABLE_SIMD_LEVELS_A1 =
|
|
41
|
+
AVAILABLE_SIMD_LEVELS_A0 | (1 << int(SIMDLevel::ARM_SVE));
|
|
42
|
+
|
|
43
|
+
// A2: NONE + AVX2 + ARM_SVE only (for functions with only these
|
|
44
|
+
// implementations)
|
|
45
|
+
constexpr int AVAILABLE_SIMD_LEVELS_A2 = AVAILABLE_SIMD_LEVELS_NONE |
|
|
46
|
+
(1 << int(SIMDLevel::AVX2)) | (1 << int(SIMDLevel::ARM_SVE));
|
|
47
|
+
|
|
48
|
+
constexpr int AVAILABLE_SIMD_LEVELS_ALL = -1;
|
|
49
|
+
|
|
50
|
+
/** The complete dispatching function. It takes into account:
|
|
51
|
+
* - the currently selected SIMD level
|
|
52
|
+
* - the compiled in SIMD levels (given by COMPILE_SIMD_XXX)
|
|
53
|
+
* - the available SIMD implementations for that particular function (given by
|
|
54
|
+
* available_levels)
|
|
55
|
+
*/
|
|
56
|
+
|
|
57
|
+
template <int available_levels, typename LambdaType>
|
|
58
|
+
inline auto with_selected_simd_levels(LambdaType&& action) {
|
|
59
|
+
#ifdef FAISS_ENABLE_DD
|
|
60
|
+
switch (SIMDConfig::level) {
|
|
61
|
+
// For x86 -- try from highest to lowest level
|
|
43
62
|
|
|
44
63
|
#ifdef COMPILE_SIMD_AVX512_SPR
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
64
|
+
case SIMDLevel::AVX512_SPR:
|
|
65
|
+
if constexpr (
|
|
66
|
+
available_levels & (1 << int(SIMDLevel::AVX512_SPR))) {
|
|
67
|
+
return action.template operator()<SIMDLevel::AVX512_SPR>();
|
|
68
|
+
}
|
|
69
|
+
[[fallthrough]];
|
|
50
70
|
#endif
|
|
51
71
|
|
|
52
|
-
|
|
72
|
+
#ifdef COMPILE_SIMD_AVX512
|
|
73
|
+
case SIMDLevel::AVX512:
|
|
74
|
+
if constexpr (available_levels & (1 << int(SIMDLevel::AVX512))) {
|
|
75
|
+
return action.template operator()<SIMDLevel::AVX512>();
|
|
76
|
+
}
|
|
77
|
+
[[fallthrough]];
|
|
78
|
+
#endif
|
|
53
79
|
|
|
54
|
-
#ifdef
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
80
|
+
#ifdef COMPILE_SIMD_AVX2
|
|
81
|
+
case SIMDLevel::AVX2:
|
|
82
|
+
if constexpr (available_levels & (1 << int(SIMDLevel::AVX2))) {
|
|
83
|
+
return action.template operator()<SIMDLevel::AVX2>();
|
|
84
|
+
}
|
|
85
|
+
[[fallthrough]];
|
|
60
86
|
#endif
|
|
61
87
|
|
|
88
|
+
// For ARM, try from highest to lowest level
|
|
62
89
|
#ifdef COMPILE_SIMD_ARM_SVE
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
90
|
+
case SIMDLevel::ARM_SVE:
|
|
91
|
+
if constexpr (available_levels & (1 << int(SIMDLevel::ARM_SVE))) {
|
|
92
|
+
return action.template operator()<SIMDLevel::ARM_SVE>();
|
|
93
|
+
}
|
|
94
|
+
[[fallthrough]];
|
|
68
95
|
#endif
|
|
69
96
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
97
|
+
#ifdef COMPILE_SIMD_ARM_NEON
|
|
98
|
+
case SIMDLevel::ARM_NEON:
|
|
99
|
+
if constexpr (available_levels & (1 << int(SIMDLevel::ARM_NEON))) {
|
|
100
|
+
return action.template operator()<SIMDLevel::ARM_NEON>();
|
|
101
|
+
}
|
|
102
|
+
[[fallthrough]];
|
|
103
|
+
#endif
|
|
73
104
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
105
|
+
#ifdef COMPILE_SIMD_RISCV_RVV
|
|
106
|
+
case SIMDLevel::RISCV_RVV:
|
|
107
|
+
if constexpr (available_levels & (1 << int(SIMDLevel::RISCV_RVV))) {
|
|
108
|
+
return action.template operator()<SIMDLevel::RISCV_RVV>();
|
|
109
|
+
}
|
|
110
|
+
[[fallthrough]];
|
|
111
|
+
#endif
|
|
112
|
+
default:
|
|
113
|
+
return action.template operator()<SIMDLevel::NONE>();
|
|
114
|
+
}
|
|
115
|
+
#else // static dispatch
|
|
116
|
+
// In static mode, SINGLE_SIMD_LEVEL is a constexpr resolved at compile
|
|
117
|
+
// time. If the compiled level is not in the available set, fall through
|
|
118
|
+
// to NONE (mirroring the DD fallthrough behavior). Only SINGLE_SIMD_LEVEL
|
|
119
|
+
// and NONE have compiled specializations.
|
|
120
|
+
if constexpr (available_levels & (1 << int(SINGLE_SIMD_LEVEL))) {
|
|
121
|
+
return action.template operator()<SINGLE_SIMD_LEVEL>();
|
|
122
|
+
} else {
|
|
123
|
+
return action.template operator()<SIMDLevel::NONE>();
|
|
86
124
|
}
|
|
87
|
-
|
|
88
|
-
#else // Static mode
|
|
89
|
-
|
|
90
|
-
// Static mode: direct call to compiled-in SIMD level (no runtime switch)
|
|
91
|
-
#if defined(COMPILE_SIMD_AVX512_SPR)
|
|
92
|
-
#define DISPATCH_SIMDLevel(f, ...) return f<SIMDLevel::AVX512_SPR>(__VA_ARGS__)
|
|
93
|
-
#elif defined(COMPILE_SIMD_AVX512)
|
|
94
|
-
#define DISPATCH_SIMDLevel(f, ...) return f<SIMDLevel::AVX512>(__VA_ARGS__)
|
|
95
|
-
#elif defined(COMPILE_SIMD_AVX2)
|
|
96
|
-
#define DISPATCH_SIMDLevel(f, ...) return f<SIMDLevel::AVX2>(__VA_ARGS__)
|
|
97
|
-
#elif defined(COMPILE_SIMD_ARM_SVE)
|
|
98
|
-
#define DISPATCH_SIMDLevel(f, ...) return f<SIMDLevel::ARM_SVE>(__VA_ARGS__)
|
|
99
|
-
#elif defined(COMPILE_SIMD_ARM_NEON)
|
|
100
|
-
#define DISPATCH_SIMDLevel(f, ...) return f<SIMDLevel::ARM_NEON>(__VA_ARGS__)
|
|
101
|
-
#else
|
|
102
|
-
#define DISPATCH_SIMDLevel(f, ...) return f<SIMDLevel::NONE>(__VA_ARGS__)
|
|
103
125
|
#endif
|
|
104
|
-
|
|
105
|
-
#endif // FAISS_ENABLE_DD
|
|
126
|
+
}
|
|
106
127
|
|
|
107
128
|
/**
|
|
108
129
|
* Dispatch to a lambda with SIMDLevel as a compile-time constant.
|
|
@@ -126,6 +147,8 @@ namespace faiss {
|
|
|
126
147
|
* });
|
|
127
148
|
*
|
|
128
149
|
* The lambda must be a generic lambda with a SIMDLevel template parameter.
|
|
150
|
+
* By default, the lambda uses levels AVX2 + AVX512 + NEON + RVV, since these
|
|
151
|
+
* are the most common cases.
|
|
129
152
|
*
|
|
130
153
|
* @param action A generic lambda with signature `template<SIMDLevel> T
|
|
131
154
|
* operator()()`
|
|
@@ -133,7 +156,18 @@ namespace faiss {
|
|
|
133
156
|
*/
|
|
134
157
|
template <typename LambdaType>
|
|
135
158
|
inline auto with_simd_level(LambdaType&& action) {
|
|
136
|
-
|
|
159
|
+
return with_selected_simd_levels<AVAILABLE_SIMD_LEVELS_A0>(
|
|
160
|
+
std::forward<LambdaType>(action));
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Use for functions implemented with simdXintY (256-bit) operations
|
|
165
|
+
* that don't have dedicated AVX512 or SVE implementations.
|
|
166
|
+
*/
|
|
167
|
+
template <typename LambdaType>
|
|
168
|
+
inline auto with_simd_level_256bit(LambdaType&& action) {
|
|
169
|
+
return with_selected_simd_levels<AVAILABLE_SIMD_LEVELS_AVX2_NEON>(
|
|
170
|
+
std::forward<LambdaType>(action));
|
|
137
171
|
}
|
|
138
172
|
|
|
139
173
|
} // namespace faiss
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#pragma once
|
|
9
|
+
|
|
10
|
+
/** Abstractions for 256-bit and 512-bit SIMD registers.
|
|
11
|
+
*
|
|
12
|
+
* The objective is to separate the different interpretations of the same
|
|
13
|
+
* registers (as a vector of uint8, uint16 or uint32), to provide printing
|
|
14
|
+
* functions.
|
|
15
|
+
*
|
|
16
|
+
* The types are templatized on SIMDLevel. Each platform header provides
|
|
17
|
+
* explicit specializations for the appropriate level. Code without explicit
|
|
18
|
+
* SL context uses SINGLE_SIMD_LEVEL (see simd_levels.h).
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#include <faiss/utils/simd_levels.h>
|
|
22
|
+
|
|
23
|
+
namespace faiss {
|
|
24
|
+
|
|
25
|
+
// 256-bit primary templates
|
|
26
|
+
template <SIMDLevel SL>
|
|
27
|
+
struct simd256bit_tpl {};
|
|
28
|
+
template <SIMDLevel SL>
|
|
29
|
+
struct simd16uint16_tpl : simd256bit_tpl<SL> {};
|
|
30
|
+
template <SIMDLevel SL>
|
|
31
|
+
struct simd32uint8_tpl : simd256bit_tpl<SL> {};
|
|
32
|
+
template <SIMDLevel SL>
|
|
33
|
+
struct simd8uint32_tpl : simd256bit_tpl<SL> {};
|
|
34
|
+
template <SIMDLevel SL>
|
|
35
|
+
struct simd8float32_tpl : simd256bit_tpl<SL> {};
|
|
36
|
+
|
|
37
|
+
// 512-bit primary templates
|
|
38
|
+
template <SIMDLevel SL>
|
|
39
|
+
struct simd512bit_tpl {};
|
|
40
|
+
template <SIMDLevel SL>
|
|
41
|
+
struct simd32uint16_tpl : simd512bit_tpl<SL> {};
|
|
42
|
+
template <SIMDLevel SL>
|
|
43
|
+
struct simd64uint8_tpl : simd512bit_tpl<SL> {};
|
|
44
|
+
template <SIMDLevel SL>
|
|
45
|
+
struct simd16float32_tpl : simd512bit_tpl<SL> {};
|
|
46
|
+
|
|
47
|
+
} // namespace faiss
|
|
48
|
+
|
|
49
|
+
// NONE specialization — always included.
|
|
50
|
+
// Provides simd16uint16_tpl<NONE> etc. (scalar fallback).
|
|
51
|
+
// On PPC64: uses PPC-optimized scalar code (hand-tuned loop unrolling).
|
|
52
|
+
// Elsewhere: generic scalar implementation.
|
|
53
|
+
#if defined(__PPC64__)
|
|
54
|
+
#include <faiss/impl/simdlib/simdlib_ppc64.h>
|
|
55
|
+
#else
|
|
56
|
+
#include <faiss/impl/simdlib/simdlib_emulated.h>
|
|
57
|
+
#endif
|