faiss 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/faiss/extconf.rb +2 -1
- data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
- data/ext/faiss/index_binary.cpp +1 -1
- data/ext/faiss/kmeans.cpp +1 -1
- data/ext/faiss/pca_matrix.cpp +1 -1
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +93 -80
- data/vendor/faiss/faiss/Clustering.cpp +39 -240
- data/vendor/faiss/faiss/Clustering.h +6 -0
- data/vendor/faiss/faiss/IVFlib.cpp +41 -21
- data/vendor/faiss/faiss/Index.cpp +6 -5
- data/vendor/faiss/faiss/Index.h +5 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +88 -97
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +89 -417
- data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
- data/vendor/faiss/faiss/IndexFastScan.h +25 -23
- data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
- data/vendor/faiss/faiss/IndexFlat.h +21 -18
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
- data/vendor/faiss/faiss/IndexHNSW.cpp +374 -206
- data/vendor/faiss/faiss/IndexHNSW.h +16 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +467 -364
- data/vendor/faiss/faiss/IndexIVF.h +33 -12
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +79 -76
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
- data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +39 -69
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +56 -33
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +73 -846
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +23 -20
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +30 -52
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
- data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +38 -23
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
- data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +150 -20
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
- data/vendor/faiss/faiss/IndexShards.cpp +10 -9
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +14 -7
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
- data/vendor/faiss/faiss/VectorTransform.h +16 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +48 -47
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
- data/vendor/faiss/faiss/factory_tools.cpp +9 -0
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +15 -16
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +5 -4
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +58 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +111 -0
- data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
- data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +639 -507
- data/vendor/faiss/faiss/impl/HNSW.h +61 -44
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
- data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +53 -32
- data/vendor/faiss/faiss/impl/NSG.h +4 -4
- data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
- data/vendor/faiss/faiss/impl/Panorama.h +269 -87
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +55 -25
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +302 -283
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +100 -75
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +318 -7
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +77 -1
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
- data/vendor/faiss/faiss/impl/VisitedTable.h +70 -28
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +270 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +83 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +113 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +150 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +142 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1227 -79
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +96 -13
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +58 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
- data/vendor/faiss/faiss/impl/platform_macros.h +15 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +23 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +23 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +23 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +45 -107
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +274 -5
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +10 -7
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +70 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +9 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +419 -19
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +387 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +341 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +425 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +290 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +337 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +157 -66
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +90 -18
- data/vendor/faiss/faiss/index_io.h +40 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +28 -15
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +170 -86
- data/vendor/faiss/faiss/invlists/InvertedLists.h +88 -25
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +142 -21
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +33 -7
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +77 -27
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +10 -4
- data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/bf16.h +34 -0
- data/vendor/faiss/faiss/utils/distances.cpp +390 -560
- data/vendor/faiss/faiss/utils/distances.h +20 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -178
- data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +16 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +210 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -989
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1031 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +29 -7
- data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +5 -5
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +129 -34
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
- /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* @file rabitq_avx512_spr.cpp
|
|
10
|
+
*
|
|
11
|
+
* RaBitQ SIMD kernels specialized for SIMDLevel::AVX512_SPR.
|
|
12
|
+
*
|
|
13
|
+
* Sapphire Rapids (SPR) and later Intel microarchitectures expose
|
|
14
|
+
* AVX-512 VPOPCNTDQ (vpopcntq), which performs a per-lane 64-bit
|
|
15
|
+
* popcount in a single instruction. This is used here to replace the
|
|
16
|
+
* multi-step shuffle/pshufb-based popcount used by the generic AVX-512
|
|
17
|
+
* specialization in rabitq_avx512.cpp. The popcount-heavy kernels
|
|
18
|
+
* (bitwise_and_dot_product, bitwise_xor_dot_product, popcount) become
|
|
19
|
+
* substantially shorter and faster on SPR+ as a result.
|
|
20
|
+
*
|
|
21
|
+
* Build / dispatch behavior:
|
|
22
|
+
* - faiss_avx512 (AVX-512 only, no SPR features): NOT compiled.
|
|
23
|
+
* The existing AVX512 specialization in rabitq_avx512.cpp is used.
|
|
24
|
+
* - faiss_avx512_spr (statically built for SPR+): compiled. The
|
|
25
|
+
* SINGLE_SIMD_LEVEL is AVX512_SPR, so this specialization is
|
|
26
|
+
* selected by static dispatch.
|
|
27
|
+
* - faiss with FAISS_OPT_LEVEL=dd (dynamic dispatch): compiled with
|
|
28
|
+
* -mavx512vpopcntdq as a per-file flag. Selected at runtime when
|
|
29
|
+
* SIMDConfig::level == SIMDLevel::AVX512_SPR.
|
|
30
|
+
*
|
|
31
|
+
* The floating-point multi-bit inner-product kernel does not benefit
|
|
32
|
+
* from VPOPCNTDQ, so this TU forwards compute_inner_product<SPR> to
|
|
33
|
+
* the AVX512 implementation to avoid duplicating that code path.
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
#ifdef COMPILE_SIMD_AVX512_SPR
|
|
37
|
+
|
|
38
|
+
#include <faiss/utils/popcount.h>
|
|
39
|
+
#include <faiss/utils/rabitq_simd.h>
|
|
40
|
+
#include <immintrin.h>
|
|
41
|
+
#include <cstdint>
|
|
42
|
+
|
|
43
|
+
#if defined(_MSC_VER)
|
|
44
|
+
#include <intrin.h>
|
|
45
|
+
#endif
|
|
46
|
+
|
|
47
|
+
namespace faiss::rabitq {
|
|
48
|
+
|
|
49
|
+
// Forward declarations for the AVX512 specializations defined in
|
|
50
|
+
// rabitq_avx512.cpp. They live in the same TU group on SPR builds, so
|
|
51
|
+
// we can reuse them as a tail handler / fallback. Declaring rather
|
|
52
|
+
// than redefining avoids ODR risk and keeps a single source of truth
|
|
53
|
+
// for the floating-point kernel.
|
|
54
|
+
template <>
|
|
55
|
+
uint64_t bitwise_and_dot_product<SIMDLevel::AVX512>(
|
|
56
|
+
const uint8_t* query,
|
|
57
|
+
const uint8_t* data,
|
|
58
|
+
size_t size,
|
|
59
|
+
size_t qb);
|
|
60
|
+
template <>
|
|
61
|
+
uint64_t bitwise_xor_dot_product<SIMDLevel::AVX512>(
|
|
62
|
+
const uint8_t* query,
|
|
63
|
+
const uint8_t* data,
|
|
64
|
+
size_t size,
|
|
65
|
+
size_t qb);
|
|
66
|
+
template <>
|
|
67
|
+
uint64_t popcount<SIMDLevel::AVX512>(const uint8_t* data, size_t size);
|
|
68
|
+
|
|
69
|
+
namespace {
|
|
70
|
+
|
|
71
|
+
// 512-bit popcount using AVX-512 VPOPCNTDQ (vpopcntq).
|
|
72
|
+
// Single-instruction per-lane popcount on 8x uint64 lanes.
|
|
73
|
+
inline __m512i popcount_512_vpopcntdq(__m512i v) {
|
|
74
|
+
return _mm512_popcnt_epi64(v);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// 256-bit popcount using AVX-512VL VPOPCNTDQ.
|
|
78
|
+
// AVX512VL is part of the SPR feature set, so vpopcntq is available
|
|
79
|
+
// on 256-bit registers via _mm256_popcnt_epi64.
|
|
80
|
+
inline __m256i popcount_256_vpopcntdq(__m256i v) {
|
|
81
|
+
return _mm256_popcnt_epi64(v);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// 128-bit popcount using AVX-512VL VPOPCNTDQ.
|
|
85
|
+
inline __m128i popcount_128_vpopcntdq(__m128i v) {
|
|
86
|
+
return _mm_popcnt_epi64(v);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
inline uint64_t reduce_add_256(__m256i v) {
|
|
90
|
+
alignas(32) uint64_t lanes[4];
|
|
91
|
+
_mm256_store_si256(reinterpret_cast<__m256i*>(lanes), v);
|
|
92
|
+
return lanes[0] + lanes[1] + lanes[2] + lanes[3];
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
inline uint64_t reduce_add_128(__m128i v) {
|
|
96
|
+
alignas(16) uint64_t lanes[2];
|
|
97
|
+
_mm_store_si128(reinterpret_cast<__m128i*>(lanes), v);
|
|
98
|
+
return lanes[0] + lanes[1];
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
} // namespace
|
|
102
|
+
|
|
103
|
+
template <>
|
|
104
|
+
uint64_t bitwise_and_dot_product<SIMDLevel::AVX512_SPR>(
|
|
105
|
+
const uint8_t* query,
|
|
106
|
+
const uint8_t* data,
|
|
107
|
+
size_t size,
|
|
108
|
+
size_t qb) {
|
|
109
|
+
uint64_t sum = 0;
|
|
110
|
+
size_t offset = 0;
|
|
111
|
+
|
|
112
|
+
// 512-bit main loop: vpopcntq replaces the shuffle-based popcount,
|
|
113
|
+
// halving the instruction count per iteration relative to AVX512.
|
|
114
|
+
if (size_t step = 512 / 8; offset + step <= size) {
|
|
115
|
+
__m512i sum_512 = _mm512_setzero_si512();
|
|
116
|
+
for (; offset + step <= size; offset += step) {
|
|
117
|
+
__m512i v_x = _mm512_loadu_si512(
|
|
118
|
+
reinterpret_cast<const __m512i*>(data + offset));
|
|
119
|
+
for (size_t j = 0; j < qb; j++) {
|
|
120
|
+
__m512i v_q = _mm512_loadu_si512(
|
|
121
|
+
reinterpret_cast<const __m512i*>(
|
|
122
|
+
query + j * size + offset));
|
|
123
|
+
__m512i v_and = _mm512_and_si512(v_q, v_x);
|
|
124
|
+
__m512i v_popcnt = popcount_512_vpopcntdq(v_and);
|
|
125
|
+
__m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
|
|
126
|
+
sum_512 = _mm512_add_epi64(sum_512, v_shifted);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
sum += _mm512_reduce_add_epi64(sum_512);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// 256-bit tail.
|
|
133
|
+
if (size_t step = 256 / 8; offset + step <= size) {
|
|
134
|
+
__m256i sum_256 = _mm256_setzero_si256();
|
|
135
|
+
for (; offset + step <= size; offset += step) {
|
|
136
|
+
__m256i v_x = _mm256_loadu_si256(
|
|
137
|
+
reinterpret_cast<const __m256i*>(data + offset));
|
|
138
|
+
for (size_t j = 0; j < qb; j++) {
|
|
139
|
+
__m256i v_q = _mm256_loadu_si256(
|
|
140
|
+
reinterpret_cast<const __m256i*>(
|
|
141
|
+
query + j * size + offset));
|
|
142
|
+
__m256i v_and = _mm256_and_si256(v_q, v_x);
|
|
143
|
+
__m256i v_popcnt = popcount_256_vpopcntdq(v_and);
|
|
144
|
+
__m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
|
|
145
|
+
sum_256 = _mm256_add_epi64(sum_256, v_shifted);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
sum += reduce_add_256(sum_256);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// 128-bit tail.
|
|
152
|
+
__m128i sum_128 = _mm_setzero_si128();
|
|
153
|
+
for (size_t step = 128 / 8; offset + step <= size; offset += step) {
|
|
154
|
+
__m128i v_x = _mm_loadu_si128(
|
|
155
|
+
reinterpret_cast<const __m128i*>(data + offset));
|
|
156
|
+
for (size_t j = 0; j < qb; j++) {
|
|
157
|
+
__m128i v_q = _mm_loadu_si128(
|
|
158
|
+
reinterpret_cast<const __m128i*>(
|
|
159
|
+
query + j * size + offset));
|
|
160
|
+
__m128i v_and = _mm_and_si128(v_q, v_x);
|
|
161
|
+
__m128i v_popcnt = popcount_128_vpopcntdq(v_and);
|
|
162
|
+
__m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
|
|
163
|
+
sum_128 = _mm_add_epi64(sum_128, v_shifted);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
sum += reduce_add_128(sum_128);
|
|
167
|
+
|
|
168
|
+
// 64-bit scalar tail.
|
|
169
|
+
for (size_t step = 64 / 8; offset + step <= size; offset += step) {
|
|
170
|
+
const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
|
|
171
|
+
for (size_t j = 0; j < qb; j++) {
|
|
172
|
+
const auto qv = *reinterpret_cast<const uint64_t*>(
|
|
173
|
+
query + j * size + offset);
|
|
174
|
+
sum += static_cast<uint64_t>(popcount64(qv & yv)) << j;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
// Byte tail.
|
|
178
|
+
for (; offset < size; ++offset) {
|
|
179
|
+
const auto yv = *(data + offset);
|
|
180
|
+
for (size_t j = 0; j < qb; j++) {
|
|
181
|
+
const auto qv = *(query + j * size + offset);
|
|
182
|
+
sum += static_cast<uint64_t>(popcount32(qv & yv)) << j;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
return sum;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
template <>
|
|
189
|
+
uint64_t bitwise_xor_dot_product<SIMDLevel::AVX512_SPR>(
|
|
190
|
+
const uint8_t* query,
|
|
191
|
+
const uint8_t* data,
|
|
192
|
+
size_t size,
|
|
193
|
+
size_t qb) {
|
|
194
|
+
uint64_t sum = 0;
|
|
195
|
+
size_t offset = 0;
|
|
196
|
+
|
|
197
|
+
if (size_t step = 512 / 8; offset + step <= size) {
|
|
198
|
+
__m512i sum_512 = _mm512_setzero_si512();
|
|
199
|
+
for (; offset + step <= size; offset += step) {
|
|
200
|
+
__m512i v_x = _mm512_loadu_si512(
|
|
201
|
+
reinterpret_cast<const __m512i*>(data + offset));
|
|
202
|
+
for (size_t j = 0; j < qb; j++) {
|
|
203
|
+
__m512i v_q = _mm512_loadu_si512(
|
|
204
|
+
reinterpret_cast<const __m512i*>(
|
|
205
|
+
query + j * size + offset));
|
|
206
|
+
__m512i v_xor = _mm512_xor_si512(v_q, v_x);
|
|
207
|
+
__m512i v_popcnt = popcount_512_vpopcntdq(v_xor);
|
|
208
|
+
__m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
|
|
209
|
+
sum_512 = _mm512_add_epi64(sum_512, v_shifted);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
sum += _mm512_reduce_add_epi64(sum_512);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (size_t step = 256 / 8; offset + step <= size) {
|
|
216
|
+
__m256i sum_256 = _mm256_setzero_si256();
|
|
217
|
+
for (; offset + step <= size; offset += step) {
|
|
218
|
+
__m256i v_x = _mm256_loadu_si256(
|
|
219
|
+
reinterpret_cast<const __m256i*>(data + offset));
|
|
220
|
+
for (size_t j = 0; j < qb; j++) {
|
|
221
|
+
__m256i v_q = _mm256_loadu_si256(
|
|
222
|
+
reinterpret_cast<const __m256i*>(
|
|
223
|
+
query + j * size + offset));
|
|
224
|
+
__m256i v_xor = _mm256_xor_si256(v_q, v_x);
|
|
225
|
+
__m256i v_popcnt = popcount_256_vpopcntdq(v_xor);
|
|
226
|
+
__m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
|
|
227
|
+
sum_256 = _mm256_add_epi64(sum_256, v_shifted);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
sum += reduce_add_256(sum_256);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
__m128i sum_128 = _mm_setzero_si128();
|
|
234
|
+
for (size_t step = 128 / 8; offset + step <= size; offset += step) {
|
|
235
|
+
__m128i v_x = _mm_loadu_si128(
|
|
236
|
+
reinterpret_cast<const __m128i*>(data + offset));
|
|
237
|
+
for (size_t j = 0; j < qb; j++) {
|
|
238
|
+
__m128i v_q = _mm_loadu_si128(
|
|
239
|
+
reinterpret_cast<const __m128i*>(
|
|
240
|
+
query + j * size + offset));
|
|
241
|
+
__m128i v_xor = _mm_xor_si128(v_q, v_x);
|
|
242
|
+
__m128i v_popcnt = popcount_128_vpopcntdq(v_xor);
|
|
243
|
+
__m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
|
|
244
|
+
sum_128 = _mm_add_epi64(sum_128, v_shifted);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
sum += reduce_add_128(sum_128);
|
|
248
|
+
|
|
249
|
+
for (size_t step = 64 / 8; offset + step <= size; offset += step) {
|
|
250
|
+
const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
|
|
251
|
+
for (size_t j = 0; j < qb; j++) {
|
|
252
|
+
const auto qv = *reinterpret_cast<const uint64_t*>(
|
|
253
|
+
query + j * size + offset);
|
|
254
|
+
sum += static_cast<uint64_t>(popcount64(qv ^ yv)) << j;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
for (; offset < size; ++offset) {
|
|
258
|
+
const auto yv = *(data + offset);
|
|
259
|
+
for (size_t j = 0; j < qb; j++) {
|
|
260
|
+
const auto qv = *(query + j * size + offset);
|
|
261
|
+
sum += static_cast<uint64_t>(popcount32(qv ^ yv)) << j;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
return sum;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
template <>
|
|
268
|
+
uint64_t popcount<SIMDLevel::AVX512_SPR>(const uint8_t* data, size_t size) {
|
|
269
|
+
uint64_t sum = 0;
|
|
270
|
+
size_t offset = 0;
|
|
271
|
+
|
|
272
|
+
if (offset + 512 / 8 <= size) {
|
|
273
|
+
__m512i sum_512 = _mm512_setzero_si512();
|
|
274
|
+
for (size_t end; (end = offset + 512 / 8) <= size; offset = end) {
|
|
275
|
+
__m512i v_x = _mm512_loadu_si512(
|
|
276
|
+
reinterpret_cast<const __m512i*>(data + offset));
|
|
277
|
+
__m512i v_popcnt = popcount_512_vpopcntdq(v_x);
|
|
278
|
+
sum_512 = _mm512_add_epi64(sum_512, v_popcnt);
|
|
279
|
+
}
|
|
280
|
+
sum += _mm512_reduce_add_epi64(sum_512);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if (offset + 256 / 8 <= size) {
|
|
284
|
+
__m256i sum_256 = _mm256_setzero_si256();
|
|
285
|
+
for (size_t end; (end = offset + 256 / 8) <= size; offset = end) {
|
|
286
|
+
__m256i v_x = _mm256_loadu_si256(
|
|
287
|
+
reinterpret_cast<const __m256i*>(data + offset));
|
|
288
|
+
__m256i v_popcnt = popcount_256_vpopcntdq(v_x);
|
|
289
|
+
sum_256 = _mm256_add_epi64(sum_256, v_popcnt);
|
|
290
|
+
}
|
|
291
|
+
sum += reduce_add_256(sum_256);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
__m128i sum_128 = _mm_setzero_si128();
|
|
295
|
+
for (size_t step = 128 / 8; offset + step <= size; offset += step) {
|
|
296
|
+
__m128i v_x = _mm_loadu_si128(
|
|
297
|
+
reinterpret_cast<const __m128i*>(data + offset));
|
|
298
|
+
sum_128 = _mm_add_epi64(sum_128, popcount_128_vpopcntdq(v_x));
|
|
299
|
+
}
|
|
300
|
+
sum += reduce_add_128(sum_128);
|
|
301
|
+
|
|
302
|
+
for (size_t step = 64 / 8; offset + step <= size; offset += step) {
|
|
303
|
+
const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
|
|
304
|
+
sum += popcount64(yv);
|
|
305
|
+
}
|
|
306
|
+
for (; offset < size; ++offset) {
|
|
307
|
+
const auto yv = *(data + offset);
|
|
308
|
+
sum += popcount32(yv);
|
|
309
|
+
}
|
|
310
|
+
return sum;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
} // namespace faiss::rabitq
|
|
314
|
+
|
|
315
|
+
namespace faiss::rabitq::multibit {
|
|
316
|
+
|
|
317
|
+
// Forward-declare the AVX512 floating-point inner-product kernel.
|
|
318
|
+
// VPOPCNTDQ does not help this kernel (it operates on FP32), so we
|
|
319
|
+
// reuse the AVX512 implementation rather than duplicate it.
|
|
320
|
+
template <>
|
|
321
|
+
float compute_inner_product<SIMDLevel::AVX512>(
|
|
322
|
+
const uint8_t* __restrict sign_bits,
|
|
323
|
+
const uint8_t* __restrict ex_code,
|
|
324
|
+
const float* __restrict rotated_q,
|
|
325
|
+
size_t d,
|
|
326
|
+
size_t ex_bits,
|
|
327
|
+
float cb);
|
|
328
|
+
|
|
329
|
+
template <>
|
|
330
|
+
float compute_inner_product<SIMDLevel::AVX512_SPR>(
|
|
331
|
+
const uint8_t* __restrict sign_bits,
|
|
332
|
+
const uint8_t* __restrict ex_code,
|
|
333
|
+
const float* __restrict rotated_q,
|
|
334
|
+
size_t d,
|
|
335
|
+
size_t ex_bits,
|
|
336
|
+
float cb) {
|
|
337
|
+
return compute_inner_product<SIMDLevel::AVX512>(
|
|
338
|
+
sign_bits, ex_code, rotated_q, d, ex_bits, cb);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
} // namespace faiss::rabitq::multibit
|
|
342
|
+
|
|
343
|
+
#endif // COMPILE_SIMD_AVX512_SPR
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#include <faiss/utils/rabitq_simd.h>
|
|
9
|
+
|
|
10
|
+
#ifdef COMPILE_SIMD_ARM_NEON
|
|
11
|
+
|
|
12
|
+
namespace faiss::rabitq {
|
|
13
|
+
|
|
14
|
+
template <>
|
|
15
|
+
uint64_t bitwise_and_dot_product<SIMDLevel::ARM_NEON>(
|
|
16
|
+
const uint8_t* query,
|
|
17
|
+
const uint8_t* data,
|
|
18
|
+
size_t size,
|
|
19
|
+
size_t qb) {
|
|
20
|
+
return bitwise_and_dot_product<SIMDLevel::NONE>(query, data, size, qb);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
template <>
|
|
24
|
+
uint64_t bitwise_xor_dot_product<SIMDLevel::ARM_NEON>(
|
|
25
|
+
const uint8_t* query,
|
|
26
|
+
const uint8_t* data,
|
|
27
|
+
size_t size,
|
|
28
|
+
size_t qb) {
|
|
29
|
+
return bitwise_xor_dot_product<SIMDLevel::NONE>(query, data, size, qb);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
template <>
|
|
33
|
+
uint64_t popcount<SIMDLevel::ARM_NEON>(const uint8_t* data, size_t size) {
|
|
34
|
+
return popcount<SIMDLevel::NONE>(data, size);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
} // namespace faiss::rabitq
|
|
38
|
+
|
|
39
|
+
namespace faiss::rabitq::multibit {
|
|
40
|
+
|
|
41
|
+
template <>
|
|
42
|
+
float compute_inner_product<SIMDLevel::ARM_NEON>(
|
|
43
|
+
const uint8_t* __restrict sign_bits,
|
|
44
|
+
const uint8_t* __restrict ex_code,
|
|
45
|
+
const float* __restrict rotated_q,
|
|
46
|
+
size_t d,
|
|
47
|
+
size_t ex_bits,
|
|
48
|
+
float cb) {
|
|
49
|
+
return compute_inner_product<SIMDLevel::NONE>(
|
|
50
|
+
sign_bits, ex_code, rotated_q, d, ex_bits, cb);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
} // namespace faiss::rabitq::multibit
|
|
54
|
+
|
|
55
|
+
#endif // COMPILE_SIMD_ARM_NEON
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#include <faiss/utils/rabitq_simd.h>
|
|
9
|
+
|
|
10
|
+
#ifdef COMPILE_SIMD_RISCV_RVV
|
|
11
|
+
|
|
12
|
+
namespace faiss::rabitq {
|
|
13
|
+
|
|
14
|
+
template <>
|
|
15
|
+
uint64_t bitwise_and_dot_product<SIMDLevel::RISCV_RVV>(
|
|
16
|
+
const uint8_t* query,
|
|
17
|
+
const uint8_t* data,
|
|
18
|
+
size_t size,
|
|
19
|
+
size_t qb) {
|
|
20
|
+
return bitwise_and_dot_product<SIMDLevel::NONE>(query, data, size, qb);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
template <>
|
|
24
|
+
uint64_t bitwise_xor_dot_product<SIMDLevel::RISCV_RVV>(
|
|
25
|
+
const uint8_t* query,
|
|
26
|
+
const uint8_t* data,
|
|
27
|
+
size_t size,
|
|
28
|
+
size_t qb) {
|
|
29
|
+
return bitwise_xor_dot_product<SIMDLevel::NONE>(query, data, size, qb);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
template <>
|
|
33
|
+
uint64_t popcount<SIMDLevel::RISCV_RVV>(const uint8_t* data, size_t size) {
|
|
34
|
+
return popcount<SIMDLevel::NONE>(data, size);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
} // namespace faiss::rabitq
|
|
38
|
+
|
|
39
|
+
namespace faiss::rabitq::multibit {
|
|
40
|
+
|
|
41
|
+
template <>
|
|
42
|
+
float compute_inner_product<SIMDLevel::RISCV_RVV>(
|
|
43
|
+
const uint8_t* __restrict sign_bits,
|
|
44
|
+
const uint8_t* __restrict ex_code,
|
|
45
|
+
const float* __restrict rotated_q,
|
|
46
|
+
size_t d,
|
|
47
|
+
size_t ex_bits,
|
|
48
|
+
float cb) {
|
|
49
|
+
return compute_inner_product<SIMDLevel::NONE>(
|
|
50
|
+
sign_bits, ex_code, rotated_q, d, ex_bits, cb);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
} // namespace faiss::rabitq::multibit
|
|
54
|
+
|
|
55
|
+
#endif // COMPILE_SIMD_RISCV_RVV
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#pragma once
|
|
9
|
+
|
|
10
|
+
// Private dispatch wrapper for SuperKMeans's block_l2. Routes to the
|
|
11
|
+
// highest available SIMD specialization at runtime (DD mode) or the
|
|
12
|
+
// compiled-in level (static mode). aarch64 currently falls through to the
|
|
13
|
+
// scalar primary template; adding NEON/SVE means just adding a new
|
|
14
|
+
// specialization file alongside the AVX ones.
|
|
15
|
+
//
|
|
16
|
+
// Known perf gap: aarch64 (NEON/SVE) specializations are not implemented yet.
|
|
17
|
+
// aarch64 falls through to the scalar primary template. Validating SVE requires
|
|
18
|
+
// a Graviton-class host; deferred to a focused follow-up.
|
|
19
|
+
|
|
20
|
+
#include <faiss/impl/simd_dispatch.h>
|
|
21
|
+
#include <faiss/utils/simd_impl/super_kmeans_kernels.h>
|
|
22
|
+
|
|
23
|
+
namespace faiss {
|
|
24
|
+
namespace detail {
|
|
25
|
+
|
|
26
|
+
inline float block_l2_dispatch(const float* x, const float* y, int n) {
|
|
27
|
+
return with_selected_simd_levels<AVAILABLE_SIMD_LEVELS_A0>(
|
|
28
|
+
[&]<SIMDLevel SL>() { return block_l2<SL>(x, y, n); });
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
} // namespace detail
|
|
32
|
+
} // namespace faiss
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#pragma once
|
|
9
|
+
|
|
10
|
+
#include <cstddef>
|
|
11
|
+
|
|
12
|
+
#include <faiss/utils/simd_levels.h>
|
|
13
|
+
|
|
14
|
+
namespace faiss {
|
|
15
|
+
namespace detail {
|
|
16
|
+
|
|
17
|
+
// Squared L2 over `n` dimensions; n in [1, pdx_block_size].
|
|
18
|
+
// Primary template is the scalar fallback; SIMDLevels without a dedicated
|
|
19
|
+
// specialization (ARM_NEON, ARM_SVE, NONE, ...) use it directly.
|
|
20
|
+
template <SIMDLevel Level>
|
|
21
|
+
inline float block_l2(const float* x, const float* y, int n) {
|
|
22
|
+
float s = 0.0f;
|
|
23
|
+
for (int m = 0; m < n; ++m) {
|
|
24
|
+
const float d = x[m] - y[m];
|
|
25
|
+
s += d * d;
|
|
26
|
+
}
|
|
27
|
+
return s;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// COMPILE_SIMD_* is a build-system define (link-time promise that the
|
|
31
|
+
// specialization will be available). Mirrors the impl-file guards.
|
|
32
|
+
#ifdef COMPILE_SIMD_AVX2
|
|
33
|
+
template <>
|
|
34
|
+
float block_l2<SIMDLevel::AVX2>(const float* x, const float* y, int n);
|
|
35
|
+
#endif
|
|
36
|
+
|
|
37
|
+
#ifdef COMPILE_SIMD_AVX512
|
|
38
|
+
template <>
|
|
39
|
+
float block_l2<SIMDLevel::AVX512>(const float* x, const float* y, int n);
|
|
40
|
+
#endif
|
|
41
|
+
|
|
42
|
+
} // namespace detail
|
|
43
|
+
} // namespace faiss
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#ifdef COMPILE_SIMD_AVX2
|
|
9
|
+
|
|
10
|
+
#include <faiss/utils/simd_impl/super_kmeans_kernels.h>
|
|
11
|
+
|
|
12
|
+
#include <immintrin.h>
|
|
13
|
+
|
|
14
|
+
namespace faiss {
|
|
15
|
+
namespace detail {
|
|
16
|
+
|
|
17
|
+
namespace {
|
|
18
|
+
|
|
19
|
+
// Reduce 8 float lanes of an AVX2 register to a scalar sum.
|
|
20
|
+
// Uses a shuffle+add tree instead of two _mm_hadd_ps. On Skylake-class
|
|
21
|
+
// cores, hadd is 3-cycle latency / 2-uop, while movehdup/movehl/add_ss
|
|
22
|
+
// are single-uop, single-cycle ops.
|
|
23
|
+
inline float horizontal_sum_avx2(__m256 v) {
|
|
24
|
+
__m128 lo = _mm256_castps256_ps128(v);
|
|
25
|
+
__m128 hi = _mm256_extractf128_ps(v, 1);
|
|
26
|
+
__m128 sum128 = _mm_add_ps(lo, hi); // 4 lanes
|
|
27
|
+
__m128 shuf = _mm_movehdup_ps(sum128); // [s1, s1, s3, s3]
|
|
28
|
+
__m128 sums = _mm_add_ps(sum128, shuf); // [s0+s1, _, s2+s3, _]
|
|
29
|
+
shuf = _mm_movehl_ps(shuf, sums); // [s2+s3, s3, _, _]
|
|
30
|
+
sums = _mm_add_ss(sums, shuf); // (s0+s1) + (s2+s3)
|
|
31
|
+
return _mm_cvtss_f32(sums);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
} // namespace
|
|
35
|
+
|
|
36
|
+
template <>
|
|
37
|
+
float block_l2<SIMDLevel::AVX2>(const float* x, const float* y, int n) {
|
|
38
|
+
__m256 acc = _mm256_setzero_ps();
|
|
39
|
+
int m = 0;
|
|
40
|
+
for (; m + 8 <= n; m += 8) {
|
|
41
|
+
__m256 xv = _mm256_loadu_ps(x + m);
|
|
42
|
+
__m256 yv = _mm256_loadu_ps(y + m);
|
|
43
|
+
__m256 diff = _mm256_sub_ps(xv, yv);
|
|
44
|
+
acc = _mm256_fmadd_ps(diff, diff, acc);
|
|
45
|
+
}
|
|
46
|
+
float result = horizontal_sum_avx2(acc);
|
|
47
|
+
for (; m < n; ++m) {
|
|
48
|
+
const float d = x[m] - y[m];
|
|
49
|
+
result += d * d;
|
|
50
|
+
}
|
|
51
|
+
return result;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
} // namespace detail
|
|
55
|
+
} // namespace faiss
|
|
56
|
+
|
|
57
|
+
#endif // COMPILE_SIMD_AVX2
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#ifdef COMPILE_SIMD_AVX512
|
|
9
|
+
|
|
10
|
+
#include <faiss/utils/simd_impl/super_kmeans_kernels.h>
|
|
11
|
+
|
|
12
|
+
#include <immintrin.h>
|
|
13
|
+
|
|
14
|
+
namespace faiss {
|
|
15
|
+
namespace detail {
|
|
16
|
+
|
|
17
|
+
template <>
|
|
18
|
+
float block_l2<SIMDLevel::AVX512>(const float* x, const float* y, int n) {
|
|
19
|
+
__m512 acc = _mm512_setzero_ps();
|
|
20
|
+
int m = 0;
|
|
21
|
+
for (; m + 16 <= n; m += 16) {
|
|
22
|
+
__m512 xv = _mm512_loadu_ps(x + m);
|
|
23
|
+
__m512 yv = _mm512_loadu_ps(y + m);
|
|
24
|
+
__m512 diff = _mm512_sub_ps(xv, yv);
|
|
25
|
+
acc = _mm512_fmadd_ps(diff, diff, acc);
|
|
26
|
+
}
|
|
27
|
+
// _mm512_reduce_add_ps: on modern AVX-512 SKUs (Cascade Lake+, Sapphire
|
|
28
|
+
// Rapids) GCC/Clang lower this to a shuffle+add tree, ~5-cycle latency.
|
|
29
|
+
// On older AVX-512 SKUs (Skylake-X, Ice Lake) the cross-lane reduction
|
|
30
|
+
// can be ~20 cycles. Acceptable here because n ~ pdx_block_size = 64
|
|
31
|
+
// (4 iterations of 16-wide accumulation), so per-block work dominates
|
|
32
|
+
// the reduction cost. AVX2 uses a manual shuffle+add tree explicitly
|
|
33
|
+
// to avoid `_mm_hadd_ps` overhead, where the ratio is reversed.
|
|
34
|
+
float result = _mm512_reduce_add_ps(acc);
|
|
35
|
+
for (; m < n; ++m) {
|
|
36
|
+
const float d = x[m] - y[m];
|
|
37
|
+
result += d * d;
|
|
38
|
+
}
|
|
39
|
+
return result;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
} // namespace detail
|
|
43
|
+
} // namespace faiss
|
|
44
|
+
|
|
45
|
+
#endif // COMPILE_SIMD_AVX512
|