faiss 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/faiss/extconf.rb +2 -1
- data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
- data/ext/faiss/index_binary.cpp +1 -1
- data/ext/faiss/kmeans.cpp +1 -1
- data/ext/faiss/pca_matrix.cpp +1 -1
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +93 -80
- data/vendor/faiss/faiss/Clustering.cpp +39 -240
- data/vendor/faiss/faiss/Clustering.h +6 -0
- data/vendor/faiss/faiss/IVFlib.cpp +41 -21
- data/vendor/faiss/faiss/Index.cpp +6 -5
- data/vendor/faiss/faiss/Index.h +5 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
- data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
- data/vendor/faiss/faiss/IndexFastScan.h +25 -23
- data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
- data/vendor/faiss/faiss/IndexFlat.h +21 -18
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
- data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
- data/vendor/faiss/faiss/IndexHNSW.h +16 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
- data/vendor/faiss/faiss/IndexIVF.h +33 -12
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
- data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
- data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
- data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
- data/vendor/faiss/faiss/IndexShards.cpp +10 -9
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +14 -7
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
- data/vendor/faiss/faiss/VectorTransform.h +16 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +48 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/factory_tools.cpp +5 -0
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
- data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
- data/vendor/faiss/faiss/impl/HNSW.h +13 -34
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
- data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
- data/vendor/faiss/faiss/impl/NSG.h +4 -4
- data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
- data/vendor/faiss/faiss/impl/Panorama.h +258 -87
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +33 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
- data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +86 -18
- data/vendor/faiss/faiss/index_io.h +24 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
- data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
- data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/distances.cpp +390 -560
- data/vendor/faiss/faiss/utils/distances.h +20 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
- data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
- data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +5 -5
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +119 -34
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
- /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
|
@@ -11,414 +11,116 @@
|
|
|
11
11
|
#include <cstdint>
|
|
12
12
|
#include <cstring>
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
#
|
|
16
|
-
defined(_M_IX86)
|
|
17
|
-
#include <immintrin.h>
|
|
18
|
-
#endif // defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||
|
|
14
|
+
#include <faiss/utils/popcount.h>
|
|
15
|
+
#include <faiss/utils/simd_levels.h>
|
|
19
16
|
|
|
20
17
|
namespace faiss::rabitq {
|
|
21
18
|
|
|
22
|
-
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
|
|
23
|
-
defined(_M_IX86)
|
|
24
19
|
/**
|
|
25
|
-
*
|
|
26
|
-
* This table is used for lookup-based popcount implementation.
|
|
20
|
+
* Compute dot product between query and binary data using popcount on AND.
|
|
27
21
|
*
|
|
28
|
-
*
|
|
29
|
-
*
|
|
30
|
-
* @
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
inline __m512i get_lookup_512() {
|
|
34
|
-
return _mm512_set_epi8(
|
|
35
|
-
/* f */ 4,
|
|
36
|
-
/* e */ 3,
|
|
37
|
-
/* d */ 3,
|
|
38
|
-
/* c */ 2,
|
|
39
|
-
/* b */ 3,
|
|
40
|
-
/* a */ 2,
|
|
41
|
-
/* 9 */ 2,
|
|
42
|
-
/* 8 */ 1,
|
|
43
|
-
/* 7 */ 3,
|
|
44
|
-
/* 6 */ 2,
|
|
45
|
-
/* 5 */ 2,
|
|
46
|
-
/* 4 */ 1,
|
|
47
|
-
/* 3 */ 2,
|
|
48
|
-
/* 2 */ 1,
|
|
49
|
-
/* 1 */ 1,
|
|
50
|
-
/* 0 */ 0,
|
|
51
|
-
/* f */ 4,
|
|
52
|
-
/* e */ 3,
|
|
53
|
-
/* d */ 3,
|
|
54
|
-
/* c */ 2,
|
|
55
|
-
/* b */ 3,
|
|
56
|
-
/* a */ 2,
|
|
57
|
-
/* 9 */ 2,
|
|
58
|
-
/* 8 */ 1,
|
|
59
|
-
/* 7 */ 3,
|
|
60
|
-
/* 6 */ 2,
|
|
61
|
-
/* 5 */ 2,
|
|
62
|
-
/* 4 */ 1,
|
|
63
|
-
/* 3 */ 2,
|
|
64
|
-
/* 2 */ 1,
|
|
65
|
-
/* 1 */ 1,
|
|
66
|
-
/* 0 */ 0,
|
|
67
|
-
/* f */ 4,
|
|
68
|
-
/* e */ 3,
|
|
69
|
-
/* d */ 3,
|
|
70
|
-
/* c */ 2,
|
|
71
|
-
/* b */ 3,
|
|
72
|
-
/* a */ 2,
|
|
73
|
-
/* 9 */ 2,
|
|
74
|
-
/* 8 */ 1,
|
|
75
|
-
/* 7 */ 3,
|
|
76
|
-
/* 6 */ 2,
|
|
77
|
-
/* 5 */ 2,
|
|
78
|
-
/* 4 */ 1,
|
|
79
|
-
/* 3 */ 2,
|
|
80
|
-
/* 2 */ 1,
|
|
81
|
-
/* 1 */ 1,
|
|
82
|
-
/* 0 */ 0,
|
|
83
|
-
/* f */ 4,
|
|
84
|
-
/* e */ 3,
|
|
85
|
-
/* d */ 3,
|
|
86
|
-
/* c */ 2,
|
|
87
|
-
/* b */ 3,
|
|
88
|
-
/* a */ 2,
|
|
89
|
-
/* 9 */ 2,
|
|
90
|
-
/* 8 */ 1,
|
|
91
|
-
/* 7 */ 3,
|
|
92
|
-
/* 6 */ 2,
|
|
93
|
-
/* 5 */ 2,
|
|
94
|
-
/* 4 */ 1,
|
|
95
|
-
/* 3 */ 2,
|
|
96
|
-
/* 2 */ 1,
|
|
97
|
-
/* 1 */ 1,
|
|
98
|
-
/* 0 */ 0);
|
|
99
|
-
}
|
|
100
|
-
#endif // defined(__AVX512F__)
|
|
101
|
-
#if defined(__AVX2__)
|
|
102
|
-
/**
|
|
103
|
-
* Returns the lookup table for AVX2 popcount operations.
|
|
104
|
-
* This table is used for lookup-based popcount implementation.
|
|
105
|
-
*
|
|
106
|
-
* @return Lookup table as __m256i register
|
|
22
|
+
* @param query Pointer to rearranged rotated query data
|
|
23
|
+
* @param data Pointer to binary data
|
|
24
|
+
* @param size Size in bytes
|
|
25
|
+
* @param qb Number of quantization bits
|
|
26
|
+
* @return Unsigned integer dot product
|
|
107
27
|
*/
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
/* 4 */ 1,
|
|
115
|
-
/* 5 */ 2,
|
|
116
|
-
/* 6 */ 2,
|
|
117
|
-
/* 7 */ 3,
|
|
118
|
-
/* 8 */ 1,
|
|
119
|
-
/* 9 */ 2,
|
|
120
|
-
/* a */ 2,
|
|
121
|
-
/* b */ 3,
|
|
122
|
-
/* c */ 2,
|
|
123
|
-
/* d */ 3,
|
|
124
|
-
/* e */ 3,
|
|
125
|
-
/* f */ 4,
|
|
126
|
-
/* 0 */ 0,
|
|
127
|
-
/* 1 */ 1,
|
|
128
|
-
/* 2 */ 1,
|
|
129
|
-
/* 3 */ 2,
|
|
130
|
-
/* 4 */ 1,
|
|
131
|
-
/* 5 */ 2,
|
|
132
|
-
/* 6 */ 2,
|
|
133
|
-
/* 7 */ 3,
|
|
134
|
-
/* 8 */ 1,
|
|
135
|
-
/* 9 */ 2,
|
|
136
|
-
/* a */ 2,
|
|
137
|
-
/* b */ 3,
|
|
138
|
-
/* c */ 2,
|
|
139
|
-
/* d */ 3,
|
|
140
|
-
/* e */ 3,
|
|
141
|
-
/* f */ 4);
|
|
142
|
-
}
|
|
143
|
-
#endif // defined(__AVX2__)
|
|
28
|
+
template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
|
|
29
|
+
uint64_t bitwise_and_dot_product(
|
|
30
|
+
const uint8_t* query,
|
|
31
|
+
const uint8_t* data,
|
|
32
|
+
size_t size,
|
|
33
|
+
size_t qb);
|
|
144
34
|
|
|
145
|
-
#if defined(__AVX512F__)
|
|
146
35
|
/**
|
|
147
|
-
*
|
|
36
|
+
* Compute dot product between query and binary data using popcount on XOR.
|
|
148
37
|
*
|
|
149
|
-
* @param
|
|
150
|
-
* @
|
|
38
|
+
* @param query Pointer to rearranged rotated query data
|
|
39
|
+
* @param data Pointer to binary data
|
|
40
|
+
* @param size Size in bytes
|
|
41
|
+
* @param qb Number of quantization bits
|
|
42
|
+
* @return Unsigned integer dot product
|
|
151
43
|
*/
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
const __m512i lo = _mm512_and_si512(v, low_mask);
|
|
160
|
-
const __m512i hi = _mm512_and_si512(_mm512_srli_epi16(v, 4), low_mask);
|
|
161
|
-
const __m512i popcnt_lo = _mm512_shuffle_epi8(lookup, lo);
|
|
162
|
-
const __m512i popcnt_hi = _mm512_shuffle_epi8(lookup, hi);
|
|
163
|
-
const __m512i popcnt = _mm512_add_epi8(popcnt_lo, popcnt_hi);
|
|
164
|
-
return _mm512_sad_epu8(_mm512_setzero_si512(), popcnt);
|
|
165
|
-
#endif // defined(__AVX512VPOPCNTDQ__)
|
|
166
|
-
}
|
|
167
|
-
#endif // defined(__AVX512F__)
|
|
44
|
+
template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
|
|
45
|
+
uint64_t bitwise_xor_dot_product(
|
|
46
|
+
const uint8_t* query,
|
|
47
|
+
const uint8_t* data,
|
|
48
|
+
size_t size,
|
|
49
|
+
size_t qb);
|
|
168
50
|
|
|
169
|
-
#if defined(__AVX2__)
|
|
170
51
|
/**
|
|
171
|
-
*
|
|
52
|
+
* Count total set bits in data.
|
|
172
53
|
*
|
|
173
|
-
* @param
|
|
174
|
-
* @
|
|
54
|
+
* @param data Pointer to binary data
|
|
55
|
+
* @param size Size in bytes
|
|
56
|
+
* @return Total popcount
|
|
175
57
|
*/
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
const __m256i low_mask = _mm256_set1_epi8(0x0f);
|
|
179
|
-
|
|
180
|
-
const __m256i lo = _mm256_and_si256(v, low_mask);
|
|
181
|
-
const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
|
|
182
|
-
const __m256i popcnt_lo = _mm256_shuffle_epi8(lookup, lo);
|
|
183
|
-
const __m256i popcnt_hi = _mm256_shuffle_epi8(lookup, hi);
|
|
184
|
-
const __m256i popcnt = _mm256_add_epi8(popcnt_lo, popcnt_hi);
|
|
185
|
-
// Reduce uint8_t[32] into uint64_t[4] by addition.
|
|
186
|
-
return _mm256_sad_epu8(_mm256_setzero_si256(), popcnt);
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
inline uint64_t reduce_add_256(__m256i v) {
|
|
190
|
-
alignas(32) uint64_t lanes[4];
|
|
191
|
-
_mm256_store_si256((__m256i*)lanes, v);
|
|
192
|
-
return lanes[0] + lanes[1] + lanes[2] + lanes[3];
|
|
193
|
-
}
|
|
194
|
-
#endif // defined(__AVX2__)
|
|
195
|
-
|
|
196
|
-
#if defined(__SSE4_1__)
|
|
197
|
-
inline __m128i popcount_128(__m128i v) {
|
|
198
|
-
// Scalar popcount for each 64-bit lane
|
|
199
|
-
uint64_t lane0 = _mm_extract_epi64(v, 0);
|
|
200
|
-
uint64_t lane1 = _mm_extract_epi64(v, 1);
|
|
201
|
-
uint64_t pop0 = __builtin_popcountll(lane0);
|
|
202
|
-
uint64_t pop1 = __builtin_popcountll(lane1);
|
|
203
|
-
return _mm_set_epi64x(pop1, pop0);
|
|
204
|
-
}
|
|
58
|
+
template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
|
|
59
|
+
uint64_t popcount(const uint8_t* data, size_t size);
|
|
205
60
|
|
|
206
|
-
|
|
207
|
-
alignas(16) uint64_t lanes[2];
|
|
208
|
-
_mm_store_si128((__m128i*)lanes, v);
|
|
209
|
-
return lanes[0] + lanes[1];
|
|
210
|
-
}
|
|
211
|
-
#endif // defined(__SSE4_1__)
|
|
212
|
-
#endif // defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||
|
|
61
|
+
// NONE specializations — scalar fallbacks
|
|
213
62
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
*
|
|
217
|
-
* @param query Pointer to rearranged rotated query data
|
|
218
|
-
* @param data Pointer to binary data
|
|
219
|
-
* @param d Dimension
|
|
220
|
-
* @param qb Number of quantization bits
|
|
221
|
-
* @return Unsigned integer dot product
|
|
222
|
-
*/
|
|
223
|
-
inline uint64_t bitwise_and_dot_product(
|
|
63
|
+
template <>
|
|
64
|
+
inline uint64_t bitwise_and_dot_product<SIMDLevel::NONE>(
|
|
224
65
|
const uint8_t* query,
|
|
225
66
|
const uint8_t* data,
|
|
226
67
|
size_t size,
|
|
227
68
|
size_t qb) {
|
|
228
69
|
uint64_t sum = 0;
|
|
229
70
|
size_t offset = 0;
|
|
230
|
-
#if defined(__AVX512F__)
|
|
231
|
-
// Handle 512-bit chunks.
|
|
232
|
-
if (size_t step = 512 / 8; offset + step <= size) {
|
|
233
|
-
__m512i sum_512 = _mm512_setzero_si512();
|
|
234
|
-
for (; offset + step <= size; offset += step) {
|
|
235
|
-
__m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
|
|
236
|
-
for (int j = 0; j < qb; j++) {
|
|
237
|
-
__m512i v_q = _mm512_loadu_si512(
|
|
238
|
-
(const __m512i*)(query + j * size + offset));
|
|
239
|
-
__m512i v_and = _mm512_and_si512(v_q, v_x);
|
|
240
|
-
__m512i v_popcnt = popcount_512(v_and);
|
|
241
|
-
__m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
|
|
242
|
-
sum_512 = _mm512_add_epi64(sum_512, v_shifted);
|
|
243
|
-
}
|
|
244
|
-
}
|
|
245
|
-
sum += _mm512_reduce_add_epi64(sum_512);
|
|
246
|
-
}
|
|
247
|
-
#endif // defined(__AVX512F__)
|
|
248
|
-
#if defined(__AVX2__)
|
|
249
|
-
if (size_t step = 256 / 8; offset + step <= size) {
|
|
250
|
-
__m256i sum_256 = _mm256_setzero_si256();
|
|
251
|
-
for (; offset + step <= size; offset += step) {
|
|
252
|
-
__m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
|
|
253
|
-
for (int j = 0; j < qb; j++) {
|
|
254
|
-
__m256i v_q = _mm256_loadu_si256(
|
|
255
|
-
(const __m256i*)(query + j * size + offset));
|
|
256
|
-
__m256i v_and = _mm256_and_si256(v_q, v_x);
|
|
257
|
-
__m256i v_popcnt = popcount_256(v_and);
|
|
258
|
-
__m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
|
|
259
|
-
sum_256 = _mm256_add_epi64(sum_256, v_shifted);
|
|
260
|
-
}
|
|
261
|
-
}
|
|
262
|
-
sum += reduce_add_256(sum_256);
|
|
263
|
-
}
|
|
264
|
-
#endif // defined(__AVX2__)
|
|
265
|
-
#if defined(__SSE4_1__)
|
|
266
|
-
__m128i sum_128 = _mm_setzero_si128();
|
|
267
|
-
for (size_t step = 128 / 8; offset + step <= size; offset += step) {
|
|
268
|
-
__m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
|
|
269
|
-
for (int j = 0; j < qb; j++) {
|
|
270
|
-
__m128i v_q = _mm_loadu_si128(
|
|
271
|
-
(const __m128i*)(query + j * size + offset));
|
|
272
|
-
__m128i v_and = _mm_and_si128(v_q, v_x);
|
|
273
|
-
__m128i v_popcnt = popcount_128(v_and);
|
|
274
|
-
__m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
|
|
275
|
-
sum_128 = _mm_add_epi64(sum_128, v_shifted);
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
sum += reduce_add_128(sum_128);
|
|
279
|
-
#endif // defined(__SSE4_1__)
|
|
280
71
|
for (size_t step = 64 / 8; offset + step <= size; offset += step) {
|
|
281
72
|
const auto yv = *(const uint64_t*)(data + offset);
|
|
282
73
|
for (int j = 0; j < qb; j++) {
|
|
283
74
|
const auto qv = *(const uint64_t*)(query + j * size + offset);
|
|
284
|
-
sum +=
|
|
75
|
+
sum += popcount64(qv & yv) << j;
|
|
285
76
|
}
|
|
286
77
|
}
|
|
287
78
|
for (; offset < size; ++offset) {
|
|
288
79
|
const auto yv = *(data + offset);
|
|
289
80
|
for (int j = 0; j < qb; j++) {
|
|
290
81
|
const auto qv = *(query + j * size + offset);
|
|
291
|
-
sum +=
|
|
82
|
+
sum += popcount32(qv & yv) << j;
|
|
292
83
|
}
|
|
293
84
|
}
|
|
294
85
|
return sum;
|
|
295
86
|
}
|
|
296
87
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
*
|
|
300
|
-
* @param query Pointer to rearranged rotated query data
|
|
301
|
-
* @param data Pointer to binary data
|
|
302
|
-
* @param d Dimension
|
|
303
|
-
* @param qb Number of quantization bits
|
|
304
|
-
* @return Unsigned integer dot product
|
|
305
|
-
*/
|
|
306
|
-
inline uint64_t bitwise_xor_dot_product(
|
|
88
|
+
template <>
|
|
89
|
+
inline uint64_t bitwise_xor_dot_product<SIMDLevel::NONE>(
|
|
307
90
|
const uint8_t* query,
|
|
308
91
|
const uint8_t* data,
|
|
309
92
|
size_t size,
|
|
310
93
|
size_t qb) {
|
|
311
94
|
uint64_t sum = 0;
|
|
312
95
|
size_t offset = 0;
|
|
313
|
-
#if defined(__AVX512F__)
|
|
314
|
-
// Handle 512-bit chunks.
|
|
315
|
-
if (size_t step = 512 / 8; offset + step <= size) {
|
|
316
|
-
__m512i sum_512 = _mm512_setzero_si512();
|
|
317
|
-
for (; offset + step <= size; offset += step) {
|
|
318
|
-
__m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
|
|
319
|
-
for (int j = 0; j < qb; j++) {
|
|
320
|
-
__m512i v_q = _mm512_loadu_si512(
|
|
321
|
-
(const __m512i*)(query + j * size + offset));
|
|
322
|
-
__m512i v_xor = _mm512_xor_si512(v_q, v_x);
|
|
323
|
-
__m512i v_popcnt = popcount_512(v_xor);
|
|
324
|
-
__m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
|
|
325
|
-
sum_512 = _mm512_add_epi64(sum_512, v_shifted);
|
|
326
|
-
}
|
|
327
|
-
}
|
|
328
|
-
sum += _mm512_reduce_add_epi64(sum_512);
|
|
329
|
-
}
|
|
330
|
-
#endif
|
|
331
|
-
#if defined(__AVX2__)
|
|
332
|
-
if (size_t step = 256 / 8; offset + step <= size) {
|
|
333
|
-
__m256i sum_256 = _mm256_setzero_si256();
|
|
334
|
-
for (; offset + step <= size; offset += step) {
|
|
335
|
-
__m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
|
|
336
|
-
for (int j = 0; j < qb; j++) {
|
|
337
|
-
__m256i v_q = _mm256_loadu_si256(
|
|
338
|
-
(const __m256i*)(query + j * size + offset));
|
|
339
|
-
__m256i v_xor = _mm256_xor_si256(v_q, v_x);
|
|
340
|
-
__m256i v_popcnt = popcount_256(v_xor);
|
|
341
|
-
__m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
|
|
342
|
-
sum_256 = _mm256_add_epi64(sum_256, v_shifted);
|
|
343
|
-
}
|
|
344
|
-
}
|
|
345
|
-
sum += reduce_add_256(sum_256);
|
|
346
|
-
}
|
|
347
|
-
#endif
|
|
348
|
-
#if defined(__SSE4_1__)
|
|
349
|
-
__m128i sum_128 = _mm_setzero_si128();
|
|
350
|
-
for (size_t step = 128 / 8; offset + step <= size; offset += step) {
|
|
351
|
-
__m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
|
|
352
|
-
for (int j = 0; j < qb; j++) {
|
|
353
|
-
__m128i v_q = _mm_loadu_si128(
|
|
354
|
-
(const __m128i*)(query + j * size + offset));
|
|
355
|
-
__m128i v_xor = _mm_xor_si128(v_q, v_x);
|
|
356
|
-
__m128i v_popcnt = popcount_128(v_xor);
|
|
357
|
-
__m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
|
|
358
|
-
sum_128 = _mm_add_epi64(sum_128, v_shifted);
|
|
359
|
-
}
|
|
360
|
-
}
|
|
361
|
-
sum += reduce_add_128(sum_128);
|
|
362
|
-
#endif
|
|
363
96
|
for (size_t step = 64 / 8; offset + step <= size; offset += step) {
|
|
364
97
|
const auto yv = *(const uint64_t*)(data + offset);
|
|
365
98
|
for (int j = 0; j < qb; j++) {
|
|
366
99
|
const auto qv = *(const uint64_t*)(query + j * size + offset);
|
|
367
|
-
sum +=
|
|
100
|
+
sum += popcount64(qv ^ yv) << j;
|
|
368
101
|
}
|
|
369
102
|
}
|
|
370
103
|
for (; offset < size; ++offset) {
|
|
371
104
|
const auto yv = *(data + offset);
|
|
372
105
|
for (int j = 0; j < qb; j++) {
|
|
373
106
|
const auto qv = *(query + j * size + offset);
|
|
374
|
-
sum +=
|
|
107
|
+
sum += popcount32(qv ^ yv) << j;
|
|
375
108
|
}
|
|
376
109
|
}
|
|
377
110
|
return sum;
|
|
378
111
|
}
|
|
379
112
|
|
|
380
|
-
|
|
113
|
+
template <>
|
|
114
|
+
inline uint64_t popcount<SIMDLevel::NONE>(const uint8_t* data, size_t size) {
|
|
381
115
|
uint64_t sum = 0;
|
|
382
116
|
size_t offset = 0;
|
|
383
|
-
#if defined(__AVX512F__)
|
|
384
|
-
// Handle 512-bit chunks.
|
|
385
|
-
if (offset + 512 / 8 <= size) {
|
|
386
|
-
__m512i sum_512 = _mm512_setzero_si512();
|
|
387
|
-
for (size_t end; (end = offset + 512 / 8) <= size; offset = end) {
|
|
388
|
-
__m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
|
|
389
|
-
__m512i v_popcnt = popcount_512(v_x);
|
|
390
|
-
sum_512 = _mm512_add_epi64(sum_512, v_popcnt);
|
|
391
|
-
}
|
|
392
|
-
sum += _mm512_reduce_add_epi64(sum_512);
|
|
393
|
-
}
|
|
394
|
-
#endif // defined(__AVX512F__)
|
|
395
|
-
#if defined(__AVX2__)
|
|
396
|
-
if (offset + 256 / 8 <= size) {
|
|
397
|
-
__m256i sum_256 = _mm256_setzero_si256();
|
|
398
|
-
for (size_t end; (end = offset + 256 / 8) <= size; offset = end) {
|
|
399
|
-
__m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
|
|
400
|
-
__m256i v_popcnt = popcount_256(v_x);
|
|
401
|
-
sum_256 = _mm256_add_epi64(sum_256, v_popcnt);
|
|
402
|
-
}
|
|
403
|
-
sum += reduce_add_256(sum_256);
|
|
404
|
-
}
|
|
405
|
-
#endif // defined(__AVX2__)
|
|
406
|
-
#if defined(__SSE4_1__)
|
|
407
|
-
__m128i sum_128 = _mm_setzero_si128();
|
|
408
|
-
for (size_t step = 128 / 8; offset + step <= size; offset += step) {
|
|
409
|
-
__m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
|
|
410
|
-
sum_128 = _mm_add_epi64(sum_128, popcount_128(v_x));
|
|
411
|
-
}
|
|
412
|
-
sum += reduce_add_128(sum_128);
|
|
413
|
-
#endif // defined(__SSE4_1__)
|
|
414
|
-
|
|
415
117
|
for (size_t step = 64 / 8; offset + step <= size; offset += step) {
|
|
416
118
|
const auto yv = *(const uint64_t*)(data + offset);
|
|
417
|
-
sum +=
|
|
119
|
+
sum += popcount64(yv);
|
|
418
120
|
}
|
|
419
121
|
for (; offset < size; ++offset) {
|
|
420
122
|
const auto yv = *(data + offset);
|
|
421
|
-
sum +=
|
|
123
|
+
sum += popcount32(yv);
|
|
422
124
|
}
|
|
423
125
|
return sum;
|
|
424
126
|
}
|
|
@@ -469,186 +171,9 @@ inline float ip_scalar(
|
|
|
469
171
|
return result;
|
|
470
172
|
}
|
|
471
173
|
|
|
472
|
-
#if defined(__x86_64__) || defined(_M_X64)
|
|
473
|
-
|
|
474
|
-
#if defined(__AVX2__)
|
|
475
|
-
/// Horizontal sum of 8 floats in a __m256 register.
|
|
476
|
-
inline float hsum_avx2(__m256 v) {
|
|
477
|
-
__m128 hi = _mm256_extractf128_ps(v, 1);
|
|
478
|
-
__m128 lo = _mm256_castps256_ps128(v);
|
|
479
|
-
lo = _mm_add_ps(lo, hi);
|
|
480
|
-
__m128 shuf = _mm_movehdup_ps(lo);
|
|
481
|
-
lo = _mm_add_ps(lo, shuf);
|
|
482
|
-
shuf = _mm_movehl_ps(shuf, lo);
|
|
483
|
-
return _mm_cvtss_f32(_mm_add_ss(lo, shuf));
|
|
484
|
-
}
|
|
485
|
-
#endif // __AVX2__
|
|
486
|
-
|
|
487
|
-
/*********************************************************
|
|
488
|
-
* Specialized 1-bit kernels (ex_bits == 1).
|
|
489
|
-
*
|
|
490
|
-
* For 1 extra bit, both sign_bits and ex_code are 1-bit-per-dim packed,
|
|
491
|
-
* so we convert bits to floats directly — no extraction loops needed.
|
|
492
|
-
*********************************************************/
|
|
493
|
-
|
|
494
|
-
#if defined(__AVX512F__)
|
|
495
|
-
/// AVX-512: 16 dims/iter, ex_bits == 1.
|
|
496
|
-
inline float ip_1exbit_avx512(
|
|
497
|
-
const uint8_t* __restrict sign_bits,
|
|
498
|
-
const uint8_t* __restrict ex_code,
|
|
499
|
-
const float* __restrict rotated_q,
|
|
500
|
-
size_t d,
|
|
501
|
-
float cb) {
|
|
502
|
-
__m512 acc = _mm512_setzero_ps();
|
|
503
|
-
const __m512 v_cb = _mm512_set1_ps(cb);
|
|
504
|
-
const __m512 v_two = _mm512_set1_ps(2.0f);
|
|
505
|
-
const __m512 v_one = _mm512_set1_ps(1.0f);
|
|
506
|
-
|
|
507
|
-
size_t i = 0;
|
|
508
|
-
for (; i + 16 <= d; i += 16) {
|
|
509
|
-
uint16_t sb16;
|
|
510
|
-
memcpy(&sb16, sign_bits + i / 8, sizeof(uint16_t));
|
|
511
|
-
uint16_t eb16;
|
|
512
|
-
memcpy(&eb16, ex_code + i / 8, sizeof(uint16_t));
|
|
513
|
-
|
|
514
|
-
__m512 sb_f = _mm512_maskz_mov_ps(_cvtu32_mask16(sb16), v_one);
|
|
515
|
-
__m512 eb_f = _mm512_maskz_mov_ps(_cvtu32_mask16(eb16), v_one);
|
|
516
|
-
|
|
517
|
-
__m512 recon = _mm512_add_ps(_mm512_fmadd_ps(sb_f, v_two, eb_f), v_cb);
|
|
518
|
-
__m512 rq = _mm512_loadu_ps(rotated_q + i);
|
|
519
|
-
acc = _mm512_fmadd_ps(rq, recon, acc);
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
float result = _mm512_reduce_add_ps(acc);
|
|
523
|
-
result += ip_scalar(sign_bits, ex_code, rotated_q, i, d, 1, cb);
|
|
524
|
-
return result;
|
|
525
|
-
}
|
|
526
|
-
#endif // __AVX512F__
|
|
527
|
-
|
|
528
|
-
#if defined(__AVX2__)
|
|
529
|
-
/// AVX2: 8 dims/iter, ex_bits == 1.
|
|
530
|
-
inline float ip_1exbit_avx2(
|
|
531
|
-
const uint8_t* __restrict sign_bits,
|
|
532
|
-
const uint8_t* __restrict ex_code,
|
|
533
|
-
const float* __restrict rotated_q,
|
|
534
|
-
size_t d,
|
|
535
|
-
float cb) {
|
|
536
|
-
__m256 acc = _mm256_setzero_ps();
|
|
537
|
-
const __m256 v_cb = _mm256_set1_ps(cb);
|
|
538
|
-
const __m256 v_two = _mm256_set1_ps(2.0f);
|
|
539
|
-
const __m256 v_one = _mm256_set1_ps(1.0f);
|
|
540
|
-
const __m256i bit_pos = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128);
|
|
541
|
-
const __m256i zero = _mm256_setzero_si256();
|
|
542
|
-
|
|
543
|
-
size_t i = 0;
|
|
544
|
-
for (; i + 8 <= d; i += 8) {
|
|
545
|
-
uint8_t sb = sign_bits[i / 8];
|
|
546
|
-
uint8_t eb = ex_code[i / 8];
|
|
547
|
-
|
|
548
|
-
__m256i sb_cmp = _mm256_cmpgt_epi32(
|
|
549
|
-
_mm256_and_si256(_mm256_set1_epi32(sb), bit_pos), zero);
|
|
550
|
-
__m256 sb_f = _mm256_and_ps(_mm256_castsi256_ps(sb_cmp), v_one);
|
|
551
|
-
|
|
552
|
-
__m256i eb_cmp = _mm256_cmpgt_epi32(
|
|
553
|
-
_mm256_and_si256(_mm256_set1_epi32(eb), bit_pos), zero);
|
|
554
|
-
__m256 eb_f = _mm256_and_ps(_mm256_castsi256_ps(eb_cmp), v_one);
|
|
555
|
-
|
|
556
|
-
__m256 recon = _mm256_add_ps(_mm256_fmadd_ps(sb_f, v_two, eb_f), v_cb);
|
|
557
|
-
__m256 rq = _mm256_loadu_ps(rotated_q + i);
|
|
558
|
-
acc = _mm256_fmadd_ps(rq, recon, acc);
|
|
559
|
-
}
|
|
560
|
-
|
|
561
|
-
float result = hsum_avx2(acc);
|
|
562
|
-
result += ip_scalar(sign_bits, ex_code, rotated_q, i, d, 1, cb);
|
|
563
|
-
return result;
|
|
564
|
-
}
|
|
565
|
-
#endif // __AVX2__
|
|
566
|
-
|
|
567
|
-
/*********************************************************
|
|
568
|
-
* Bit-plane decomposition kernels (ex_bits >= 2, BMI2 required).
|
|
569
|
-
*
|
|
570
|
-
* Decomposes the inner product as:
|
|
571
|
-
* ex_ip = (1 << ex_bits) * sign_dot
|
|
572
|
-
* + Σ_{b=0}^{ex_bits-1} (1 << b) * plane_dot_b
|
|
573
|
-
* + cb * total_q
|
|
574
|
-
*
|
|
575
|
-
* Each plane_dot_b is a float × bit-vector dot product, computed using
|
|
576
|
-
* the same bit→mask→float conversion as the 1-bit kernel. PEXT
|
|
577
|
-
* extracts each bit plane from the packed ex_code in one instruction
|
|
578
|
-
* per 8 dimensions.
|
|
579
|
-
*********************************************************/
|
|
580
|
-
|
|
581
|
-
#if defined(__AVX2__) && defined(__BMI2__)
|
|
582
|
-
/// AVX2 + BMI2 bit-plane decomposition: 8 dims/iter, ex_bits in [2, 7].
|
|
583
|
-
/// Caller must ensure ex_bits <= 7 (pext_masks[7] / v_weights[8]).
|
|
584
|
-
inline float ip_bitplane_avx2(
|
|
585
|
-
const uint8_t* __restrict sign_bits,
|
|
586
|
-
const uint8_t* __restrict ex_code,
|
|
587
|
-
const float* __restrict rotated_q,
|
|
588
|
-
size_t d,
|
|
589
|
-
size_t ex_bits,
|
|
590
|
-
float cb) {
|
|
591
|
-
__m256 acc = _mm256_setzero_ps();
|
|
592
|
-
const __m256 v_one = _mm256_set1_ps(1.0f);
|
|
593
|
-
const __m256i bit_pos = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128);
|
|
594
|
-
const __m256i zero = _mm256_setzero_si256();
|
|
595
|
-
const __m256 v_cb = _mm256_set1_ps(cb);
|
|
596
|
-
|
|
597
|
-
// Precompute PEXT masks and plane weights
|
|
598
|
-
uint64_t pext_masks[7];
|
|
599
|
-
__m256 v_weights[8];
|
|
600
|
-
for (size_t b = 0; b < ex_bits; b++) {
|
|
601
|
-
uint64_t m = 0;
|
|
602
|
-
for (int j = 0; j < 8; j++) {
|
|
603
|
-
m |= (1ULL << (b + j * ex_bits));
|
|
604
|
-
}
|
|
605
|
-
pext_masks[b] = m;
|
|
606
|
-
v_weights[b] = _mm256_set1_ps(static_cast<float>(1u << b));
|
|
607
|
-
}
|
|
608
|
-
v_weights[ex_bits] = _mm256_set1_ps(static_cast<float>(1u << ex_bits));
|
|
609
|
-
|
|
610
|
-
size_t i = 0;
|
|
611
|
-
for (; i + 8 <= d; i += 8) {
|
|
612
|
-
// Sign bit → float via bit mask comparison
|
|
613
|
-
__m256i sb_cmp = _mm256_cmpgt_epi32(
|
|
614
|
-
_mm256_and_si256(_mm256_set1_epi32(sign_bits[i / 8]), bit_pos),
|
|
615
|
-
zero);
|
|
616
|
-
__m256 recon = _mm256_mul_ps(
|
|
617
|
-
_mm256_and_ps(_mm256_castsi256_ps(sb_cmp), v_one),
|
|
618
|
-
v_weights[ex_bits]);
|
|
619
|
-
|
|
620
|
-
// Load packed ex_code for 8 dims (8 × ex_bits bits = ex_bits bytes)
|
|
621
|
-
uint64_t ex64 = 0;
|
|
622
|
-
memcpy(&ex64, ex_code + (i / 8) * ex_bits, sizeof(uint64_t));
|
|
623
|
-
|
|
624
|
-
// Extract each bit plane via PEXT → bit mask → float
|
|
625
|
-
for (size_t b = 0; b < ex_bits; b++) {
|
|
626
|
-
auto plane = static_cast<uint8_t>(_pext_u64(ex64, pext_masks[b]));
|
|
627
|
-
__m256i p_cmp = _mm256_cmpgt_epi32(
|
|
628
|
-
_mm256_and_si256(_mm256_set1_epi32(plane), bit_pos), zero);
|
|
629
|
-
__m256 p_f = _mm256_and_ps(_mm256_castsi256_ps(p_cmp), v_one);
|
|
630
|
-
recon = _mm256_fmadd_ps(p_f, v_weights[b], recon);
|
|
631
|
-
}
|
|
632
|
-
|
|
633
|
-
__m256 rq = _mm256_loadu_ps(rotated_q + i);
|
|
634
|
-
acc = _mm256_fmadd_ps(rq, _mm256_add_ps(recon, v_cb), acc);
|
|
635
|
-
}
|
|
636
|
-
|
|
637
|
-
float result = hsum_avx2(acc);
|
|
638
|
-
result += ip_scalar(sign_bits, ex_code, rotated_q, i, d, ex_bits, cb);
|
|
639
|
-
return result;
|
|
640
|
-
}
|
|
641
|
-
#endif // __AVX2__ && __BMI2__
|
|
642
|
-
|
|
643
|
-
#endif // x86_64
|
|
644
|
-
|
|
645
174
|
/**
|
|
646
175
|
* Dispatch to the best available kernel for the given ex_bits.
|
|
647
176
|
*
|
|
648
|
-
* Routing (compile-time):
|
|
649
|
-
* ex_bits == 1: specialized 1-bit kernel (AVX-512 > AVX2 > scalar)
|
|
650
|
-
* ex_bits >= 2: bit-plane decomposition (AVX2+BMI2 > scalar)
|
|
651
|
-
*
|
|
652
177
|
* @param sign_bits packed sign bits (1 bit/dim, standard byte packing)
|
|
653
178
|
* @param ex_code packed extra-bit codes (ex_bits bits/dim)
|
|
654
179
|
* @param rotated_q rotated query vector (float[d])
|
|
@@ -657,28 +182,24 @@ inline float ip_bitplane_avx2(
|
|
|
657
182
|
* @param cb constant bias: -(2^ex_bits - 0.5)
|
|
658
183
|
* @return inner product value
|
|
659
184
|
*/
|
|
660
|
-
|
|
185
|
+
template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
|
|
186
|
+
float compute_inner_product(
|
|
661
187
|
const uint8_t* __restrict sign_bits,
|
|
662
188
|
const uint8_t* __restrict ex_code,
|
|
663
189
|
const float* __restrict rotated_q,
|
|
664
190
|
size_t d,
|
|
665
191
|
size_t ex_bits,
|
|
666
|
-
float cb)
|
|
667
|
-
if (ex_bits == 1) {
|
|
668
|
-
#if defined(__AVX512F__)
|
|
669
|
-
return ip_1exbit_avx512(sign_bits, ex_code, rotated_q, d, cb);
|
|
670
|
-
#elif defined(__AVX2__)
|
|
671
|
-
return ip_1exbit_avx2(sign_bits, ex_code, rotated_q, d, cb);
|
|
672
|
-
#else
|
|
673
|
-
return ip_scalar(sign_bits, ex_code, rotated_q, 0, d, 1, cb);
|
|
674
|
-
#endif
|
|
675
|
-
}
|
|
192
|
+
float cb);
|
|
676
193
|
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
194
|
+
// NONE specialization — pure scalar
|
|
195
|
+
template <>
|
|
196
|
+
inline float compute_inner_product<SIMDLevel::NONE>(
|
|
197
|
+
const uint8_t* __restrict sign_bits,
|
|
198
|
+
const uint8_t* __restrict ex_code,
|
|
199
|
+
const float* __restrict rotated_q,
|
|
200
|
+
size_t d,
|
|
201
|
+
size_t ex_bits,
|
|
202
|
+
float cb) {
|
|
682
203
|
return ip_scalar(sign_bits, ex_code, rotated_q, 0, d, ex_bits, cb);
|
|
683
204
|
}
|
|
684
205
|
|