faiss 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/faiss/extconf.rb +2 -1
- data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
- data/ext/faiss/index_binary.cpp +1 -1
- data/ext/faiss/kmeans.cpp +1 -1
- data/ext/faiss/pca_matrix.cpp +1 -1
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +93 -80
- data/vendor/faiss/faiss/Clustering.cpp +39 -240
- data/vendor/faiss/faiss/Clustering.h +6 -0
- data/vendor/faiss/faiss/IVFlib.cpp +41 -21
- data/vendor/faiss/faiss/Index.cpp +6 -5
- data/vendor/faiss/faiss/Index.h +5 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +88 -97
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +89 -417
- data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
- data/vendor/faiss/faiss/IndexFastScan.h +25 -23
- data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
- data/vendor/faiss/faiss/IndexFlat.h +21 -18
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
- data/vendor/faiss/faiss/IndexHNSW.cpp +374 -206
- data/vendor/faiss/faiss/IndexHNSW.h +16 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +467 -364
- data/vendor/faiss/faiss/IndexIVF.h +33 -12
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +79 -76
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
- data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +39 -69
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +56 -33
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +73 -846
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +23 -20
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +30 -52
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
- data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +38 -23
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
- data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +150 -20
- data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
- data/vendor/faiss/faiss/IndexShards.cpp +10 -9
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +14 -7
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
- data/vendor/faiss/faiss/VectorTransform.h +16 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +48 -47
- data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
- data/vendor/faiss/faiss/factory_tools.cpp +9 -0
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +15 -16
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +5 -4
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +58 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +111 -0
- data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
- data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +639 -507
- data/vendor/faiss/faiss/impl/HNSW.h +61 -44
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
- data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +53 -32
- data/vendor/faiss/faiss/impl/NSG.h +4 -4
- data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
- data/vendor/faiss/faiss/impl/Panorama.h +269 -87
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +55 -25
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +302 -283
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +100 -75
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +318 -7
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +77 -1
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
- data/vendor/faiss/faiss/impl/VisitedTable.h +70 -28
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +270 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +83 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +113 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +150 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +142 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1227 -79
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +96 -13
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +58 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
- data/vendor/faiss/faiss/impl/platform_macros.h +15 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +23 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +23 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +23 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +45 -107
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +274 -5
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +10 -7
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +70 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +9 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +419 -19
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +387 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +341 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +425 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +290 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +337 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +157 -66
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +90 -18
- data/vendor/faiss/faiss/index_io.h +40 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +28 -15
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +170 -86
- data/vendor/faiss/faiss/invlists/InvertedLists.h +88 -25
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +142 -21
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +33 -7
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +77 -27
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +10 -4
- data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/bf16.h +34 -0
- data/vendor/faiss/faiss/utils/distances.cpp +390 -560
- data/vendor/faiss/faiss/utils/distances.h +20 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -178
- data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +16 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +210 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -989
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1031 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +29 -7
- data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +5 -5
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +129 -34
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
- /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
|
@@ -7,6 +7,11 @@
|
|
|
7
7
|
|
|
8
8
|
#ifdef COMPILE_SIMD_ARM_NEON
|
|
9
9
|
|
|
10
|
+
#include <faiss/impl/simdlib/simdlib_neon.h>
|
|
11
|
+
|
|
12
|
+
#include <algorithm>
|
|
13
|
+
#include <cstring>
|
|
14
|
+
|
|
10
15
|
#include <faiss/impl/scalar_quantizer/codecs.h>
|
|
11
16
|
#include <faiss/impl/scalar_quantizer/distance_computers.h>
|
|
12
17
|
#include <faiss/impl/scalar_quantizer/quantizers.h>
|
|
@@ -17,6 +22,81 @@ namespace faiss {
|
|
|
17
22
|
|
|
18
23
|
namespace scalar_quantizer {
|
|
19
24
|
|
|
25
|
+
using simd8float32 = faiss::simd8float32_tpl<SIMDLevel::ARM_NEON>;
|
|
26
|
+
|
|
27
|
+
namespace {
|
|
28
|
+
|
|
29
|
+
FAISS_ALWAYS_INLINE uint16_t load_u16(const uint8_t* ptr) {
|
|
30
|
+
uint16_t value;
|
|
31
|
+
std::memcpy(&value, ptr, sizeof(value));
|
|
32
|
+
return value;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
FAISS_ALWAYS_INLINE uint32_t load_u32(const uint8_t* ptr) {
|
|
36
|
+
uint32_t value;
|
|
37
|
+
std::memcpy(&value, ptr, sizeof(value));
|
|
38
|
+
return value;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
FAISS_ALWAYS_INLINE uint32_t load_u24(const uint8_t* ptr) {
|
|
42
|
+
return static_cast<uint32_t>(ptr[0]) |
|
|
43
|
+
(static_cast<uint32_t>(ptr[1]) << 8) |
|
|
44
|
+
(static_cast<uint32_t>(ptr[2]) << 16);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
FAISS_ALWAYS_INLINE void unpack_8x1bit_to_u8(
|
|
48
|
+
const uint8_t* code,
|
|
49
|
+
int i,
|
|
50
|
+
uint8_t out[8]) {
|
|
51
|
+
const uint8_t packed = code[static_cast<size_t>(i) >> 3];
|
|
52
|
+
for (size_t j = 0; j < 8; ++j) {
|
|
53
|
+
out[j] = (packed >> j) & 0x1;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
FAISS_ALWAYS_INLINE void unpack_8x2bit_to_u8(
|
|
58
|
+
const uint8_t* code,
|
|
59
|
+
int i,
|
|
60
|
+
uint8_t out[8]) {
|
|
61
|
+
const uint16_t packed = load_u16(code + (static_cast<size_t>(i) >> 2));
|
|
62
|
+
for (size_t j = 0; j < 8; ++j) {
|
|
63
|
+
out[j] = (packed >> (2 * j)) & 0x3;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
FAISS_ALWAYS_INLINE void unpack_8x3bit_to_u8(
|
|
68
|
+
const uint8_t* code,
|
|
69
|
+
int i,
|
|
70
|
+
uint8_t out[8]) {
|
|
71
|
+
const uint32_t packed =
|
|
72
|
+
load_u24(code + ((static_cast<size_t>(i) >> 3) * 3));
|
|
73
|
+
for (size_t j = 0; j < 8; ++j) {
|
|
74
|
+
out[j] = (packed >> (3 * j)) & 0x7;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
FAISS_ALWAYS_INLINE void unpack_8x4bit_to_u8(
|
|
79
|
+
const uint8_t* code,
|
|
80
|
+
int i,
|
|
81
|
+
uint8_t out[8]) {
|
|
82
|
+
const uint32_t packed = load_u32(code + (static_cast<size_t>(i) >> 1));
|
|
83
|
+
for (size_t j = 0; j < 8; ++j) {
|
|
84
|
+
out[j] = (packed >> (4 * j)) & 0xf;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
FAISS_ALWAYS_INLINE simd8float32
|
|
89
|
+
gather_8_components(const float* codebook, const uint8_t indices[8]) {
|
|
90
|
+
float result[8];
|
|
91
|
+
for (size_t j = 0; j < 8; ++j) {
|
|
92
|
+
result[j] = codebook[indices[j]];
|
|
93
|
+
}
|
|
94
|
+
return simd8float32(
|
|
95
|
+
float32x4x2_t{vld1q_f32(result), vld1q_f32(result + 4)});
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
} // namespace
|
|
99
|
+
|
|
20
100
|
/**********************************************************
|
|
21
101
|
* Codecs
|
|
22
102
|
**********************************************************/
|
|
@@ -101,6 +181,12 @@ struct QuantizerTemplate<
|
|
|
101
181
|
xi.data.val[1],
|
|
102
182
|
this->vdiff)});
|
|
103
183
|
}
|
|
184
|
+
|
|
185
|
+
/// Raw codec decode without denormalization (for pre-decode opt)
|
|
186
|
+
FAISS_ALWAYS_INLINE simd8float32
|
|
187
|
+
decode_8_raw(const uint8_t* code, int i) const {
|
|
188
|
+
return Codec::decode_8_components(code, i);
|
|
189
|
+
}
|
|
104
190
|
};
|
|
105
191
|
|
|
106
192
|
template <class Codec>
|
|
@@ -136,6 +222,74 @@ struct QuantizerTemplate<
|
|
|
136
222
|
}
|
|
137
223
|
};
|
|
138
224
|
|
|
225
|
+
/**********************************************************
|
|
226
|
+
* TurboQuant MSE quantizer
|
|
227
|
+
**********************************************************/
|
|
228
|
+
|
|
229
|
+
// NEON TurboQuantMSE: decode via gather, encode stays scalar.
|
|
230
|
+
// NEON doesn't have movemask so 1-bit encode is also scalar.
|
|
231
|
+
#define DEFINE_TQMSE_NEON_SPECIALIZATION(NBITS, UNPACK_FN) \
|
|
232
|
+
template <> \
|
|
233
|
+
struct QuantizerTurboQuantMSE<NBITS, SIMDLevel::ARM_NEON> \
|
|
234
|
+
: QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE> { \
|
|
235
|
+
using Base = QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE>; \
|
|
236
|
+
\
|
|
237
|
+
QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained) \
|
|
238
|
+
: Base(d, trained) { \
|
|
239
|
+
assert(d % 8 == 0); \
|
|
240
|
+
} \
|
|
241
|
+
\
|
|
242
|
+
FAISS_ALWAYS_INLINE simd8float32 \
|
|
243
|
+
reconstruct_8_components(const uint8_t* code, int i) const { \
|
|
244
|
+
uint8_t indices[8]; \
|
|
245
|
+
UNPACK_FN(code, i, indices); \
|
|
246
|
+
return gather_8_components(this->centroids, indices); \
|
|
247
|
+
} \
|
|
248
|
+
\
|
|
249
|
+
void decode_vector(const uint8_t* code, float* x) const final { \
|
|
250
|
+
for (size_t i = 0; i < this->d; i += 8) { \
|
|
251
|
+
simd8float32 xi = \
|
|
252
|
+
reconstruct_8_components(code, static_cast<int>(i)); \
|
|
253
|
+
vst1q_f32(x + i, xi.data.val[0]); \
|
|
254
|
+
vst1q_f32(x + i + 4, xi.data.val[1]); \
|
|
255
|
+
} \
|
|
256
|
+
} \
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
DEFINE_TQMSE_NEON_SPECIALIZATION(1, unpack_8x1bit_to_u8);
|
|
260
|
+
DEFINE_TQMSE_NEON_SPECIALIZATION(2, unpack_8x2bit_to_u8);
|
|
261
|
+
DEFINE_TQMSE_NEON_SPECIALIZATION(3, unpack_8x3bit_to_u8);
|
|
262
|
+
DEFINE_TQMSE_NEON_SPECIALIZATION(4, unpack_8x4bit_to_u8);
|
|
263
|
+
|
|
264
|
+
#undef DEFINE_TQMSE_NEON_SPECIALIZATION
|
|
265
|
+
|
|
266
|
+
template <>
|
|
267
|
+
struct QuantizerTurboQuantMSE<8, SIMDLevel::ARM_NEON>
|
|
268
|
+
: QuantizerTurboQuantMSE<8, SIMDLevel::NONE> {
|
|
269
|
+
using Base = QuantizerTurboQuantMSE<8, SIMDLevel::NONE>;
|
|
270
|
+
|
|
271
|
+
QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained)
|
|
272
|
+
: Base(d, trained) {
|
|
273
|
+
assert(d % 8 == 0);
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
FAISS_ALWAYS_INLINE simd8float32
|
|
277
|
+
reconstruct_8_components(const uint8_t* code, int i) const {
|
|
278
|
+
uint8_t indices[8];
|
|
279
|
+
std::memcpy(indices, code + static_cast<size_t>(i), sizeof(indices));
|
|
280
|
+
return gather_8_components(this->centroids, indices);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
void decode_vector(const uint8_t* code, float* x) const final {
|
|
284
|
+
for (size_t i = 0; i < this->d; i += 8) {
|
|
285
|
+
simd8float32 xi =
|
|
286
|
+
reconstruct_8_components(code, static_cast<int>(i));
|
|
287
|
+
vst1q_f32(x + i, xi.data.val[0]);
|
|
288
|
+
vst1q_f32(x + i + 4, xi.data.val[1]);
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
};
|
|
292
|
+
|
|
139
293
|
/**********************************************************
|
|
140
294
|
* FP16 Quantizer
|
|
141
295
|
**********************************************************/
|
|
@@ -270,6 +424,22 @@ struct SimilarityL2<SIMDLevel::ARM_NEON> {
|
|
|
270
424
|
FAISS_ALWAYS_INLINE float result_8() {
|
|
271
425
|
return horizontal_add(accu8);
|
|
272
426
|
}
|
|
427
|
+
|
|
428
|
+
static void adjust_query_for_raw_decode(
|
|
429
|
+
const float* x,
|
|
430
|
+
float* q_adj,
|
|
431
|
+
size_t d,
|
|
432
|
+
float vmin,
|
|
433
|
+
float vdiff,
|
|
434
|
+
float& scale_factor,
|
|
435
|
+
float& bias) {
|
|
436
|
+
float inv_vdiff = (vdiff != 0) ? 1.0f / vdiff : 0.0f;
|
|
437
|
+
for (size_t i = 0; i < d; i++) {
|
|
438
|
+
q_adj[i] = (x[i] - vmin) * inv_vdiff;
|
|
439
|
+
}
|
|
440
|
+
scale_factor = vdiff * vdiff;
|
|
441
|
+
bias = 0;
|
|
442
|
+
}
|
|
273
443
|
};
|
|
274
444
|
|
|
275
445
|
template <>
|
|
@@ -304,6 +474,23 @@ struct SimilarityIP<SIMDLevel::ARM_NEON> {
|
|
|
304
474
|
FAISS_ALWAYS_INLINE float result_8() {
|
|
305
475
|
return horizontal_add(accu8);
|
|
306
476
|
}
|
|
477
|
+
|
|
478
|
+
static void adjust_query_for_raw_decode(
|
|
479
|
+
const float* x,
|
|
480
|
+
float* q_adj,
|
|
481
|
+
size_t d,
|
|
482
|
+
float vmin,
|
|
483
|
+
float vdiff,
|
|
484
|
+
float& scale_factor,
|
|
485
|
+
float& bias) {
|
|
486
|
+
float sum_q = 0;
|
|
487
|
+
for (size_t i = 0; i < d; i++) {
|
|
488
|
+
q_adj[i] = x[i];
|
|
489
|
+
sum_q += x[i];
|
|
490
|
+
}
|
|
491
|
+
scale_factor = vdiff;
|
|
492
|
+
bias = vmin * sum_q;
|
|
493
|
+
}
|
|
307
494
|
};
|
|
308
495
|
|
|
309
496
|
/**********************************************************
|
|
@@ -317,8 +504,23 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::ARM_NEON>
|
|
|
317
504
|
|
|
318
505
|
Quantizer quant;
|
|
319
506
|
|
|
507
|
+
// Pre-adjusted query buffer for uniform quantizers
|
|
508
|
+
std::vector<float> q_adj;
|
|
509
|
+
float scale_factor = 0;
|
|
510
|
+
float bias = 0;
|
|
511
|
+
|
|
512
|
+
static constexpr bool has_decode_raw() {
|
|
513
|
+
return requires(const Quantizer& q, const uint8_t* c, int i) {
|
|
514
|
+
{ q.decode_8_raw(c, i) };
|
|
515
|
+
};
|
|
516
|
+
}
|
|
517
|
+
|
|
320
518
|
DCTemplate(size_t d, const std::vector<float>& trained)
|
|
321
|
-
: quant(d, trained) {
|
|
519
|
+
: quant(d, trained) {
|
|
520
|
+
if constexpr (has_decode_raw()) {
|
|
521
|
+
q_adj.resize(d);
|
|
522
|
+
}
|
|
523
|
+
}
|
|
322
524
|
|
|
323
525
|
float compute_distance(const float* x, const uint8_t* code) const {
|
|
324
526
|
Similarity sim(x);
|
|
@@ -344,6 +546,26 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::ARM_NEON>
|
|
|
344
546
|
|
|
345
547
|
void set_query(const float* x) final {
|
|
346
548
|
q = x;
|
|
549
|
+
if constexpr (has_decode_raw()) {
|
|
550
|
+
Sim::adjust_query_for_raw_decode(
|
|
551
|
+
x,
|
|
552
|
+
q_adj.data(),
|
|
553
|
+
quant.d,
|
|
554
|
+
quant.vmin,
|
|
555
|
+
quant.vdiff,
|
|
556
|
+
scale_factor,
|
|
557
|
+
bias);
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
float query_to_code_predecoded(const uint8_t* code) const {
|
|
562
|
+
Similarity sim(q_adj.data());
|
|
563
|
+
sim.begin_8();
|
|
564
|
+
for (size_t i = 0; i < quant.d; i += 8) {
|
|
565
|
+
simd8float32 xi = quant.decode_8_raw(code, i);
|
|
566
|
+
sim.add_8_components(xi);
|
|
567
|
+
}
|
|
568
|
+
return bias + scale_factor * sim.result_8();
|
|
347
569
|
}
|
|
348
570
|
|
|
349
571
|
float symmetric_dis(idx_t i, idx_t j) override {
|
|
@@ -352,7 +574,47 @@ struct DCTemplate<Quantizer, Similarity, SIMDLevel::ARM_NEON>
|
|
|
352
574
|
}
|
|
353
575
|
|
|
354
576
|
float query_to_code(const uint8_t* code) const final {
|
|
355
|
-
|
|
577
|
+
if constexpr (has_decode_raw()) {
|
|
578
|
+
return query_to_code_predecoded(code);
|
|
579
|
+
} else {
|
|
580
|
+
return compute_distance(q, code);
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
void query_to_codes_batch_4(
|
|
585
|
+
const uint8_t* code_0,
|
|
586
|
+
const uint8_t* code_1,
|
|
587
|
+
const uint8_t* code_2,
|
|
588
|
+
const uint8_t* code_3,
|
|
589
|
+
float& dis0,
|
|
590
|
+
float& dis1,
|
|
591
|
+
float& dis2,
|
|
592
|
+
float& dis3) const final {
|
|
593
|
+
Similarity sim0(q);
|
|
594
|
+
Similarity sim1(q);
|
|
595
|
+
Similarity sim2(q);
|
|
596
|
+
Similarity sim3(q);
|
|
597
|
+
|
|
598
|
+
sim0.begin_8();
|
|
599
|
+
sim1.begin_8();
|
|
600
|
+
sim2.begin_8();
|
|
601
|
+
sim3.begin_8();
|
|
602
|
+
|
|
603
|
+
for (size_t i = 0; i < quant.d; i += 8) {
|
|
604
|
+
simd8float32 xi0 = quant.reconstruct_8_components(code_0, i);
|
|
605
|
+
simd8float32 xi1 = quant.reconstruct_8_components(code_1, i);
|
|
606
|
+
simd8float32 xi2 = quant.reconstruct_8_components(code_2, i);
|
|
607
|
+
simd8float32 xi3 = quant.reconstruct_8_components(code_3, i);
|
|
608
|
+
sim0.add_8_components(xi0);
|
|
609
|
+
sim1.add_8_components(xi1);
|
|
610
|
+
sim2.add_8_components(xi2);
|
|
611
|
+
sim3.add_8_components(xi3);
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
dis0 = sim0.result_8();
|
|
615
|
+
dis1 = sim1.result_8();
|
|
616
|
+
dis2 = sim2.result_8();
|
|
617
|
+
dis3 = sim3.result_8();
|
|
356
618
|
}
|
|
357
619
|
};
|
|
358
620
|
|
|
@@ -401,6 +663,32 @@ struct DistanceComputerByte<Similarity, SIMDLevel::ARM_NEON>
|
|
|
401
663
|
}
|
|
402
664
|
};
|
|
403
665
|
|
|
666
|
+
/**********************************************************
|
|
667
|
+
* TurboQuant masked_sum NEON specialization (scalar fallback)
|
|
668
|
+
**********************************************************/
|
|
669
|
+
|
|
670
|
+
template <SIMDLevel SL0>
|
|
671
|
+
float turboq_masked_sum(const float* arr, const uint8_t* bits, size_t d);
|
|
672
|
+
|
|
673
|
+
template <>
|
|
674
|
+
float turboq_masked_sum<SIMDLevel::ARM_NEON>(
|
|
675
|
+
const float* arr,
|
|
676
|
+
const uint8_t* bits,
|
|
677
|
+
size_t d) {
|
|
678
|
+
float result = 0;
|
|
679
|
+
for (size_t byte_idx = 0; byte_idx < (d + 7) / 8; byte_idx++) {
|
|
680
|
+
uint8_t b = bits[byte_idx];
|
|
681
|
+
size_t base = byte_idx * 8;
|
|
682
|
+
size_t end = std::min(base + 8, d);
|
|
683
|
+
for (size_t j = base; j < end; j++) {
|
|
684
|
+
if (b & (1 << (j - base))) {
|
|
685
|
+
result += arr[j];
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
return result;
|
|
690
|
+
}
|
|
691
|
+
|
|
404
692
|
} // namespace scalar_quantizer
|
|
405
693
|
} // namespace faiss
|
|
406
694
|
|
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#ifdef COMPILE_SIMD_RISCV_RVV
|
|
9
|
+
|
|
10
|
+
#include <faiss/impl/scalar_quantizer/codecs.h>
|
|
11
|
+
#include <faiss/impl/scalar_quantizer/distance_computers.h>
|
|
12
|
+
#include <faiss/impl/scalar_quantizer/quantizers.h>
|
|
13
|
+
#include <faiss/impl/scalar_quantizer/scanners.h>
|
|
14
|
+
#include <faiss/impl/scalar_quantizer/similarities.h>
|
|
15
|
+
|
|
16
|
+
#include <riscv_vector.h>
|
|
17
|
+
#include <cmath>
|
|
18
|
+
|
|
19
|
+
namespace faiss {
|
|
20
|
+
|
|
21
|
+
namespace scalar_quantizer {
|
|
22
|
+
|
|
23
|
+
/*************************************************************************
|
|
24
|
+
* Marker specializations.
|
|
25
|
+
*
|
|
26
|
+
* Unlike x86/NEON sq-*.cpp files that expose a fixed 8-wide / 16-wide codec
|
|
27
|
+
* interface (reconstruct_8_components / reconstruct_16_components), RVV is
|
|
28
|
+
* variable-width: the native vector length is implementation-defined and
|
|
29
|
+
* queried at runtime via __riscv_vsetvl. Forcing RVV into a fixed-width
|
|
30
|
+
* codec would leave performance on the table on wider hardware.
|
|
31
|
+
*
|
|
32
|
+
* So the strategy here is: Codec / Quantizer / Similarity classes for
|
|
33
|
+
* RISCV_RVV act as opaque TAG TYPES — they only need to be complete types
|
|
34
|
+
* so that baseline's sq-dispatch.h can form template arguments like
|
|
35
|
+
* `DCTemplate<QuantizerTemplate<Codec4bit<RISCV_RVV>, UNIFORM, RISCV_RVV>,
|
|
36
|
+
* SimilarityL2<RISCV_RVV>, RISCV_RVV>`.
|
|
37
|
+
*
|
|
38
|
+
* The real SIMD work lives in full DCTemplate specializations below.
|
|
39
|
+
* Unspecialized combinations fall through to scalar via the fallback
|
|
40
|
+
* `DCTemplate<Q, Sim, RISCV_RVV> : DCTemplate<Q, Sim, NONE>`.
|
|
41
|
+
************************************************************************/
|
|
42
|
+
|
|
43
|
+
template <>
|
|
44
|
+
struct Codec8bit<SIMDLevel::RISCV_RVV> : Codec8bit<SIMDLevel::NONE> {};
|
|
45
|
+
|
|
46
|
+
template <>
|
|
47
|
+
struct Codec4bit<SIMDLevel::RISCV_RVV> : Codec4bit<SIMDLevel::NONE> {};
|
|
48
|
+
|
|
49
|
+
template <>
|
|
50
|
+
struct Codec6bit<SIMDLevel::RISCV_RVV> : Codec6bit<SIMDLevel::NONE> {};
|
|
51
|
+
|
|
52
|
+
template <class Codec>
|
|
53
|
+
struct QuantizerTemplate<
|
|
54
|
+
Codec,
|
|
55
|
+
QuantizerTemplateScaling::UNIFORM,
|
|
56
|
+
SIMDLevel::RISCV_RVV>
|
|
57
|
+
: QuantizerTemplate<
|
|
58
|
+
Codec,
|
|
59
|
+
QuantizerTemplateScaling::UNIFORM,
|
|
60
|
+
SIMDLevel::NONE> {
|
|
61
|
+
QuantizerTemplate(size_t d, const std::vector<float>& trained)
|
|
62
|
+
: QuantizerTemplate<
|
|
63
|
+
Codec,
|
|
64
|
+
QuantizerTemplateScaling::UNIFORM,
|
|
65
|
+
SIMDLevel::NONE>(d, trained) {}
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
template <class Codec>
|
|
69
|
+
struct QuantizerTemplate<
|
|
70
|
+
Codec,
|
|
71
|
+
QuantizerTemplateScaling::NON_UNIFORM,
|
|
72
|
+
SIMDLevel::RISCV_RVV>
|
|
73
|
+
: QuantizerTemplate<
|
|
74
|
+
Codec,
|
|
75
|
+
QuantizerTemplateScaling::NON_UNIFORM,
|
|
76
|
+
SIMDLevel::NONE> {
|
|
77
|
+
QuantizerTemplate(size_t d, const std::vector<float>& trained)
|
|
78
|
+
: QuantizerTemplate<
|
|
79
|
+
Codec,
|
|
80
|
+
QuantizerTemplateScaling::NON_UNIFORM,
|
|
81
|
+
SIMDLevel::NONE>(d, trained) {}
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
template <>
|
|
85
|
+
struct QuantizerFP16<SIMDLevel::RISCV_RVV> : QuantizerFP16<SIMDLevel::NONE> {
|
|
86
|
+
QuantizerFP16(size_t d, const std::vector<float>& trained)
|
|
87
|
+
: QuantizerFP16<SIMDLevel::NONE>(d, trained) {}
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
template <>
|
|
91
|
+
struct QuantizerBF16<SIMDLevel::RISCV_RVV> : QuantizerBF16<SIMDLevel::NONE> {
|
|
92
|
+
QuantizerBF16(size_t d, const std::vector<float>& trained)
|
|
93
|
+
: QuantizerBF16<SIMDLevel::NONE>(d, trained) {}
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
template <>
|
|
97
|
+
struct Quantizer8bitDirect<SIMDLevel::RISCV_RVV>
|
|
98
|
+
: Quantizer8bitDirect<SIMDLevel::NONE> {
|
|
99
|
+
Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
|
|
100
|
+
: Quantizer8bitDirect<SIMDLevel::NONE>(d, trained) {}
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
template <>
|
|
104
|
+
struct Quantizer8bitDirectSigned<SIMDLevel::RISCV_RVV>
|
|
105
|
+
: Quantizer8bitDirectSigned<SIMDLevel::NONE> {
|
|
106
|
+
Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
|
|
107
|
+
: Quantizer8bitDirectSigned<SIMDLevel::NONE>(d, trained) {}
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
template <>
|
|
111
|
+
struct SimilarityL2<SIMDLevel::RISCV_RVV> : SimilarityL2<SIMDLevel::NONE> {
|
|
112
|
+
using SimilarityL2<SIMDLevel::NONE>::SimilarityL2;
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
template <>
|
|
116
|
+
struct SimilarityIP<SIMDLevel::RISCV_RVV> : SimilarityIP<SIMDLevel::NONE> {
|
|
117
|
+
using SimilarityIP<SIMDLevel::NONE>::SimilarityIP;
|
|
118
|
+
};
|
|
119
|
+
|
|
120
|
+
/*************************************************************************
|
|
121
|
+
* Fallback DCTemplate / DistanceComputerByte for RISCV_RVV.
|
|
122
|
+
*
|
|
123
|
+
* Inheriting from the NONE specialization means every (Quantizer, Similarity)
|
|
124
|
+
* combination that does NOT have a hand-tuned RVV full specialization below
|
|
125
|
+
* falls through to scalar code. Callers and the dispatcher don't know or care.
|
|
126
|
+
************************************************************************/
|
|
127
|
+
|
|
128
|
+
template <class Quantizer, class Similarity>
|
|
129
|
+
struct DCTemplate<Quantizer, Similarity, SIMDLevel::RISCV_RVV>
|
|
130
|
+
: DCTemplate<Quantizer, Similarity, SIMDLevel::NONE> {
|
|
131
|
+
using Base = DCTemplate<Quantizer, Similarity, SIMDLevel::NONE>;
|
|
132
|
+
using Base::Base;
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
template <class Similarity>
|
|
136
|
+
struct DistanceComputerByte<Similarity, SIMDLevel::RISCV_RVV>
|
|
137
|
+
: DistanceComputerByte<Similarity, SIMDLevel::NONE> {
|
|
138
|
+
using Base = DistanceComputerByte<Similarity, SIMDLevel::NONE>;
|
|
139
|
+
using Base::Base;
|
|
140
|
+
};
|
|
141
|
+
|
|
142
|
+
/*************************************************************************
|
|
143
|
+
* Fast path — QT_4bit_uniform + L2
|
|
144
|
+
*
|
|
145
|
+
* 4-bit UNIFORM scaling: every component reconstructs as an affine function
|
|
146
|
+
* of the 4-bit code,
|
|
147
|
+
* recon(c) = vmin + vdiff * (c + 0.5) / 15 = final_scale * c + bias
|
|
148
|
+
* where final_scale = vdiff / 15. L2 distance between two reconstructions
|
|
149
|
+
* therefore reduces to final_scale^2 * (q_c - c_c)^2 over integer codes,
|
|
150
|
+
* so we can stay in the int domain and pay one float multiply at the end.
|
|
151
|
+
*
|
|
152
|
+
* The RVV path pre-nibbles the query into q_lo / q_hi (even / odd lanes)
|
|
153
|
+
* once at set_query time and then processes native-VL-sized chunks of code
|
|
154
|
+
* without ever decoding to float.
|
|
155
|
+
************************************************************************/
|
|
156
|
+
|
|
157
|
+
template <>
|
|
158
|
+
struct DCTemplate<
|
|
159
|
+
QuantizerTemplate<
|
|
160
|
+
Codec4bit<SIMDLevel::RISCV_RVV>,
|
|
161
|
+
QuantizerTemplateScaling::UNIFORM,
|
|
162
|
+
SIMDLevel::RISCV_RVV>,
|
|
163
|
+
SimilarityL2<SIMDLevel::RISCV_RVV>,
|
|
164
|
+
SIMDLevel::RISCV_RVV> : SQDistanceComputer {
|
|
165
|
+
using Sim = SimilarityL2<SIMDLevel::RISCV_RVV>;
|
|
166
|
+
|
|
167
|
+
size_t d;
|
|
168
|
+
float vmin;
|
|
169
|
+
float vdiff;
|
|
170
|
+
float final_scale_sq;
|
|
171
|
+
std::vector<uint8_t> q_lo;
|
|
172
|
+
std::vector<uint8_t> q_hi;
|
|
173
|
+
|
|
174
|
+
DCTemplate(size_t d_in, const std::vector<float>& trained)
|
|
175
|
+
: d(d_in),
|
|
176
|
+
vmin(trained[0]),
|
|
177
|
+
vdiff(trained[1]),
|
|
178
|
+
q_lo((d_in + 1) / 2, 0),
|
|
179
|
+
q_hi((d_in + 1) / 2, 0) {
|
|
180
|
+
const float final_scale = vdiff / 15.0f;
|
|
181
|
+
final_scale_sq = final_scale * final_scale;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
void set_query(const float* x) final {
|
|
185
|
+
this->q = x;
|
|
186
|
+
const float inv_scale = (vdiff == 0.0f) ? 0.0f : 15.0f / vdiff;
|
|
187
|
+
for (size_t i = 0; i < d; i++) {
|
|
188
|
+
float val = (x[i] - vmin) * inv_scale;
|
|
189
|
+
int code = static_cast<int>(val);
|
|
190
|
+
if (code < 0) {
|
|
191
|
+
code = 0;
|
|
192
|
+
}
|
|
193
|
+
if (code > 15) {
|
|
194
|
+
code = 15;
|
|
195
|
+
}
|
|
196
|
+
if (i % 2 == 0) {
|
|
197
|
+
q_lo[i / 2] = static_cast<uint8_t>(code);
|
|
198
|
+
} else {
|
|
199
|
+
q_hi[i / 2] = static_cast<uint8_t>(code);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/// Squared integer-domain L2 between pre-nibbled q and packed 4-bit code.
|
|
205
|
+
/// Uses RVV's native VL; no fixed width assumptions. Returns the raw
|
|
206
|
+
/// integer sum — caller multiplies by final_scale_sq.
|
|
207
|
+
int64_t accumulate_int_l2(const uint8_t* code) const {
|
|
208
|
+
int64_t acc = 0;
|
|
209
|
+
size_t i = 0;
|
|
210
|
+
while (i < d) {
|
|
211
|
+
// Process up to vl codes per iteration. Each code byte packs two
|
|
212
|
+
// 4-bit codes, so we load (vl + 1) / 2 bytes; keep vl even to
|
|
213
|
+
// keep the nibble split aligned with the i % 2 split we used at
|
|
214
|
+
// set_query time.
|
|
215
|
+
size_t remaining = d - i;
|
|
216
|
+
size_t vl = __riscv_vsetvl_e8m1(remaining);
|
|
217
|
+
if (vl & 1) {
|
|
218
|
+
vl -= 1; // keep even; tail handled on next iter or scalar
|
|
219
|
+
}
|
|
220
|
+
if (vl == 0) {
|
|
221
|
+
break;
|
|
222
|
+
}
|
|
223
|
+
const size_t byte_vl = vl / 2;
|
|
224
|
+
|
|
225
|
+
vuint8m1_t packed = __riscv_vle8_v_u8m1(code + i / 2, byte_vl);
|
|
226
|
+
vuint8m1_t ql = __riscv_vle8_v_u8m1(q_lo.data() + i / 2, byte_vl);
|
|
227
|
+
vuint8m1_t qh = __riscv_vle8_v_u8m1(q_hi.data() + i / 2, byte_vl);
|
|
228
|
+
|
|
229
|
+
vuint8m1_t lo_nib = __riscv_vand_vx_u8m1(packed, 0x0F, byte_vl);
|
|
230
|
+
vuint8m1_t hi_nib = __riscv_vsrl_vx_u8m1(packed, 4, byte_vl);
|
|
231
|
+
|
|
232
|
+
// |ql - lo| and |qh - hi| fit in u8 (values are in [0, 15]).
|
|
233
|
+
vuint8m1_t d_lo = __riscv_vsub_vv_u8m1(
|
|
234
|
+
__riscv_vmaxu_vv_u8m1(ql, lo_nib, byte_vl),
|
|
235
|
+
__riscv_vminu_vv_u8m1(ql, lo_nib, byte_vl),
|
|
236
|
+
byte_vl);
|
|
237
|
+
vuint8m1_t d_hi = __riscv_vsub_vv_u8m1(
|
|
238
|
+
__riscv_vmaxu_vv_u8m1(qh, hi_nib, byte_vl),
|
|
239
|
+
__riscv_vminu_vv_u8m1(qh, hi_nib, byte_vl),
|
|
240
|
+
byte_vl);
|
|
241
|
+
|
|
242
|
+
// Square via widening multiply (each byte squared fits in u16,
|
|
243
|
+
// since max byte value is 15 -> 225).
|
|
244
|
+
vuint16m2_t sq_lo = __riscv_vwmulu_vv_u16m2(d_lo, d_lo, byte_vl);
|
|
245
|
+
vuint16m2_t sq_hi = __riscv_vwmulu_vv_u16m2(d_hi, d_hi, byte_vl);
|
|
246
|
+
vuint16m2_t sq_sum = __riscv_vadd_vv_u16m2(sq_lo, sq_hi, byte_vl);
|
|
247
|
+
|
|
248
|
+
// Reduce to a scalar u32 (safe: byte_vl * 450 fits in u32 for
|
|
249
|
+
// any realistic d).
|
|
250
|
+
vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, 1);
|
|
251
|
+
vuint32m1_t red =
|
|
252
|
+
__riscv_vwredsumu_vs_u16m2_u32m1(sq_sum, zero, byte_vl);
|
|
253
|
+
acc += __riscv_vmv_x_s_u32m1_u32(red);
|
|
254
|
+
|
|
255
|
+
i += vl;
|
|
256
|
+
}
|
|
257
|
+
// Scalar tail: cover any leftover odd lane (at most one).
|
|
258
|
+
for (; i < d; i++) {
|
|
259
|
+
uint8_t c_code =
|
|
260
|
+
(i % 2 == 0) ? (code[i / 2] & 0x0F) : (code[i / 2] >> 4);
|
|
261
|
+
uint8_t q_code = (i % 2 == 0) ? q_lo[i / 2] : q_hi[i / 2];
|
|
262
|
+
int diff = int(q_code) - int(c_code);
|
|
263
|
+
acc += diff * diff;
|
|
264
|
+
}
|
|
265
|
+
return acc;
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
float query_to_code(const uint8_t* code) const final {
|
|
269
|
+
return static_cast<float>(accumulate_int_l2(code)) * final_scale_sq;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
float symmetric_dis(idx_t i, idx_t j) override {
|
|
273
|
+
// Not on the critical path for most workloads; reconstruct both
|
|
274
|
+
// codes into nibbles scalar-style and compute squared distance.
|
|
275
|
+
const uint8_t* c1 = codes + i * code_size;
|
|
276
|
+
const uint8_t* c2 = codes + j * code_size;
|
|
277
|
+
int64_t acc = 0;
|
|
278
|
+
for (size_t k = 0; k < d; k++) {
|
|
279
|
+
uint8_t a = (k % 2 == 0) ? (c1[k / 2] & 0x0F) : (c1[k / 2] >> 4);
|
|
280
|
+
uint8_t b = (k % 2 == 0) ? (c2[k / 2] & 0x0F) : (c2[k / 2] >> 4);
|
|
281
|
+
int diff = int(a) - int(b);
|
|
282
|
+
acc += diff * diff;
|
|
283
|
+
}
|
|
284
|
+
return static_cast<float>(acc) * final_scale_sq;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
void query_to_codes_batch_4(
|
|
288
|
+
const uint8_t* code_0,
|
|
289
|
+
const uint8_t* code_1,
|
|
290
|
+
const uint8_t* code_2,
|
|
291
|
+
const uint8_t* code_3,
|
|
292
|
+
float& dis0,
|
|
293
|
+
float& dis1,
|
|
294
|
+
float& dis2,
|
|
295
|
+
float& dis3) const final {
|
|
296
|
+
// Simple 4x unroll of the single-code path; good enough as a first
|
|
297
|
+
// cut — gives ILP across the four independent accumulate loops.
|
|
298
|
+
dis0 = static_cast<float>(accumulate_int_l2(code_0)) * final_scale_sq;
|
|
299
|
+
dis1 = static_cast<float>(accumulate_int_l2(code_1)) * final_scale_sq;
|
|
300
|
+
dis2 = static_cast<float>(accumulate_int_l2(code_2)) * final_scale_sq;
|
|
301
|
+
dis3 = static_cast<float>(accumulate_int_l2(code_3)) * final_scale_sq;
|
|
302
|
+
}
|
|
303
|
+
};
|
|
304
|
+
|
|
305
|
+
/**********************************************************
|
|
306
|
+
* TurboQuant masked_sum RVV specialization (scalar fallback)
|
|
307
|
+
**********************************************************/
|
|
308
|
+
|
|
309
|
+
template <SIMDLevel SL0>
|
|
310
|
+
float turboq_masked_sum(const float* arr, const uint8_t* bits, size_t d);
|
|
311
|
+
|
|
312
|
+
template <>
|
|
313
|
+
float turboq_masked_sum<SIMDLevel::RISCV_RVV>(
|
|
314
|
+
const float* arr,
|
|
315
|
+
const uint8_t* bits,
|
|
316
|
+
size_t d) {
|
|
317
|
+
float result = 0;
|
|
318
|
+
for (size_t byte_idx = 0; byte_idx < (d + 7) / 8; byte_idx++) {
|
|
319
|
+
uint8_t b = bits[byte_idx];
|
|
320
|
+
size_t base = byte_idx * 8;
|
|
321
|
+
size_t end = std::min(base + 8, d);
|
|
322
|
+
for (size_t j = base; j < end; j++) {
|
|
323
|
+
if (b & (1 << (j - base))) {
|
|
324
|
+
result += arr[j];
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
return result;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
} // namespace scalar_quantizer
|
|
332
|
+
} // namespace faiss
|
|
333
|
+
|
|
334
|
+
#define THE_LEVEL_TO_DISPATCH SIMDLevel::RISCV_RVV
|
|
335
|
+
#include <faiss/impl/scalar_quantizer/sq-dispatch.h>
|
|
336
|
+
|
|
337
|
+
#endif // COMPILE_SIMD_RISCV_RVV
|