faiss 0.5.3 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/faiss/ext.cpp +1 -1
- data/ext/faiss/extconf.rb +4 -4
- data/ext/faiss/index.cpp +63 -45
- data/ext/faiss/index_binary.cpp +37 -27
- data/ext/faiss/kmeans.cpp +9 -8
- data/ext/faiss/pca_matrix.cpp +9 -7
- data/ext/faiss/product_quantizer.cpp +13 -11
- data/ext/faiss/utils.cpp +4 -2
- data/ext/faiss/utils.h +4 -0
- data/lib/faiss/version.rb +1 -1
- data/lib/faiss.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +214 -82
- data/vendor/faiss/faiss/AutoTune.h +14 -1
- data/vendor/faiss/faiss/Clustering.cpp +97 -249
- data/vendor/faiss/faiss/Clustering.h +18 -0
- data/vendor/faiss/faiss/IVFlib.cpp +67 -44
- data/vendor/faiss/faiss/Index.cpp +25 -12
- data/vendor/faiss/faiss/Index.h +26 -4
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +68 -61
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +6 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +92 -95
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +120 -414
- data/vendor/faiss/faiss/IndexFastScan.cpp +105 -129
- data/vendor/faiss/faiss/IndexFastScan.h +35 -24
- data/vendor/faiss/faiss/IndexFlat.cpp +216 -152
- data/vendor/faiss/faiss/IndexFlat.h +32 -14
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +88 -41
- data/vendor/faiss/faiss/IndexFlatCodes.h +7 -1
- data/vendor/faiss/faiss/IndexHNSW.cpp +299 -187
- data/vendor/faiss/faiss/IndexHNSW.h +30 -14
- data/vendor/faiss/faiss/IndexIDMap.cpp +26 -22
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +535 -405
- data/vendor/faiss/faiss/IndexIVF.h +47 -16
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +105 -99
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +6 -3
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +379 -249
- data/vendor/faiss/faiss/IndexIVFFastScan.h +65 -60
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +41 -124
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +89 -138
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +77 -907
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +184 -122
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +23 -18
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +59 -60
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +4 -3
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +564 -416
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +269 -111
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +44 -25
- data/vendor/faiss/faiss/IndexLattice.cpp +41 -36
- data/vendor/faiss/faiss/IndexNNDescent.cpp +37 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +40 -23
- data/vendor/faiss/faiss/IndexNSG.h +0 -2
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +32 -12
- data/vendor/faiss/faiss/IndexPQ.cpp +129 -213
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +31 -43
- data/vendor/faiss/faiss/IndexRaBitQ.h +4 -3
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +135 -317
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +192 -34
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -55
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
- data/vendor/faiss/faiss/IndexShards.cpp +13 -13
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +29 -6
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +349 -141
- data/vendor/faiss/faiss/VectorTransform.h +39 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +55 -51
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/{cppcontrib/factory_tools.cpp → factory_tools.cpp} +6 -1
- data/vendor/faiss/faiss/gpu/GpuCloner.cpp +1 -1
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +64 -34
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -28
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +367 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.h +107 -0
- data/vendor/faiss/faiss/impl/CodePacker.cpp +7 -3
- data/vendor/faiss/faiss/impl/CodePacker.h +11 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +83 -0
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.h +47 -0
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +64 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +117 -351
- data/vendor/faiss/faiss/impl/HNSW.h +21 -40
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +114 -102
- data/vendor/faiss/faiss/impl/NNDescent.cpp +63 -26
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +44 -26
- data/vendor/faiss/faiss/impl/NSG.h +20 -10
- data/vendor/faiss/faiss/impl/Panorama.cpp +76 -52
- data/vendor/faiss/faiss/impl/Panorama.h +265 -78
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +62 -37
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +99 -80
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +135 -37
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +148 -21
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +298 -301
- data/vendor/faiss/faiss/impl/RaBitQuantizer.h +3 -10
- data/vendor/faiss/faiss/impl/RaBitQuantizerMultiBit.cpp +15 -41
- data/vendor/faiss/faiss/impl/RaBitQuantizerMultiBit.h +0 -4
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +40 -32
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +218 -113
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +119 -2362
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -3
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.cpp +42 -0
- data/vendor/faiss/faiss/impl/VisitedTable.h +76 -0
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +163 -0
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +176 -4
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -348
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +290 -142
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1950 -505
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -2
- data/vendor/faiss/faiss/impl/index_write.cpp +112 -21
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +33 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +81 -40
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +15 -8
- data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
- data/vendor/faiss/faiss/impl/{code_distance/code_distance-avx2.h → pq_code_distance/pq_code_distance-avx2.h} +43 -220
- data/vendor/faiss/faiss/impl/{code_distance/code_distance-avx512.h → pq_code_distance/pq_code_distance-avx512.h} +25 -112
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +59 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +256 -0
- data/vendor/faiss/faiss/impl/{code_distance/code_distance-sve.h → pq_code_distance/pq_code_distance-sve.cpp} +57 -146
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +320 -483
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +121 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +137 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +371 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +190 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +94 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +603 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +597 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +388 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +630 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +387 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +54 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +173 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +274 -171
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +275 -217
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +115 -28
- data/vendor/faiss/faiss/index_io.h +53 -3
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +73 -20
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
- data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +14 -14
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +9 -19
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSFlat.h +2 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +19 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +19 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +14 -0
- data/vendor/faiss/faiss/utils/Heap.cpp +56 -10
- data/vendor/faiss/faiss/utils/Heap.h +21 -0
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +54 -40
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/distances.cpp +507 -559
- data/vendor/faiss/faiss/utils/distances.h +118 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +250 -0
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +72 -3681
- data/vendor/faiss/faiss/utils/extra_distances.cpp +60 -102
- data/vendor/faiss/faiss/utils/extra_distances.h +79 -7
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +251 -0
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +124 -343
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +154 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +777 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +306 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +1431 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +1095 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +392 -0
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +334 -0
- data/vendor/faiss/faiss/utils/simd_levels.h +183 -0
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +21 -14
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +156 -42
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +0 -81
- data/vendor/faiss/faiss/impl/code_distance/code_distance.h +0 -186
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -216
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -224
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -228
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -450
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -296
- /data/vendor/faiss/faiss/{cppcontrib/factory_tools.h → factory_tools.h} +0 -0
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#pragma once
|
|
9
|
+
|
|
10
|
+
#include <faiss/impl/simdlib/simdlib_dispatch.h>
|
|
11
|
+
|
|
12
|
+
namespace faiss {
|
|
13
|
+
|
|
14
|
+
/*
|
|
15
|
+
* Multi-BB variant: accumulates NQ queries x BB*32 database elements.
|
|
16
|
+
* Used by the search_1 path (bbs > 32).
|
|
17
|
+
*
|
|
18
|
+
* KernelSL selects the SIMD type width used for the inner accumulation
|
|
19
|
+
* loop. In DD mode the caller passes THE_LEVEL_TO_DISPATCH so the
|
|
20
|
+
* kernel uses real AVX2/AVX512 types rather than the emulated-scalar
|
|
21
|
+
* fallback that SINGLE_SIMD_LEVEL_256 would give.
|
|
22
|
+
*/
|
|
23
|
+
template <
|
|
24
|
+
int NQ,
|
|
25
|
+
int BB,
|
|
26
|
+
SIMDLevel KernelSL = SINGLE_SIMD_LEVEL,
|
|
27
|
+
class ResultHandler,
|
|
28
|
+
class Scaler>
|
|
29
|
+
void kernel_accumulate_block(
|
|
30
|
+
int nsq,
|
|
31
|
+
const uint8_t* codes,
|
|
32
|
+
const uint8_t* LUT,
|
|
33
|
+
ResultHandler& res,
|
|
34
|
+
const Scaler& scaler) {
|
|
35
|
+
constexpr SIMDLevel SL256 = simd256_level_selector<KernelSL>::value;
|
|
36
|
+
using simd16uint16 = simd16uint16_tpl<SL256>;
|
|
37
|
+
using simd32uint8 = simd32uint8_tpl<SL256>;
|
|
38
|
+
|
|
39
|
+
// distance accumulators
|
|
40
|
+
simd16uint16 accu[NQ][BB][4];
|
|
41
|
+
|
|
42
|
+
for (int q = 0; q < NQ; q++) {
|
|
43
|
+
for (int b = 0; b < BB; b++) {
|
|
44
|
+
accu[q][b][0].clear();
|
|
45
|
+
accu[q][b][1].clear();
|
|
46
|
+
accu[q][b][2].clear();
|
|
47
|
+
accu[q][b][3].clear();
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
for (int sq = 0; sq < nsq - scaler.nscale; sq += 2) {
|
|
52
|
+
simd32uint8 lut_cache[NQ];
|
|
53
|
+
for (int q = 0; q < NQ; q++) {
|
|
54
|
+
lut_cache[q] = simd32uint8(LUT);
|
|
55
|
+
LUT += 32;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
for (int b = 0; b < BB; b++) {
|
|
59
|
+
simd32uint8 c = simd32uint8(codes);
|
|
60
|
+
codes += 32;
|
|
61
|
+
simd32uint8 mask(15);
|
|
62
|
+
simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
|
|
63
|
+
simd32uint8 clo = c & mask;
|
|
64
|
+
|
|
65
|
+
for (int q = 0; q < NQ; q++) {
|
|
66
|
+
simd32uint8 lut = lut_cache[q];
|
|
67
|
+
simd32uint8 res0 = lut.lookup_2_lanes(clo);
|
|
68
|
+
simd32uint8 res1 = lut.lookup_2_lanes(chi);
|
|
69
|
+
|
|
70
|
+
accu[q][b][0] += simd16uint16(res0);
|
|
71
|
+
accu[q][b][1] += simd16uint16(res0) >> 8;
|
|
72
|
+
|
|
73
|
+
accu[q][b][2] += simd16uint16(res1);
|
|
74
|
+
accu[q][b][3] += simd16uint16(res1) >> 8;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
for (int sq = 0; sq < scaler.nscale; sq += 2) {
|
|
80
|
+
simd32uint8 lut_cache[NQ];
|
|
81
|
+
for (int q = 0; q < NQ; q++) {
|
|
82
|
+
lut_cache[q] = simd32uint8(LUT);
|
|
83
|
+
LUT += 32;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
for (int b = 0; b < BB; b++) {
|
|
87
|
+
simd32uint8 c = simd32uint8(codes);
|
|
88
|
+
codes += 32;
|
|
89
|
+
simd32uint8 mask(15);
|
|
90
|
+
simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
|
|
91
|
+
simd32uint8 clo = c & mask;
|
|
92
|
+
|
|
93
|
+
for (int q = 0; q < NQ; q++) {
|
|
94
|
+
simd32uint8 lut = lut_cache[q];
|
|
95
|
+
|
|
96
|
+
simd32uint8 res0 = scaler.lookup(lut, clo);
|
|
97
|
+
accu[q][b][0] += scaler.scale_lo(res0); // handle vectors 0..7
|
|
98
|
+
accu[q][b][1] += scaler.scale_hi(res0); // handle vectors 8..15
|
|
99
|
+
|
|
100
|
+
simd32uint8 res1 = scaler.lookup(lut, chi);
|
|
101
|
+
accu[q][b][2] += scaler.scale_lo(res1); // handle vectors 16..23
|
|
102
|
+
accu[q][b][3] +=
|
|
103
|
+
scaler.scale_hi(res1); // handle vectors 24..31
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
for (int q = 0; q < NQ; q++) {
|
|
109
|
+
for (int b = 0; b < BB; b++) {
|
|
110
|
+
accu[q][b][0] -= accu[q][b][1] << 8;
|
|
111
|
+
simd16uint16 dis0 = combine2x2(accu[q][b][0], accu[q][b][1]);
|
|
112
|
+
|
|
113
|
+
accu[q][b][2] -= accu[q][b][3] << 8;
|
|
114
|
+
simd16uint16 dis1 = combine2x2(accu[q][b][2], accu[q][b][3]);
|
|
115
|
+
|
|
116
|
+
res.handle(q, b, dis0, dis1);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/*
|
|
122
|
+
* Single-BB QBS variant: accumulates NQ queries x 32 db elements (BB=1).
|
|
123
|
+
* Used by the decompose_qbs layer for non-AVX512 paths.
|
|
124
|
+
*/
|
|
125
|
+
template <
|
|
126
|
+
int NQ,
|
|
127
|
+
SIMDLevel KernelSL = SINGLE_SIMD_LEVEL,
|
|
128
|
+
class ResultHandler,
|
|
129
|
+
class Scaler>
|
|
130
|
+
void pq4_kernel_qbs_256(
|
|
131
|
+
int nsq,
|
|
132
|
+
const uint8_t* codes,
|
|
133
|
+
const uint8_t* LUT,
|
|
134
|
+
ResultHandler& res,
|
|
135
|
+
const Scaler& scaler) {
|
|
136
|
+
constexpr SIMDLevel SL256 = simd256_level_selector<KernelSL>::value;
|
|
137
|
+
using simd16uint16 = simd16uint16_tpl<SL256>;
|
|
138
|
+
using simd32uint8 = simd32uint8_tpl<SL256>;
|
|
139
|
+
|
|
140
|
+
// dummy alloc to keep the windows compiler happy
|
|
141
|
+
constexpr int NQA = NQ > 0 ? NQ : 1;
|
|
142
|
+
// distance accumulators
|
|
143
|
+
// layout: accu[q][b]: distance accumulator for vectors 8*b..8*b+7
|
|
144
|
+
simd16uint16 accu[NQA][4];
|
|
145
|
+
|
|
146
|
+
for (int q = 0; q < NQ; q++) {
|
|
147
|
+
for (int b = 0; b < 4; b++) {
|
|
148
|
+
accu[q][b].clear();
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// _mm_prefetch(codes + 768, 0);
|
|
153
|
+
for (int sq = 0; sq < nsq - scaler.nscale; sq += 2) {
|
|
154
|
+
simd32uint8 c;
|
|
155
|
+
c.loadu(codes);
|
|
156
|
+
codes += 32;
|
|
157
|
+
|
|
158
|
+
simd32uint8 mask(0xf);
|
|
159
|
+
// shift op does not exist for int8...
|
|
160
|
+
simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
|
|
161
|
+
simd32uint8 clo = c & mask;
|
|
162
|
+
|
|
163
|
+
for (int q = 0; q < NQ; q++) {
|
|
164
|
+
// load LUTs for 2 quantizers
|
|
165
|
+
simd32uint8 lut(LUT);
|
|
166
|
+
LUT += 32;
|
|
167
|
+
|
|
168
|
+
simd32uint8 res0 = lut.lookup_2_lanes(clo);
|
|
169
|
+
simd32uint8 res1 = lut.lookup_2_lanes(chi);
|
|
170
|
+
|
|
171
|
+
accu[q][0] += simd16uint16(res0);
|
|
172
|
+
accu[q][1] += simd16uint16(res0) >> 8;
|
|
173
|
+
|
|
174
|
+
accu[q][2] += simd16uint16(res1);
|
|
175
|
+
accu[q][3] += simd16uint16(res1) >> 8;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
for (int sq = 0; sq < scaler.nscale; sq += 2) {
|
|
180
|
+
simd32uint8 c;
|
|
181
|
+
c.loadu(codes);
|
|
182
|
+
codes += 32;
|
|
183
|
+
|
|
184
|
+
simd32uint8 mask(0xf);
|
|
185
|
+
// shift op does not exist for int8...
|
|
186
|
+
simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
|
|
187
|
+
simd32uint8 clo = c & mask;
|
|
188
|
+
|
|
189
|
+
for (int q = 0; q < NQ; q++) {
|
|
190
|
+
// load LUTs for 2 quantizers
|
|
191
|
+
simd32uint8 lut(LUT);
|
|
192
|
+
LUT += 32;
|
|
193
|
+
|
|
194
|
+
simd32uint8 res0 = scaler.lookup(lut, clo);
|
|
195
|
+
accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
|
|
196
|
+
accu[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
|
|
197
|
+
|
|
198
|
+
simd32uint8 res1 = scaler.lookup(lut, chi);
|
|
199
|
+
accu[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
|
|
200
|
+
accu[q][3] += scaler.scale_hi(res1); // handle vectors 24..31
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
for (int q = 0; q < NQ; q++) {
|
|
205
|
+
accu[q][0] -= accu[q][1] << 8;
|
|
206
|
+
simd16uint16 dis0 = combine2x2(accu[q][0], accu[q][1]);
|
|
207
|
+
accu[q][2] -= accu[q][3] << 8;
|
|
208
|
+
simd16uint16 dis1 = combine2x2(accu[q][2], accu[q][3]);
|
|
209
|
+
res.handle(q, 0, dis0, dis1);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
} // namespace faiss
|
|
@@ -5,115 +5,20 @@
|
|
|
5
5
|
* LICENSE file in the root directory of this source tree.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
#
|
|
8
|
+
#pragma once
|
|
9
9
|
|
|
10
|
-
#include <faiss/impl/
|
|
11
|
-
#include <faiss/impl/
|
|
12
|
-
#include <faiss/impl/simd_result_handlers.h>
|
|
10
|
+
#include <faiss/impl/platform_macros.h>
|
|
11
|
+
#include <faiss/impl/simdlib/simdlib_dispatch.h>
|
|
13
12
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
// declared in simd_result_handlers.h
|
|
17
|
-
bool simd_result_handlers_accept_virtual = true;
|
|
18
|
-
|
|
19
|
-
using namespace simd_result_handlers;
|
|
20
|
-
|
|
21
|
-
/************************************************************
|
|
22
|
-
* Accumulation functions
|
|
23
|
-
************************************************************/
|
|
24
|
-
|
|
25
|
-
namespace {
|
|
26
|
-
|
|
27
|
-
/*
|
|
28
|
-
* The computation kernel
|
|
29
|
-
* It accumulates results for NQ queries and 2 * 16 database elements
|
|
30
|
-
* writes results in a ResultHandler
|
|
31
|
-
*/
|
|
32
|
-
|
|
33
|
-
#ifndef __AVX512F__
|
|
34
|
-
|
|
35
|
-
template <int NQ, class ResultHandler, class Scaler>
|
|
36
|
-
void kernel_accumulate_block(
|
|
37
|
-
int nsq,
|
|
38
|
-
const uint8_t* codes,
|
|
39
|
-
const uint8_t* LUT,
|
|
40
|
-
ResultHandler& res,
|
|
41
|
-
const Scaler& scaler) {
|
|
42
|
-
// dummy alloc to keep the windows compiler happy
|
|
43
|
-
constexpr int NQA = NQ > 0 ? NQ : 1;
|
|
44
|
-
// distance accumulators
|
|
45
|
-
// layout: accu[q][b]: distance accumulator for vectors 8*b..8*b+7
|
|
46
|
-
simd16uint16 accu[NQA][4];
|
|
47
|
-
|
|
48
|
-
for (int q = 0; q < NQ; q++) {
|
|
49
|
-
for (int b = 0; b < 4; b++) {
|
|
50
|
-
accu[q][b].clear();
|
|
51
|
-
}
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
// _mm_prefetch(codes + 768, 0);
|
|
55
|
-
for (int sq = 0; sq < nsq - scaler.nscale; sq += 2) {
|
|
56
|
-
simd32uint8 c;
|
|
57
|
-
c.loadu(codes);
|
|
58
|
-
codes += 32;
|
|
59
|
-
|
|
60
|
-
simd32uint8 mask(0xf);
|
|
61
|
-
// shift op does not exist for int8...
|
|
62
|
-
simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
|
|
63
|
-
simd32uint8 clo = c & mask;
|
|
64
|
-
|
|
65
|
-
for (int q = 0; q < NQ; q++) {
|
|
66
|
-
// load LUTs for 2 quantizers
|
|
67
|
-
simd32uint8 lut(LUT);
|
|
68
|
-
LUT += 32;
|
|
69
|
-
|
|
70
|
-
simd32uint8 res0 = lut.lookup_2_lanes(clo);
|
|
71
|
-
simd32uint8 res1 = lut.lookup_2_lanes(chi);
|
|
72
|
-
|
|
73
|
-
accu[q][0] += simd16uint16(res0);
|
|
74
|
-
accu[q][1] += simd16uint16(res0) >> 8;
|
|
75
|
-
|
|
76
|
-
accu[q][2] += simd16uint16(res1);
|
|
77
|
-
accu[q][3] += simd16uint16(res1) >> 8;
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
for (int sq = 0; sq < scaler.nscale; sq += 2) {
|
|
82
|
-
simd32uint8 c;
|
|
83
|
-
c.loadu(codes);
|
|
84
|
-
codes += 32;
|
|
85
|
-
|
|
86
|
-
simd32uint8 mask(0xf);
|
|
87
|
-
// shift op does not exist for int8...
|
|
88
|
-
simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
|
|
89
|
-
simd32uint8 clo = c & mask;
|
|
13
|
+
#ifdef __AVX512F__
|
|
90
14
|
|
|
91
|
-
|
|
92
|
-
// load LUTs for 2 quantizers
|
|
93
|
-
simd32uint8 lut(LUT);
|
|
94
|
-
LUT += 32;
|
|
95
|
-
|
|
96
|
-
simd32uint8 res0 = scaler.lookup(lut, clo);
|
|
97
|
-
accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
|
|
98
|
-
accu[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
|
|
99
|
-
|
|
100
|
-
simd32uint8 res1 = scaler.lookup(lut, chi);
|
|
101
|
-
accu[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
|
|
102
|
-
accu[q][3] += scaler.scale_hi(res1); // handle vectors 24..31
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
for (int q = 0; q < NQ; q++) {
|
|
107
|
-
accu[q][0] -= accu[q][1] << 8;
|
|
108
|
-
simd16uint16 dis0 = combine2x2(accu[q][0], accu[q][1]);
|
|
109
|
-
accu[q][2] -= accu[q][3] << 8;
|
|
110
|
-
simd16uint16 dis1 = combine2x2(accu[q][2], accu[q][3]);
|
|
111
|
-
res.handle(q, 0, dis0, dis1);
|
|
112
|
-
}
|
|
113
|
-
}
|
|
15
|
+
namespace faiss {
|
|
114
16
|
|
|
115
|
-
|
|
17
|
+
// Explicit SIMD-level aliases for this file (no global bare aliases).
|
|
18
|
+
using simd32uint16 = simd32uint16_tpl<SINGLE_SIMD_LEVEL_512>;
|
|
19
|
+
using simd64uint8 = simd64uint8_tpl<SINGLE_SIMD_LEVEL_512>;
|
|
116
20
|
|
|
21
|
+
// NQ=1 specialization: processes 512-bit chunks aggressively.
|
|
117
22
|
// a special version for NQ=1.
|
|
118
23
|
// Despite the function being large in the text form, it compiles to a very
|
|
119
24
|
// compact assembler code.
|
|
@@ -125,6 +30,13 @@ void kernel_accumulate_block_avx512_nq1(
|
|
|
125
30
|
const uint8_t* LUT,
|
|
126
31
|
ResultHandler& res,
|
|
127
32
|
const Scaler& scaler) {
|
|
33
|
+
// Explicit SIMD levels for DD mode where bare aliases resolve to NONE
|
|
34
|
+
// (512-bit NONE types don't exist — empty primary templates).
|
|
35
|
+
using simd32uint16 = simd32uint16_tpl<SIMDLevel::AVX512>;
|
|
36
|
+
using simd64uint8 = simd64uint8_tpl<SIMDLevel::AVX512>;
|
|
37
|
+
using simd16uint16 = simd16uint16_tpl<SIMDLevel::AVX2>;
|
|
38
|
+
using simd32uint8 = simd32uint8_tpl<SIMDLevel::AVX2>;
|
|
39
|
+
|
|
128
40
|
// NQ is kept in order to match the similarity to baseline function
|
|
129
41
|
constexpr int NQ = 1;
|
|
130
42
|
// distance accumulators. We can accept more for NQ=1
|
|
@@ -386,6 +298,12 @@ void kernel_accumulate_block_avx512_nqx(
|
|
|
386
298
|
const uint8_t* LUT,
|
|
387
299
|
ResultHandler& res,
|
|
388
300
|
const Scaler& scaler) {
|
|
301
|
+
// Explicit SIMD levels for DD mode (see nq1 variant for explanation).
|
|
302
|
+
using simd32uint16 = simd32uint16_tpl<SIMDLevel::AVX512>;
|
|
303
|
+
using simd64uint8 = simd64uint8_tpl<SIMDLevel::AVX512>;
|
|
304
|
+
using simd16uint16 = simd16uint16_tpl<SIMDLevel::AVX2>;
|
|
305
|
+
using simd32uint8 = simd32uint8_tpl<SIMDLevel::AVX2>;
|
|
306
|
+
|
|
389
307
|
// dummy alloc to keep the windows compiler happy
|
|
390
308
|
constexpr int NQA = NQ > 0 ? NQ : 1;
|
|
391
309
|
// distance accumulators
|
|
@@ -539,8 +457,9 @@ void kernel_accumulate_block_avx512_nqx(
|
|
|
539
457
|
}
|
|
540
458
|
}
|
|
541
459
|
|
|
460
|
+
// Dispatcher: selects NQ=1 vs general case.
|
|
542
461
|
template <int NQ, class ResultHandler, class Scaler>
|
|
543
|
-
void
|
|
462
|
+
void pq4_kernel_qbs_512(
|
|
544
463
|
int nsq,
|
|
545
464
|
const uint8_t* codes,
|
|
546
465
|
const uint8_t* LUT,
|
|
@@ -555,247 +474,6 @@ void kernel_accumulate_block(
|
|
|
555
474
|
}
|
|
556
475
|
}
|
|
557
476
|
|
|
558
|
-
#endif
|
|
559
|
-
|
|
560
|
-
// handle at most 4 blocks of queries
|
|
561
|
-
template <int QBS, class ResultHandler, class Scaler>
|
|
562
|
-
void accumulate_q_4step(
|
|
563
|
-
size_t ntotal2,
|
|
564
|
-
int nsq,
|
|
565
|
-
const uint8_t* codes,
|
|
566
|
-
const uint8_t* LUT0,
|
|
567
|
-
ResultHandler& res,
|
|
568
|
-
const Scaler& scaler) {
|
|
569
|
-
constexpr int Q1 = QBS & 15;
|
|
570
|
-
constexpr int Q2 = (QBS >> 4) & 15;
|
|
571
|
-
constexpr int Q3 = (QBS >> 8) & 15;
|
|
572
|
-
constexpr int Q4 = (QBS >> 12) & 15;
|
|
573
|
-
constexpr int SQ = Q1 + Q2 + Q3 + Q4;
|
|
574
|
-
|
|
575
|
-
for (size_t j0 = 0; j0 < ntotal2; j0 += 32) {
|
|
576
|
-
FixedStorageHandler<SQ, 2> res2;
|
|
577
|
-
const uint8_t* LUT = LUT0;
|
|
578
|
-
kernel_accumulate_block<Q1>(nsq, codes, LUT, res2, scaler);
|
|
579
|
-
LUT += Q1 * nsq * 16;
|
|
580
|
-
if (Q2 > 0) {
|
|
581
|
-
res2.set_block_origin(Q1, 0);
|
|
582
|
-
kernel_accumulate_block<Q2>(nsq, codes, LUT, res2, scaler);
|
|
583
|
-
LUT += Q2 * nsq * 16;
|
|
584
|
-
}
|
|
585
|
-
if (Q3 > 0) {
|
|
586
|
-
res2.set_block_origin(Q1 + Q2, 0);
|
|
587
|
-
kernel_accumulate_block<Q3>(nsq, codes, LUT, res2, scaler);
|
|
588
|
-
LUT += Q3 * nsq * 16;
|
|
589
|
-
}
|
|
590
|
-
if (Q4 > 0) {
|
|
591
|
-
res2.set_block_origin(Q1 + Q2 + Q3, 0);
|
|
592
|
-
kernel_accumulate_block<Q4>(nsq, codes, LUT, res2, scaler);
|
|
593
|
-
}
|
|
594
|
-
res.set_block_origin(0, j0);
|
|
595
|
-
res2.to_other_handler(res);
|
|
596
|
-
codes += 32 * nsq / 2;
|
|
597
|
-
}
|
|
598
|
-
}
|
|
599
|
-
|
|
600
|
-
template <int NQ, class ResultHandler, class Scaler>
|
|
601
|
-
void kernel_accumulate_block_loop(
|
|
602
|
-
size_t ntotal2,
|
|
603
|
-
int nsq,
|
|
604
|
-
const uint8_t* codes,
|
|
605
|
-
const uint8_t* LUT,
|
|
606
|
-
ResultHandler& res,
|
|
607
|
-
const Scaler& scaler) {
|
|
608
|
-
for (size_t j0 = 0; j0 < ntotal2; j0 += 32) {
|
|
609
|
-
res.set_block_origin(0, j0);
|
|
610
|
-
kernel_accumulate_block<NQ, ResultHandler>(
|
|
611
|
-
nsq, codes + j0 * nsq / 2, LUT, res, scaler);
|
|
612
|
-
}
|
|
613
|
-
}
|
|
614
|
-
|
|
615
|
-
// non-template version of accumulate kernel -- dispatches dynamically
|
|
616
|
-
template <class ResultHandler, class Scaler>
|
|
617
|
-
void accumulate(
|
|
618
|
-
int nq,
|
|
619
|
-
size_t ntotal2,
|
|
620
|
-
int nsq,
|
|
621
|
-
const uint8_t* codes,
|
|
622
|
-
const uint8_t* LUT,
|
|
623
|
-
ResultHandler& res,
|
|
624
|
-
const Scaler& scaler) {
|
|
625
|
-
assert(nsq % 2 == 0);
|
|
626
|
-
assert(is_aligned_pointer(LUT));
|
|
627
|
-
|
|
628
|
-
#define DISPATCH(NQ) \
|
|
629
|
-
case NQ: \
|
|
630
|
-
kernel_accumulate_block_loop<NQ, ResultHandler>( \
|
|
631
|
-
ntotal2, nsq, codes, LUT, res, scaler); \
|
|
632
|
-
return
|
|
633
|
-
|
|
634
|
-
switch (nq) {
|
|
635
|
-
DISPATCH(1);
|
|
636
|
-
DISPATCH(2);
|
|
637
|
-
DISPATCH(3);
|
|
638
|
-
DISPATCH(4);
|
|
639
|
-
}
|
|
640
|
-
FAISS_THROW_FMT("accumulate nq=%d not instantiated", nq);
|
|
641
|
-
|
|
642
|
-
#undef DISPATCH
|
|
643
|
-
}
|
|
644
|
-
|
|
645
|
-
template <class ResultHandler, class Scaler>
|
|
646
|
-
void pq4_accumulate_loop_qbs_fixed_scaler(
|
|
647
|
-
int qbs,
|
|
648
|
-
size_t ntotal2,
|
|
649
|
-
int nsq,
|
|
650
|
-
const uint8_t* codes,
|
|
651
|
-
const uint8_t* LUT0,
|
|
652
|
-
ResultHandler& res,
|
|
653
|
-
const Scaler& scaler) {
|
|
654
|
-
assert(nsq % 2 == 0);
|
|
655
|
-
assert(is_aligned_pointer(codes));
|
|
656
|
-
assert(is_aligned_pointer(LUT0));
|
|
657
|
-
|
|
658
|
-
// try out optimized versions
|
|
659
|
-
switch (qbs) {
|
|
660
|
-
#define DISPATCH(QBS) \
|
|
661
|
-
case QBS: \
|
|
662
|
-
accumulate_q_4step<QBS>(ntotal2, nsq, codes, LUT0, res, scaler); \
|
|
663
|
-
return;
|
|
664
|
-
DISPATCH(0x3333); // 12
|
|
665
|
-
DISPATCH(0x2333); // 11
|
|
666
|
-
DISPATCH(0x2233); // 10
|
|
667
|
-
DISPATCH(0x333); // 9
|
|
668
|
-
DISPATCH(0x2223); // 9
|
|
669
|
-
DISPATCH(0x233); // 8
|
|
670
|
-
DISPATCH(0x1223); // 8
|
|
671
|
-
DISPATCH(0x223); // 7
|
|
672
|
-
DISPATCH(0x34); // 7
|
|
673
|
-
DISPATCH(0x133); // 7
|
|
674
|
-
DISPATCH(0x6); // 6
|
|
675
|
-
DISPATCH(0x33); // 6
|
|
676
|
-
DISPATCH(0x123); // 6
|
|
677
|
-
DISPATCH(0x222); // 6
|
|
678
|
-
DISPATCH(0x23); // 5
|
|
679
|
-
DISPATCH(0x5); // 5
|
|
680
|
-
DISPATCH(0x13); // 4
|
|
681
|
-
DISPATCH(0x22); // 4
|
|
682
|
-
DISPATCH(0x4); // 4
|
|
683
|
-
DISPATCH(0x3); // 3
|
|
684
|
-
DISPATCH(0x21); // 3
|
|
685
|
-
DISPATCH(0x2); // 2
|
|
686
|
-
DISPATCH(0x1); // 1
|
|
687
|
-
#undef DISPATCH
|
|
688
|
-
}
|
|
689
|
-
|
|
690
|
-
// default implementation where qbs is not known at compile time
|
|
691
|
-
|
|
692
|
-
for (size_t j0 = 0; j0 < ntotal2; j0 += 32) {
|
|
693
|
-
const uint8_t* LUT = LUT0;
|
|
694
|
-
int qi = qbs;
|
|
695
|
-
int i0 = 0;
|
|
696
|
-
while (qi) {
|
|
697
|
-
int nq = qi & 15;
|
|
698
|
-
qi >>= 4;
|
|
699
|
-
res.set_block_origin(i0, j0);
|
|
700
|
-
#define DISPATCH(NQ) \
|
|
701
|
-
case NQ: \
|
|
702
|
-
kernel_accumulate_block<NQ, ResultHandler>( \
|
|
703
|
-
nsq, codes, LUT, res, scaler); \
|
|
704
|
-
break
|
|
705
|
-
switch (nq) {
|
|
706
|
-
DISPATCH(1);
|
|
707
|
-
DISPATCH(2);
|
|
708
|
-
DISPATCH(3);
|
|
709
|
-
DISPATCH(4);
|
|
710
|
-
#undef DISPATCH
|
|
711
|
-
default:
|
|
712
|
-
FAISS_THROW_FMT("accumulate nq=%d not instantiated", nq);
|
|
713
|
-
}
|
|
714
|
-
i0 += nq;
|
|
715
|
-
LUT += nq * nsq * 16;
|
|
716
|
-
}
|
|
717
|
-
codes += 32 * nsq / 2;
|
|
718
|
-
}
|
|
719
|
-
}
|
|
720
|
-
|
|
721
|
-
struct Run_pq4_accumulate_loop_qbs {
|
|
722
|
-
template <class ResultHandler>
|
|
723
|
-
void f(ResultHandler& res,
|
|
724
|
-
int qbs,
|
|
725
|
-
size_t nb,
|
|
726
|
-
int nsq,
|
|
727
|
-
const uint8_t* codes,
|
|
728
|
-
const uint8_t* LUT,
|
|
729
|
-
const NormTableScaler* scaler) {
|
|
730
|
-
if (scaler) {
|
|
731
|
-
pq4_accumulate_loop_qbs_fixed_scaler(
|
|
732
|
-
qbs, nb, nsq, codes, LUT, res, *scaler);
|
|
733
|
-
} else {
|
|
734
|
-
DummyScaler dummy;
|
|
735
|
-
pq4_accumulate_loop_qbs_fixed_scaler(
|
|
736
|
-
qbs, nb, nsq, codes, LUT, res, dummy);
|
|
737
|
-
}
|
|
738
|
-
}
|
|
739
|
-
};
|
|
740
|
-
|
|
741
|
-
} // namespace
|
|
742
|
-
|
|
743
|
-
void pq4_accumulate_loop_qbs(
|
|
744
|
-
int qbs,
|
|
745
|
-
size_t nb,
|
|
746
|
-
int nsq,
|
|
747
|
-
const uint8_t* codes,
|
|
748
|
-
const uint8_t* LUT,
|
|
749
|
-
SIMDResultHandler& res,
|
|
750
|
-
const NormTableScaler* scaler) {
|
|
751
|
-
Run_pq4_accumulate_loop_qbs consumer;
|
|
752
|
-
dispatch_SIMDResultHandler(res, consumer, qbs, nb, nsq, codes, LUT, scaler);
|
|
753
|
-
}
|
|
754
|
-
|
|
755
|
-
/***************************************************************
|
|
756
|
-
* Packing functions
|
|
757
|
-
***************************************************************/
|
|
758
|
-
|
|
759
|
-
int pq4_qbs_to_nq(int qbs) {
|
|
760
|
-
int i0 = 0;
|
|
761
|
-
int qi = qbs;
|
|
762
|
-
while (qi) {
|
|
763
|
-
int nq = qi & 15;
|
|
764
|
-
qi >>= 4;
|
|
765
|
-
i0 += nq;
|
|
766
|
-
}
|
|
767
|
-
return i0;
|
|
768
|
-
}
|
|
769
|
-
|
|
770
|
-
void accumulate_to_mem(
|
|
771
|
-
int nq,
|
|
772
|
-
size_t ntotal2,
|
|
773
|
-
int nsq,
|
|
774
|
-
const uint8_t* codes,
|
|
775
|
-
const uint8_t* LUT,
|
|
776
|
-
uint16_t* accu) {
|
|
777
|
-
FAISS_THROW_IF_NOT(ntotal2 % 32 == 0);
|
|
778
|
-
StoreResultHandler handler(accu, ntotal2);
|
|
779
|
-
DummyScaler scaler;
|
|
780
|
-
accumulate(nq, ntotal2, nsq, codes, LUT, handler, scaler);
|
|
781
|
-
}
|
|
782
|
-
|
|
783
|
-
int pq4_preferred_qbs(int n) {
|
|
784
|
-
// from timings in P141901742, P141902828
|
|
785
|
-
static int map[12] = {
|
|
786
|
-
0, 1, 2, 3, 0x13, 0x23, 0x33, 0x223, 0x233, 0x333, 0x2233, 0x2333};
|
|
787
|
-
if (n <= 11) {
|
|
788
|
-
return map[n];
|
|
789
|
-
} else if (n <= 24) {
|
|
790
|
-
// override qbs: all first stages with 3 steps
|
|
791
|
-
// then 1 stage with the rest
|
|
792
|
-
int nbit = 4 * (n / 3); // nbits with only 3s
|
|
793
|
-
int qbs = 0x33333333 & ((1 << nbit) - 1);
|
|
794
|
-
qbs |= (n % 3) << nbit;
|
|
795
|
-
return qbs;
|
|
796
|
-
} else {
|
|
797
|
-
FAISS_THROW_FMT("number of queries %d too large", n);
|
|
798
|
-
}
|
|
799
|
-
}
|
|
800
|
-
|
|
801
477
|
} // namespace faiss
|
|
478
|
+
|
|
479
|
+
#endif // __AVX512F__
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#pragma once
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* @file rabitq_dispatching.h
|
|
12
|
+
* @brief Per-SIMD TU dispatch for RaBitQ flat scanner.
|
|
13
|
+
*
|
|
14
|
+
* Included after dispatching.h in each per-SIMD TU, so that
|
|
15
|
+
* ScannerMixIn from dispatching.h is visible (same TU).
|
|
16
|
+
*
|
|
17
|
+
* Provides the rabitq_make_knn_scanner_impl<SL> specialization
|
|
18
|
+
* that wraps RaBitQHeapHandler in ScannerMixIn.
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
#ifndef THE_LEVEL_TO_DISPATCH
|
|
22
|
+
#error "Define THE_LEVEL_TO_DISPATCH before including this header"
|
|
23
|
+
#endif
|
|
24
|
+
|
|
25
|
+
#include <faiss/IndexIVFRaBitQFastScan.h>
|
|
26
|
+
#include <faiss/IndexRaBitQFastScan.h>
|
|
27
|
+
#include <faiss/impl/fast_scan/fast_scan.h>
|
|
28
|
+
|
|
29
|
+
// ScannerMixIn is visible from dispatching.h (same TU)
|
|
30
|
+
|
|
31
|
+
namespace faiss {
|
|
32
|
+
|
|
33
|
+
template <>
|
|
34
|
+
std::unique_ptr<FastScanCodeScanner> rabitq_make_knn_scanner_impl<
|
|
35
|
+
THE_LEVEL_TO_DISPATCH>(
|
|
36
|
+
const IndexRaBitQFastScan* index,
|
|
37
|
+
bool is_max,
|
|
38
|
+
size_t nq,
|
|
39
|
+
int64_t k,
|
|
40
|
+
float* distances,
|
|
41
|
+
int64_t* ids,
|
|
42
|
+
const IDSelector* sel,
|
|
43
|
+
const FastScanDistancePostProcessing& context,
|
|
44
|
+
bool is_multi_bit) {
|
|
45
|
+
if (is_max) {
|
|
46
|
+
using H = RaBitQHeapHandler<
|
|
47
|
+
CMax<uint16_t, int>,
|
|
48
|
+
false,
|
|
49
|
+
THE_LEVEL_TO_DISPATCH>;
|
|
50
|
+
return std::make_unique<ScannerMixIn<H>>(
|
|
51
|
+
index, nq, k, distances, ids, sel, &context, is_multi_bit);
|
|
52
|
+
} else {
|
|
53
|
+
using H = RaBitQHeapHandler<
|
|
54
|
+
CMin<uint16_t, int>,
|
|
55
|
+
false,
|
|
56
|
+
THE_LEVEL_TO_DISPATCH>;
|
|
57
|
+
return std::make_unique<ScannerMixIn<H>>(
|
|
58
|
+
index, nq, k, distances, ids, sel, &context, is_multi_bit);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// IVF RaBitQ scanner factory
|
|
63
|
+
template <>
|
|
64
|
+
std::unique_ptr<FastScanCodeScanner> rabitq_ivf_make_knn_scanner_impl<
|
|
65
|
+
THE_LEVEL_TO_DISPATCH>(
|
|
66
|
+
bool is_max,
|
|
67
|
+
const IndexIVFRaBitQFastScan* index,
|
|
68
|
+
size_t nq,
|
|
69
|
+
size_t k,
|
|
70
|
+
float* distances,
|
|
71
|
+
int64_t* ids,
|
|
72
|
+
const IDSelector* sel,
|
|
73
|
+
const FastScanDistancePostProcessing* context,
|
|
74
|
+
bool multi_bit) {
|
|
75
|
+
if (is_max) {
|
|
76
|
+
using C = CMax<uint16_t, int64_t>;
|
|
77
|
+
using H = simd_result_handlers::
|
|
78
|
+
IVFRaBitQHeapHandler<C, THE_LEVEL_TO_DISPATCH>;
|
|
79
|
+
return std::make_unique<ScannerMixIn<H>>(
|
|
80
|
+
index, nq, k, distances, ids, sel, context, multi_bit);
|
|
81
|
+
} else {
|
|
82
|
+
using C = CMin<uint16_t, int64_t>;
|
|
83
|
+
using H = simd_result_handlers::
|
|
84
|
+
IVFRaBitQHeapHandler<C, THE_LEVEL_TO_DISPATCH>;
|
|
85
|
+
return std::make_unique<ScannerMixIn<H>>(
|
|
86
|
+
index, nq, k, distances, ids, sel, context, multi_bit);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
} // namespace faiss
|