faiss 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/faiss/extconf.rb +2 -1
- data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
- data/ext/faiss/index_binary.cpp +1 -1
- data/ext/faiss/kmeans.cpp +1 -1
- data/ext/faiss/pca_matrix.cpp +1 -1
- data/ext/faiss/product_quantizer.cpp +1 -1
- data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
- data/lib/faiss/version.rb +1 -1
- data/vendor/faiss/faiss/AutoTune.cpp +93 -80
- data/vendor/faiss/faiss/Clustering.cpp +39 -240
- data/vendor/faiss/faiss/Clustering.h +6 -0
- data/vendor/faiss/faiss/IVFlib.cpp +41 -21
- data/vendor/faiss/faiss/Index.cpp +6 -5
- data/vendor/faiss/faiss/Index.h +5 -5
- data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
- data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
- data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
- data/vendor/faiss/faiss/IndexBinary.h +4 -4
- data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
- data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
- data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
- data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
- data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
- data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
- data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
- data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
- data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
- data/vendor/faiss/faiss/IndexFastScan.h +25 -23
- data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
- data/vendor/faiss/faiss/IndexFlat.h +21 -18
- data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
- data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
- data/vendor/faiss/faiss/IndexHNSW.h +16 -2
- data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
- data/vendor/faiss/faiss/IndexIDMap.h +9 -7
- data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
- data/vendor/faiss/faiss/IndexIVF.h +33 -12
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
- data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
- data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
- data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
- data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
- data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
- data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
- data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
- data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
- data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
- data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
- data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
- data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
- data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
- data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
- data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
- data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
- data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
- data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
- data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
- data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
- data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
- data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
- data/vendor/faiss/faiss/IndexPQ.h +3 -2
- data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
- data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
- data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
- data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
- data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
- data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
- data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
- data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
- data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
- data/vendor/faiss/faiss/IndexRefine.h +4 -4
- data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
- data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
- data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
- data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
- data/vendor/faiss/faiss/IndexShards.cpp +10 -9
- data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
- data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
- data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
- data/vendor/faiss/faiss/MetaIndexes.h +1 -1
- data/vendor/faiss/faiss/MetricType.h +14 -7
- data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
- data/vendor/faiss/faiss/SuperKMeans.h +97 -0
- data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
- data/vendor/faiss/faiss/VectorTransform.h +16 -16
- data/vendor/faiss/faiss/build.cpp +23 -0
- data/vendor/faiss/faiss/build.h +15 -0
- data/vendor/faiss/faiss/clone_index.cpp +48 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
- data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
- data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
- data/vendor/faiss/faiss/factory_tools.cpp +5 -0
- data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
- data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
- data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
- data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
- data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
- data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
- data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
- data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
- data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
- data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
- data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
- data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
- data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
- data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
- data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
- data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
- data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
- data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
- data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
- data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
- data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
- data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
- data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
- data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
- data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
- data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
- data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
- data/vendor/faiss/faiss/impl/FaissException.h +50 -3
- data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
- data/vendor/faiss/faiss/impl/HNSW.h +13 -34
- data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
- data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
- data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
- data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
- data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
- data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
- data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
- data/vendor/faiss/faiss/impl/NSG.h +4 -4
- data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
- data/vendor/faiss/faiss/impl/Panorama.h +258 -87
- data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
- data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
- data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
- data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
- data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
- data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
- data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
- data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
- data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
- data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
- data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
- data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
- data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
- data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
- data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
- data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
- data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
- data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
- data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
- data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
- data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
- data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
- data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
- data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
- data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
- data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
- data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
- data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
- data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
- data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
- data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
- data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
- data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
- data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
- data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
- data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
- data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
- data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
- data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
- data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
- data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
- data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
- data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
- data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
- data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
- data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
- data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
- data/vendor/faiss/faiss/impl/io.cpp +6 -6
- data/vendor/faiss/faiss/impl/io_macros.h +33 -16
- data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
- data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
- data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
- data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
- data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
- data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
- data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
- data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
- data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
- data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
- data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
- data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
- data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
- data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
- data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
- data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
- data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
- data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
- data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
- data/vendor/faiss/faiss/impl/svs_io.h +8 -2
- data/vendor/faiss/faiss/index_factory.cpp +86 -18
- data/vendor/faiss/faiss/index_io.h +24 -0
- data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
- data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
- data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
- data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
- data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
- data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
- data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
- data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
- data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
- data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
- data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
- data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
- data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
- data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
- data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
- data/vendor/faiss/faiss/utils/distances.cpp +390 -560
- data/vendor/faiss/faiss/utils/distances.h +20 -1
- data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
- data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
- data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
- data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
- data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
- data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
- data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
- data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
- data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
- data/vendor/faiss/faiss/utils/hamming.h +92 -2
- data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
- data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
- data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
- data/vendor/faiss/faiss/utils/partitioning.h +31 -0
- data/vendor/faiss/faiss/utils/popcount.h +29 -0
- data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
- data/vendor/faiss/faiss/utils/prefetch.h +2 -2
- data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
- data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
- data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
- data/vendor/faiss/faiss/utils/random.cpp +6 -6
- data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
- data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
- data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
- data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
- data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
- data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
- data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
- data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
- data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
- data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
- data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
- data/vendor/faiss/faiss/utils/utils.cpp +5 -5
- data/vendor/faiss/faiss/utils/utils.h +3 -3
- metadata +119 -34
- data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
- data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
- data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
- data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
- data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
- data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
- data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
- data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
- data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
- data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
- data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
- data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
- data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
- data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
- data/vendor/faiss/faiss/utils/simdlib.h +0 -42
- data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
- /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#pragma once
|
|
9
|
+
|
|
10
|
+
#include <cstdint>
|
|
11
|
+
#include <string>
|
|
12
|
+
|
|
13
|
+
#include <immintrin.h>
|
|
14
|
+
|
|
15
|
+
#include <faiss/impl/platform_macros.h>
|
|
16
|
+
#include <faiss/impl/simdlib/simdlib.h>
|
|
17
|
+
|
|
18
|
+
#include <faiss/impl/simdlib/simdlib_avx2.h>
|
|
19
|
+
|
|
20
|
+
namespace faiss {
|
|
21
|
+
|
|
22
|
+
/** Simple wrapper around the AVX 512-bit registers
|
|
23
|
+
*
|
|
24
|
+
* The objective is to separate the different interpretations of the same
|
|
25
|
+
* registers (as a vector of uint8, uint16 or uint32), to provide printing
|
|
26
|
+
* functions, and to give more readable names to the AVX intrinsics. It does not
|
|
27
|
+
* pretend to be exhaustive, functions are added as needed.
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
/// 512-bit representation without interpretation as a vector
|
|
31
|
+
template <>
|
|
32
|
+
struct simd512bit_tpl<SIMDLevel::AVX512> {
|
|
33
|
+
union {
|
|
34
|
+
__m512i i;
|
|
35
|
+
__m512 f;
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
simd512bit_tpl() {}
|
|
39
|
+
|
|
40
|
+
explicit simd512bit_tpl(__m512i i) : i(i) {}
|
|
41
|
+
|
|
42
|
+
explicit simd512bit_tpl(__m512 f) : f(f) {}
|
|
43
|
+
|
|
44
|
+
explicit simd512bit_tpl(const void* x)
|
|
45
|
+
: i(_mm512_loadu_si512((__m512i const*)x)) {}
|
|
46
|
+
|
|
47
|
+
// sets up a lower half of the register while keeping upper one as zero
|
|
48
|
+
explicit simd512bit_tpl(simd256bit_tpl<SIMDLevel::AVX2> lo)
|
|
49
|
+
: simd512bit_tpl(_mm512_inserti32x8(
|
|
50
|
+
_mm512_castsi256_si512(lo.i),
|
|
51
|
+
_mm256_setzero_si256(),
|
|
52
|
+
1)) {}
|
|
53
|
+
|
|
54
|
+
// constructs from lower and upper halves
|
|
55
|
+
explicit simd512bit_tpl(
|
|
56
|
+
simd256bit_tpl<SIMDLevel::AVX2> lo,
|
|
57
|
+
simd256bit_tpl<SIMDLevel::AVX2> hi)
|
|
58
|
+
: simd512bit_tpl(_mm512_inserti32x8(
|
|
59
|
+
_mm512_castsi256_si512(lo.i),
|
|
60
|
+
hi.i,
|
|
61
|
+
1)) {}
|
|
62
|
+
|
|
63
|
+
void clear() {
|
|
64
|
+
i = _mm512_setzero_si512();
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
void storeu(void* ptr) const {
|
|
68
|
+
_mm512_storeu_si512((__m512i*)ptr, i);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
void loadu(const void* ptr) {
|
|
72
|
+
i = _mm512_loadu_si512((__m512i*)ptr);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
void store(void* ptr) const {
|
|
76
|
+
_mm512_storeu_si512((__m512i*)ptr, i);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
void bin(char bits[513]) const {
|
|
80
|
+
char bytes[64];
|
|
81
|
+
storeu((void*)bytes);
|
|
82
|
+
for (int i = 0; i < 512; i++) {
|
|
83
|
+
bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
|
|
84
|
+
}
|
|
85
|
+
bits[512] = 0;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
std::string bin() const {
|
|
89
|
+
char bits[513];
|
|
90
|
+
bin(bits);
|
|
91
|
+
return std::string(bits);
|
|
92
|
+
}
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
/// vector of 32 elements in uint16
|
|
96
|
+
template <>
|
|
97
|
+
struct simd32uint16_tpl<SIMDLevel::AVX512> : simd512bit_tpl<SIMDLevel::AVX512> {
|
|
98
|
+
simd32uint16_tpl() {}
|
|
99
|
+
|
|
100
|
+
explicit simd32uint16_tpl(__m512i i)
|
|
101
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(i) {}
|
|
102
|
+
|
|
103
|
+
explicit simd32uint16_tpl(int x)
|
|
104
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(_mm512_set1_epi16(x)) {}
|
|
105
|
+
|
|
106
|
+
explicit simd32uint16_tpl(uint16_t x)
|
|
107
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(_mm512_set1_epi16(x)) {}
|
|
108
|
+
|
|
109
|
+
explicit simd32uint16_tpl(simd512bit_tpl<SIMDLevel::AVX512> x)
|
|
110
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(x) {}
|
|
111
|
+
|
|
112
|
+
explicit simd32uint16_tpl(const uint16_t* x)
|
|
113
|
+
: simd512bit_tpl<SIMDLevel::AVX512>((const void*)x) {}
|
|
114
|
+
|
|
115
|
+
// sets up a lower half of the register
|
|
116
|
+
explicit simd32uint16_tpl(simd256bit_tpl<SIMDLevel::AVX2> lo)
|
|
117
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(lo) {}
|
|
118
|
+
|
|
119
|
+
// constructs from lower and upper halves
|
|
120
|
+
explicit simd32uint16_tpl(
|
|
121
|
+
simd256bit_tpl<SIMDLevel::AVX2> lo,
|
|
122
|
+
simd256bit_tpl<SIMDLevel::AVX2> hi)
|
|
123
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(lo, hi) {}
|
|
124
|
+
|
|
125
|
+
std::string elements_to_string(const char* fmt) const {
|
|
126
|
+
uint16_t bytes[32];
|
|
127
|
+
storeu((void*)bytes);
|
|
128
|
+
char res[2000];
|
|
129
|
+
char* ptr = res;
|
|
130
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
131
|
+
#pragma GCC diagnostic push
|
|
132
|
+
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
|
|
133
|
+
#endif
|
|
134
|
+
for (int i = 0; i < 32; i++) {
|
|
135
|
+
ptr += snprintf(
|
|
136
|
+
ptr, (size_t)(res + sizeof(res) - ptr), fmt, bytes[i]);
|
|
137
|
+
}
|
|
138
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
139
|
+
#pragma GCC diagnostic pop
|
|
140
|
+
#endif
|
|
141
|
+
// strip last ,
|
|
142
|
+
ptr[-1] = 0;
|
|
143
|
+
return std::string(res);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
std::string hex() const {
|
|
147
|
+
return elements_to_string("%02x,");
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
std::string dec() const {
|
|
151
|
+
return elements_to_string("%3d,");
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
void set1(uint16_t x) {
|
|
155
|
+
i = _mm512_set1_epi16((short)x);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
simd32uint16_tpl operator*(const simd32uint16_tpl& other) const {
|
|
159
|
+
return simd32uint16_tpl(_mm512_mullo_epi16(i, other.i));
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// shift must be known at compile time
|
|
163
|
+
simd32uint16_tpl operator>>(const int shift) const {
|
|
164
|
+
return simd32uint16_tpl(_mm512_srli_epi16(i, shift));
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// shift must be known at compile time
|
|
168
|
+
simd32uint16_tpl operator<<(const int shift) const {
|
|
169
|
+
return simd32uint16_tpl(_mm512_slli_epi16(i, shift));
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
simd32uint16_tpl& operator+=(simd32uint16_tpl other) {
|
|
173
|
+
i = _mm512_add_epi16(i, other.i);
|
|
174
|
+
return *this;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
simd32uint16_tpl& operator-=(simd32uint16_tpl other) {
|
|
178
|
+
i = _mm512_sub_epi16(i, other.i);
|
|
179
|
+
return *this;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
simd32uint16_tpl operator+(simd32uint16_tpl other) const {
|
|
183
|
+
return simd32uint16_tpl(_mm512_add_epi16(i, other.i));
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
simd32uint16_tpl operator-(simd32uint16_tpl other) const {
|
|
187
|
+
return simd32uint16_tpl(_mm512_sub_epi16(i, other.i));
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
simd32uint16_tpl operator&(simd512bit_tpl<SIMDLevel::AVX512> other) const {
|
|
191
|
+
return simd32uint16_tpl(_mm512_and_si512(i, other.i));
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
simd32uint16_tpl operator|(simd512bit_tpl<SIMDLevel::AVX512> other) const {
|
|
195
|
+
return simd32uint16_tpl(_mm512_or_si512(i, other.i));
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
simd32uint16_tpl operator^(simd512bit_tpl<SIMDLevel::AVX512> other) const {
|
|
199
|
+
return simd32uint16_tpl(_mm512_xor_si512(i, other.i));
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
simd32uint16_tpl operator~() const {
|
|
203
|
+
return simd32uint16_tpl(_mm512_xor_si512(i, _mm512_set1_epi32(-1)));
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
simd16uint16_tpl<SIMDLevel::AVX2> low() const {
|
|
207
|
+
return simd16uint16_tpl<SIMDLevel::AVX2>(_mm512_castsi512_si256(i));
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
simd16uint16_tpl<SIMDLevel::AVX2> high() const {
|
|
211
|
+
return simd16uint16_tpl<SIMDLevel::AVX2>(
|
|
212
|
+
_mm512_extracti32x8_epi32(i, 1));
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// for debugging only
|
|
216
|
+
uint16_t operator[](int i) const {
|
|
217
|
+
ALIGNED(64) uint16_t tab[32];
|
|
218
|
+
store(tab);
|
|
219
|
+
return tab[i];
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
void accu_min(simd32uint16_tpl incoming) {
|
|
223
|
+
i = _mm512_min_epu16(i, incoming.i);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
void accu_max(simd32uint16_tpl incoming) {
|
|
227
|
+
i = _mm512_max_epu16(i, incoming.i);
|
|
228
|
+
}
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
// decompose in 128-lanes: a = (a0, a1, a2, a3), b = (b0, b1, b2, b3)
|
|
232
|
+
// return (a0 + a1 + a2 + a3, b0 + b1 + b2 + b3)
|
|
233
|
+
inline simd16uint16_tpl<SIMDLevel::AVX2> combine4x2(
|
|
234
|
+
simd32uint16_tpl<SIMDLevel::AVX512> a,
|
|
235
|
+
simd32uint16_tpl<SIMDLevel::AVX512> b) {
|
|
236
|
+
return combine2x2(a.low(), b.low()) + combine2x2(a.high(), b.high());
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// vector of 32 unsigned 8-bit integers
|
|
240
|
+
template <>
|
|
241
|
+
struct simd64uint8_tpl<SIMDLevel::AVX512> : simd512bit_tpl<SIMDLevel::AVX512> {
|
|
242
|
+
simd64uint8_tpl() {}
|
|
243
|
+
|
|
244
|
+
explicit simd64uint8_tpl(__m512i i)
|
|
245
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(i) {}
|
|
246
|
+
|
|
247
|
+
explicit simd64uint8_tpl(int x)
|
|
248
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(_mm512_set1_epi8(x)) {}
|
|
249
|
+
|
|
250
|
+
explicit simd64uint8_tpl(uint8_t x)
|
|
251
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(_mm512_set1_epi8(x)) {}
|
|
252
|
+
|
|
253
|
+
// sets up a lower half of the register
|
|
254
|
+
explicit simd64uint8_tpl(simd256bit_tpl<SIMDLevel::AVX2> lo)
|
|
255
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(lo) {}
|
|
256
|
+
|
|
257
|
+
// constructs from lower and upper halves
|
|
258
|
+
explicit simd64uint8_tpl(
|
|
259
|
+
simd256bit_tpl<SIMDLevel::AVX2> lo,
|
|
260
|
+
simd256bit_tpl<SIMDLevel::AVX2> hi)
|
|
261
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(lo, hi) {}
|
|
262
|
+
|
|
263
|
+
explicit simd64uint8_tpl(simd512bit_tpl<SIMDLevel::AVX512> x)
|
|
264
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(x) {}
|
|
265
|
+
|
|
266
|
+
explicit simd64uint8_tpl(const uint8_t* x)
|
|
267
|
+
: simd512bit_tpl<SIMDLevel::AVX512>((const void*)x) {}
|
|
268
|
+
|
|
269
|
+
std::string elements_to_string(const char* fmt) const {
|
|
270
|
+
uint8_t bytes[64];
|
|
271
|
+
storeu((void*)bytes);
|
|
272
|
+
char res[2000];
|
|
273
|
+
char* ptr = res;
|
|
274
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
275
|
+
#pragma GCC diagnostic push
|
|
276
|
+
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
|
|
277
|
+
#endif
|
|
278
|
+
for (int i = 0; i < 64; i++) {
|
|
279
|
+
ptr += snprintf(
|
|
280
|
+
ptr, (size_t)(res + sizeof(res) - ptr), fmt, bytes[i]);
|
|
281
|
+
}
|
|
282
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
283
|
+
#pragma GCC diagnostic pop
|
|
284
|
+
#endif
|
|
285
|
+
// strip last ,
|
|
286
|
+
ptr[-1] = 0;
|
|
287
|
+
return std::string(res);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
std::string hex() const {
|
|
291
|
+
return elements_to_string("%02x,");
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
std::string dec() const {
|
|
295
|
+
return elements_to_string("%3d,");
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
void set1(uint8_t x) {
|
|
299
|
+
i = _mm512_set1_epi8((char)x);
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
simd64uint8_tpl operator&(simd512bit_tpl<SIMDLevel::AVX512> other) const {
|
|
303
|
+
return simd64uint8_tpl(_mm512_and_si512(i, other.i));
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
simd64uint8_tpl operator+(simd64uint8_tpl other) const {
|
|
307
|
+
return simd64uint8_tpl(_mm512_add_epi8(i, other.i));
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
simd64uint8_tpl lookup_4_lanes(simd64uint8_tpl idx) const {
|
|
311
|
+
return simd64uint8_tpl(_mm512_shuffle_epi8(i, idx.i));
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// extract + 0-extend lane
|
|
315
|
+
// this operation is slow (3 cycles)
|
|
316
|
+
simd32uint16_tpl<SIMDLevel::AVX512> lane0_as_uint16() const {
|
|
317
|
+
__m256i x = _mm512_extracti32x8_epi32(i, 0);
|
|
318
|
+
return simd32uint16_tpl<SIMDLevel::AVX512>(_mm512_cvtepu8_epi16(x));
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
simd32uint16_tpl<SIMDLevel::AVX512> lane1_as_uint16() const {
|
|
322
|
+
__m256i x = _mm512_extracti32x8_epi32(i, 1);
|
|
323
|
+
return simd32uint16_tpl<SIMDLevel::AVX512>(_mm512_cvtepu8_epi16(x));
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
simd64uint8_tpl& operator+=(simd64uint8_tpl other) {
|
|
327
|
+
i = _mm512_add_epi8(i, other.i);
|
|
328
|
+
return *this;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// for debugging only
|
|
332
|
+
uint8_t operator[](int i) const {
|
|
333
|
+
ALIGNED(64) uint8_t tab[64];
|
|
334
|
+
store(tab);
|
|
335
|
+
return tab[i];
|
|
336
|
+
}
|
|
337
|
+
};
|
|
338
|
+
|
|
339
|
+
/// vector of 16 32-bit floats
|
|
340
|
+
template <>
|
|
341
|
+
struct simd16float32_tpl<SIMDLevel::AVX512>
|
|
342
|
+
: simd512bit_tpl<SIMDLevel::AVX512> {
|
|
343
|
+
simd16float32_tpl() {}
|
|
344
|
+
|
|
345
|
+
explicit simd16float32_tpl(__m512 f)
|
|
346
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(f) {}
|
|
347
|
+
|
|
348
|
+
explicit simd16float32_tpl(float x)
|
|
349
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(_mm512_set1_ps(x)) {}
|
|
350
|
+
|
|
351
|
+
explicit simd16float32_tpl(const float* x)
|
|
352
|
+
: simd512bit_tpl<SIMDLevel::AVX512>(_mm512_loadu_ps(x)) {}
|
|
353
|
+
|
|
354
|
+
void clear() {
|
|
355
|
+
f = _mm512_setzero_ps();
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
void storeu(float* ptr) const {
|
|
359
|
+
_mm512_storeu_ps(ptr, f);
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
void loadu(const float* ptr) {
|
|
363
|
+
f = _mm512_loadu_ps(ptr);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
void store(float* ptr) const {
|
|
367
|
+
_mm512_storeu_ps(ptr, f);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
simd16float32_tpl operator*(const simd16float32_tpl& other) const {
|
|
371
|
+
return simd16float32_tpl(_mm512_mul_ps(f, other.f));
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
simd16float32_tpl operator+(const simd16float32_tpl& other) const {
|
|
375
|
+
return simd16float32_tpl(_mm512_add_ps(f, other.f));
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
simd16float32_tpl operator-(const simd16float32_tpl& other) const {
|
|
379
|
+
return simd16float32_tpl(_mm512_sub_ps(f, other.f));
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
simd16float32_tpl& operator+=(const simd16float32_tpl& other) {
|
|
383
|
+
f = _mm512_add_ps(f, other.f);
|
|
384
|
+
return *this;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
std::string tostring() const {
|
|
388
|
+
float tab[16];
|
|
389
|
+
storeu(tab);
|
|
390
|
+
char res[1000];
|
|
391
|
+
char* ptr = res;
|
|
392
|
+
for (int i = 0; i < 16; i++) {
|
|
393
|
+
ptr += snprintf(
|
|
394
|
+
ptr, (size_t)(res + sizeof(res) - ptr), "%g,", tab[i]);
|
|
395
|
+
}
|
|
396
|
+
ptr[-1] = 0;
|
|
397
|
+
return std::string(res);
|
|
398
|
+
}
|
|
399
|
+
};
|
|
400
|
+
|
|
401
|
+
// compute a * b + c
|
|
402
|
+
inline simd16float32_tpl<SIMDLevel::AVX512> fmadd(
|
|
403
|
+
const simd16float32_tpl<SIMDLevel::AVX512>& a,
|
|
404
|
+
const simd16float32_tpl<SIMDLevel::AVX512>& b,
|
|
405
|
+
const simd16float32_tpl<SIMDLevel::AVX512>& c) {
|
|
406
|
+
return simd16float32_tpl<SIMDLevel::AVX512>(_mm512_fmadd_ps(a.f, b.f, c.f));
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// horizontal add: sum all 16 floats in the register
|
|
410
|
+
inline float horizontal_add(const simd16float32_tpl<SIMDLevel::AVX512>& a) {
|
|
411
|
+
return _mm512_reduce_add_ps(a.f);
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
} // namespace faiss
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#pragma once
|
|
9
|
+
|
|
10
|
+
/** Includes simdlib.h (primary templates + NONE/emulated specialization)
|
|
11
|
+
* plus the platform specialization for the current compilation context.
|
|
12
|
+
*
|
|
13
|
+
* Generic code should include this header.
|
|
14
|
+
* Per-SIMD TUs should include the concrete header directly
|
|
15
|
+
* (e.g., simdlib_avx2.h).
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
#include <faiss/impl/simdlib/simdlib.h>
|
|
19
|
+
|
|
20
|
+
// Platform specializations — guarded by COMPILE_SIMD_* AND compiler macros.
|
|
21
|
+
// In DD mode: COMPILE_SIMD_* are target-wide, compiler macros are per-file.
|
|
22
|
+
// Only per-SIMD TUs (compiled with -mavx2 etc.) see the platform
|
|
23
|
+
// specializations. In static mode: only the compiled-in level is available.
|
|
24
|
+
|
|
25
|
+
#if (defined(COMPILE_SIMD_AVX512) || defined(COMPILE_SIMD_AVX512_SPR)) && \
|
|
26
|
+
defined(__AVX512F__)
|
|
27
|
+
|
|
28
|
+
// AVX512 includes AVX2 (simdlib_avx512.h includes simdlib_avx2.h)
|
|
29
|
+
#include <faiss/impl/simdlib/simdlib_avx512.h>
|
|
30
|
+
|
|
31
|
+
#elif defined(COMPILE_SIMD_AVX2) && defined(__AVX2__)
|
|
32
|
+
|
|
33
|
+
#include <faiss/impl/simdlib/simdlib_avx2.h>
|
|
34
|
+
|
|
35
|
+
#elif defined(COMPILE_SIMD_ARM_NEON) && defined(__aarch64__)
|
|
36
|
+
|
|
37
|
+
#include <faiss/impl/simdlib/simdlib_neon.h>
|
|
38
|
+
|
|
39
|
+
#endif
|
|
40
|
+
|
|
41
|
+
// No global bare-name aliases (simd16uint16, simd32uint8, etc.) — each file
|
|
42
|
+
// that needs them must declare its own `using` with an explicit SIMD level.
|
|
43
|
+
// This prevents per-ISA TUs from accidentally picking up SINGLE_SIMD_LEVEL
|
|
44
|
+
// (= NONE in DD mode) when they should use THE_SIMD_LEVEL.
|