faiss 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/faiss/extconf.rb +2 -1
  4. data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
  5. data/ext/faiss/index_binary.cpp +1 -1
  6. data/ext/faiss/kmeans.cpp +1 -1
  7. data/ext/faiss/pca_matrix.cpp +1 -1
  8. data/ext/faiss/product_quantizer.cpp +1 -1
  9. data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
  10. data/lib/faiss/version.rb +1 -1
  11. data/vendor/faiss/faiss/AutoTune.cpp +93 -80
  12. data/vendor/faiss/faiss/Clustering.cpp +39 -240
  13. data/vendor/faiss/faiss/Clustering.h +6 -0
  14. data/vendor/faiss/faiss/IVFlib.cpp +41 -21
  15. data/vendor/faiss/faiss/Index.cpp +6 -5
  16. data/vendor/faiss/faiss/Index.h +5 -5
  17. data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
  21. data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
  22. data/vendor/faiss/faiss/IndexBinary.h +4 -4
  23. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
  24. data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
  31. data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
  32. data/vendor/faiss/faiss/IndexFastScan.h +25 -23
  33. data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
  34. data/vendor/faiss/faiss/IndexFlat.h +21 -18
  35. data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
  36. data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
  37. data/vendor/faiss/faiss/IndexHNSW.h +16 -2
  38. data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
  39. data/vendor/faiss/faiss/IndexIDMap.h +9 -7
  40. data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
  41. data/vendor/faiss/faiss/IndexIVF.h +33 -12
  42. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
  43. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
  45. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
  46. data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
  47. data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
  48. data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
  49. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
  50. data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
  51. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
  53. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
  55. data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
  56. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
  57. data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
  58. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
  59. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
  60. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
  61. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
  62. data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
  63. data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
  66. data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
  67. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
  68. data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
  69. data/vendor/faiss/faiss/IndexPQ.h +3 -2
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
  73. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  74. data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
  75. data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
  76. data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
  77. data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
  78. data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
  79. data/vendor/faiss/faiss/IndexRefine.h +4 -4
  80. data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
  81. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
  82. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
  83. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
  84. data/vendor/faiss/faiss/IndexShards.cpp +10 -9
  85. data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
  86. data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
  87. data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
  88. data/vendor/faiss/faiss/MetaIndexes.h +1 -1
  89. data/vendor/faiss/faiss/MetricType.h +14 -7
  90. data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
  91. data/vendor/faiss/faiss/SuperKMeans.h +97 -0
  92. data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
  93. data/vendor/faiss/faiss/VectorTransform.h +16 -16
  94. data/vendor/faiss/faiss/build.cpp +23 -0
  95. data/vendor/faiss/faiss/build.h +15 -0
  96. data/vendor/faiss/faiss/clone_index.cpp +48 -47
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
  101. data/vendor/faiss/faiss/factory_tools.cpp +5 -0
  102. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
  103. data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
  104. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
  106. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
  107. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
  108. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
  109. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
  110. data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
  111. data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
  112. data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
  113. data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
  114. data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
  115. data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
  116. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
  117. data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
  118. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
  119. data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
  120. data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
  121. data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
  122. data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
  123. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
  124. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
  125. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
  126. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
  127. data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
  128. data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
  129. data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
  130. data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
  131. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
  132. data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
  133. data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
  134. data/vendor/faiss/faiss/impl/FaissException.h +50 -3
  135. data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
  136. data/vendor/faiss/faiss/impl/HNSW.h +13 -34
  137. data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
  138. data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
  139. data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
  140. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
  141. data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
  142. data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
  143. data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
  144. data/vendor/faiss/faiss/impl/NSG.h +4 -4
  145. data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
  146. data/vendor/faiss/faiss/impl/Panorama.h +258 -87
  147. data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
  148. data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
  149. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
  150. data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
  151. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
  152. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
  153. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
  154. data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
  155. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
  156. data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
  157. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
  158. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
  159. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
  160. data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
  161. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
  162. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
  163. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
  164. data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
  165. data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
  166. data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
  167. data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
  168. data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
  169. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
  170. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
  171. data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
  172. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
  173. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
  174. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
  175. data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
  176. data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
  177. data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
  178. data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
  179. data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
  180. data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
  181. data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
  182. data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
  183. data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
  184. data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
  185. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
  186. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
  187. data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
  188. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
  189. data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
  190. data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
  191. data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
  192. data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
  193. data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
  194. data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
  195. data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
  196. data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
  197. data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
  198. data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
  199. data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
  200. data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
  201. data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
  202. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
  203. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
  204. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
  205. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
  206. data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
  207. data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
  208. data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
  209. data/vendor/faiss/faiss/impl/io.cpp +6 -6
  210. data/vendor/faiss/faiss/impl/io_macros.h +33 -16
  211. data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
  212. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
  213. data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
  214. data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
  215. data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
  216. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
  217. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
  218. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
  219. data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
  220. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
  221. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
  222. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
  223. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
  224. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
  225. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
  226. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
  227. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
  228. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
  229. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
  230. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
  231. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  232. data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
  233. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
  234. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
  235. data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
  236. data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
  237. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
  238. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
  239. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
  240. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
  241. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
  242. data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
  243. data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
  244. data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
  245. data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
  246. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
  247. data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
  248. data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
  249. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
  250. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
  251. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
  252. data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
  253. data/vendor/faiss/faiss/impl/svs_io.h +8 -2
  254. data/vendor/faiss/faiss/index_factory.cpp +86 -18
  255. data/vendor/faiss/faiss/index_io.h +24 -0
  256. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
  257. data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
  258. data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
  259. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
  260. data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
  261. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
  262. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
  263. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
  264. data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
  265. data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
  266. data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
  267. data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
  268. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
  269. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
  270. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
  271. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
  272. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
  273. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
  274. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
  275. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
  276. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
  277. data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
  278. data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
  279. data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
  280. data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
  281. data/vendor/faiss/faiss/utils/distances.cpp +390 -560
  282. data/vendor/faiss/faiss/utils/distances.h +20 -1
  283. data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
  284. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
  285. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
  286. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
  287. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
  288. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
  289. data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
  290. data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
  291. data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
  292. data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
  293. data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
  294. data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
  295. data/vendor/faiss/faiss/utils/hamming.h +92 -2
  296. data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
  297. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
  298. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
  299. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
  300. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
  301. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
  302. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
  303. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
  304. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
  305. data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
  306. data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
  307. data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
  308. data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
  309. data/vendor/faiss/faiss/utils/partitioning.h +31 -0
  310. data/vendor/faiss/faiss/utils/popcount.h +29 -0
  311. data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
  312. data/vendor/faiss/faiss/utils/prefetch.h +2 -2
  313. data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
  314. data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
  315. data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
  316. data/vendor/faiss/faiss/utils/random.cpp +6 -6
  317. data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
  318. data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
  319. data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
  320. data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
  321. data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
  322. data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
  323. data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
  324. data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
  325. data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
  326. data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
  327. data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
  328. data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
  329. data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
  330. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
  331. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
  332. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
  333. data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
  334. data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
  335. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
  336. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
  337. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
  338. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
  339. data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
  340. data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
  341. data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
  342. data/vendor/faiss/faiss/utils/utils.cpp +5 -5
  343. data/vendor/faiss/faiss/utils/utils.h +3 -3
  344. metadata +119 -34
  345. data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
  346. data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
  347. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
  348. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
  349. data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
  350. data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
  351. data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
  352. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
  353. data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
  354. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
  355. data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
  356. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
  357. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
  358. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
  359. data/vendor/faiss/faiss/utils/simdlib.h +0 -42
  360. data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
  361. /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
@@ -5,280 +5,18 @@
5
5
  * LICENSE file in the root directory of this source tree.
6
6
  */
7
7
 
8
- // -*- c++ -*-
8
+ // AVX2 compilation unit for the simdlib-based fused distance kernel.
9
9
 
10
- #include <faiss/utils/distances_fused/simdlib_based.h>
10
+ #ifdef COMPILE_SIMD_AVX2
11
11
 
12
- #if defined(__AVX2__) || defined(__aarch64__)
13
-
14
- #include <faiss/utils/simdlib.h>
15
-
16
- #if defined(__AVX2__)
17
- #include <immintrin.h>
18
- #endif
12
+ #include <faiss/impl/simdlib/simdlib_avx2.h>
13
+ // NOLINTNEXTLINE(facebook-hte-InlineHeader)
14
+ #include <faiss/utils/distances_fused/simdlib_kernel-inl.h>
19
15
 
20
16
  namespace faiss {
21
17
 
22
- namespace {
23
-
24
- // It makes sense to like to overload certain cases because the further
25
- // kernels are in need of registers. So, let's tell compiler
26
- // not to waste registers on a bit faster code, if needed.
27
- template <size_t DIM>
28
- float l2_sqr(const float* const x) {
29
- // compiler should be smart enough to handle that
30
- float output = x[0] * x[0];
31
- for (size_t i = 1; i < DIM; i++) {
32
- output += x[i] * x[i];
33
- }
34
-
35
- return output;
36
- }
37
-
38
- template <size_t DIM>
39
- float dot_product(
40
- const float* const __restrict x,
41
- const float* const __restrict y) {
42
- // compiler should be smart enough to handle that
43
- float output = x[0] * y[0];
44
- for (size_t i = 1; i < DIM; i++) {
45
- output += x[i] * y[i];
46
- }
47
-
48
- return output;
49
- }
50
-
51
- // The kernel for low dimensionality vectors.
52
- // Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x
53
- //
54
- // DIM is the dimensionality of the data
55
- // NX_POINTS_PER_LOOP is the number of x points that get processed
56
- // simultaneously.
57
- // NY_POINTS_PER_LOOP is the number of y points that get processed
58
- // simultaneously.
59
- template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
60
- void kernel(
61
- const float* const __restrict x,
62
- const float* const __restrict y,
63
- const float* const __restrict y_transposed,
64
- const size_t ny,
65
- Top1BlockResultHandler<CMax<float, int64_t>>& res,
66
- const float* __restrict y_norms,
67
- const size_t i) {
68
- const size_t ny_p =
69
- (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP);
70
-
71
- // compute
72
- const float* const __restrict xd_0 = x + i * DIM;
73
-
74
- // prefetch the next point
75
- #if defined(__AVX2__)
76
- _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
77
- #endif
78
-
79
- // load a single point from x
80
- // load -2 * value
81
- simd8float32 x_i[NX_POINTS_PER_LOOP][DIM];
82
- for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
83
- for (size_t dd = 0; dd < DIM; dd++) {
84
- x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd));
85
- }
86
- }
87
-
88
- // compute x_norm
89
- float x_norm_i[NX_POINTS_PER_LOOP];
90
- for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
91
- x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
92
- }
93
-
94
- // distances and indices
95
- simd8float32 min_distances_i[NX_POINTS_PER_LOOP];
96
- for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
97
- min_distances_i[nx_k] =
98
- simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
99
- }
100
-
101
- simd8uint32 min_indices_i[NX_POINTS_PER_LOOP];
102
- for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
103
- min_indices_i[nx_k] = simd8uint32((uint32_t)0);
104
- }
105
-
106
- //
107
- simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
108
- const simd8uint32 indices_delta = simd8uint32(8);
109
-
110
- // main loop
111
- size_t j = 0;
112
- for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) {
113
- // compute dot products for NX_POINTS from x and NY_POINTS from y
114
- // technically, we're multiplying -2x and y
115
- simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
116
-
117
- // DIM 0 that uses MUL
118
- for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
119
- simd8float32 y_i =
120
- simd8float32(y_transposed + j + ny_k * 8 + ny * 0);
121
- for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
122
- dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i;
123
- }
124
- }
125
-
126
- // other DIMs that use FMA
127
- for (size_t dd = 1; dd < DIM; dd++) {
128
- for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
129
- simd8float32 y_i =
130
- simd8float32(y_transposed + j + ny_k * 8 + ny * dd);
131
-
132
- for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
133
- dp_i[nx_k][ny_k] =
134
- fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
135
- }
136
- }
137
- }
138
-
139
- // compute y^2 + (-2x,y)
140
- for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
141
- simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8);
142
-
143
- for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
144
- dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr;
145
- }
146
- }
147
-
148
- // do the comparisons and alter the min indices
149
- for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
150
- for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
151
- // cmpps
152
- cmplt_and_blend_inplace(
153
- dp_i[nx_k][ny_k],
154
- current_indices,
155
- min_distances_i[nx_k],
156
- min_indices_i[nx_k]);
157
- }
158
-
159
- current_indices = current_indices + indices_delta;
160
- }
161
- }
162
-
163
- // dump values and find the minimum distance / minimum index
164
- for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
165
- float min_distances_scalar[8];
166
- uint32_t min_indices_scalar[8];
167
-
168
- min_distances_i[nx_k].storeu(min_distances_scalar);
169
- min_indices_i[nx_k].storeu(min_indices_scalar);
170
-
171
- float current_min_distance = res.dis_tab[i + nx_k];
172
- uint32_t current_min_index = res.ids_tab[i + nx_k];
173
-
174
- // This unusual comparison is needed to maintain the behavior
175
- // of the original implementation: if two indices are
176
- // represented with equal distance values, then
177
- // the index with the min value is returned.
178
- for (size_t jv = 0; jv < 8; jv++) {
179
- // add missing x_norms[i]
180
- float distance_candidate =
181
- min_distances_scalar[jv] + x_norm_i[nx_k];
182
-
183
- // negative values can occur for identical vectors
184
- // due to roundoff errors.
185
- if (distance_candidate < 0) {
186
- distance_candidate = 0;
187
- }
188
-
189
- const int64_t index_candidate = min_indices_scalar[jv];
190
-
191
- if (current_min_distance > distance_candidate) {
192
- current_min_distance = distance_candidate;
193
- current_min_index = index_candidate;
194
- } else if (
195
- current_min_distance == distance_candidate &&
196
- current_min_index > index_candidate) {
197
- current_min_index = index_candidate;
198
- }
199
- }
200
-
201
- // process leftovers
202
- for (size_t j0 = j; j0 < ny; j0++) {
203
- const float dp =
204
- dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
205
- float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
206
- // negative values can occur for identical vectors
207
- // due to roundoff errors.
208
- if (dis < 0) {
209
- dis = 0;
210
- }
211
-
212
- if (current_min_distance > dis) {
213
- current_min_distance = dis;
214
- current_min_index = j0;
215
- }
216
- }
217
-
218
- // done
219
- res.add_result(i + nx_k, current_min_distance, current_min_index);
220
- }
221
- }
222
-
223
- template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
224
- void exhaustive_L2sqr_fused_cmax(
225
- const float* const __restrict x,
226
- const float* const __restrict y,
227
- size_t nx,
228
- size_t ny,
229
- Top1BlockResultHandler<CMax<float, int64_t>>& res,
230
- const float* __restrict y_norms) {
231
- // BLAS does not like empty matrices
232
- if (nx == 0 || ny == 0) {
233
- return;
234
- }
235
-
236
- // compute norms for y
237
- std::unique_ptr<float[]> del2;
238
- if (!y_norms) {
239
- float* y_norms2 = new float[ny];
240
- del2.reset(y_norms2);
241
-
242
- for (size_t i = 0; i < ny; i++) {
243
- y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
244
- }
245
-
246
- y_norms = y_norms2;
247
- }
248
-
249
- // initialize res
250
- res.begin_multiple(0, nx);
251
-
252
- // transpose y
253
- std::vector<float> y_transposed(DIM * ny);
254
- for (size_t j = 0; j < DIM; j++) {
255
- for (size_t i = 0; i < ny; i++) {
256
- y_transposed[j * ny + i] = y[j + i * DIM];
257
- }
258
- }
259
-
260
- const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
261
- // the main loop.
262
- #pragma omp parallel for schedule(dynamic)
263
- for (int64_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
264
- kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
265
- x, y, y_transposed.data(), ny, res, y_norms, i);
266
- }
267
-
268
- for (size_t i = nx_p; i < nx; i++) {
269
- kernel<DIM, 1, NY_POINTS_PER_LOOP>(
270
- x, y, y_transposed.data(), ny, res, y_norms, i);
271
- }
272
-
273
- // Does nothing for Top1BlockResultHandler, but
274
- // keeping the call for the consistency.
275
- res.end_multiple();
276
- InterruptCallback::check();
277
- }
278
-
279
- } // namespace
280
-
281
- bool exhaustive_L2sqr_fused_cmax_simdlib(
18
+ template <>
19
+ bool exhaustive_L2sqr_fused_cmax<SIMDLevel::AVX2>(
282
20
  const float* x,
283
21
  const float* y,
284
22
  size_t d,
@@ -290,13 +28,14 @@ bool exhaustive_L2sqr_fused_cmax_simdlib(
290
28
  // An acceptable dimensionality value is limited by the number of
291
29
  // available registers.
292
30
 
293
- #define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP) \
294
- case DIM: { \
295
- exhaustive_L2sqr_fused_cmax< \
296
- DIM, \
297
- NX_POINTS_PER_LOOP, \
298
- NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \
299
- return true; \
31
+ #define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP) \
32
+ case DIM: { \
33
+ exhaustive_L2sqr_fused_cmax< \
34
+ DIM, \
35
+ NX_POINTS_PER_LOOP, \
36
+ NY_POINTS_PER_LOOP, \
37
+ SIMDLevel::AVX2>(x, y, nx, ny, res, y_norms); \
38
+ return true; \
300
39
  }
301
40
 
302
41
  // faiss/benchs/bench_quantizer.py was used for benchmarking
@@ -307,7 +46,6 @@ bool exhaustive_L2sqr_fused_cmax_simdlib(
307
46
  // code might operate with more registers than available
308
47
  // because of concurrent ports operations for ALU and LOAD/STORE.
309
48
 
310
- #if defined(__AVX2__)
311
49
  // It was possible to tweak these parameters on x64 machine.
312
50
  switch (d) {
313
51
  DISPATCH(1, 6, 1)
@@ -327,21 +65,6 @@ bool exhaustive_L2sqr_fused_cmax_simdlib(
327
65
  DISPATCH(15, 6, 1)
328
66
  DISPATCH(16, 6, 1)
329
67
  }
330
- #else
331
- // Please feel free to alter 2nd and 3rd parameters if you have access
332
- // to ARM-based machine so that you are able to benchmark this code.
333
- // Or to enable other dimensions.
334
- switch (d) {
335
- DISPATCH(1, 4, 2)
336
- DISPATCH(2, 2, 2)
337
- DISPATCH(3, 2, 2)
338
- DISPATCH(4, 2, 1)
339
- DISPATCH(5, 1, 1)
340
- DISPATCH(6, 1, 1)
341
- DISPATCH(7, 1, 1)
342
- DISPATCH(8, 1, 1)
343
- }
344
- #endif
345
68
 
346
69
  return false;
347
70
  #undef DISPATCH
@@ -349,4 +72,4 @@ bool exhaustive_L2sqr_fused_cmax_simdlib(
349
72
 
350
73
  } // namespace faiss
351
74
 
352
- #endif
75
+ #endif // COMPILE_SIMD_AVX2
@@ -0,0 +1,57 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // ARM NEON compilation unit for the simdlib-based fused distance kernel.
9
+
10
+ #ifdef COMPILE_SIMD_ARM_NEON
11
+
12
+ #include <faiss/impl/simdlib/simdlib_neon.h>
13
+ // NOLINTNEXTLINE(facebook-hte-InlineHeader)
14
+ #include <faiss/utils/distances_fused/simdlib_kernel-inl.h>
15
+
16
+ namespace faiss {
17
+
18
+ template <>
19
+ bool exhaustive_L2sqr_fused_cmax<SIMDLevel::ARM_NEON>(
20
+ const float* x,
21
+ const float* y,
22
+ size_t d,
23
+ size_t nx,
24
+ size_t ny,
25
+ Top1BlockResultHandler<CMax<float, int64_t>>& res,
26
+ const float* y_norms) {
27
+ #define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP) \
28
+ case DIM: { \
29
+ exhaustive_L2sqr_fused_cmax< \
30
+ DIM, \
31
+ NX_POINTS_PER_LOOP, \
32
+ NY_POINTS_PER_LOOP, \
33
+ SIMDLevel::ARM_NEON>(x, y, nx, ny, res, y_norms); \
34
+ return true; \
35
+ }
36
+
37
+ // Please feel free to alter 2nd and 3rd parameters if you have access
38
+ // to ARM-based machine so that you are able to benchmark this code.
39
+ // Or to enable other dimensions.
40
+ switch (d) {
41
+ DISPATCH(1, 4, 2)
42
+ DISPATCH(2, 2, 2)
43
+ DISPATCH(3, 2, 2)
44
+ DISPATCH(4, 2, 1)
45
+ DISPATCH(5, 1, 1)
46
+ DISPATCH(6, 1, 1)
47
+ DISPATCH(7, 1, 1)
48
+ DISPATCH(8, 1, 1)
49
+ }
50
+
51
+ return false;
52
+ #undef DISPATCH
53
+ }
54
+
55
+ } // namespace faiss
56
+
57
+ #endif // COMPILE_SIMD_ARM_NEON
@@ -0,0 +1,290 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ // Shared kernel for the simdlib-based fused L2+top1 distance computation.
9
+ // Templatized on SIMDLevel so simd8float32_tpl<SL>/simd8uint32_tpl<SL>
10
+ // resolve to the correct ISA-specific specialization.
11
+ //
12
+ // Only included by per-ISA .cpp files (simdlib_based.cpp,
13
+ // simdlib_based_neon.cpp) which first include the ISA-specific simdlib header.
14
+
15
+ #pragma once
16
+
17
+ #include <faiss/utils/distances_fused/distances_fused.h>
18
+
19
+ namespace faiss {
20
+
21
+ namespace {
22
+
23
+ // It makes sense to like to overload certain cases because the further
24
+ // kernels are in need of registers. So, let's tell compiler
25
+ // not to waste registers on a bit faster code, if needed.
26
+ template <size_t DIM>
27
+ float l2_sqr(const float* const x) {
28
+ // compiler should be smart enough to handle that
29
+ float output = x[0] * x[0];
30
+ for (size_t i = 1; i < DIM; i++) {
31
+ output += x[i] * x[i];
32
+ }
33
+
34
+ return output;
35
+ }
36
+
37
+ template <size_t DIM>
38
+ float dot_product(
39
+ const float* const __restrict x,
40
+ const float* const __restrict y) {
41
+ // compiler should be smart enough to handle that
42
+ float output = x[0] * y[0];
43
+ for (size_t i = 1; i < DIM; i++) {
44
+ output += x[i] * y[i];
45
+ }
46
+
47
+ return output;
48
+ }
49
+
50
+ // The kernel for low dimensionality vectors.
51
+ // Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x
52
+ //
53
+ // DIM is the dimensionality of the data
54
+ // NX_POINTS_PER_LOOP is the number of x points that get processed
55
+ // simultaneously.
56
+ // NY_POINTS_PER_LOOP is the number of y points that get processed
57
+ // simultaneously.
58
+ //
59
+ // Templatized on SIMDLevel so the simdlib types (simd8float32_tpl<SL>,
60
+ // simd8uint32_tpl<SL>) resolve to the correct ISA-specific specialization.
61
+ // Only instantiated with concrete SL values (AVX2 or ARM_NEON) from the
62
+ // per-ISA specializations below.
63
+ template <
64
+ size_t DIM,
65
+ size_t NX_POINTS_PER_LOOP,
66
+ size_t NY_POINTS_PER_LOOP,
67
+ SIMDLevel SL>
68
+ void kernel(
69
+ const float* const __restrict x,
70
+ const float* const __restrict y,
71
+ const float* const __restrict y_transposed,
72
+ const size_t ny,
73
+ Top1BlockResultHandler<CMax<float, int64_t>>& res,
74
+ const float* __restrict y_norms,
75
+ const size_t i) {
76
+ using simd_float = simd8float32_tpl<SL>;
77
+ using simd_uint = simd8uint32_tpl<SL>;
78
+ const size_t ny_p =
79
+ (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP);
80
+
81
+ // compute
82
+ const float* const __restrict xd_0 = x + i * DIM;
83
+
84
+ // load a single point from x
85
+ // load -2 * value
86
+ simd_float x_i[NX_POINTS_PER_LOOP][DIM];
87
+ for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
88
+ for (size_t dd = 0; dd < DIM; dd++) {
89
+ x_i[nx_k][dd] = simd_float(-2 * *(xd_0 + nx_k * DIM + dd));
90
+ }
91
+ }
92
+
93
+ // compute x_norm
94
+ float x_norm_i[NX_POINTS_PER_LOOP];
95
+ for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
96
+ x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
97
+ }
98
+
99
+ // distances and indices
100
+ simd_float min_distances_i[NX_POINTS_PER_LOOP];
101
+ for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
102
+ min_distances_i[nx_k] =
103
+ simd_float(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
104
+ }
105
+
106
+ simd_uint min_indices_i[NX_POINTS_PER_LOOP];
107
+ for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
108
+ min_indices_i[nx_k] = simd_uint((uint32_t)0);
109
+ }
110
+
111
+ //
112
+ simd_uint current_indices = simd_uint(0, 1, 2, 3, 4, 5, 6, 7);
113
+ const simd_uint indices_delta = simd_uint(8);
114
+
115
+ // main loop
116
+ size_t j = 0;
117
+ for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) {
118
+ // compute dot products for NX_POINTS from x and NY_POINTS from y
119
+ // technically, we're multiplying -2x and y
120
+ simd_float dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
121
+
122
+ // DIM 0 that uses MUL
123
+ for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
124
+ simd_float y_i = simd_float(y_transposed + j + ny_k * 8 + ny * 0);
125
+ for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
126
+ dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i;
127
+ }
128
+ }
129
+
130
+ // other DIMs that use FMA
131
+ for (size_t dd = 1; dd < DIM; dd++) {
132
+ for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
133
+ simd_float y_i =
134
+ simd_float(y_transposed + j + ny_k * 8 + ny * dd);
135
+
136
+ for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
137
+ dp_i[nx_k][ny_k] =
138
+ fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
139
+ }
140
+ }
141
+ }
142
+
143
+ // compute y^2 + (-2x,y)
144
+ for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
145
+ simd_float y_l2_sqr = simd_float(y_norms + j + ny_k * 8);
146
+
147
+ for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
148
+ dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr;
149
+ }
150
+ }
151
+
152
+ // do the comparisons and alter the min indices
153
+ for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
154
+ for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
155
+ // cmpps
156
+ cmplt_and_blend_inplace(
157
+ dp_i[nx_k][ny_k],
158
+ current_indices,
159
+ min_distances_i[nx_k],
160
+ min_indices_i[nx_k]);
161
+ }
162
+
163
+ current_indices = current_indices + indices_delta;
164
+ }
165
+ }
166
+
167
+ // dump values and find the minimum distance / minimum index
168
+ for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
169
+ float min_distances_scalar[8];
170
+ uint32_t min_indices_scalar[8];
171
+
172
+ min_distances_i[nx_k].storeu(min_distances_scalar);
173
+ min_indices_i[nx_k].storeu(min_indices_scalar);
174
+
175
+ float current_min_distance = res.dis_tab[i + nx_k];
176
+ uint32_t current_min_index = res.ids_tab[i + nx_k];
177
+
178
+ // This unusual comparison is needed to maintain the behavior
179
+ // of the original implementation: if two indices are
180
+ // represented with equal distance values, then
181
+ // the index with the min value is returned.
182
+ for (size_t jv = 0; jv < 8; jv++) {
183
+ // add missing x_norms[i]
184
+ float distance_candidate =
185
+ min_distances_scalar[jv] + x_norm_i[nx_k];
186
+
187
+ // negative values can occur for identical vectors
188
+ // due to roundoff errors.
189
+ if (distance_candidate < 0) {
190
+ distance_candidate = 0;
191
+ }
192
+
193
+ const int64_t index_candidate = min_indices_scalar[jv];
194
+
195
+ if (current_min_distance > distance_candidate) {
196
+ current_min_distance = distance_candidate;
197
+ current_min_index = index_candidate;
198
+ } else if (
199
+ current_min_distance == distance_candidate &&
200
+ current_min_index > index_candidate) {
201
+ current_min_index = index_candidate;
202
+ }
203
+ }
204
+
205
+ // process leftovers
206
+ for (size_t j0 = j; j0 < ny; j0++) {
207
+ const float dp =
208
+ dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
209
+ float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
210
+ // negative values can occur for identical vectors
211
+ // due to roundoff errors.
212
+ if (dis < 0) {
213
+ dis = 0;
214
+ }
215
+
216
+ if (current_min_distance > dis) {
217
+ current_min_distance = dis;
218
+ current_min_index = j0;
219
+ }
220
+ }
221
+
222
+ // done
223
+ res.add_result(i + nx_k, current_min_distance, current_min_index);
224
+ }
225
+ }
226
+
227
+ template <
228
+ size_t DIM,
229
+ size_t NX_POINTS_PER_LOOP,
230
+ size_t NY_POINTS_PER_LOOP,
231
+ SIMDLevel SL>
232
+ void exhaustive_L2sqr_fused_cmax(
233
+ const float* const __restrict x,
234
+ const float* const __restrict y,
235
+ size_t nx,
236
+ size_t ny,
237
+ Top1BlockResultHandler<CMax<float, int64_t>>& res,
238
+ const float* __restrict y_norms) {
239
+ // BLAS does not like empty matrices
240
+ if (nx == 0 || ny == 0) {
241
+ return;
242
+ }
243
+
244
+ // compute norms for y
245
+ std::unique_ptr<float[]> del2;
246
+ if (!y_norms) {
247
+ float* y_norms2 = new float[ny];
248
+ del2.reset(y_norms2);
249
+
250
+ for (size_t i = 0; i < ny; i++) {
251
+ y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
252
+ }
253
+
254
+ y_norms = y_norms2;
255
+ }
256
+
257
+ // initialize res
258
+ res.begin_multiple(0, nx);
259
+
260
+ // transpose y
261
+ std::vector<float> y_transposed(DIM * ny);
262
+ for (size_t j = 0; j < DIM; j++) {
263
+ for (size_t i = 0; i < ny; i++) {
264
+ y_transposed[j * ny + i] = y[j + i * DIM];
265
+ }
266
+ }
267
+
268
+ const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
269
+ // the main loop.
270
+ #pragma omp parallel for schedule(dynamic)
271
+ for (int64_t i = 0; i < static_cast<int64_t>(nx_p);
272
+ i += NX_POINTS_PER_LOOP) {
273
+ kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP, SL>(
274
+ x, y, y_transposed.data(), ny, res, y_norms, i);
275
+ }
276
+
277
+ for (size_t i = nx_p; i < nx; i++) {
278
+ kernel<DIM, 1, NY_POINTS_PER_LOOP, SL>(
279
+ x, y, y_transposed.data(), ny, res, y_norms, i);
280
+ }
281
+
282
+ // Does nothing for Top1BlockResultHandler, but
283
+ // keeping the call for the consistency.
284
+ res.end_multiple();
285
+ InterruptCallback::check();
286
+ }
287
+
288
+ } // namespace
289
+
290
+ } // namespace faiss