faiss 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/faiss/extconf.rb +2 -1
  4. data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
  5. data/ext/faiss/index_binary.cpp +1 -1
  6. data/ext/faiss/kmeans.cpp +1 -1
  7. data/ext/faiss/pca_matrix.cpp +1 -1
  8. data/ext/faiss/product_quantizer.cpp +1 -1
  9. data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
  10. data/lib/faiss/version.rb +1 -1
  11. data/vendor/faiss/faiss/AutoTune.cpp +93 -80
  12. data/vendor/faiss/faiss/Clustering.cpp +39 -240
  13. data/vendor/faiss/faiss/Clustering.h +6 -0
  14. data/vendor/faiss/faiss/IVFlib.cpp +41 -21
  15. data/vendor/faiss/faiss/Index.cpp +6 -5
  16. data/vendor/faiss/faiss/Index.h +5 -5
  17. data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
  21. data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
  22. data/vendor/faiss/faiss/IndexBinary.h +4 -4
  23. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
  24. data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
  31. data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
  32. data/vendor/faiss/faiss/IndexFastScan.h +25 -23
  33. data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
  34. data/vendor/faiss/faiss/IndexFlat.h +21 -18
  35. data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
  36. data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
  37. data/vendor/faiss/faiss/IndexHNSW.h +16 -2
  38. data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
  39. data/vendor/faiss/faiss/IndexIDMap.h +9 -7
  40. data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
  41. data/vendor/faiss/faiss/IndexIVF.h +33 -12
  42. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
  43. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
  45. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
  46. data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
  47. data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
  48. data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
  49. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
  50. data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
  51. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
  53. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
  55. data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
  56. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
  57. data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
  58. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
  59. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
  60. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
  61. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
  62. data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
  63. data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
  66. data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
  67. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
  68. data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
  69. data/vendor/faiss/faiss/IndexPQ.h +3 -2
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
  73. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  74. data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
  75. data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
  76. data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
  77. data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
  78. data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
  79. data/vendor/faiss/faiss/IndexRefine.h +4 -4
  80. data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
  81. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
  82. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
  83. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
  84. data/vendor/faiss/faiss/IndexShards.cpp +10 -9
  85. data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
  86. data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
  87. data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
  88. data/vendor/faiss/faiss/MetaIndexes.h +1 -1
  89. data/vendor/faiss/faiss/MetricType.h +14 -7
  90. data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
  91. data/vendor/faiss/faiss/SuperKMeans.h +97 -0
  92. data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
  93. data/vendor/faiss/faiss/VectorTransform.h +16 -16
  94. data/vendor/faiss/faiss/build.cpp +23 -0
  95. data/vendor/faiss/faiss/build.h +15 -0
  96. data/vendor/faiss/faiss/clone_index.cpp +48 -47
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
  101. data/vendor/faiss/faiss/factory_tools.cpp +5 -0
  102. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
  103. data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
  104. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
  106. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
  107. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
  108. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
  109. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
  110. data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
  111. data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
  112. data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
  113. data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
  114. data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
  115. data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
  116. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
  117. data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
  118. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
  119. data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
  120. data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
  121. data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
  122. data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
  123. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
  124. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
  125. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
  126. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
  127. data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
  128. data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
  129. data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
  130. data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
  131. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
  132. data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
  133. data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
  134. data/vendor/faiss/faiss/impl/FaissException.h +50 -3
  135. data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
  136. data/vendor/faiss/faiss/impl/HNSW.h +13 -34
  137. data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
  138. data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
  139. data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
  140. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
  141. data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
  142. data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
  143. data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
  144. data/vendor/faiss/faiss/impl/NSG.h +4 -4
  145. data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
  146. data/vendor/faiss/faiss/impl/Panorama.h +258 -87
  147. data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
  148. data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
  149. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
  150. data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
  151. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
  152. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
  153. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
  154. data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
  155. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
  156. data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
  157. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
  158. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
  159. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
  160. data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
  161. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
  162. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
  163. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
  164. data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
  165. data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
  166. data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
  167. data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
  168. data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
  169. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
  170. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
  171. data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
  172. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
  173. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
  174. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
  175. data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
  176. data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
  177. data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
  178. data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
  179. data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
  180. data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
  181. data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
  182. data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
  183. data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
  184. data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
  185. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
  186. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
  187. data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
  188. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
  189. data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
  190. data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
  191. data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
  192. data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
  193. data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
  194. data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
  195. data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
  196. data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
  197. data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
  198. data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
  199. data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
  200. data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
  201. data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
  202. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
  203. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
  204. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
  205. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
  206. data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
  207. data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
  208. data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
  209. data/vendor/faiss/faiss/impl/io.cpp +6 -6
  210. data/vendor/faiss/faiss/impl/io_macros.h +33 -16
  211. data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
  212. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
  213. data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
  214. data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
  215. data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
  216. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
  217. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
  218. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
  219. data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
  220. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
  221. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
  222. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
  223. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
  224. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
  225. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
  226. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
  227. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
  228. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
  229. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
  230. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
  231. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  232. data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
  233. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
  234. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
  235. data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
  236. data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
  237. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
  238. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
  239. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
  240. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
  241. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
  242. data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
  243. data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
  244. data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
  245. data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
  246. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
  247. data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
  248. data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
  249. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
  250. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
  251. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
  252. data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
  253. data/vendor/faiss/faiss/impl/svs_io.h +8 -2
  254. data/vendor/faiss/faiss/index_factory.cpp +86 -18
  255. data/vendor/faiss/faiss/index_io.h +24 -0
  256. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
  257. data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
  258. data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
  259. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
  260. data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
  261. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
  262. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
  263. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
  264. data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
  265. data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
  266. data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
  267. data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
  268. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
  269. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
  270. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
  271. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
  272. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
  273. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
  274. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
  275. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
  276. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
  277. data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
  278. data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
  279. data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
  280. data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
  281. data/vendor/faiss/faiss/utils/distances.cpp +390 -560
  282. data/vendor/faiss/faiss/utils/distances.h +20 -1
  283. data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
  284. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
  285. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
  286. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
  287. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
  288. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
  289. data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
  290. data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
  291. data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
  292. data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
  293. data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
  294. data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
  295. data/vendor/faiss/faiss/utils/hamming.h +92 -2
  296. data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
  297. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
  298. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
  299. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
  300. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
  301. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
  302. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
  303. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
  304. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
  305. data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
  306. data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
  307. data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
  308. data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
  309. data/vendor/faiss/faiss/utils/partitioning.h +31 -0
  310. data/vendor/faiss/faiss/utils/popcount.h +29 -0
  311. data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
  312. data/vendor/faiss/faiss/utils/prefetch.h +2 -2
  313. data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
  314. data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
  315. data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
  316. data/vendor/faiss/faiss/utils/random.cpp +6 -6
  317. data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
  318. data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
  319. data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
  320. data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
  321. data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
  322. data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
  323. data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
  324. data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
  325. data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
  326. data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
  327. data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
  328. data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
  329. data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
  330. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
  331. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
  332. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
  333. data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
  334. data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
  335. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
  336. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
  337. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
  338. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
  339. data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
  340. data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
  341. data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
  342. data/vendor/faiss/faiss/utils/utils.cpp +5 -5
  343. data/vendor/faiss/faiss/utils/utils.h +3 -3
  344. metadata +119 -34
  345. data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
  346. data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
  347. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
  348. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
  349. data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
  350. data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
  351. data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
  352. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
  353. data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
  354. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
  355. data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
  356. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
  357. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
  358. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
  359. data/vendor/faiss/faiss/utils/simdlib.h +0 -42
  360. data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
  361. /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
@@ -1,230 +0,0 @@
1
- /*
2
- * Copyright (c) Meta Platforms, Inc. and affiliates.
3
- *
4
- * This source code is licensed under the MIT license found in the
5
- * LICENSE file in the root directory of this source tree.
6
- */
7
-
8
- #include <faiss/impl/pq4_fast_scan.h>
9
-
10
- #include <faiss/impl/FaissAssert.h>
11
- #include <faiss/impl/LookupTableScaler.h>
12
- #include <faiss/impl/simd_result_handlers.h>
13
-
14
- namespace faiss {
15
-
16
- using namespace simd_result_handlers;
17
-
18
- /***************************************************************
19
- * accumulation functions
20
- ***************************************************************/
21
-
22
- namespace {
23
-
24
- /*
25
- * The computation kernel
26
- * It accumulates results for NQ queries and BB * 32 database elements
27
- * writes results in a ResultHandler
28
- */
29
-
30
- template <int NQ, int BB, class ResultHandler, class Scaler>
31
- void kernel_accumulate_block(
32
- int nsq,
33
- const uint8_t* codes,
34
- const uint8_t* LUT,
35
- ResultHandler& res,
36
- const Scaler& scaler) {
37
- // distance accumulators
38
- simd16uint16 accu[NQ][BB][4];
39
-
40
- for (int q = 0; q < NQ; q++) {
41
- for (int b = 0; b < BB; b++) {
42
- accu[q][b][0].clear();
43
- accu[q][b][1].clear();
44
- accu[q][b][2].clear();
45
- accu[q][b][3].clear();
46
- }
47
- }
48
-
49
- for (int sq = 0; sq < nsq - scaler.nscale; sq += 2) {
50
- simd32uint8 lut_cache[NQ];
51
- for (int q = 0; q < NQ; q++) {
52
- lut_cache[q] = simd32uint8(LUT);
53
- LUT += 32;
54
- }
55
-
56
- for (int b = 0; b < BB; b++) {
57
- simd32uint8 c = simd32uint8(codes);
58
- codes += 32;
59
- simd32uint8 mask(15);
60
- simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
61
- simd32uint8 clo = c & mask;
62
-
63
- for (int q = 0; q < NQ; q++) {
64
- simd32uint8 lut = lut_cache[q];
65
- simd32uint8 res0 = lut.lookup_2_lanes(clo);
66
- simd32uint8 res1 = lut.lookup_2_lanes(chi);
67
-
68
- accu[q][b][0] += simd16uint16(res0);
69
- accu[q][b][1] += simd16uint16(res0) >> 8;
70
-
71
- accu[q][b][2] += simd16uint16(res1);
72
- accu[q][b][3] += simd16uint16(res1) >> 8;
73
- }
74
- }
75
- }
76
-
77
- for (int sq = 0; sq < scaler.nscale; sq += 2) {
78
- simd32uint8 lut_cache[NQ];
79
- for (int q = 0; q < NQ; q++) {
80
- lut_cache[q] = simd32uint8(LUT);
81
- LUT += 32;
82
- }
83
-
84
- for (int b = 0; b < BB; b++) {
85
- simd32uint8 c = simd32uint8(codes);
86
- codes += 32;
87
- simd32uint8 mask(15);
88
- simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
89
- simd32uint8 clo = c & mask;
90
-
91
- for (int q = 0; q < NQ; q++) {
92
- simd32uint8 lut = lut_cache[q];
93
-
94
- simd32uint8 res0 = scaler.lookup(lut, clo);
95
- accu[q][b][0] += scaler.scale_lo(res0); // handle vectors 0..7
96
- accu[q][b][1] += scaler.scale_hi(res0); // handle vectors 8..15
97
-
98
- simd32uint8 res1 = scaler.lookup(lut, chi);
99
- accu[q][b][2] += scaler.scale_lo(res1); // handle vectors 16..23
100
- accu[q][b][3] +=
101
- scaler.scale_hi(res1); // handle vectors 24..31
102
- }
103
- }
104
- }
105
-
106
- for (int q = 0; q < NQ; q++) {
107
- for (int b = 0; b < BB; b++) {
108
- accu[q][b][0] -= accu[q][b][1] << 8;
109
- simd16uint16 dis0 = combine2x2(accu[q][b][0], accu[q][b][1]);
110
-
111
- accu[q][b][2] -= accu[q][b][3] << 8;
112
- simd16uint16 dis1 = combine2x2(accu[q][b][2], accu[q][b][3]);
113
-
114
- res.handle(q, b, dis0, dis1);
115
- }
116
- }
117
- }
118
-
119
- template <int NQ, int BB, class ResultHandler, class Scaler>
120
- void accumulate_fixed_blocks(
121
- size_t nb,
122
- int nsq,
123
- const uint8_t* codes,
124
- const uint8_t* LUT,
125
- ResultHandler& res,
126
- const Scaler& scaler,
127
- size_t block_stride) {
128
- constexpr int bbs = 32 * BB;
129
- for (size_t j0 = 0; j0 < nb; j0 += bbs) {
130
- FixedStorageHandler<NQ, 2 * BB> res2;
131
- kernel_accumulate_block<NQ, BB>(nsq, codes, LUT, res2, scaler);
132
- res.set_block_origin(0, j0);
133
- res2.to_other_handler(res);
134
- codes += block_stride;
135
- }
136
- }
137
-
138
- template <class ResultHandler, class Scaler>
139
- void pq4_accumulate_loop_fixed_scaler(
140
- int nq,
141
- size_t nb,
142
- int bbs,
143
- int nsq,
144
- const uint8_t* codes,
145
- const uint8_t* LUT,
146
- ResultHandler& res,
147
- const Scaler& scaler,
148
- size_t block_stride) {
149
- FAISS_THROW_IF_NOT(is_aligned_pointer(codes));
150
- FAISS_THROW_IF_NOT(is_aligned_pointer(LUT));
151
- FAISS_THROW_IF_NOT(bbs % 32 == 0);
152
- FAISS_THROW_IF_NOT(nb % bbs == 0);
153
-
154
- #define DISPATCH(NQ, BB) \
155
- case NQ * 1000 + BB: \
156
- accumulate_fixed_blocks<NQ, BB>( \
157
- nb, nsq, codes, LUT, res, scaler, block_stride); \
158
- break
159
-
160
- switch (nq * 1000 + bbs / 32) {
161
- DISPATCH(1, 1);
162
- DISPATCH(1, 2);
163
- DISPATCH(1, 3);
164
- DISPATCH(1, 4);
165
- DISPATCH(1, 5);
166
- DISPATCH(2, 1);
167
- DISPATCH(2, 2);
168
- DISPATCH(3, 1);
169
- DISPATCH(4, 1);
170
- default:
171
- FAISS_THROW_FMT("nq=%d bbs=%d not instantiated", nq, bbs);
172
- }
173
- #undef DISPATCH
174
- }
175
-
176
- template <class ResultHandler>
177
- void pq4_accumulate_loop_fixed_handler(
178
- int nq,
179
- size_t nb,
180
- int bbs,
181
- int nsq,
182
- const uint8_t* codes,
183
- const uint8_t* LUT,
184
- ResultHandler& res,
185
- const NormTableScaler* scaler,
186
- size_t block_stride) {
187
- if (scaler) {
188
- pq4_accumulate_loop_fixed_scaler(
189
- nq, nb, bbs, nsq, codes, LUT, res, *scaler, block_stride);
190
- } else {
191
- DummyScaler dscaler;
192
- pq4_accumulate_loop_fixed_scaler(
193
- nq, nb, bbs, nsq, codes, LUT, res, dscaler, block_stride);
194
- }
195
- }
196
-
197
- struct Run_pq4_accumulate_loop {
198
- template <class ResultHandler>
199
- void f(ResultHandler& res,
200
- int nq,
201
- size_t nb,
202
- int bbs,
203
- int nsq,
204
- const uint8_t* codes,
205
- const uint8_t* LUT,
206
- const NormTableScaler* scaler,
207
- size_t block_stride) {
208
- pq4_accumulate_loop_fixed_handler(
209
- nq, nb, bbs, nsq, codes, LUT, res, scaler, block_stride);
210
- }
211
- };
212
-
213
- } // anonymous namespace
214
-
215
- void pq4_accumulate_loop(
216
- int nq,
217
- size_t nb,
218
- int bbs,
219
- int nsq,
220
- const uint8_t* codes,
221
- const uint8_t* LUT,
222
- SIMDResultHandler& res,
223
- const NormTableScaler* scaler,
224
- size_t block_stride) {
225
- Run_pq4_accumulate_loop consumer;
226
- dispatch_SIMDResultHandler(
227
- res, consumer, nq, nb, bbs, nsq, codes, LUT, scaler, block_stride);
228
- }
229
-
230
- } // namespace faiss
@@ -1,84 +0,0 @@
1
- /*
2
- * Copyright (c) Meta Platforms, Inc. and affiliates.
3
- *
4
- * This source code is licensed under the MIT license found in the
5
- * LICENSE file in the root directory of this source tree.
6
- */
7
-
8
- // This file contains an implementation of approximate top-k search
9
- // using heap. It was initially created for a beam search.
10
- //
11
- // The core idea is the following.
12
- // Say we need to find beam_size indices with the minimal distance
13
- // values. It is done via heap (priority_queue) using the following
14
- // pseudocode:
15
- //
16
- // def baseline():
17
- // distances = np.empty([beam_size * n], dtype=float)
18
- // indices = np.empty([beam_size * n], dtype=int)
19
- //
20
- // heap = Heap(max_heap_size=beam_size)
21
- //
22
- // for i in range(0, beam_size * n):
23
- // heap.push(distances[i], indices[i])
24
- //
25
- // Basically, this is what heap_addn() function from utils/Heap.h does.
26
- //
27
- // The following scheme can be used for approximate beam search.
28
- // Say, we need to find elements with min distance.
29
- // Basically, we split n elements of every beam into NBUCKETS buckets
30
- // and track the index with the minimal distance for every bucket.
31
- // This can be effectively SIMD-ed and significantly lowers the number
32
- // of operations, but yields approximate results for beam_size >= 2.
33
- //
34
- // def approximate_v1():
35
- // distances = np.empty([beam_size * n], dtype=float)
36
- // indices = np.empty([beam_size * n], dtype=int)
37
- //
38
- // heap = Heap(max_heap_size=beam_size)
39
- //
40
- // for beam in range(0, beam_size):
41
- // # The value of 32 is just an example.
42
- // # The value may be varied: the larger the value is,
43
- // # the slower and the more precise vs baseline beam search is
44
- // NBUCKETS = 32
45
- //
46
- // local_min_distances = [HUGE_VALF] * NBUCKETS
47
- // local_min_indices = [0] * NBUCKETS
48
- //
49
- // for i in range(0, n / NBUCKETS):
50
- // for j in range(0, NBUCKETS):
51
- // idx = beam * n + i * NBUCKETS + j
52
- // if distances[idx] < local_min_distances[j]:
53
- // local_min_distances[j] = distances[idx]
54
- // local_min_indices[j] = indices[idx]
55
- //
56
- // for j in range(0, NBUCKETS):
57
- // heap.push(local_min_distances[j], local_min_indices[j])
58
- //
59
- // The accuracy can be improved by tracking min-2 elements for every
60
- // bucket. Such a min-2 implementation with NBUCKETS buckets provides
61
- // better accuracy than top-1 implementation with 2 * NBUCKETS buckets.
62
- // Min-3 is also doable. One can use min-N approach, but I'm not sure
63
- // whether min-4 and above are practical, because of the lack of SIMD
64
- // registers (unless AVX-512 version is used).
65
- //
66
- // C++ template for top-N implementation is provided. The code
67
- // assumes that indices[idx] == idx. One can write a code that lifts
68
- // such an assumption easily.
69
- //
70
- // Currently, the code that tracks elements with min distances is implemented
71
- // (Max Heap). Min Heap option can be added easily.
72
-
73
- #pragma once
74
-
75
- #include <faiss/impl/platform_macros.h>
76
-
77
- // the list of available modes is in the following file
78
- #include <faiss/utils/approx_topk/mode.h>
79
-
80
- #ifdef __AVX2__
81
- #include <faiss/utils/approx_topk/avx2-inl.h>
82
- #else
83
- #include <faiss/utils/approx_topk/generic.h>
84
- #endif
@@ -1,196 +0,0 @@
1
- /*
2
- * Copyright (c) Meta Platforms, Inc. and affiliates.
3
- *
4
- * This source code is licensed under the MIT license found in the
5
- * LICENSE file in the root directory of this source tree.
6
- */
7
-
8
- #pragma once
9
-
10
- #include <immintrin.h>
11
-
12
- #include <limits>
13
-
14
- #include <faiss/impl/FaissAssert.h>
15
- #include <faiss/utils/Heap.h>
16
-
17
- namespace faiss {
18
-
19
- template <typename C, uint32_t NBUCKETS, uint32_t N>
20
- struct HeapWithBuckets {
21
- // this case was not implemented yet.
22
- };
23
-
24
- template <uint32_t NBUCKETS, uint32_t N>
25
- struct HeapWithBuckets<CMax<float, int>, NBUCKETS, N> {
26
- static constexpr uint32_t NBUCKETS_8 = NBUCKETS / 8;
27
- static_assert(
28
- (NBUCKETS) > 0 && ((NBUCKETS % 8) == 0),
29
- "Number of buckets needs to be 8, 16, 24, ...");
30
-
31
- static void addn(
32
- // number of elements
33
- const uint32_t n,
34
- // distances. It is assumed to have n elements.
35
- const float* const __restrict distances,
36
- // number of best elements to keep
37
- const uint32_t k,
38
- // output distances
39
- float* const __restrict bh_val,
40
- // output indices, each being within [0, n) range
41
- int32_t* const __restrict bh_ids) {
42
- // forward a call to bs_addn with 1 beam
43
- bs_addn(1, n, distances, k, bh_val, bh_ids);
44
- }
45
-
46
- static void bs_addn(
47
- // beam_size parameter of Beam Search algorithm
48
- const uint32_t beam_size,
49
- // number of elements per beam
50
- const uint32_t n_per_beam,
51
- // distances. It is assumed to have (n_per_beam * beam_size)
52
- // elements.
53
- const float* const __restrict distances,
54
- // number of best elements to keep
55
- const uint32_t k,
56
- // output distances
57
- float* const __restrict bh_val,
58
- // output indices, each being within [0, n_per_beam * beam_size)
59
- // range
60
- int32_t* const __restrict bh_ids) {
61
- // // Basically, the function runs beam_size iterations.
62
- // // Every iteration NBUCKETS * N elements are added to a regular heap.
63
- // // So, maximum number of added elements is beam_size * NBUCKETS * N.
64
- // // This number is expected to be less or equal than k.
65
- // FAISS_THROW_IF_NOT_FMT(
66
- // beam_size * NBUCKETS * N >= k,
67
- // "Cannot pick %d elements, only %d. "
68
- // "Check the function and template arguments values.",
69
- // k,
70
- // beam_size * NBUCKETS * N);
71
-
72
- using C = CMax<float, int>;
73
-
74
- // main loop
75
- for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
76
- __m256 min_distances_i[NBUCKETS_8][N];
77
- __m256i min_indices_i[NBUCKETS_8][N];
78
-
79
- for (uint32_t j = 0; j < NBUCKETS_8; j++) {
80
- for (uint32_t p = 0; p < N; p++) {
81
- min_distances_i[j][p] =
82
- _mm256_set1_ps(std::numeric_limits<float>::max());
83
- min_indices_i[j][p] =
84
- _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
85
- }
86
- }
87
-
88
- __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
89
- __m256i indices_delta = _mm256_set1_epi32(NBUCKETS);
90
-
91
- const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
92
-
93
- // put the data into buckets
94
- for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
95
- for (uint32_t j = 0; j < NBUCKETS_8; j++) {
96
- const __m256 distances_reg = _mm256_loadu_ps(
97
- distances + j * 8 + ip + n_per_beam * beam_index);
98
-
99
- // loop. Compiler should get rid of unneeded ops
100
- __m256 distance_candidate = distances_reg;
101
- __m256i indices_candidate = current_indices;
102
-
103
- for (uint32_t p = 0; p < N; p++) {
104
- const __m256 comparison = _mm256_cmp_ps(
105
- min_distances_i[j][p],
106
- distance_candidate,
107
- _CMP_LE_OS);
108
-
109
- // // blend seems to be slower than min
110
- // const __m256 min_distances_new = _mm256_blendv_ps(
111
- // distance_candidate,
112
- // min_distances_i[j][p],
113
- // comparison);
114
- const __m256 min_distances_new = _mm256_min_ps(
115
- distance_candidate, min_distances_i[j][p]);
116
- const __m256i min_indices_new =
117
- _mm256_castps_si256(_mm256_blendv_ps(
118
- _mm256_castsi256_ps(indices_candidate),
119
- _mm256_castsi256_ps(
120
- min_indices_i[j][p]),
121
- comparison));
122
-
123
- // // blend seems to be slower than min
124
- // const __m256 max_distances_new = _mm256_blendv_ps(
125
- // min_distances_i[j][p],
126
- // distance_candidate,
127
- // comparison);
128
- const __m256 max_distances_new = _mm256_max_ps(
129
- min_distances_i[j][p], distances_reg);
130
- const __m256i max_indices_new =
131
- _mm256_castps_si256(_mm256_blendv_ps(
132
- _mm256_castsi256_ps(
133
- min_indices_i[j][p]),
134
- _mm256_castsi256_ps(indices_candidate),
135
- comparison));
136
-
137
- distance_candidate = max_distances_new;
138
- indices_candidate = max_indices_new;
139
-
140
- min_distances_i[j][p] = min_distances_new;
141
- min_indices_i[j][p] = min_indices_new;
142
- }
143
- }
144
-
145
- current_indices =
146
- _mm256_add_epi32(current_indices, indices_delta);
147
- }
148
-
149
- // fix the indices
150
- for (uint32_t j = 0; j < NBUCKETS_8; j++) {
151
- const __m256i offset =
152
- _mm256_set1_epi32(n_per_beam * beam_index + j * 8);
153
- for (uint32_t p = 0; p < N; p++) {
154
- min_indices_i[j][p] =
155
- _mm256_add_epi32(min_indices_i[j][p], offset);
156
- }
157
- }
158
-
159
- // merge every bucket into the regular heap
160
- for (uint32_t p = 0; p < N; p++) {
161
- for (uint32_t j = 0; j < NBUCKETS_8; j++) {
162
- int32_t min_indices_scalar[8];
163
- float min_distances_scalar[8];
164
-
165
- _mm256_storeu_si256(
166
- (__m256i*)min_indices_scalar, min_indices_i[j][p]);
167
- _mm256_storeu_ps(
168
- min_distances_scalar, min_distances_i[j][p]);
169
-
170
- // this exact way is needed to maintain the order as if the
171
- // input elements were pushed to the heap sequentially
172
- for (size_t j8 = 0; j8 < 8; j8++) {
173
- const auto value = min_distances_scalar[j8];
174
- const auto index = min_indices_scalar[j8];
175
- if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
176
- heap_replace_top<C>(
177
- k, bh_val, bh_ids, value, index);
178
- }
179
- }
180
- }
181
- }
182
-
183
- // process leftovers
184
- for (uint32_t ip = nb; ip < n_per_beam; ip++) {
185
- const int32_t index = ip + n_per_beam * beam_index;
186
- const float value = distances[index];
187
-
188
- if (C::cmp(bh_val[0], value)) {
189
- heap_replace_top<C>(k, bh_val, bh_ids, value, index);
190
- }
191
- }
192
- }
193
- }
194
- };
195
-
196
- } // namespace faiss
@@ -1,34 +0,0 @@
1
- /*
2
- * Copyright (c) Meta Platforms, Inc. and affiliates.
3
- *
4
- * This source code is licensed under the MIT license found in the
5
- * LICENSE file in the root directory of this source tree.
6
- */
7
-
8
- #pragma once
9
-
10
- /// Represents the mode of use of approximate top-k computations
11
- /// that allows to trade accuracy vs speed. So, every options
12
- /// besides EXACT_TOPK increases the speed.
13
- ///
14
- /// B represents the number of buckets.
15
- /// D is the number of min-k elements to track within every bucket.
16
- ///
17
- /// Default option is EXACT_TOPK.
18
- /// APPROX_TOPK_BUCKETS_B16_D2 is worth starting from, if you'd like
19
- /// to experiment a bit.
20
- ///
21
- /// It seems that only the limited number of combinations are
22
- /// meaningful, because of the limited supply of SIMD registers.
23
- /// Also, certain combinations, such as B32_D1 and B16_D1, were concluded
24
- /// to be not very precise in benchmarks, so they were not introduced.
25
- ///
26
- /// TODO: Consider d-ary SIMD heap.
27
-
28
- enum ApproxTopK_mode_t : int {
29
- EXACT_TOPK = 0,
30
- APPROX_TOPK_BUCKETS_B32_D2 = 1,
31
- APPROX_TOPK_BUCKETS_B8_D3 = 2,
32
- APPROX_TOPK_BUCKETS_B16_D2 = 3,
33
- APPROX_TOPK_BUCKETS_B8_D2 = 4,
34
- };
@@ -1,36 +0,0 @@
1
- /*
2
- * Copyright (c) Meta Platforms, Inc. and affiliates.
3
- *
4
- * This source code is licensed under the MIT license found in the
5
- * LICENSE file in the root directory of this source tree.
6
- */
7
-
8
- // AVX512 might be not used, but this version provides ~2x speedup
9
- // over AVX2 kernel, say, for training PQx10 or PQx12, and speeds up
10
- // additional cases with larger dimensionalities.
11
-
12
- #pragma once
13
-
14
- #include <faiss/impl/ResultHandler.h>
15
- #include <faiss/impl/platform_macros.h>
16
-
17
- #include <faiss/utils/Heap.h>
18
-
19
- #ifdef __AVX512F__
20
-
21
- namespace faiss {
22
-
23
- // Returns true if the fused kernel is available and the data was processed.
24
- // Returns false if the fused kernel is not available.
25
- bool exhaustive_L2sqr_fused_cmax_AVX512(
26
- const float* x,
27
- const float* y,
28
- size_t d,
29
- size_t nx,
30
- size_t ny,
31
- Top1BlockResultHandler<CMax<float, int64_t>>& res,
32
- const float* y_norms);
33
-
34
- } // namespace faiss
35
-
36
- #endif