faiss 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/faiss/extconf.rb +2 -1
  4. data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
  5. data/ext/faiss/index_binary.cpp +1 -1
  6. data/ext/faiss/kmeans.cpp +1 -1
  7. data/ext/faiss/pca_matrix.cpp +1 -1
  8. data/ext/faiss/product_quantizer.cpp +1 -1
  9. data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
  10. data/lib/faiss/version.rb +1 -1
  11. data/vendor/faiss/faiss/AutoTune.cpp +93 -80
  12. data/vendor/faiss/faiss/Clustering.cpp +39 -240
  13. data/vendor/faiss/faiss/Clustering.h +6 -0
  14. data/vendor/faiss/faiss/IVFlib.cpp +41 -21
  15. data/vendor/faiss/faiss/Index.cpp +6 -5
  16. data/vendor/faiss/faiss/Index.h +5 -5
  17. data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
  21. data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
  22. data/vendor/faiss/faiss/IndexBinary.h +4 -4
  23. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
  24. data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
  31. data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
  32. data/vendor/faiss/faiss/IndexFastScan.h +25 -23
  33. data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
  34. data/vendor/faiss/faiss/IndexFlat.h +21 -18
  35. data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
  36. data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
  37. data/vendor/faiss/faiss/IndexHNSW.h +16 -2
  38. data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
  39. data/vendor/faiss/faiss/IndexIDMap.h +9 -7
  40. data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
  41. data/vendor/faiss/faiss/IndexIVF.h +33 -12
  42. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
  43. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
  45. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
  46. data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
  47. data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
  48. data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
  49. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
  50. data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
  51. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
  53. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
  55. data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
  56. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
  57. data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
  58. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
  59. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
  60. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
  61. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
  62. data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
  63. data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
  66. data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
  67. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
  68. data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
  69. data/vendor/faiss/faiss/IndexPQ.h +3 -2
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
  73. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  74. data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
  75. data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
  76. data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
  77. data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
  78. data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
  79. data/vendor/faiss/faiss/IndexRefine.h +4 -4
  80. data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
  81. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
  82. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
  83. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
  84. data/vendor/faiss/faiss/IndexShards.cpp +10 -9
  85. data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
  86. data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
  87. data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
  88. data/vendor/faiss/faiss/MetaIndexes.h +1 -1
  89. data/vendor/faiss/faiss/MetricType.h +14 -7
  90. data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
  91. data/vendor/faiss/faiss/SuperKMeans.h +97 -0
  92. data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
  93. data/vendor/faiss/faiss/VectorTransform.h +16 -16
  94. data/vendor/faiss/faiss/build.cpp +23 -0
  95. data/vendor/faiss/faiss/build.h +15 -0
  96. data/vendor/faiss/faiss/clone_index.cpp +48 -47
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
  101. data/vendor/faiss/faiss/factory_tools.cpp +5 -0
  102. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
  103. data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
  104. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
  106. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
  107. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
  108. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
  109. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
  110. data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
  111. data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
  112. data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
  113. data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
  114. data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
  115. data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
  116. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
  117. data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
  118. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
  119. data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
  120. data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
  121. data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
  122. data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
  123. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
  124. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
  125. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
  126. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
  127. data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
  128. data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
  129. data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
  130. data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
  131. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
  132. data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
  133. data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
  134. data/vendor/faiss/faiss/impl/FaissException.h +50 -3
  135. data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
  136. data/vendor/faiss/faiss/impl/HNSW.h +13 -34
  137. data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
  138. data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
  139. data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
  140. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
  141. data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
  142. data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
  143. data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
  144. data/vendor/faiss/faiss/impl/NSG.h +4 -4
  145. data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
  146. data/vendor/faiss/faiss/impl/Panorama.h +258 -87
  147. data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
  148. data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
  149. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
  150. data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
  151. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
  152. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
  153. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
  154. data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
  155. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
  156. data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
  157. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
  158. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
  159. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
  160. data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
  161. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
  162. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
  163. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
  164. data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
  165. data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
  166. data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
  167. data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
  168. data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
  169. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
  170. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
  171. data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
  172. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
  173. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
  174. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
  175. data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
  176. data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
  177. data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
  178. data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
  179. data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
  180. data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
  181. data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
  182. data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
  183. data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
  184. data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
  185. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
  186. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
  187. data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
  188. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
  189. data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
  190. data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
  191. data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
  192. data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
  193. data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
  194. data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
  195. data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
  196. data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
  197. data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
  198. data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
  199. data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
  200. data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
  201. data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
  202. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
  203. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
  204. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
  205. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
  206. data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
  207. data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
  208. data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
  209. data/vendor/faiss/faiss/impl/io.cpp +6 -6
  210. data/vendor/faiss/faiss/impl/io_macros.h +33 -16
  211. data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
  212. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
  213. data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
  214. data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
  215. data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
  216. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
  217. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
  218. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
  219. data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
  220. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
  221. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
  222. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
  223. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
  224. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
  225. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
  226. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
  227. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
  228. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
  229. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
  230. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
  231. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  232. data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
  233. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
  234. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
  235. data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
  236. data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
  237. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
  238. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
  239. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
  240. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
  241. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
  242. data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
  243. data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
  244. data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
  245. data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
  246. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
  247. data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
  248. data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
  249. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
  250. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
  251. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
  252. data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
  253. data/vendor/faiss/faiss/impl/svs_io.h +8 -2
  254. data/vendor/faiss/faiss/index_factory.cpp +86 -18
  255. data/vendor/faiss/faiss/index_io.h +24 -0
  256. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
  257. data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
  258. data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
  259. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
  260. data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
  261. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
  262. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
  263. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
  264. data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
  265. data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
  266. data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
  267. data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
  268. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
  269. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
  270. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
  271. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
  272. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
  273. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
  274. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
  275. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
  276. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
  277. data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
  278. data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
  279. data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
  280. data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
  281. data/vendor/faiss/faiss/utils/distances.cpp +390 -560
  282. data/vendor/faiss/faiss/utils/distances.h +20 -1
  283. data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
  284. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
  285. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
  286. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
  287. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
  288. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
  289. data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
  290. data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
  291. data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
  292. data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
  293. data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
  294. data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
  295. data/vendor/faiss/faiss/utils/hamming.h +92 -2
  296. data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
  297. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
  298. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
  299. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
  300. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
  301. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
  302. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
  303. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
  304. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
  305. data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
  306. data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
  307. data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
  308. data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
  309. data/vendor/faiss/faiss/utils/partitioning.h +31 -0
  310. data/vendor/faiss/faiss/utils/popcount.h +29 -0
  311. data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
  312. data/vendor/faiss/faiss/utils/prefetch.h +2 -2
  313. data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
  314. data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
  315. data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
  316. data/vendor/faiss/faiss/utils/random.cpp +6 -6
  317. data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
  318. data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
  319. data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
  320. data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
  321. data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
  322. data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
  323. data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
  324. data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
  325. data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
  326. data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
  327. data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
  328. data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
  329. data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
  330. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
  331. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
  332. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
  333. data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
  334. data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
  335. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
  336. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
  337. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
  338. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
  339. data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
  340. data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
  341. data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
  342. data/vendor/faiss/faiss/utils/utils.cpp +5 -5
  343. data/vendor/faiss/faiss/utils/utils.h +3 -3
  344. metadata +119 -34
  345. data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
  346. data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
  347. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
  348. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
  349. data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
  350. data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
  351. data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
  352. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
  353. data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
  354. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
  355. data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
  356. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
  357. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
  358. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
  359. data/vendor/faiss/faiss/utils/simdlib.h +0 -42
  360. data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
  361. /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
@@ -0,0 +1,477 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #ifdef COMPILE_SIMD_AVX512
9
+
10
+ #include <faiss/utils/rabitq_simd.h>
11
+ #include <immintrin.h>
12
+
13
+ namespace faiss::rabitq {
14
+
15
+ namespace {
16
+
17
+ inline __m512i get_lookup_512() {
18
+ return _mm512_set_epi8(
19
+ /* f */ 4,
20
+ /* e */ 3,
21
+ /* d */ 3,
22
+ /* c */ 2,
23
+ /* b */ 3,
24
+ /* a */ 2,
25
+ /* 9 */ 2,
26
+ /* 8 */ 1,
27
+ /* 7 */ 3,
28
+ /* 6 */ 2,
29
+ /* 5 */ 2,
30
+ /* 4 */ 1,
31
+ /* 3 */ 2,
32
+ /* 2 */ 1,
33
+ /* 1 */ 1,
34
+ /* 0 */ 0,
35
+ /* f */ 4,
36
+ /* e */ 3,
37
+ /* d */ 3,
38
+ /* c */ 2,
39
+ /* b */ 3,
40
+ /* a */ 2,
41
+ /* 9 */ 2,
42
+ /* 8 */ 1,
43
+ /* 7 */ 3,
44
+ /* 6 */ 2,
45
+ /* 5 */ 2,
46
+ /* 4 */ 1,
47
+ /* 3 */ 2,
48
+ /* 2 */ 1,
49
+ /* 1 */ 1,
50
+ /* 0 */ 0,
51
+ /* f */ 4,
52
+ /* e */ 3,
53
+ /* d */ 3,
54
+ /* c */ 2,
55
+ /* b */ 3,
56
+ /* a */ 2,
57
+ /* 9 */ 2,
58
+ /* 8 */ 1,
59
+ /* 7 */ 3,
60
+ /* 6 */ 2,
61
+ /* 5 */ 2,
62
+ /* 4 */ 1,
63
+ /* 3 */ 2,
64
+ /* 2 */ 1,
65
+ /* 1 */ 1,
66
+ /* 0 */ 0,
67
+ /* f */ 4,
68
+ /* e */ 3,
69
+ /* d */ 3,
70
+ /* c */ 2,
71
+ /* b */ 3,
72
+ /* a */ 2,
73
+ /* 9 */ 2,
74
+ /* 8 */ 1,
75
+ /* 7 */ 3,
76
+ /* 6 */ 2,
77
+ /* 5 */ 2,
78
+ /* 4 */ 1,
79
+ /* 3 */ 2,
80
+ /* 2 */ 1,
81
+ /* 1 */ 1,
82
+ /* 0 */ 0);
83
+ }
84
+
85
+ inline __m512i popcount_512(__m512i v) {
86
+ #if defined(__AVX512VPOPCNTDQ__)
87
+ return _mm512_popcnt_epi64(v);
88
+ #else
89
+ const __m512i lookup = get_lookup_512();
90
+ const __m512i low_mask = _mm512_set1_epi8(0x0f);
91
+
92
+ const __m512i lo = _mm512_and_si512(v, low_mask);
93
+ const __m512i hi = _mm512_and_si512(_mm512_srli_epi16(v, 4), low_mask);
94
+ const __m512i popcnt_lo = _mm512_shuffle_epi8(lookup, lo);
95
+ const __m512i popcnt_hi = _mm512_shuffle_epi8(lookup, hi);
96
+ const __m512i popcnt = _mm512_add_epi8(popcnt_lo, popcnt_hi);
97
+ return _mm512_sad_epu8(_mm512_setzero_si512(), popcnt);
98
+ #endif
99
+ }
100
+
101
+ // AVX2 helpers needed for AVX512 fallback paths (compute_inner_product)
102
+ inline __m256i get_lookup_256() {
103
+ return _mm256_setr_epi8(
104
+ /* 0 */ 0,
105
+ /* 1 */ 1,
106
+ /* 2 */ 1,
107
+ /* 3 */ 2,
108
+ /* 4 */ 1,
109
+ /* 5 */ 2,
110
+ /* 6 */ 2,
111
+ /* 7 */ 3,
112
+ /* 8 */ 1,
113
+ /* 9 */ 2,
114
+ /* a */ 2,
115
+ /* b */ 3,
116
+ /* c */ 2,
117
+ /* d */ 3,
118
+ /* e */ 3,
119
+ /* f */ 4,
120
+ /* 0 */ 0,
121
+ /* 1 */ 1,
122
+ /* 2 */ 1,
123
+ /* 3 */ 2,
124
+ /* 4 */ 1,
125
+ /* 5 */ 2,
126
+ /* 6 */ 2,
127
+ /* 7 */ 3,
128
+ /* 8 */ 1,
129
+ /* 9 */ 2,
130
+ /* a */ 2,
131
+ /* b */ 3,
132
+ /* c */ 2,
133
+ /* d */ 3,
134
+ /* e */ 3,
135
+ /* f */ 4);
136
+ }
137
+
138
+ inline __m256i popcount_256(__m256i v) {
139
+ const __m256i lookup = get_lookup_256();
140
+ const __m256i low_mask = _mm256_set1_epi8(0x0f);
141
+
142
+ const __m256i lo = _mm256_and_si256(v, low_mask);
143
+ const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
144
+ const __m256i popcnt_lo = _mm256_shuffle_epi8(lookup, lo);
145
+ const __m256i popcnt_hi = _mm256_shuffle_epi8(lookup, hi);
146
+ const __m256i popcnt = _mm256_add_epi8(popcnt_lo, popcnt_hi);
147
+ return _mm256_sad_epu8(_mm256_setzero_si256(), popcnt);
148
+ }
149
+
150
+ inline uint64_t reduce_add_256(__m256i v) {
151
+ alignas(32) uint64_t lanes[4];
152
+ _mm256_store_si256((__m256i*)lanes, v);
153
+ return lanes[0] + lanes[1] + lanes[2] + lanes[3];
154
+ }
155
+
156
+ inline __m128i popcount_128(__m128i v) {
157
+ uint64_t lane0 = _mm_extract_epi64(v, 0);
158
+ uint64_t lane1 = _mm_extract_epi64(v, 1);
159
+ uint64_t pop0 = popcount64(lane0);
160
+ uint64_t pop1 = popcount64(lane1);
161
+ return _mm_set_epi64x(pop1, pop0);
162
+ }
163
+
164
+ inline uint64_t reduce_add_128(__m128i v) {
165
+ alignas(16) uint64_t lanes[2];
166
+ _mm_store_si128((__m128i*)lanes, v);
167
+ return lanes[0] + lanes[1];
168
+ }
169
+
170
+ } // namespace
171
+
172
+ template <>
173
+ uint64_t bitwise_and_dot_product<SIMDLevel::AVX512>(
174
+ const uint8_t* query,
175
+ const uint8_t* data,
176
+ size_t size,
177
+ size_t qb) {
178
+ uint64_t sum = 0;
179
+ size_t offset = 0;
180
+ if (size_t step = 512 / 8; offset + step <= size) {
181
+ __m512i sum_512 = _mm512_setzero_si512();
182
+ for (; offset + step <= size; offset += step) {
183
+ __m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
184
+ for (int j = 0; j < qb; j++) {
185
+ __m512i v_q = _mm512_loadu_si512(
186
+ (const __m512i*)(query + j * size + offset));
187
+ __m512i v_and = _mm512_and_si512(v_q, v_x);
188
+ __m512i v_popcnt = popcount_512(v_and);
189
+ __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
190
+ sum_512 = _mm512_add_epi64(sum_512, v_shifted);
191
+ }
192
+ }
193
+ sum += _mm512_reduce_add_epi64(sum_512);
194
+ }
195
+ if (size_t step = 256 / 8; offset + step <= size) {
196
+ __m256i sum_256 = _mm256_setzero_si256();
197
+ for (; offset + step <= size; offset += step) {
198
+ __m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
199
+ for (int j = 0; j < qb; j++) {
200
+ __m256i v_q = _mm256_loadu_si256(
201
+ (const __m256i*)(query + j * size + offset));
202
+ __m256i v_and = _mm256_and_si256(v_q, v_x);
203
+ __m256i v_popcnt = popcount_256(v_and);
204
+ __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
205
+ sum_256 = _mm256_add_epi64(sum_256, v_shifted);
206
+ }
207
+ }
208
+ sum += reduce_add_256(sum_256);
209
+ }
210
+ __m128i sum_128 = _mm_setzero_si128();
211
+ for (size_t step = 128 / 8; offset + step <= size; offset += step) {
212
+ __m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
213
+ for (int j = 0; j < qb; j++) {
214
+ __m128i v_q = _mm_loadu_si128(
215
+ (const __m128i*)(query + j * size + offset));
216
+ __m128i v_and = _mm_and_si128(v_q, v_x);
217
+ __m128i v_popcnt = popcount_128(v_and);
218
+ __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
219
+ sum_128 = _mm_add_epi64(sum_128, v_shifted);
220
+ }
221
+ }
222
+ sum += reduce_add_128(sum_128);
223
+ for (size_t step = 64 / 8; offset + step <= size; offset += step) {
224
+ const auto yv = *(const uint64_t*)(data + offset);
225
+ for (int j = 0; j < qb; j++) {
226
+ const auto qv = *(const uint64_t*)(query + j * size + offset);
227
+ sum += popcount64(qv & yv) << j;
228
+ }
229
+ }
230
+ for (; offset < size; ++offset) {
231
+ const auto yv = *(data + offset);
232
+ for (int j = 0; j < qb; j++) {
233
+ const auto qv = *(query + j * size + offset);
234
+ sum += popcount32(qv & yv) << j;
235
+ }
236
+ }
237
+ return sum;
238
+ }
239
+
240
+ template <>
241
+ uint64_t bitwise_xor_dot_product<SIMDLevel::AVX512>(
242
+ const uint8_t* query,
243
+ const uint8_t* data,
244
+ size_t size,
245
+ size_t qb) {
246
+ uint64_t sum = 0;
247
+ size_t offset = 0;
248
+ if (size_t step = 512 / 8; offset + step <= size) {
249
+ __m512i sum_512 = _mm512_setzero_si512();
250
+ for (; offset + step <= size; offset += step) {
251
+ __m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
252
+ for (int j = 0; j < qb; j++) {
253
+ __m512i v_q = _mm512_loadu_si512(
254
+ (const __m512i*)(query + j * size + offset));
255
+ __m512i v_xor = _mm512_xor_si512(v_q, v_x);
256
+ __m512i v_popcnt = popcount_512(v_xor);
257
+ __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
258
+ sum_512 = _mm512_add_epi64(sum_512, v_shifted);
259
+ }
260
+ }
261
+ sum += _mm512_reduce_add_epi64(sum_512);
262
+ }
263
+ if (size_t step = 256 / 8; offset + step <= size) {
264
+ __m256i sum_256 = _mm256_setzero_si256();
265
+ for (; offset + step <= size; offset += step) {
266
+ __m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
267
+ for (int j = 0; j < qb; j++) {
268
+ __m256i v_q = _mm256_loadu_si256(
269
+ (const __m256i*)(query + j * size + offset));
270
+ __m256i v_xor = _mm256_xor_si256(v_q, v_x);
271
+ __m256i v_popcnt = popcount_256(v_xor);
272
+ __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
273
+ sum_256 = _mm256_add_epi64(sum_256, v_shifted);
274
+ }
275
+ }
276
+ sum += reduce_add_256(sum_256);
277
+ }
278
+ __m128i sum_128 = _mm_setzero_si128();
279
+ for (size_t step = 128 / 8; offset + step <= size; offset += step) {
280
+ __m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
281
+ for (int j = 0; j < qb; j++) {
282
+ __m128i v_q = _mm_loadu_si128(
283
+ (const __m128i*)(query + j * size + offset));
284
+ __m128i v_xor = _mm_xor_si128(v_q, v_x);
285
+ __m128i v_popcnt = popcount_128(v_xor);
286
+ __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
287
+ sum_128 = _mm_add_epi64(sum_128, v_shifted);
288
+ }
289
+ }
290
+ sum += reduce_add_128(sum_128);
291
+ for (size_t step = 64 / 8; offset + step <= size; offset += step) {
292
+ const auto yv = *(const uint64_t*)(data + offset);
293
+ for (int j = 0; j < qb; j++) {
294
+ const auto qv = *(const uint64_t*)(query + j * size + offset);
295
+ sum += popcount64(qv ^ yv) << j;
296
+ }
297
+ }
298
+ for (; offset < size; ++offset) {
299
+ const auto yv = *(data + offset);
300
+ for (int j = 0; j < qb; j++) {
301
+ const auto qv = *(query + j * size + offset);
302
+ sum += popcount32(qv ^ yv) << j;
303
+ }
304
+ }
305
+ return sum;
306
+ }
307
+
308
+ template <>
309
+ uint64_t popcount<SIMDLevel::AVX512>(const uint8_t* data, size_t size) {
310
+ uint64_t sum = 0;
311
+ size_t offset = 0;
312
+ if (offset + 512 / 8 <= size) {
313
+ __m512i sum_512 = _mm512_setzero_si512();
314
+ for (size_t end; (end = offset + 512 / 8) <= size; offset = end) {
315
+ __m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
316
+ __m512i v_popcnt = popcount_512(v_x);
317
+ sum_512 = _mm512_add_epi64(sum_512, v_popcnt);
318
+ }
319
+ sum += _mm512_reduce_add_epi64(sum_512);
320
+ }
321
+ if (offset + 256 / 8 <= size) {
322
+ __m256i sum_256 = _mm256_setzero_si256();
323
+ for (size_t end; (end = offset + 256 / 8) <= size; offset = end) {
324
+ __m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
325
+ __m256i v_popcnt = popcount_256(v_x);
326
+ sum_256 = _mm256_add_epi64(sum_256, v_popcnt);
327
+ }
328
+ sum += reduce_add_256(sum_256);
329
+ }
330
+ __m128i sum_128 = _mm_setzero_si128();
331
+ for (size_t step = 128 / 8; offset + step <= size; offset += step) {
332
+ __m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
333
+ sum_128 = _mm_add_epi64(sum_128, popcount_128(v_x));
334
+ }
335
+ sum += reduce_add_128(sum_128);
336
+ for (size_t step = 64 / 8; offset + step <= size; offset += step) {
337
+ const auto yv = *(const uint64_t*)(data + offset);
338
+ sum += popcount64(yv);
339
+ }
340
+ for (; offset < size; ++offset) {
341
+ const auto yv = *(data + offset);
342
+ sum += popcount32(yv);
343
+ }
344
+ return sum;
345
+ }
346
+
347
+ } // namespace faiss::rabitq
348
+
349
+ namespace faiss::rabitq::multibit {
350
+
351
+ namespace {
352
+
353
+ inline float hsum_avx2(__m256 v) {
354
+ __m128 hi = _mm256_extractf128_ps(v, 1);
355
+ __m128 lo = _mm256_castps256_ps128(v);
356
+ lo = _mm_add_ps(lo, hi);
357
+ __m128 shuf = _mm_movehdup_ps(lo);
358
+ lo = _mm_add_ps(lo, shuf);
359
+ shuf = _mm_movehl_ps(shuf, lo);
360
+ return _mm_cvtss_f32(_mm_add_ss(lo, shuf));
361
+ }
362
+
363
+ inline float ip_1exbit_avx512(
364
+ const uint8_t* __restrict sign_bits,
365
+ const uint8_t* __restrict ex_code,
366
+ const float* __restrict rotated_q,
367
+ size_t d,
368
+ float cb) {
369
+ __m512 acc = _mm512_setzero_ps();
370
+ const __m512 v_cb = _mm512_set1_ps(cb);
371
+ const __m512 v_two = _mm512_set1_ps(2.0f);
372
+ const __m512 v_one = _mm512_set1_ps(1.0f);
373
+
374
+ size_t i = 0;
375
+ for (; i + 16 <= d; i += 16) {
376
+ uint16_t sb16;
377
+ memcpy(&sb16, sign_bits + i / 8, sizeof(uint16_t));
378
+ uint16_t eb16;
379
+ memcpy(&eb16, ex_code + i / 8, sizeof(uint16_t));
380
+
381
+ __m512 sb_f = _mm512_maskz_mov_ps(_cvtu32_mask16(sb16), v_one);
382
+ __m512 eb_f = _mm512_maskz_mov_ps(_cvtu32_mask16(eb16), v_one);
383
+
384
+ __m512 recon = _mm512_add_ps(_mm512_fmadd_ps(sb_f, v_two, eb_f), v_cb);
385
+ __m512 rq = _mm512_loadu_ps(rotated_q + i);
386
+ acc = _mm512_fmadd_ps(rq, recon, acc);
387
+ }
388
+
389
+ float result = _mm512_reduce_add_ps(acc);
390
+ result += ip_scalar(sign_bits, ex_code, rotated_q, i, d, 1, cb);
391
+ return result;
392
+ }
393
+
394
+ // AVX2+BMI2 bitplane kernel used as fallback for ex_bits >= 2.
395
+ // AVX512 TU has AVX2 available. BMI2 guarded separately since
396
+ // VIA Eden X4 has AVX2 without BMI2.
397
+ #ifdef __BMI2__
398
+ inline float ip_bitplane_avx2(
399
+ const uint8_t* __restrict sign_bits,
400
+ const uint8_t* __restrict ex_code,
401
+ const float* __restrict rotated_q,
402
+ size_t d,
403
+ size_t ex_bits,
404
+ float cb) {
405
+ __m256 acc = _mm256_setzero_ps();
406
+ const __m256 v_one = _mm256_set1_ps(1.0f);
407
+ const __m256i bit_pos = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128);
408
+ const __m256i zero = _mm256_setzero_si256();
409
+ const __m256 v_cb = _mm256_set1_ps(cb);
410
+
411
+ uint64_t pext_masks[7];
412
+ __m256 v_weights[8];
413
+ for (size_t b = 0; b < ex_bits; b++) {
414
+ uint64_t m = 0;
415
+ for (int j = 0; j < 8; j++) {
416
+ m |= (1ULL << (b + j * ex_bits));
417
+ }
418
+ pext_masks[b] = m;
419
+ v_weights[b] = _mm256_set1_ps(static_cast<float>(1u << b));
420
+ }
421
+ v_weights[ex_bits] = _mm256_set1_ps(static_cast<float>(1u << ex_bits));
422
+
423
+ size_t i = 0;
424
+ for (; i + 8 <= d; i += 8) {
425
+ __m256i sb_cmp = _mm256_cmpgt_epi32(
426
+ _mm256_and_si256(_mm256_set1_epi32(sign_bits[i / 8]), bit_pos),
427
+ zero);
428
+ __m256 recon = _mm256_mul_ps(
429
+ _mm256_and_ps(_mm256_castsi256_ps(sb_cmp), v_one),
430
+ v_weights[ex_bits]);
431
+
432
+ uint64_t ex64 = 0;
433
+ memcpy(&ex64, ex_code + (i / 8) * ex_bits, sizeof(uint64_t));
434
+
435
+ for (size_t b = 0; b < ex_bits; b++) {
436
+ auto plane = static_cast<uint8_t>(_pext_u64(ex64, pext_masks[b]));
437
+ __m256i p_cmp = _mm256_cmpgt_epi32(
438
+ _mm256_and_si256(_mm256_set1_epi32(plane), bit_pos), zero);
439
+ __m256 p_f = _mm256_and_ps(_mm256_castsi256_ps(p_cmp), v_one);
440
+ recon = _mm256_fmadd_ps(p_f, v_weights[b], recon);
441
+ }
442
+
443
+ __m256 rq = _mm256_loadu_ps(rotated_q + i);
444
+ acc = _mm256_fmadd_ps(rq, _mm256_add_ps(recon, v_cb), acc);
445
+ }
446
+
447
+ float result = hsum_avx2(acc);
448
+ result += ip_scalar(sign_bits, ex_code, rotated_q, i, d, ex_bits, cb);
449
+ return result;
450
+ }
451
+ #endif // __BMI2__
452
+
453
+ } // namespace
454
+
455
+ template <>
456
+ float compute_inner_product<SIMDLevel::AVX512>(
457
+ const uint8_t* __restrict sign_bits,
458
+ const uint8_t* __restrict ex_code,
459
+ const float* __restrict rotated_q,
460
+ size_t d,
461
+ size_t ex_bits,
462
+ float cb) {
463
+ if (ex_bits == 1) {
464
+ return ip_1exbit_avx512(sign_bits, ex_code, rotated_q, d, cb);
465
+ }
466
+
467
+ #ifdef __BMI2__
468
+ if (ex_bits <= 7) {
469
+ return ip_bitplane_avx2(sign_bits, ex_code, rotated_q, d, ex_bits, cb);
470
+ }
471
+ #endif
472
+ return ip_scalar(sign_bits, ex_code, rotated_q, 0, d, ex_bits, cb);
473
+ }
474
+
475
+ } // namespace faiss::rabitq::multibit
476
+
477
+ #endif // COMPILE_SIMD_AVX512
@@ -0,0 +1,55 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <faiss/utils/rabitq_simd.h>
9
+
10
+ #ifdef COMPILE_SIMD_ARM_NEON
11
+
12
+ namespace faiss::rabitq {
13
+
14
+ template <>
15
+ uint64_t bitwise_and_dot_product<SIMDLevel::ARM_NEON>(
16
+ const uint8_t* query,
17
+ const uint8_t* data,
18
+ size_t size,
19
+ size_t qb) {
20
+ return bitwise_and_dot_product<SIMDLevel::NONE>(query, data, size, qb);
21
+ }
22
+
23
+ template <>
24
+ uint64_t bitwise_xor_dot_product<SIMDLevel::ARM_NEON>(
25
+ const uint8_t* query,
26
+ const uint8_t* data,
27
+ size_t size,
28
+ size_t qb) {
29
+ return bitwise_xor_dot_product<SIMDLevel::NONE>(query, data, size, qb);
30
+ }
31
+
32
+ template <>
33
+ uint64_t popcount<SIMDLevel::ARM_NEON>(const uint8_t* data, size_t size) {
34
+ return popcount<SIMDLevel::NONE>(data, size);
35
+ }
36
+
37
+ } // namespace faiss::rabitq
38
+
39
+ namespace faiss::rabitq::multibit {
40
+
41
+ template <>
42
+ float compute_inner_product<SIMDLevel::ARM_NEON>(
43
+ const uint8_t* __restrict sign_bits,
44
+ const uint8_t* __restrict ex_code,
45
+ const float* __restrict rotated_q,
46
+ size_t d,
47
+ size_t ex_bits,
48
+ float cb) {
49
+ return compute_inner_product<SIMDLevel::NONE>(
50
+ sign_bits, ex_code, rotated_q, d, ex_bits, cb);
51
+ }
52
+
53
+ } // namespace faiss::rabitq::multibit
54
+
55
+ #endif // COMPILE_SIMD_ARM_NEON
@@ -0,0 +1,55 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <faiss/utils/rabitq_simd.h>
9
+
10
+ #ifdef COMPILE_SIMD_RISCV_RVV
11
+
12
+ namespace faiss::rabitq {
13
+
14
+ template <>
15
+ uint64_t bitwise_and_dot_product<SIMDLevel::RISCV_RVV>(
16
+ const uint8_t* query,
17
+ const uint8_t* data,
18
+ size_t size,
19
+ size_t qb) {
20
+ return bitwise_and_dot_product<SIMDLevel::NONE>(query, data, size, qb);
21
+ }
22
+
23
+ template <>
24
+ uint64_t bitwise_xor_dot_product<SIMDLevel::RISCV_RVV>(
25
+ const uint8_t* query,
26
+ const uint8_t* data,
27
+ size_t size,
28
+ size_t qb) {
29
+ return bitwise_xor_dot_product<SIMDLevel::NONE>(query, data, size, qb);
30
+ }
31
+
32
+ template <>
33
+ uint64_t popcount<SIMDLevel::RISCV_RVV>(const uint8_t* data, size_t size) {
34
+ return popcount<SIMDLevel::NONE>(data, size);
35
+ }
36
+
37
+ } // namespace faiss::rabitq
38
+
39
+ namespace faiss::rabitq::multibit {
40
+
41
+ template <>
42
+ float compute_inner_product<SIMDLevel::RISCV_RVV>(
43
+ const uint8_t* __restrict sign_bits,
44
+ const uint8_t* __restrict ex_code,
45
+ const float* __restrict rotated_q,
46
+ size_t d,
47
+ size_t ex_bits,
48
+ float cb) {
49
+ return compute_inner_product<SIMDLevel::NONE>(
50
+ sign_bits, ex_code, rotated_q, d, ex_bits, cb);
51
+ }
52
+
53
+ } // namespace faiss::rabitq::multibit
54
+
55
+ #endif // COMPILE_SIMD_RISCV_RVV
@@ -0,0 +1,32 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #pragma once
9
+
10
+ // Private dispatch wrapper for SuperKMeans's block_l2. Routes to the
11
+ // highest available SIMD specialization at runtime (DD mode) or the
12
+ // compiled-in level (static mode). aarch64 currently falls through to the
13
+ // scalar primary template; adding NEON/SVE means just adding a new
14
+ // specialization file alongside the AVX ones.
15
+ //
16
+ // Known perf gap: aarch64 (NEON/SVE) specializations are not implemented yet.
17
+ // aarch64 falls through to the scalar primary template. Validating SVE requires
18
+ // a Graviton-class host; deferred to a focused follow-up.
19
+
20
+ #include <faiss/impl/simd_dispatch.h>
21
+ #include <faiss/utils/simd_impl/super_kmeans_kernels.h>
22
+
23
+ namespace faiss {
24
+ namespace detail {
25
+
26
+ inline float block_l2_dispatch(const float* x, const float* y, int n) {
27
+ return with_selected_simd_levels<AVAILABLE_SIMD_LEVELS_A0>(
28
+ [&]<SIMDLevel SL>() { return block_l2<SL>(x, y, n); });
29
+ }
30
+
31
+ } // namespace detail
32
+ } // namespace faiss
@@ -0,0 +1,43 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #pragma once
9
+
10
+ #include <cstddef>
11
+
12
+ #include <faiss/utils/simd_levels.h>
13
+
14
+ namespace faiss {
15
+ namespace detail {
16
+
17
+ // Squared L2 over `n` dimensions; n in [1, pdx_block_size].
18
+ // Primary template is the scalar fallback; SIMDLevels without a dedicated
19
+ // specialization (ARM_NEON, ARM_SVE, NONE, ...) use it directly.
20
+ template <SIMDLevel Level>
21
+ inline float block_l2(const float* x, const float* y, int n) {
22
+ float s = 0.0f;
23
+ for (int m = 0; m < n; ++m) {
24
+ const float d = x[m] - y[m];
25
+ s += d * d;
26
+ }
27
+ return s;
28
+ }
29
+
30
+ // COMPILE_SIMD_* is a build-system define (link-time promise that the
31
+ // specialization will be available). Mirrors the impl-file guards.
32
+ #ifdef COMPILE_SIMD_AVX2
33
+ template <>
34
+ float block_l2<SIMDLevel::AVX2>(const float* x, const float* y, int n);
35
+ #endif
36
+
37
+ #ifdef COMPILE_SIMD_AVX512
38
+ template <>
39
+ float block_l2<SIMDLevel::AVX512>(const float* x, const float* y, int n);
40
+ #endif
41
+
42
+ } // namespace detail
43
+ } // namespace faiss