faiss 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (378) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/ext/faiss/extconf.rb +2 -1
  4. data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
  5. data/ext/faiss/index_binary.cpp +1 -1
  6. data/ext/faiss/kmeans.cpp +1 -1
  7. data/ext/faiss/pca_matrix.cpp +1 -1
  8. data/ext/faiss/product_quantizer.cpp +1 -1
  9. data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
  10. data/lib/faiss/version.rb +1 -1
  11. data/vendor/faiss/faiss/AutoTune.cpp +93 -80
  12. data/vendor/faiss/faiss/Clustering.cpp +39 -240
  13. data/vendor/faiss/faiss/Clustering.h +6 -0
  14. data/vendor/faiss/faiss/IVFlib.cpp +41 -21
  15. data/vendor/faiss/faiss/Index.cpp +6 -5
  16. data/vendor/faiss/faiss/Index.h +5 -5
  17. data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
  21. data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
  22. data/vendor/faiss/faiss/IndexBinary.h +4 -4
  23. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
  24. data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +88 -97
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +89 -417
  31. data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
  32. data/vendor/faiss/faiss/IndexFastScan.h +25 -23
  33. data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
  34. data/vendor/faiss/faiss/IndexFlat.h +21 -18
  35. data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
  36. data/vendor/faiss/faiss/IndexHNSW.cpp +374 -206
  37. data/vendor/faiss/faiss/IndexHNSW.h +16 -2
  38. data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
  39. data/vendor/faiss/faiss/IndexIDMap.h +9 -7
  40. data/vendor/faiss/faiss/IndexIVF.cpp +467 -364
  41. data/vendor/faiss/faiss/IndexIVF.h +33 -12
  42. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +79 -76
  43. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
  45. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
  46. data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
  47. data/vendor/faiss/faiss/IndexIVFFlat.cpp +39 -69
  48. data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
  49. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +56 -33
  50. data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
  51. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +73 -846
  53. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
  55. data/vendor/faiss/faiss/IndexIVFPQR.cpp +23 -20
  56. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +30 -52
  57. data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
  58. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
  59. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
  60. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
  61. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
  62. data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
  63. data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
  66. data/vendor/faiss/faiss/IndexNSG.cpp +38 -23
  67. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
  68. data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
  69. data/vendor/faiss/faiss/IndexPQ.h +3 -2
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
  73. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  74. data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
  75. data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
  76. data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
  77. data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
  78. data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
  79. data/vendor/faiss/faiss/IndexRefine.h +4 -4
  80. data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
  81. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
  82. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
  83. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +150 -20
  84. data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
  85. data/vendor/faiss/faiss/IndexShards.cpp +10 -9
  86. data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
  87. data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
  88. data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
  89. data/vendor/faiss/faiss/MetaIndexes.h +1 -1
  90. data/vendor/faiss/faiss/MetricType.h +14 -7
  91. data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
  92. data/vendor/faiss/faiss/SuperKMeans.h +97 -0
  93. data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
  94. data/vendor/faiss/faiss/VectorTransform.h +16 -16
  95. data/vendor/faiss/faiss/build.cpp +23 -0
  96. data/vendor/faiss/faiss/build.h +15 -0
  97. data/vendor/faiss/faiss/clone_index.cpp +48 -47
  98. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
  101. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
  102. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
  103. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
  104. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
  105. data/vendor/faiss/faiss/factory_tools.cpp +9 -0
  106. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
  107. data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
  108. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +15 -16
  109. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +5 -4
  110. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
  111. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
  112. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
  113. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
  114. data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
  115. data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
  116. data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
  117. data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
  118. data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
  119. data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
  120. data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
  121. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +58 -0
  122. data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
  123. data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
  124. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +111 -0
  125. data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
  126. data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
  127. data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
  128. data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
  129. data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
  130. data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
  131. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
  132. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
  133. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
  134. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
  135. data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
  136. data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
  137. data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
  138. data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
  139. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
  140. data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
  141. data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
  142. data/vendor/faiss/faiss/impl/FaissException.h +50 -3
  143. data/vendor/faiss/faiss/impl/HNSW.cpp +639 -507
  144. data/vendor/faiss/faiss/impl/HNSW.h +61 -44
  145. data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
  146. data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
  147. data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
  148. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
  149. data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
  150. data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
  151. data/vendor/faiss/faiss/impl/NSG.cpp +53 -32
  152. data/vendor/faiss/faiss/impl/NSG.h +4 -4
  153. data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
  154. data/vendor/faiss/faiss/impl/Panorama.h +269 -87
  155. data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
  156. data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
  157. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
  158. data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
  159. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
  160. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
  161. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +55 -25
  162. data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
  163. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
  164. data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
  165. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +302 -283
  166. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
  167. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
  168. data/vendor/faiss/faiss/impl/ResultHandler.h +100 -75
  169. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +318 -7
  170. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +77 -1
  171. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
  172. data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
  173. data/vendor/faiss/faiss/impl/VisitedTable.h +70 -28
  174. data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
  175. data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
  176. data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
  177. data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
  178. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
  179. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
  180. data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
  181. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
  182. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
  183. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
  184. data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
  185. data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
  186. data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
  187. data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
  188. data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
  189. data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
  190. data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
  191. data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
  192. data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
  193. data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
  194. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
  195. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
  196. data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
  197. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +270 -0
  198. data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
  199. data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
  200. data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
  201. data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
  202. data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
  203. data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
  204. data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
  205. data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
  206. data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
  207. data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
  208. data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
  209. data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
  210. data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
  211. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +83 -0
  212. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +113 -0
  213. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +150 -0
  214. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +142 -0
  215. data/vendor/faiss/faiss/impl/index_read.cpp +1227 -79
  216. data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
  217. data/vendor/faiss/faiss/impl/index_write.cpp +96 -13
  218. data/vendor/faiss/faiss/impl/io.cpp +6 -6
  219. data/vendor/faiss/faiss/impl/io_macros.h +58 -16
  220. data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
  221. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
  222. data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
  223. data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
  224. data/vendor/faiss/faiss/impl/platform_macros.h +15 -4
  225. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
  226. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
  227. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
  228. data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
  229. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +23 -0
  230. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +23 -0
  231. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +23 -0
  232. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
  233. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
  234. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +45 -107
  235. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
  236. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +274 -5
  237. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +10 -7
  238. data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
  239. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +70 -0
  240. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
  241. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  242. data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
  243. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +9 -2
  244. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +419 -19
  245. data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
  246. data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
  247. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +387 -2
  248. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
  249. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
  250. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +341 -2
  251. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +425 -3
  252. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +290 -2
  253. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +337 -0
  254. data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
  255. data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
  256. data/vendor/faiss/faiss/impl/simd_dispatch.h +157 -66
  257. data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
  258. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
  259. data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
  260. data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
  261. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
  262. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
  263. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
  264. data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
  265. data/vendor/faiss/faiss/impl/svs_io.h +8 -2
  266. data/vendor/faiss/faiss/index_factory.cpp +90 -18
  267. data/vendor/faiss/faiss/index_io.h +40 -0
  268. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
  269. data/vendor/faiss/faiss/invlists/DirectMap.cpp +28 -15
  270. data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
  271. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +170 -86
  272. data/vendor/faiss/faiss/invlists/InvertedLists.h +88 -25
  273. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
  274. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
  275. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
  276. data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
  277. data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
  278. data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
  279. data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
  280. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
  281. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
  282. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
  283. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
  284. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +142 -21
  285. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +33 -7
  286. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
  287. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
  288. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +77 -27
  289. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +10 -4
  290. data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
  291. data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
  292. data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
  293. data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
  294. data/vendor/faiss/faiss/utils/bf16.h +34 -0
  295. data/vendor/faiss/faiss/utils/distances.cpp +390 -560
  296. data/vendor/faiss/faiss/utils/distances.h +20 -1
  297. data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
  298. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
  299. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
  300. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
  301. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
  302. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
  303. data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
  304. data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -178
  305. data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
  306. data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
  307. data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
  308. data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
  309. data/vendor/faiss/faiss/utils/hamming.h +92 -2
  310. data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
  311. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +16 -0
  312. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
  313. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
  314. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
  315. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +210 -0
  316. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
  317. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
  318. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
  319. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
  320. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
  321. data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
  322. data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
  323. data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
  324. data/vendor/faiss/faiss/utils/partitioning.cpp +66 -989
  325. data/vendor/faiss/faiss/utils/partitioning.h +31 -0
  326. data/vendor/faiss/faiss/utils/popcount.h +29 -0
  327. data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
  328. data/vendor/faiss/faiss/utils/prefetch.h +2 -2
  329. data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
  330. data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
  331. data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
  332. data/vendor/faiss/faiss/utils/random.cpp +6 -6
  333. data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
  334. data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
  335. data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
  336. data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
  337. data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
  338. data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
  339. data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
  340. data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
  341. data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
  342. data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
  343. data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
  344. data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
  345. data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
  346. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1031 -0
  347. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
  348. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
  349. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
  350. data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
  351. data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
  352. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
  353. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
  354. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
  355. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
  356. data/vendor/faiss/faiss/utils/simd_levels.cpp +29 -7
  357. data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
  358. data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
  359. data/vendor/faiss/faiss/utils/utils.cpp +5 -5
  360. data/vendor/faiss/faiss/utils/utils.h +3 -3
  361. metadata +129 -34
  362. data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
  363. data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
  364. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
  365. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
  366. data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
  367. data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
  368. data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
  369. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
  370. data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
  371. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
  372. data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
  373. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
  374. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
  375. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
  376. data/vendor/faiss/faiss/utils/simdlib.h +0 -42
  377. data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
  378. /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
@@ -0,0 +1,343 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ /**
9
+ * @file rabitq_avx512_spr.cpp
10
+ *
11
+ * RaBitQ SIMD kernels specialized for SIMDLevel::AVX512_SPR.
12
+ *
13
+ * Sapphire Rapids (SPR) and later Intel microarchitectures expose
14
+ * AVX-512 VPOPCNTDQ (vpopcntq), which performs a per-lane 64-bit
15
+ * popcount in a single instruction. This is used here to replace the
16
+ * multi-step shuffle/pshufb-based popcount used by the generic AVX-512
17
+ * specialization in rabitq_avx512.cpp. The popcount-heavy kernels
18
+ * (bitwise_and_dot_product, bitwise_xor_dot_product, popcount) become
19
+ * substantially shorter and faster on SPR+ as a result.
20
+ *
21
+ * Build / dispatch behavior:
22
+ * - faiss_avx512 (AVX-512 only, no SPR features): NOT compiled.
23
+ * The existing AVX512 specialization in rabitq_avx512.cpp is used.
24
+ * - faiss_avx512_spr (statically built for SPR+): compiled. The
25
+ * SINGLE_SIMD_LEVEL is AVX512_SPR, so this specialization is
26
+ * selected by static dispatch.
27
+ * - faiss with FAISS_OPT_LEVEL=dd (dynamic dispatch): compiled with
28
+ * -mavx512vpopcntdq as a per-file flag. Selected at runtime when
29
+ * SIMDConfig::level == SIMDLevel::AVX512_SPR.
30
+ *
31
+ * The floating-point multi-bit inner-product kernel does not benefit
32
+ * from VPOPCNTDQ, so this TU forwards compute_inner_product<SPR> to
33
+ * the AVX512 implementation to avoid duplicating that code path.
34
+ */
35
+
36
+ #ifdef COMPILE_SIMD_AVX512_SPR
37
+
38
+ #include <faiss/utils/popcount.h>
39
+ #include <faiss/utils/rabitq_simd.h>
40
+ #include <immintrin.h>
41
+ #include <cstdint>
42
+
43
+ #if defined(_MSC_VER)
44
+ #include <intrin.h>
45
+ #endif
46
+
47
+ namespace faiss::rabitq {
48
+
49
+ // Forward declarations for the AVX512 specializations defined in
50
+ // rabitq_avx512.cpp. They live in the same TU group on SPR builds, so
51
+ // we can reuse them as a tail handler / fallback. Declaring rather
52
+ // than redefining avoids ODR risk and keeps a single source of truth
53
+ // for the floating-point kernel.
54
+ template <>
55
+ uint64_t bitwise_and_dot_product<SIMDLevel::AVX512>(
56
+ const uint8_t* query,
57
+ const uint8_t* data,
58
+ size_t size,
59
+ size_t qb);
60
+ template <>
61
+ uint64_t bitwise_xor_dot_product<SIMDLevel::AVX512>(
62
+ const uint8_t* query,
63
+ const uint8_t* data,
64
+ size_t size,
65
+ size_t qb);
66
+ template <>
67
+ uint64_t popcount<SIMDLevel::AVX512>(const uint8_t* data, size_t size);
68
+
69
+ namespace {
70
+
71
+ // 512-bit popcount using AVX-512 VPOPCNTDQ (vpopcntq).
72
+ // Single-instruction per-lane popcount on 8x uint64 lanes.
73
+ inline __m512i popcount_512_vpopcntdq(__m512i v) {
74
+ return _mm512_popcnt_epi64(v);
75
+ }
76
+
77
+ // 256-bit popcount using AVX-512VL VPOPCNTDQ.
78
+ // AVX512VL is part of the SPR feature set, so vpopcntq is available
79
+ // on 256-bit registers via _mm256_popcnt_epi64.
80
+ inline __m256i popcount_256_vpopcntdq(__m256i v) {
81
+ return _mm256_popcnt_epi64(v);
82
+ }
83
+
84
+ // 128-bit popcount using AVX-512VL VPOPCNTDQ.
85
+ inline __m128i popcount_128_vpopcntdq(__m128i v) {
86
+ return _mm_popcnt_epi64(v);
87
+ }
88
+
89
+ inline uint64_t reduce_add_256(__m256i v) {
90
+ alignas(32) uint64_t lanes[4];
91
+ _mm256_store_si256(reinterpret_cast<__m256i*>(lanes), v);
92
+ return lanes[0] + lanes[1] + lanes[2] + lanes[3];
93
+ }
94
+
95
+ inline uint64_t reduce_add_128(__m128i v) {
96
+ alignas(16) uint64_t lanes[2];
97
+ _mm_store_si128(reinterpret_cast<__m128i*>(lanes), v);
98
+ return lanes[0] + lanes[1];
99
+ }
100
+
101
+ } // namespace
102
+
103
+ template <>
104
+ uint64_t bitwise_and_dot_product<SIMDLevel::AVX512_SPR>(
105
+ const uint8_t* query,
106
+ const uint8_t* data,
107
+ size_t size,
108
+ size_t qb) {
109
+ uint64_t sum = 0;
110
+ size_t offset = 0;
111
+
112
+ // 512-bit main loop: vpopcntq replaces the shuffle-based popcount,
113
+ // halving the instruction count per iteration relative to AVX512.
114
+ if (size_t step = 512 / 8; offset + step <= size) {
115
+ __m512i sum_512 = _mm512_setzero_si512();
116
+ for (; offset + step <= size; offset += step) {
117
+ __m512i v_x = _mm512_loadu_si512(
118
+ reinterpret_cast<const __m512i*>(data + offset));
119
+ for (size_t j = 0; j < qb; j++) {
120
+ __m512i v_q = _mm512_loadu_si512(
121
+ reinterpret_cast<const __m512i*>(
122
+ query + j * size + offset));
123
+ __m512i v_and = _mm512_and_si512(v_q, v_x);
124
+ __m512i v_popcnt = popcount_512_vpopcntdq(v_and);
125
+ __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
126
+ sum_512 = _mm512_add_epi64(sum_512, v_shifted);
127
+ }
128
+ }
129
+ sum += _mm512_reduce_add_epi64(sum_512);
130
+ }
131
+
132
+ // 256-bit tail.
133
+ if (size_t step = 256 / 8; offset + step <= size) {
134
+ __m256i sum_256 = _mm256_setzero_si256();
135
+ for (; offset + step <= size; offset += step) {
136
+ __m256i v_x = _mm256_loadu_si256(
137
+ reinterpret_cast<const __m256i*>(data + offset));
138
+ for (size_t j = 0; j < qb; j++) {
139
+ __m256i v_q = _mm256_loadu_si256(
140
+ reinterpret_cast<const __m256i*>(
141
+ query + j * size + offset));
142
+ __m256i v_and = _mm256_and_si256(v_q, v_x);
143
+ __m256i v_popcnt = popcount_256_vpopcntdq(v_and);
144
+ __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
145
+ sum_256 = _mm256_add_epi64(sum_256, v_shifted);
146
+ }
147
+ }
148
+ sum += reduce_add_256(sum_256);
149
+ }
150
+
151
+ // 128-bit tail.
152
+ __m128i sum_128 = _mm_setzero_si128();
153
+ for (size_t step = 128 / 8; offset + step <= size; offset += step) {
154
+ __m128i v_x = _mm_loadu_si128(
155
+ reinterpret_cast<const __m128i*>(data + offset));
156
+ for (size_t j = 0; j < qb; j++) {
157
+ __m128i v_q = _mm_loadu_si128(
158
+ reinterpret_cast<const __m128i*>(
159
+ query + j * size + offset));
160
+ __m128i v_and = _mm_and_si128(v_q, v_x);
161
+ __m128i v_popcnt = popcount_128_vpopcntdq(v_and);
162
+ __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
163
+ sum_128 = _mm_add_epi64(sum_128, v_shifted);
164
+ }
165
+ }
166
+ sum += reduce_add_128(sum_128);
167
+
168
+ // 64-bit scalar tail.
169
+ for (size_t step = 64 / 8; offset + step <= size; offset += step) {
170
+ const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
171
+ for (size_t j = 0; j < qb; j++) {
172
+ const auto qv = *reinterpret_cast<const uint64_t*>(
173
+ query + j * size + offset);
174
+ sum += static_cast<uint64_t>(popcount64(qv & yv)) << j;
175
+ }
176
+ }
177
+ // Byte tail.
178
+ for (; offset < size; ++offset) {
179
+ const auto yv = *(data + offset);
180
+ for (size_t j = 0; j < qb; j++) {
181
+ const auto qv = *(query + j * size + offset);
182
+ sum += static_cast<uint64_t>(popcount32(qv & yv)) << j;
183
+ }
184
+ }
185
+ return sum;
186
+ }
187
+
188
+ template <>
189
+ uint64_t bitwise_xor_dot_product<SIMDLevel::AVX512_SPR>(
190
+ const uint8_t* query,
191
+ const uint8_t* data,
192
+ size_t size,
193
+ size_t qb) {
194
+ uint64_t sum = 0;
195
+ size_t offset = 0;
196
+
197
+ if (size_t step = 512 / 8; offset + step <= size) {
198
+ __m512i sum_512 = _mm512_setzero_si512();
199
+ for (; offset + step <= size; offset += step) {
200
+ __m512i v_x = _mm512_loadu_si512(
201
+ reinterpret_cast<const __m512i*>(data + offset));
202
+ for (size_t j = 0; j < qb; j++) {
203
+ __m512i v_q = _mm512_loadu_si512(
204
+ reinterpret_cast<const __m512i*>(
205
+ query + j * size + offset));
206
+ __m512i v_xor = _mm512_xor_si512(v_q, v_x);
207
+ __m512i v_popcnt = popcount_512_vpopcntdq(v_xor);
208
+ __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
209
+ sum_512 = _mm512_add_epi64(sum_512, v_shifted);
210
+ }
211
+ }
212
+ sum += _mm512_reduce_add_epi64(sum_512);
213
+ }
214
+
215
+ if (size_t step = 256 / 8; offset + step <= size) {
216
+ __m256i sum_256 = _mm256_setzero_si256();
217
+ for (; offset + step <= size; offset += step) {
218
+ __m256i v_x = _mm256_loadu_si256(
219
+ reinterpret_cast<const __m256i*>(data + offset));
220
+ for (size_t j = 0; j < qb; j++) {
221
+ __m256i v_q = _mm256_loadu_si256(
222
+ reinterpret_cast<const __m256i*>(
223
+ query + j * size + offset));
224
+ __m256i v_xor = _mm256_xor_si256(v_q, v_x);
225
+ __m256i v_popcnt = popcount_256_vpopcntdq(v_xor);
226
+ __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
227
+ sum_256 = _mm256_add_epi64(sum_256, v_shifted);
228
+ }
229
+ }
230
+ sum += reduce_add_256(sum_256);
231
+ }
232
+
233
+ __m128i sum_128 = _mm_setzero_si128();
234
+ for (size_t step = 128 / 8; offset + step <= size; offset += step) {
235
+ __m128i v_x = _mm_loadu_si128(
236
+ reinterpret_cast<const __m128i*>(data + offset));
237
+ for (size_t j = 0; j < qb; j++) {
238
+ __m128i v_q = _mm_loadu_si128(
239
+ reinterpret_cast<const __m128i*>(
240
+ query + j * size + offset));
241
+ __m128i v_xor = _mm_xor_si128(v_q, v_x);
242
+ __m128i v_popcnt = popcount_128_vpopcntdq(v_xor);
243
+ __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
244
+ sum_128 = _mm_add_epi64(sum_128, v_shifted);
245
+ }
246
+ }
247
+ sum += reduce_add_128(sum_128);
248
+
249
+ for (size_t step = 64 / 8; offset + step <= size; offset += step) {
250
+ const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
251
+ for (size_t j = 0; j < qb; j++) {
252
+ const auto qv = *reinterpret_cast<const uint64_t*>(
253
+ query + j * size + offset);
254
+ sum += static_cast<uint64_t>(popcount64(qv ^ yv)) << j;
255
+ }
256
+ }
257
+ for (; offset < size; ++offset) {
258
+ const auto yv = *(data + offset);
259
+ for (size_t j = 0; j < qb; j++) {
260
+ const auto qv = *(query + j * size + offset);
261
+ sum += static_cast<uint64_t>(popcount32(qv ^ yv)) << j;
262
+ }
263
+ }
264
+ return sum;
265
+ }
266
+
267
+ template <>
268
+ uint64_t popcount<SIMDLevel::AVX512_SPR>(const uint8_t* data, size_t size) {
269
+ uint64_t sum = 0;
270
+ size_t offset = 0;
271
+
272
+ if (offset + 512 / 8 <= size) {
273
+ __m512i sum_512 = _mm512_setzero_si512();
274
+ for (size_t end; (end = offset + 512 / 8) <= size; offset = end) {
275
+ __m512i v_x = _mm512_loadu_si512(
276
+ reinterpret_cast<const __m512i*>(data + offset));
277
+ __m512i v_popcnt = popcount_512_vpopcntdq(v_x);
278
+ sum_512 = _mm512_add_epi64(sum_512, v_popcnt);
279
+ }
280
+ sum += _mm512_reduce_add_epi64(sum_512);
281
+ }
282
+
283
+ if (offset + 256 / 8 <= size) {
284
+ __m256i sum_256 = _mm256_setzero_si256();
285
+ for (size_t end; (end = offset + 256 / 8) <= size; offset = end) {
286
+ __m256i v_x = _mm256_loadu_si256(
287
+ reinterpret_cast<const __m256i*>(data + offset));
288
+ __m256i v_popcnt = popcount_256_vpopcntdq(v_x);
289
+ sum_256 = _mm256_add_epi64(sum_256, v_popcnt);
290
+ }
291
+ sum += reduce_add_256(sum_256);
292
+ }
293
+
294
+ __m128i sum_128 = _mm_setzero_si128();
295
+ for (size_t step = 128 / 8; offset + step <= size; offset += step) {
296
+ __m128i v_x = _mm_loadu_si128(
297
+ reinterpret_cast<const __m128i*>(data + offset));
298
+ sum_128 = _mm_add_epi64(sum_128, popcount_128_vpopcntdq(v_x));
299
+ }
300
+ sum += reduce_add_128(sum_128);
301
+
302
+ for (size_t step = 64 / 8; offset + step <= size; offset += step) {
303
+ const auto yv = *reinterpret_cast<const uint64_t*>(data + offset);
304
+ sum += popcount64(yv);
305
+ }
306
+ for (; offset < size; ++offset) {
307
+ const auto yv = *(data + offset);
308
+ sum += popcount32(yv);
309
+ }
310
+ return sum;
311
+ }
312
+
313
+ } // namespace faiss::rabitq
314
+
315
+ namespace faiss::rabitq::multibit {
316
+
317
+ // Forward-declare the AVX512 floating-point inner-product kernel.
318
+ // VPOPCNTDQ does not help this kernel (it operates on FP32), so we
319
+ // reuse the AVX512 implementation rather than duplicate it.
320
+ template <>
321
+ float compute_inner_product<SIMDLevel::AVX512>(
322
+ const uint8_t* __restrict sign_bits,
323
+ const uint8_t* __restrict ex_code,
324
+ const float* __restrict rotated_q,
325
+ size_t d,
326
+ size_t ex_bits,
327
+ float cb);
328
+
329
+ template <>
330
+ float compute_inner_product<SIMDLevel::AVX512_SPR>(
331
+ const uint8_t* __restrict sign_bits,
332
+ const uint8_t* __restrict ex_code,
333
+ const float* __restrict rotated_q,
334
+ size_t d,
335
+ size_t ex_bits,
336
+ float cb) {
337
+ return compute_inner_product<SIMDLevel::AVX512>(
338
+ sign_bits, ex_code, rotated_q, d, ex_bits, cb);
339
+ }
340
+
341
+ } // namespace faiss::rabitq::multibit
342
+
343
+ #endif // COMPILE_SIMD_AVX512_SPR
@@ -0,0 +1,55 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <faiss/utils/rabitq_simd.h>
9
+
10
+ #ifdef COMPILE_SIMD_ARM_NEON
11
+
12
+ namespace faiss::rabitq {
13
+
14
+ template <>
15
+ uint64_t bitwise_and_dot_product<SIMDLevel::ARM_NEON>(
16
+ const uint8_t* query,
17
+ const uint8_t* data,
18
+ size_t size,
19
+ size_t qb) {
20
+ return bitwise_and_dot_product<SIMDLevel::NONE>(query, data, size, qb);
21
+ }
22
+
23
+ template <>
24
+ uint64_t bitwise_xor_dot_product<SIMDLevel::ARM_NEON>(
25
+ const uint8_t* query,
26
+ const uint8_t* data,
27
+ size_t size,
28
+ size_t qb) {
29
+ return bitwise_xor_dot_product<SIMDLevel::NONE>(query, data, size, qb);
30
+ }
31
+
32
+ template <>
33
+ uint64_t popcount<SIMDLevel::ARM_NEON>(const uint8_t* data, size_t size) {
34
+ return popcount<SIMDLevel::NONE>(data, size);
35
+ }
36
+
37
+ } // namespace faiss::rabitq
38
+
39
+ namespace faiss::rabitq::multibit {
40
+
41
+ template <>
42
+ float compute_inner_product<SIMDLevel::ARM_NEON>(
43
+ const uint8_t* __restrict sign_bits,
44
+ const uint8_t* __restrict ex_code,
45
+ const float* __restrict rotated_q,
46
+ size_t d,
47
+ size_t ex_bits,
48
+ float cb) {
49
+ return compute_inner_product<SIMDLevel::NONE>(
50
+ sign_bits, ex_code, rotated_q, d, ex_bits, cb);
51
+ }
52
+
53
+ } // namespace faiss::rabitq::multibit
54
+
55
+ #endif // COMPILE_SIMD_ARM_NEON
@@ -0,0 +1,55 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <faiss/utils/rabitq_simd.h>
9
+
10
+ #ifdef COMPILE_SIMD_RISCV_RVV
11
+
12
+ namespace faiss::rabitq {
13
+
14
+ template <>
15
+ uint64_t bitwise_and_dot_product<SIMDLevel::RISCV_RVV>(
16
+ const uint8_t* query,
17
+ const uint8_t* data,
18
+ size_t size,
19
+ size_t qb) {
20
+ return bitwise_and_dot_product<SIMDLevel::NONE>(query, data, size, qb);
21
+ }
22
+
23
+ template <>
24
+ uint64_t bitwise_xor_dot_product<SIMDLevel::RISCV_RVV>(
25
+ const uint8_t* query,
26
+ const uint8_t* data,
27
+ size_t size,
28
+ size_t qb) {
29
+ return bitwise_xor_dot_product<SIMDLevel::NONE>(query, data, size, qb);
30
+ }
31
+
32
+ template <>
33
+ uint64_t popcount<SIMDLevel::RISCV_RVV>(const uint8_t* data, size_t size) {
34
+ return popcount<SIMDLevel::NONE>(data, size);
35
+ }
36
+
37
+ } // namespace faiss::rabitq
38
+
39
+ namespace faiss::rabitq::multibit {
40
+
41
+ template <>
42
+ float compute_inner_product<SIMDLevel::RISCV_RVV>(
43
+ const uint8_t* __restrict sign_bits,
44
+ const uint8_t* __restrict ex_code,
45
+ const float* __restrict rotated_q,
46
+ size_t d,
47
+ size_t ex_bits,
48
+ float cb) {
49
+ return compute_inner_product<SIMDLevel::NONE>(
50
+ sign_bits, ex_code, rotated_q, d, ex_bits, cb);
51
+ }
52
+
53
+ } // namespace faiss::rabitq::multibit
54
+
55
+ #endif // COMPILE_SIMD_RISCV_RVV
@@ -0,0 +1,32 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #pragma once
9
+
10
+ // Private dispatch wrapper for SuperKMeans's block_l2. Routes to the
11
+ // highest available SIMD specialization at runtime (DD mode) or the
12
+ // compiled-in level (static mode). aarch64 currently falls through to the
13
+ // scalar primary template; adding NEON/SVE means just adding a new
14
+ // specialization file alongside the AVX ones.
15
+ //
16
+ // Known perf gap: aarch64 (NEON/SVE) specializations are not implemented yet.
17
+ // aarch64 falls through to the scalar primary template. Validating SVE requires
18
+ // a Graviton-class host; deferred to a focused follow-up.
19
+
20
+ #include <faiss/impl/simd_dispatch.h>
21
+ #include <faiss/utils/simd_impl/super_kmeans_kernels.h>
22
+
23
+ namespace faiss {
24
+ namespace detail {
25
+
26
+ inline float block_l2_dispatch(const float* x, const float* y, int n) {
27
+ return with_selected_simd_levels<AVAILABLE_SIMD_LEVELS_A0>(
28
+ [&]<SIMDLevel SL>() { return block_l2<SL>(x, y, n); });
29
+ }
30
+
31
+ } // namespace detail
32
+ } // namespace faiss
@@ -0,0 +1,43 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #pragma once
9
+
10
+ #include <cstddef>
11
+
12
+ #include <faiss/utils/simd_levels.h>
13
+
14
+ namespace faiss {
15
+ namespace detail {
16
+
17
+ // Squared L2 over `n` dimensions; n in [1, pdx_block_size].
18
+ // Primary template is the scalar fallback; SIMDLevels without a dedicated
19
+ // specialization (ARM_NEON, ARM_SVE, NONE, ...) use it directly.
20
+ template <SIMDLevel Level>
21
+ inline float block_l2(const float* x, const float* y, int n) {
22
+ float s = 0.0f;
23
+ for (int m = 0; m < n; ++m) {
24
+ const float d = x[m] - y[m];
25
+ s += d * d;
26
+ }
27
+ return s;
28
+ }
29
+
30
+ // COMPILE_SIMD_* is a build-system define (link-time promise that the
31
+ // specialization will be available). Mirrors the impl-file guards.
32
+ #ifdef COMPILE_SIMD_AVX2
33
+ template <>
34
+ float block_l2<SIMDLevel::AVX2>(const float* x, const float* y, int n);
35
+ #endif
36
+
37
+ #ifdef COMPILE_SIMD_AVX512
38
+ template <>
39
+ float block_l2<SIMDLevel::AVX512>(const float* x, const float* y, int n);
40
+ #endif
41
+
42
+ } // namespace detail
43
+ } // namespace faiss
@@ -0,0 +1,57 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #ifdef COMPILE_SIMD_AVX2
9
+
10
+ #include <faiss/utils/simd_impl/super_kmeans_kernels.h>
11
+
12
+ #include <immintrin.h>
13
+
14
+ namespace faiss {
15
+ namespace detail {
16
+
17
+ namespace {
18
+
19
+ // Reduce 8 float lanes of an AVX2 register to a scalar sum.
20
+ // Uses a shuffle+add tree instead of two _mm_hadd_ps. On Skylake-class
21
+ // cores, hadd is 3-cycle latency / 2-uop, while movehdup/movehl/add_ss
22
+ // are single-uop, single-cycle ops.
23
+ inline float horizontal_sum_avx2(__m256 v) {
24
+ __m128 lo = _mm256_castps256_ps128(v);
25
+ __m128 hi = _mm256_extractf128_ps(v, 1);
26
+ __m128 sum128 = _mm_add_ps(lo, hi); // 4 lanes
27
+ __m128 shuf = _mm_movehdup_ps(sum128); // [s1, s1, s3, s3]
28
+ __m128 sums = _mm_add_ps(sum128, shuf); // [s0+s1, _, s2+s3, _]
29
+ shuf = _mm_movehl_ps(shuf, sums); // [s2+s3, s3, _, _]
30
+ sums = _mm_add_ss(sums, shuf); // (s0+s1) + (s2+s3)
31
+ return _mm_cvtss_f32(sums);
32
+ }
33
+
34
+ } // namespace
35
+
36
+ template <>
37
+ float block_l2<SIMDLevel::AVX2>(const float* x, const float* y, int n) {
38
+ __m256 acc = _mm256_setzero_ps();
39
+ int m = 0;
40
+ for (; m + 8 <= n; m += 8) {
41
+ __m256 xv = _mm256_loadu_ps(x + m);
42
+ __m256 yv = _mm256_loadu_ps(y + m);
43
+ __m256 diff = _mm256_sub_ps(xv, yv);
44
+ acc = _mm256_fmadd_ps(diff, diff, acc);
45
+ }
46
+ float result = horizontal_sum_avx2(acc);
47
+ for (; m < n; ++m) {
48
+ const float d = x[m] - y[m];
49
+ result += d * d;
50
+ }
51
+ return result;
52
+ }
53
+
54
+ } // namespace detail
55
+ } // namespace faiss
56
+
57
+ #endif // COMPILE_SIMD_AVX2
@@ -0,0 +1,45 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #ifdef COMPILE_SIMD_AVX512
9
+
10
+ #include <faiss/utils/simd_impl/super_kmeans_kernels.h>
11
+
12
+ #include <immintrin.h>
13
+
14
+ namespace faiss {
15
+ namespace detail {
16
+
17
+ template <>
18
+ float block_l2<SIMDLevel::AVX512>(const float* x, const float* y, int n) {
19
+ __m512 acc = _mm512_setzero_ps();
20
+ int m = 0;
21
+ for (; m + 16 <= n; m += 16) {
22
+ __m512 xv = _mm512_loadu_ps(x + m);
23
+ __m512 yv = _mm512_loadu_ps(y + m);
24
+ __m512 diff = _mm512_sub_ps(xv, yv);
25
+ acc = _mm512_fmadd_ps(diff, diff, acc);
26
+ }
27
+ // _mm512_reduce_add_ps: on modern AVX-512 SKUs (Cascade Lake+, Sapphire
28
+ // Rapids) GCC/Clang lower this to a shuffle+add tree, ~5-cycle latency.
29
+ // On older AVX-512 SKUs (Skylake-X, Ice Lake) the cross-lane reduction
30
+ // can be ~20 cycles. Acceptable here because n ~ pdx_block_size = 64
31
+ // (4 iterations of 16-wide accumulation), so per-block work dominates
32
+ // the reduction cost. AVX2 uses a manual shuffle+add tree explicitly
33
+ // to avoid `_mm_hadd_ps` overhead, where the ratio is reversed.
34
+ float result = _mm512_reduce_add_ps(acc);
35
+ for (; m < n; ++m) {
36
+ const float d = x[m] - y[m];
37
+ result += d * d;
38
+ }
39
+ return result;
40
+ }
41
+
42
+ } // namespace detail
43
+ } // namespace faiss
44
+
45
+ #endif // COMPILE_SIMD_AVX512