faiss 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/faiss/extconf.rb +2 -1
  4. data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
  5. data/ext/faiss/index_binary.cpp +1 -1
  6. data/ext/faiss/kmeans.cpp +1 -1
  7. data/ext/faiss/pca_matrix.cpp +1 -1
  8. data/ext/faiss/product_quantizer.cpp +1 -1
  9. data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
  10. data/lib/faiss/version.rb +1 -1
  11. data/vendor/faiss/faiss/AutoTune.cpp +93 -80
  12. data/vendor/faiss/faiss/Clustering.cpp +39 -240
  13. data/vendor/faiss/faiss/Clustering.h +6 -0
  14. data/vendor/faiss/faiss/IVFlib.cpp +41 -21
  15. data/vendor/faiss/faiss/Index.cpp +6 -5
  16. data/vendor/faiss/faiss/Index.h +5 -5
  17. data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
  21. data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
  22. data/vendor/faiss/faiss/IndexBinary.h +4 -4
  23. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
  24. data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
  31. data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
  32. data/vendor/faiss/faiss/IndexFastScan.h +25 -23
  33. data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
  34. data/vendor/faiss/faiss/IndexFlat.h +21 -18
  35. data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
  36. data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
  37. data/vendor/faiss/faiss/IndexHNSW.h +16 -2
  38. data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
  39. data/vendor/faiss/faiss/IndexIDMap.h +9 -7
  40. data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
  41. data/vendor/faiss/faiss/IndexIVF.h +33 -12
  42. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
  43. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
  45. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
  46. data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
  47. data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
  48. data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
  49. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
  50. data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
  51. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
  53. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
  55. data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
  56. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
  57. data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
  58. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
  59. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
  60. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
  61. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
  62. data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
  63. data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
  66. data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
  67. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
  68. data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
  69. data/vendor/faiss/faiss/IndexPQ.h +3 -2
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
  73. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  74. data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
  75. data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
  76. data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
  77. data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
  78. data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
  79. data/vendor/faiss/faiss/IndexRefine.h +4 -4
  80. data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
  81. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
  82. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
  83. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
  84. data/vendor/faiss/faiss/IndexShards.cpp +10 -9
  85. data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
  86. data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
  87. data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
  88. data/vendor/faiss/faiss/MetaIndexes.h +1 -1
  89. data/vendor/faiss/faiss/MetricType.h +14 -7
  90. data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
  91. data/vendor/faiss/faiss/SuperKMeans.h +97 -0
  92. data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
  93. data/vendor/faiss/faiss/VectorTransform.h +16 -16
  94. data/vendor/faiss/faiss/build.cpp +23 -0
  95. data/vendor/faiss/faiss/build.h +15 -0
  96. data/vendor/faiss/faiss/clone_index.cpp +48 -47
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
  101. data/vendor/faiss/faiss/factory_tools.cpp +5 -0
  102. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
  103. data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
  104. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
  106. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
  107. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
  108. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
  109. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
  110. data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
  111. data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
  112. data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
  113. data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
  114. data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
  115. data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
  116. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
  117. data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
  118. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
  119. data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
  120. data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
  121. data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
  122. data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
  123. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
  124. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
  125. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
  126. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
  127. data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
  128. data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
  129. data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
  130. data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
  131. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
  132. data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
  133. data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
  134. data/vendor/faiss/faiss/impl/FaissException.h +50 -3
  135. data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
  136. data/vendor/faiss/faiss/impl/HNSW.h +13 -34
  137. data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
  138. data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
  139. data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
  140. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
  141. data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
  142. data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
  143. data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
  144. data/vendor/faiss/faiss/impl/NSG.h +4 -4
  145. data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
  146. data/vendor/faiss/faiss/impl/Panorama.h +258 -87
  147. data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
  148. data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
  149. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
  150. data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
  151. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
  152. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
  153. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
  154. data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
  155. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
  156. data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
  157. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
  158. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
  159. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
  160. data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
  161. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
  162. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
  163. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
  164. data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
  165. data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
  166. data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
  167. data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
  168. data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
  169. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
  170. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
  171. data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
  172. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
  173. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
  174. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
  175. data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
  176. data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
  177. data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
  178. data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
  179. data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
  180. data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
  181. data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
  182. data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
  183. data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
  184. data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
  185. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
  186. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
  187. data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
  188. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
  189. data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
  190. data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
  191. data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
  192. data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
  193. data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
  194. data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
  195. data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
  196. data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
  197. data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
  198. data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
  199. data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
  200. data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
  201. data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
  202. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
  203. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
  204. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
  205. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
  206. data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
  207. data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
  208. data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
  209. data/vendor/faiss/faiss/impl/io.cpp +6 -6
  210. data/vendor/faiss/faiss/impl/io_macros.h +33 -16
  211. data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
  212. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
  213. data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
  214. data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
  215. data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
  216. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
  217. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
  218. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
  219. data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
  220. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
  221. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
  222. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
  223. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
  224. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
  225. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
  226. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
  227. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
  228. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
  229. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
  230. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
  231. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  232. data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
  233. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
  234. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
  235. data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
  236. data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
  237. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
  238. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
  239. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
  240. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
  241. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
  242. data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
  243. data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
  244. data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
  245. data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
  246. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
  247. data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
  248. data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
  249. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
  250. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
  251. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
  252. data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
  253. data/vendor/faiss/faiss/impl/svs_io.h +8 -2
  254. data/vendor/faiss/faiss/index_factory.cpp +86 -18
  255. data/vendor/faiss/faiss/index_io.h +24 -0
  256. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
  257. data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
  258. data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
  259. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
  260. data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
  261. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
  262. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
  263. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
  264. data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
  265. data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
  266. data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
  267. data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
  268. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
  269. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
  270. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
  271. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
  272. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
  273. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
  274. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
  275. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
  276. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
  277. data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
  278. data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
  279. data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
  280. data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
  281. data/vendor/faiss/faiss/utils/distances.cpp +390 -560
  282. data/vendor/faiss/faiss/utils/distances.h +20 -1
  283. data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
  284. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
  285. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
  286. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
  287. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
  288. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
  289. data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
  290. data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
  291. data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
  292. data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
  293. data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
  294. data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
  295. data/vendor/faiss/faiss/utils/hamming.h +92 -2
  296. data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
  297. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
  298. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
  299. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
  300. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
  301. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
  302. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
  303. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
  304. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
  305. data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
  306. data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
  307. data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
  308. data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
  309. data/vendor/faiss/faiss/utils/partitioning.h +31 -0
  310. data/vendor/faiss/faiss/utils/popcount.h +29 -0
  311. data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
  312. data/vendor/faiss/faiss/utils/prefetch.h +2 -2
  313. data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
  314. data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
  315. data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
  316. data/vendor/faiss/faiss/utils/random.cpp +6 -6
  317. data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
  318. data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
  319. data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
  320. data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
  321. data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
  322. data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
  323. data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
  324. data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
  325. data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
  326. data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
  327. data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
  328. data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
  329. data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
  330. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
  331. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
  332. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
  333. data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
  334. data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
  335. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
  336. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
  337. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
  338. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
  339. data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
  340. data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
  341. data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
  342. data/vendor/faiss/faiss/utils/utils.cpp +5 -5
  343. data/vendor/faiss/faiss/utils/utils.h +3 -3
  344. metadata +119 -34
  345. data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
  346. data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
  347. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
  348. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
  349. data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
  350. data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
  351. data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
  352. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
  353. data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
  354. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
  355. data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
  356. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
  357. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
  358. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
  359. data/vendor/faiss/faiss/utils/simdlib.h +0 -42
  360. data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
  361. /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
@@ -11,414 +11,116 @@
11
11
  #include <cstdint>
12
12
  #include <cstring>
13
13
 
14
- // Only include x86 SIMD intrinsics on x86/x86_64 architectures
15
- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
16
- defined(_M_IX86)
17
- #include <immintrin.h>
18
- #endif // defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||
14
+ #include <faiss/utils/popcount.h>
15
+ #include <faiss/utils/simd_levels.h>
19
16
 
20
17
  namespace faiss::rabitq {
21
18
 
22
- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
23
- defined(_M_IX86)
24
19
  /**
25
- * Returns the lookup table for AVX512 popcount operations.
26
- * This table is used for lookup-based popcount implementation.
20
+ * Compute dot product between query and binary data using popcount on AND.
27
21
  *
28
- * Source: https://github.com/WojciechMula/sse-popcount.
29
- *
30
- * @return Lookup table as __m512i register
31
- */
32
- #if defined(__AVX512F__)
33
- inline __m512i get_lookup_512() {
34
- return _mm512_set_epi8(
35
- /* f */ 4,
36
- /* e */ 3,
37
- /* d */ 3,
38
- /* c */ 2,
39
- /* b */ 3,
40
- /* a */ 2,
41
- /* 9 */ 2,
42
- /* 8 */ 1,
43
- /* 7 */ 3,
44
- /* 6 */ 2,
45
- /* 5 */ 2,
46
- /* 4 */ 1,
47
- /* 3 */ 2,
48
- /* 2 */ 1,
49
- /* 1 */ 1,
50
- /* 0 */ 0,
51
- /* f */ 4,
52
- /* e */ 3,
53
- /* d */ 3,
54
- /* c */ 2,
55
- /* b */ 3,
56
- /* a */ 2,
57
- /* 9 */ 2,
58
- /* 8 */ 1,
59
- /* 7 */ 3,
60
- /* 6 */ 2,
61
- /* 5 */ 2,
62
- /* 4 */ 1,
63
- /* 3 */ 2,
64
- /* 2 */ 1,
65
- /* 1 */ 1,
66
- /* 0 */ 0,
67
- /* f */ 4,
68
- /* e */ 3,
69
- /* d */ 3,
70
- /* c */ 2,
71
- /* b */ 3,
72
- /* a */ 2,
73
- /* 9 */ 2,
74
- /* 8 */ 1,
75
- /* 7 */ 3,
76
- /* 6 */ 2,
77
- /* 5 */ 2,
78
- /* 4 */ 1,
79
- /* 3 */ 2,
80
- /* 2 */ 1,
81
- /* 1 */ 1,
82
- /* 0 */ 0,
83
- /* f */ 4,
84
- /* e */ 3,
85
- /* d */ 3,
86
- /* c */ 2,
87
- /* b */ 3,
88
- /* a */ 2,
89
- /* 9 */ 2,
90
- /* 8 */ 1,
91
- /* 7 */ 3,
92
- /* 6 */ 2,
93
- /* 5 */ 2,
94
- /* 4 */ 1,
95
- /* 3 */ 2,
96
- /* 2 */ 1,
97
- /* 1 */ 1,
98
- /* 0 */ 0);
99
- }
100
- #endif // defined(__AVX512F__)
101
- #if defined(__AVX2__)
102
- /**
103
- * Returns the lookup table for AVX2 popcount operations.
104
- * This table is used for lookup-based popcount implementation.
105
- *
106
- * @return Lookup table as __m256i register
22
+ * @param query Pointer to rearranged rotated query data
23
+ * @param data Pointer to binary data
24
+ * @param size Size in bytes
25
+ * @param qb Number of quantization bits
26
+ * @return Unsigned integer dot product
107
27
  */
108
- inline __m256i get_lookup_256() {
109
- return _mm256_setr_epi8(
110
- /* 0 */ 0,
111
- /* 1 */ 1,
112
- /* 2 */ 1,
113
- /* 3 */ 2,
114
- /* 4 */ 1,
115
- /* 5 */ 2,
116
- /* 6 */ 2,
117
- /* 7 */ 3,
118
- /* 8 */ 1,
119
- /* 9 */ 2,
120
- /* a */ 2,
121
- /* b */ 3,
122
- /* c */ 2,
123
- /* d */ 3,
124
- /* e */ 3,
125
- /* f */ 4,
126
- /* 0 */ 0,
127
- /* 1 */ 1,
128
- /* 2 */ 1,
129
- /* 3 */ 2,
130
- /* 4 */ 1,
131
- /* 5 */ 2,
132
- /* 6 */ 2,
133
- /* 7 */ 3,
134
- /* 8 */ 1,
135
- /* 9 */ 2,
136
- /* a */ 2,
137
- /* b */ 3,
138
- /* c */ 2,
139
- /* d */ 3,
140
- /* e */ 3,
141
- /* f */ 4);
142
- }
143
- #endif // defined(__AVX2__)
28
+ template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
29
+ uint64_t bitwise_and_dot_product(
30
+ const uint8_t* query,
31
+ const uint8_t* data,
32
+ size_t size,
33
+ size_t qb);
144
34
 
145
- #if defined(__AVX512F__)
146
35
  /**
147
- * Popcount for a 512-bit register, using lookup tables if necessary.
36
+ * Compute dot product between query and binary data using popcount on XOR.
148
37
  *
149
- * @param v Input vector to count bits in
150
- * @return Vector int32_t[16] with popcount results.
38
+ * @param query Pointer to rearranged rotated query data
39
+ * @param data Pointer to binary data
40
+ * @param size Size in bytes
41
+ * @param qb Number of quantization bits
42
+ * @return Unsigned integer dot product
151
43
  */
152
- inline __m512i popcount_512(__m512i v) {
153
- #if defined(__AVX512VPOPCNTDQ__)
154
- return _mm512_popcnt_epi64(v);
155
- #else
156
- const __m512i lookup = get_lookup_512();
157
- const __m512i low_mask = _mm512_set1_epi8(0x0f);
158
-
159
- const __m512i lo = _mm512_and_si512(v, low_mask);
160
- const __m512i hi = _mm512_and_si512(_mm512_srli_epi16(v, 4), low_mask);
161
- const __m512i popcnt_lo = _mm512_shuffle_epi8(lookup, lo);
162
- const __m512i popcnt_hi = _mm512_shuffle_epi8(lookup, hi);
163
- const __m512i popcnt = _mm512_add_epi8(popcnt_lo, popcnt_hi);
164
- return _mm512_sad_epu8(_mm512_setzero_si512(), popcnt);
165
- #endif // defined(__AVX512VPOPCNTDQ__)
166
- }
167
- #endif // defined(__AVX512F__)
44
+ template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
45
+ uint64_t bitwise_xor_dot_product(
46
+ const uint8_t* query,
47
+ const uint8_t* data,
48
+ size_t size,
49
+ size_t qb);
168
50
 
169
- #if defined(__AVX2__)
170
51
  /**
171
- * Popcount for a 256-bit register, using lookup tables if necessary.
52
+ * Count total set bits in data.
172
53
  *
173
- * @param v Input vector to count bits in
174
- * @return uint64_t[4] of popcounts for each portion of the input vector.
54
+ * @param data Pointer to binary data
55
+ * @param size Size in bytes
56
+ * @return Total popcount
175
57
  */
176
- inline __m256i popcount_256(__m256i v) {
177
- const __m256i lookup = get_lookup_256();
178
- const __m256i low_mask = _mm256_set1_epi8(0x0f);
179
-
180
- const __m256i lo = _mm256_and_si256(v, low_mask);
181
- const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
182
- const __m256i popcnt_lo = _mm256_shuffle_epi8(lookup, lo);
183
- const __m256i popcnt_hi = _mm256_shuffle_epi8(lookup, hi);
184
- const __m256i popcnt = _mm256_add_epi8(popcnt_lo, popcnt_hi);
185
- // Reduce uint8_t[32] into uint64_t[4] by addition.
186
- return _mm256_sad_epu8(_mm256_setzero_si256(), popcnt);
187
- }
188
-
189
- inline uint64_t reduce_add_256(__m256i v) {
190
- alignas(32) uint64_t lanes[4];
191
- _mm256_store_si256((__m256i*)lanes, v);
192
- return lanes[0] + lanes[1] + lanes[2] + lanes[3];
193
- }
194
- #endif // defined(__AVX2__)
195
-
196
- #if defined(__SSE4_1__)
197
- inline __m128i popcount_128(__m128i v) {
198
- // Scalar popcount for each 64-bit lane
199
- uint64_t lane0 = _mm_extract_epi64(v, 0);
200
- uint64_t lane1 = _mm_extract_epi64(v, 1);
201
- uint64_t pop0 = __builtin_popcountll(lane0);
202
- uint64_t pop1 = __builtin_popcountll(lane1);
203
- return _mm_set_epi64x(pop1, pop0);
204
- }
58
+ template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
59
+ uint64_t popcount(const uint8_t* data, size_t size);
205
60
 
206
- inline uint64_t reduce_add_128(__m128i v) {
207
- alignas(16) uint64_t lanes[2];
208
- _mm_store_si128((__m128i*)lanes, v);
209
- return lanes[0] + lanes[1];
210
- }
211
- #endif // defined(__SSE4_1__)
212
- #endif // defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||
61
+ // NONE specializations scalar fallbacks
213
62
 
214
- /**
215
- * Compute dot product between query and binary data using popcount operations.
216
- *
217
- * @param query Pointer to rearranged rotated query data
218
- * @param data Pointer to binary data
219
- * @param d Dimension
220
- * @param qb Number of quantization bits
221
- * @return Unsigned integer dot product
222
- */
223
- inline uint64_t bitwise_and_dot_product(
63
+ template <>
64
+ inline uint64_t bitwise_and_dot_product<SIMDLevel::NONE>(
224
65
  const uint8_t* query,
225
66
  const uint8_t* data,
226
67
  size_t size,
227
68
  size_t qb) {
228
69
  uint64_t sum = 0;
229
70
  size_t offset = 0;
230
- #if defined(__AVX512F__)
231
- // Handle 512-bit chunks.
232
- if (size_t step = 512 / 8; offset + step <= size) {
233
- __m512i sum_512 = _mm512_setzero_si512();
234
- for (; offset + step <= size; offset += step) {
235
- __m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
236
- for (int j = 0; j < qb; j++) {
237
- __m512i v_q = _mm512_loadu_si512(
238
- (const __m512i*)(query + j * size + offset));
239
- __m512i v_and = _mm512_and_si512(v_q, v_x);
240
- __m512i v_popcnt = popcount_512(v_and);
241
- __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
242
- sum_512 = _mm512_add_epi64(sum_512, v_shifted);
243
- }
244
- }
245
- sum += _mm512_reduce_add_epi64(sum_512);
246
- }
247
- #endif // defined(__AVX512F__)
248
- #if defined(__AVX2__)
249
- if (size_t step = 256 / 8; offset + step <= size) {
250
- __m256i sum_256 = _mm256_setzero_si256();
251
- for (; offset + step <= size; offset += step) {
252
- __m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
253
- for (int j = 0; j < qb; j++) {
254
- __m256i v_q = _mm256_loadu_si256(
255
- (const __m256i*)(query + j * size + offset));
256
- __m256i v_and = _mm256_and_si256(v_q, v_x);
257
- __m256i v_popcnt = popcount_256(v_and);
258
- __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
259
- sum_256 = _mm256_add_epi64(sum_256, v_shifted);
260
- }
261
- }
262
- sum += reduce_add_256(sum_256);
263
- }
264
- #endif // defined(__AVX2__)
265
- #if defined(__SSE4_1__)
266
- __m128i sum_128 = _mm_setzero_si128();
267
- for (size_t step = 128 / 8; offset + step <= size; offset += step) {
268
- __m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
269
- for (int j = 0; j < qb; j++) {
270
- __m128i v_q = _mm_loadu_si128(
271
- (const __m128i*)(query + j * size + offset));
272
- __m128i v_and = _mm_and_si128(v_q, v_x);
273
- __m128i v_popcnt = popcount_128(v_and);
274
- __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
275
- sum_128 = _mm_add_epi64(sum_128, v_shifted);
276
- }
277
- }
278
- sum += reduce_add_128(sum_128);
279
- #endif // defined(__SSE4_1__)
280
71
  for (size_t step = 64 / 8; offset + step <= size; offset += step) {
281
72
  const auto yv = *(const uint64_t*)(data + offset);
282
73
  for (int j = 0; j < qb; j++) {
283
74
  const auto qv = *(const uint64_t*)(query + j * size + offset);
284
- sum += __builtin_popcountll(qv & yv) << j;
75
+ sum += popcount64(qv & yv) << j;
285
76
  }
286
77
  }
287
78
  for (; offset < size; ++offset) {
288
79
  const auto yv = *(data + offset);
289
80
  for (int j = 0; j < qb; j++) {
290
81
  const auto qv = *(query + j * size + offset);
291
- sum += __builtin_popcount(qv & yv) << j;
82
+ sum += popcount32(qv & yv) << j;
292
83
  }
293
84
  }
294
85
  return sum;
295
86
  }
296
87
 
297
- /**
298
- * Compute dot product between query and binary data using popcount operations.
299
- *
300
- * @param query Pointer to rearranged rotated query data
301
- * @param data Pointer to binary data
302
- * @param d Dimension
303
- * @param qb Number of quantization bits
304
- * @return Unsigned integer dot product
305
- */
306
- inline uint64_t bitwise_xor_dot_product(
88
+ template <>
89
+ inline uint64_t bitwise_xor_dot_product<SIMDLevel::NONE>(
307
90
  const uint8_t* query,
308
91
  const uint8_t* data,
309
92
  size_t size,
310
93
  size_t qb) {
311
94
  uint64_t sum = 0;
312
95
  size_t offset = 0;
313
- #if defined(__AVX512F__)
314
- // Handle 512-bit chunks.
315
- if (size_t step = 512 / 8; offset + step <= size) {
316
- __m512i sum_512 = _mm512_setzero_si512();
317
- for (; offset + step <= size; offset += step) {
318
- __m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
319
- for (int j = 0; j < qb; j++) {
320
- __m512i v_q = _mm512_loadu_si512(
321
- (const __m512i*)(query + j * size + offset));
322
- __m512i v_xor = _mm512_xor_si512(v_q, v_x);
323
- __m512i v_popcnt = popcount_512(v_xor);
324
- __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
325
- sum_512 = _mm512_add_epi64(sum_512, v_shifted);
326
- }
327
- }
328
- sum += _mm512_reduce_add_epi64(sum_512);
329
- }
330
- #endif
331
- #if defined(__AVX2__)
332
- if (size_t step = 256 / 8; offset + step <= size) {
333
- __m256i sum_256 = _mm256_setzero_si256();
334
- for (; offset + step <= size; offset += step) {
335
- __m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
336
- for (int j = 0; j < qb; j++) {
337
- __m256i v_q = _mm256_loadu_si256(
338
- (const __m256i*)(query + j * size + offset));
339
- __m256i v_xor = _mm256_xor_si256(v_q, v_x);
340
- __m256i v_popcnt = popcount_256(v_xor);
341
- __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
342
- sum_256 = _mm256_add_epi64(sum_256, v_shifted);
343
- }
344
- }
345
- sum += reduce_add_256(sum_256);
346
- }
347
- #endif
348
- #if defined(__SSE4_1__)
349
- __m128i sum_128 = _mm_setzero_si128();
350
- for (size_t step = 128 / 8; offset + step <= size; offset += step) {
351
- __m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
352
- for (int j = 0; j < qb; j++) {
353
- __m128i v_q = _mm_loadu_si128(
354
- (const __m128i*)(query + j * size + offset));
355
- __m128i v_xor = _mm_xor_si128(v_q, v_x);
356
- __m128i v_popcnt = popcount_128(v_xor);
357
- __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
358
- sum_128 = _mm_add_epi64(sum_128, v_shifted);
359
- }
360
- }
361
- sum += reduce_add_128(sum_128);
362
- #endif
363
96
  for (size_t step = 64 / 8; offset + step <= size; offset += step) {
364
97
  const auto yv = *(const uint64_t*)(data + offset);
365
98
  for (int j = 0; j < qb; j++) {
366
99
  const auto qv = *(const uint64_t*)(query + j * size + offset);
367
- sum += __builtin_popcountll(qv ^ yv) << j;
100
+ sum += popcount64(qv ^ yv) << j;
368
101
  }
369
102
  }
370
103
  for (; offset < size; ++offset) {
371
104
  const auto yv = *(data + offset);
372
105
  for (int j = 0; j < qb; j++) {
373
106
  const auto qv = *(query + j * size + offset);
374
- sum += __builtin_popcount(qv ^ yv) << j;
107
+ sum += popcount32(qv ^ yv) << j;
375
108
  }
376
109
  }
377
110
  return sum;
378
111
  }
379
112
 
380
- inline uint64_t popcount(const uint8_t* data, size_t size) {
113
+ template <>
114
+ inline uint64_t popcount<SIMDLevel::NONE>(const uint8_t* data, size_t size) {
381
115
  uint64_t sum = 0;
382
116
  size_t offset = 0;
383
- #if defined(__AVX512F__)
384
- // Handle 512-bit chunks.
385
- if (offset + 512 / 8 <= size) {
386
- __m512i sum_512 = _mm512_setzero_si512();
387
- for (size_t end; (end = offset + 512 / 8) <= size; offset = end) {
388
- __m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
389
- __m512i v_popcnt = popcount_512(v_x);
390
- sum_512 = _mm512_add_epi64(sum_512, v_popcnt);
391
- }
392
- sum += _mm512_reduce_add_epi64(sum_512);
393
- }
394
- #endif // defined(__AVX512F__)
395
- #if defined(__AVX2__)
396
- if (offset + 256 / 8 <= size) {
397
- __m256i sum_256 = _mm256_setzero_si256();
398
- for (size_t end; (end = offset + 256 / 8) <= size; offset = end) {
399
- __m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
400
- __m256i v_popcnt = popcount_256(v_x);
401
- sum_256 = _mm256_add_epi64(sum_256, v_popcnt);
402
- }
403
- sum += reduce_add_256(sum_256);
404
- }
405
- #endif // defined(__AVX2__)
406
- #if defined(__SSE4_1__)
407
- __m128i sum_128 = _mm_setzero_si128();
408
- for (size_t step = 128 / 8; offset + step <= size; offset += step) {
409
- __m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
410
- sum_128 = _mm_add_epi64(sum_128, popcount_128(v_x));
411
- }
412
- sum += reduce_add_128(sum_128);
413
- #endif // defined(__SSE4_1__)
414
-
415
117
  for (size_t step = 64 / 8; offset + step <= size; offset += step) {
416
118
  const auto yv = *(const uint64_t*)(data + offset);
417
- sum += __builtin_popcountll(yv);
119
+ sum += popcount64(yv);
418
120
  }
419
121
  for (; offset < size; ++offset) {
420
122
  const auto yv = *(data + offset);
421
- sum += __builtin_popcount(yv);
123
+ sum += popcount32(yv);
422
124
  }
423
125
  return sum;
424
126
  }
@@ -469,186 +171,9 @@ inline float ip_scalar(
469
171
  return result;
470
172
  }
471
173
 
472
- #if defined(__x86_64__) || defined(_M_X64)
473
-
474
- #if defined(__AVX2__)
475
- /// Horizontal sum of 8 floats in a __m256 register.
476
- inline float hsum_avx2(__m256 v) {
477
- __m128 hi = _mm256_extractf128_ps(v, 1);
478
- __m128 lo = _mm256_castps256_ps128(v);
479
- lo = _mm_add_ps(lo, hi);
480
- __m128 shuf = _mm_movehdup_ps(lo);
481
- lo = _mm_add_ps(lo, shuf);
482
- shuf = _mm_movehl_ps(shuf, lo);
483
- return _mm_cvtss_f32(_mm_add_ss(lo, shuf));
484
- }
485
- #endif // __AVX2__
486
-
487
- /*********************************************************
488
- * Specialized 1-bit kernels (ex_bits == 1).
489
- *
490
- * For 1 extra bit, both sign_bits and ex_code are 1-bit-per-dim packed,
491
- * so we convert bits to floats directly — no extraction loops needed.
492
- *********************************************************/
493
-
494
- #if defined(__AVX512F__)
495
- /// AVX-512: 16 dims/iter, ex_bits == 1.
496
- inline float ip_1exbit_avx512(
497
- const uint8_t* __restrict sign_bits,
498
- const uint8_t* __restrict ex_code,
499
- const float* __restrict rotated_q,
500
- size_t d,
501
- float cb) {
502
- __m512 acc = _mm512_setzero_ps();
503
- const __m512 v_cb = _mm512_set1_ps(cb);
504
- const __m512 v_two = _mm512_set1_ps(2.0f);
505
- const __m512 v_one = _mm512_set1_ps(1.0f);
506
-
507
- size_t i = 0;
508
- for (; i + 16 <= d; i += 16) {
509
- uint16_t sb16;
510
- memcpy(&sb16, sign_bits + i / 8, sizeof(uint16_t));
511
- uint16_t eb16;
512
- memcpy(&eb16, ex_code + i / 8, sizeof(uint16_t));
513
-
514
- __m512 sb_f = _mm512_maskz_mov_ps(_cvtu32_mask16(sb16), v_one);
515
- __m512 eb_f = _mm512_maskz_mov_ps(_cvtu32_mask16(eb16), v_one);
516
-
517
- __m512 recon = _mm512_add_ps(_mm512_fmadd_ps(sb_f, v_two, eb_f), v_cb);
518
- __m512 rq = _mm512_loadu_ps(rotated_q + i);
519
- acc = _mm512_fmadd_ps(rq, recon, acc);
520
- }
521
-
522
- float result = _mm512_reduce_add_ps(acc);
523
- result += ip_scalar(sign_bits, ex_code, rotated_q, i, d, 1, cb);
524
- return result;
525
- }
526
- #endif // __AVX512F__
527
-
528
- #if defined(__AVX2__)
529
- /// AVX2: 8 dims/iter, ex_bits == 1.
530
- inline float ip_1exbit_avx2(
531
- const uint8_t* __restrict sign_bits,
532
- const uint8_t* __restrict ex_code,
533
- const float* __restrict rotated_q,
534
- size_t d,
535
- float cb) {
536
- __m256 acc = _mm256_setzero_ps();
537
- const __m256 v_cb = _mm256_set1_ps(cb);
538
- const __m256 v_two = _mm256_set1_ps(2.0f);
539
- const __m256 v_one = _mm256_set1_ps(1.0f);
540
- const __m256i bit_pos = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128);
541
- const __m256i zero = _mm256_setzero_si256();
542
-
543
- size_t i = 0;
544
- for (; i + 8 <= d; i += 8) {
545
- uint8_t sb = sign_bits[i / 8];
546
- uint8_t eb = ex_code[i / 8];
547
-
548
- __m256i sb_cmp = _mm256_cmpgt_epi32(
549
- _mm256_and_si256(_mm256_set1_epi32(sb), bit_pos), zero);
550
- __m256 sb_f = _mm256_and_ps(_mm256_castsi256_ps(sb_cmp), v_one);
551
-
552
- __m256i eb_cmp = _mm256_cmpgt_epi32(
553
- _mm256_and_si256(_mm256_set1_epi32(eb), bit_pos), zero);
554
- __m256 eb_f = _mm256_and_ps(_mm256_castsi256_ps(eb_cmp), v_one);
555
-
556
- __m256 recon = _mm256_add_ps(_mm256_fmadd_ps(sb_f, v_two, eb_f), v_cb);
557
- __m256 rq = _mm256_loadu_ps(rotated_q + i);
558
- acc = _mm256_fmadd_ps(rq, recon, acc);
559
- }
560
-
561
- float result = hsum_avx2(acc);
562
- result += ip_scalar(sign_bits, ex_code, rotated_q, i, d, 1, cb);
563
- return result;
564
- }
565
- #endif // __AVX2__
566
-
567
- /*********************************************************
568
- * Bit-plane decomposition kernels (ex_bits >= 2, BMI2 required).
569
- *
570
- * Decomposes the inner product as:
571
- * ex_ip = (1 << ex_bits) * sign_dot
572
- * + Σ_{b=0}^{ex_bits-1} (1 << b) * plane_dot_b
573
- * + cb * total_q
574
- *
575
- * Each plane_dot_b is a float × bit-vector dot product, computed using
576
- * the same bit→mask→float conversion as the 1-bit kernel. PEXT
577
- * extracts each bit plane from the packed ex_code in one instruction
578
- * per 8 dimensions.
579
- *********************************************************/
580
-
581
- #if defined(__AVX2__) && defined(__BMI2__)
582
- /// AVX2 + BMI2 bit-plane decomposition: 8 dims/iter, ex_bits in [2, 7].
583
- /// Caller must ensure ex_bits <= 7 (pext_masks[7] / v_weights[8]).
584
- inline float ip_bitplane_avx2(
585
- const uint8_t* __restrict sign_bits,
586
- const uint8_t* __restrict ex_code,
587
- const float* __restrict rotated_q,
588
- size_t d,
589
- size_t ex_bits,
590
- float cb) {
591
- __m256 acc = _mm256_setzero_ps();
592
- const __m256 v_one = _mm256_set1_ps(1.0f);
593
- const __m256i bit_pos = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128);
594
- const __m256i zero = _mm256_setzero_si256();
595
- const __m256 v_cb = _mm256_set1_ps(cb);
596
-
597
- // Precompute PEXT masks and plane weights
598
- uint64_t pext_masks[7];
599
- __m256 v_weights[8];
600
- for (size_t b = 0; b < ex_bits; b++) {
601
- uint64_t m = 0;
602
- for (int j = 0; j < 8; j++) {
603
- m |= (1ULL << (b + j * ex_bits));
604
- }
605
- pext_masks[b] = m;
606
- v_weights[b] = _mm256_set1_ps(static_cast<float>(1u << b));
607
- }
608
- v_weights[ex_bits] = _mm256_set1_ps(static_cast<float>(1u << ex_bits));
609
-
610
- size_t i = 0;
611
- for (; i + 8 <= d; i += 8) {
612
- // Sign bit → float via bit mask comparison
613
- __m256i sb_cmp = _mm256_cmpgt_epi32(
614
- _mm256_and_si256(_mm256_set1_epi32(sign_bits[i / 8]), bit_pos),
615
- zero);
616
- __m256 recon = _mm256_mul_ps(
617
- _mm256_and_ps(_mm256_castsi256_ps(sb_cmp), v_one),
618
- v_weights[ex_bits]);
619
-
620
- // Load packed ex_code for 8 dims (8 × ex_bits bits = ex_bits bytes)
621
- uint64_t ex64 = 0;
622
- memcpy(&ex64, ex_code + (i / 8) * ex_bits, sizeof(uint64_t));
623
-
624
- // Extract each bit plane via PEXT → bit mask → float
625
- for (size_t b = 0; b < ex_bits; b++) {
626
- auto plane = static_cast<uint8_t>(_pext_u64(ex64, pext_masks[b]));
627
- __m256i p_cmp = _mm256_cmpgt_epi32(
628
- _mm256_and_si256(_mm256_set1_epi32(plane), bit_pos), zero);
629
- __m256 p_f = _mm256_and_ps(_mm256_castsi256_ps(p_cmp), v_one);
630
- recon = _mm256_fmadd_ps(p_f, v_weights[b], recon);
631
- }
632
-
633
- __m256 rq = _mm256_loadu_ps(rotated_q + i);
634
- acc = _mm256_fmadd_ps(rq, _mm256_add_ps(recon, v_cb), acc);
635
- }
636
-
637
- float result = hsum_avx2(acc);
638
- result += ip_scalar(sign_bits, ex_code, rotated_q, i, d, ex_bits, cb);
639
- return result;
640
- }
641
- #endif // __AVX2__ && __BMI2__
642
-
643
- #endif // x86_64
644
-
645
174
  /**
646
175
  * Dispatch to the best available kernel for the given ex_bits.
647
176
  *
648
- * Routing (compile-time):
649
- * ex_bits == 1: specialized 1-bit kernel (AVX-512 > AVX2 > scalar)
650
- * ex_bits >= 2: bit-plane decomposition (AVX2+BMI2 > scalar)
651
- *
652
177
  * @param sign_bits packed sign bits (1 bit/dim, standard byte packing)
653
178
  * @param ex_code packed extra-bit codes (ex_bits bits/dim)
654
179
  * @param rotated_q rotated query vector (float[d])
@@ -657,28 +182,24 @@ inline float ip_bitplane_avx2(
657
182
  * @param cb constant bias: -(2^ex_bits - 0.5)
658
183
  * @return inner product value
659
184
  */
660
- inline float compute_inner_product(
185
+ template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
186
+ float compute_inner_product(
661
187
  const uint8_t* __restrict sign_bits,
662
188
  const uint8_t* __restrict ex_code,
663
189
  const float* __restrict rotated_q,
664
190
  size_t d,
665
191
  size_t ex_bits,
666
- float cb) {
667
- if (ex_bits == 1) {
668
- #if defined(__AVX512F__)
669
- return ip_1exbit_avx512(sign_bits, ex_code, rotated_q, d, cb);
670
- #elif defined(__AVX2__)
671
- return ip_1exbit_avx2(sign_bits, ex_code, rotated_q, d, cb);
672
- #else
673
- return ip_scalar(sign_bits, ex_code, rotated_q, 0, d, 1, cb);
674
- #endif
675
- }
192
+ float cb);
676
193
 
677
- #if defined(__AVX2__) && defined(__BMI2__)
678
- if (ex_bits <= 7) {
679
- return ip_bitplane_avx2(sign_bits, ex_code, rotated_q, d, ex_bits, cb);
680
- }
681
- #endif
194
+ // NONE specialization — pure scalar
195
+ template <>
196
+ inline float compute_inner_product<SIMDLevel::NONE>(
197
+ const uint8_t* __restrict sign_bits,
198
+ const uint8_t* __restrict ex_code,
199
+ const float* __restrict rotated_q,
200
+ size_t d,
201
+ size_t ex_bits,
202
+ float cb) {
682
203
  return ip_scalar(sign_bits, ex_code, rotated_q, 0, d, ex_bits, cb);
683
204
  }
684
205