faiss 0.5.3 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (379) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/ext/faiss/ext.cpp +1 -1
  4. data/ext/faiss/extconf.rb +4 -4
  5. data/ext/faiss/index.cpp +63 -45
  6. data/ext/faiss/index_binary.cpp +37 -27
  7. data/ext/faiss/kmeans.cpp +9 -8
  8. data/ext/faiss/pca_matrix.cpp +9 -7
  9. data/ext/faiss/product_quantizer.cpp +13 -11
  10. data/ext/faiss/utils.cpp +4 -2
  11. data/ext/faiss/utils.h +4 -0
  12. data/lib/faiss/version.rb +1 -1
  13. data/lib/faiss.rb +1 -1
  14. data/vendor/faiss/faiss/AutoTune.cpp +214 -82
  15. data/vendor/faiss/faiss/AutoTune.h +14 -1
  16. data/vendor/faiss/faiss/Clustering.cpp +97 -249
  17. data/vendor/faiss/faiss/Clustering.h +18 -0
  18. data/vendor/faiss/faiss/IVFlib.cpp +67 -44
  19. data/vendor/faiss/faiss/Index.cpp +25 -12
  20. data/vendor/faiss/faiss/Index.h +26 -4
  21. data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
  22. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +68 -61
  23. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
  24. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
  25. data/vendor/faiss/faiss/IndexBinary.cpp +6 -3
  26. data/vendor/faiss/faiss/IndexBinary.h +4 -4
  27. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
  28. data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
  29. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
  30. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +92 -95
  31. data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
  32. data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
  33. data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
  34. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +120 -414
  35. data/vendor/faiss/faiss/IndexFastScan.cpp +105 -129
  36. data/vendor/faiss/faiss/IndexFastScan.h +35 -24
  37. data/vendor/faiss/faiss/IndexFlat.cpp +216 -152
  38. data/vendor/faiss/faiss/IndexFlat.h +32 -14
  39. data/vendor/faiss/faiss/IndexFlatCodes.cpp +88 -41
  40. data/vendor/faiss/faiss/IndexFlatCodes.h +7 -1
  41. data/vendor/faiss/faiss/IndexHNSW.cpp +299 -187
  42. data/vendor/faiss/faiss/IndexHNSW.h +30 -14
  43. data/vendor/faiss/faiss/IndexIDMap.cpp +26 -22
  44. data/vendor/faiss/faiss/IndexIDMap.h +9 -7
  45. data/vendor/faiss/faiss/IndexIVF.cpp +535 -405
  46. data/vendor/faiss/faiss/IndexIVF.h +47 -16
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
  48. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +105 -99
  49. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +6 -3
  50. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +379 -249
  51. data/vendor/faiss/faiss/IndexIVFFastScan.h +65 -60
  52. data/vendor/faiss/faiss/IndexIVFFlat.cpp +41 -124
  53. data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
  54. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +89 -138
  55. data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
  56. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
  57. data/vendor/faiss/faiss/IndexIVFPQ.cpp +77 -907
  58. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +184 -122
  59. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
  60. data/vendor/faiss/faiss/IndexIVFPQR.cpp +23 -18
  61. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +59 -60
  62. data/vendor/faiss/faiss/IndexIVFRaBitQ.h +4 -3
  63. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +564 -416
  64. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +269 -111
  65. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
  66. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
  67. data/vendor/faiss/faiss/IndexLSH.cpp +44 -25
  68. data/vendor/faiss/faiss/IndexLattice.cpp +41 -36
  69. data/vendor/faiss/faiss/IndexNNDescent.cpp +37 -21
  70. data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
  71. data/vendor/faiss/faiss/IndexNSG.cpp +40 -23
  72. data/vendor/faiss/faiss/IndexNSG.h +0 -2
  73. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +32 -12
  74. data/vendor/faiss/faiss/IndexPQ.cpp +129 -213
  75. data/vendor/faiss/faiss/IndexPQ.h +3 -2
  76. data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
  77. data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
  78. data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
  79. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  80. data/vendor/faiss/faiss/IndexRaBitQ.cpp +31 -43
  81. data/vendor/faiss/faiss/IndexRaBitQ.h +4 -3
  82. data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +135 -317
  83. data/vendor/faiss/faiss/IndexRaBitQFastScan.h +192 -34
  84. data/vendor/faiss/faiss/IndexRefine.cpp +30 -55
  85. data/vendor/faiss/faiss/IndexRefine.h +4 -4
  86. data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
  87. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
  88. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
  89. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
  90. data/vendor/faiss/faiss/IndexShards.cpp +13 -13
  91. data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
  92. data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
  93. data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
  94. data/vendor/faiss/faiss/MetaIndexes.h +1 -1
  95. data/vendor/faiss/faiss/MetricType.h +29 -6
  96. data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
  97. data/vendor/faiss/faiss/SuperKMeans.h +97 -0
  98. data/vendor/faiss/faiss/VectorTransform.cpp +349 -141
  99. data/vendor/faiss/faiss/VectorTransform.h +39 -16
  100. data/vendor/faiss/faiss/build.cpp +23 -0
  101. data/vendor/faiss/faiss/build.h +15 -0
  102. data/vendor/faiss/faiss/clone_index.cpp +55 -51
  103. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
  104. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
  105. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
  106. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
  107. data/vendor/faiss/faiss/{cppcontrib/factory_tools.cpp → factory_tools.cpp} +6 -1
  108. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +1 -1
  109. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
  110. data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
  111. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
  112. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
  113. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
  114. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
  115. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
  116. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
  117. data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
  118. data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
  119. data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
  120. data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
  121. data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
  122. data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
  123. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
  124. data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
  125. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
  126. data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
  127. data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
  128. data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
  129. data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
  130. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +64 -34
  131. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
  132. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
  133. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -28
  134. data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
  135. data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
  136. data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +367 -0
  137. data/vendor/faiss/faiss/impl/ClusteringInitialization.h +107 -0
  138. data/vendor/faiss/faiss/impl/CodePacker.cpp +7 -3
  139. data/vendor/faiss/faiss/impl/CodePacker.h +11 -3
  140. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +83 -0
  141. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.h +47 -0
  142. data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
  143. data/vendor/faiss/faiss/impl/FaissAssert.h +64 -3
  144. data/vendor/faiss/faiss/impl/FaissException.h +50 -3
  145. data/vendor/faiss/faiss/impl/HNSW.cpp +117 -351
  146. data/vendor/faiss/faiss/impl/HNSW.h +21 -40
  147. data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
  148. data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
  149. data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
  150. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +114 -102
  151. data/vendor/faiss/faiss/impl/NNDescent.cpp +63 -26
  152. data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
  153. data/vendor/faiss/faiss/impl/NSG.cpp +44 -26
  154. data/vendor/faiss/faiss/impl/NSG.h +20 -10
  155. data/vendor/faiss/faiss/impl/Panorama.cpp +76 -52
  156. data/vendor/faiss/faiss/impl/Panorama.h +265 -78
  157. data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
  158. data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
  159. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +62 -37
  160. data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
  161. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
  162. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
  163. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +99 -80
  164. data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
  165. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +135 -37
  166. data/vendor/faiss/faiss/impl/RaBitQUtils.h +148 -21
  167. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +298 -301
  168. data/vendor/faiss/faiss/impl/RaBitQuantizer.h +3 -10
  169. data/vendor/faiss/faiss/impl/RaBitQuantizerMultiBit.cpp +15 -41
  170. data/vendor/faiss/faiss/impl/RaBitQuantizerMultiBit.h +0 -4
  171. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +40 -32
  172. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
  173. data/vendor/faiss/faiss/impl/ResultHandler.h +218 -113
  174. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +119 -2362
  175. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -3
  176. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
  177. data/vendor/faiss/faiss/impl/VisitedTable.cpp +42 -0
  178. data/vendor/faiss/faiss/impl/VisitedTable.h +76 -0
  179. data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
  180. data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
  181. data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
  182. data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
  183. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
  184. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
  185. data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
  186. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
  187. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
  188. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
  189. data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
  190. data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
  191. data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
  192. data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
  193. data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
  194. data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
  195. data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
  196. data/vendor/faiss/faiss/impl/expanded_scanners.h +163 -0
  197. data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
  198. data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
  199. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
  200. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
  201. data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
  202. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
  203. data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +176 -4
  204. data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
  205. data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
  206. data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
  207. data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
  208. data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
  209. data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
  210. data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -348
  211. data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
  212. data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
  213. data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +290 -142
  214. data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
  215. data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
  216. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
  217. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
  218. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
  219. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
  220. data/vendor/faiss/faiss/impl/index_read.cpp +1950 -505
  221. data/vendor/faiss/faiss/impl/index_read_utils.h +1 -2
  222. data/vendor/faiss/faiss/impl/index_write.cpp +112 -21
  223. data/vendor/faiss/faiss/impl/io.cpp +6 -6
  224. data/vendor/faiss/faiss/impl/io_macros.h +33 -16
  225. data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
  226. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +81 -40
  227. data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
  228. data/vendor/faiss/faiss/impl/mapped_io.cpp +15 -8
  229. data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
  230. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
  231. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
  232. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
  233. data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
  234. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
  235. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
  236. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
  237. data/vendor/faiss/faiss/impl/{code_distance/code_distance-avx2.h → pq_code_distance/pq_code_distance-avx2.h} +43 -220
  238. data/vendor/faiss/faiss/impl/{code_distance/code_distance-avx512.h → pq_code_distance/pq_code_distance-avx512.h} +25 -112
  239. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +59 -0
  240. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
  241. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +256 -0
  242. data/vendor/faiss/faiss/impl/{code_distance/code_distance-sve.h → pq_code_distance/pq_code_distance-sve.cpp} +57 -146
  243. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
  244. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +320 -483
  245. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  246. data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +121 -0
  247. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +137 -0
  248. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +371 -0
  249. data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +190 -0
  250. data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +94 -0
  251. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +603 -0
  252. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +597 -0
  253. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +388 -0
  254. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +630 -0
  255. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
  256. data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +387 -0
  257. data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +54 -0
  258. data/vendor/faiss/faiss/impl/simd_dispatch.h +173 -0
  259. data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
  260. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +274 -171
  261. data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
  262. data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
  263. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
  264. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +275 -217
  265. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
  266. data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
  267. data/vendor/faiss/faiss/impl/svs_io.h +8 -2
  268. data/vendor/faiss/faiss/index_factory.cpp +115 -28
  269. data/vendor/faiss/faiss/index_io.h +53 -3
  270. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +73 -20
  271. data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
  272. data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
  273. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
  274. data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
  275. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
  276. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +14 -14
  277. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
  278. data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +9 -19
  279. data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
  280. data/vendor/faiss/faiss/svs/IndexSVSFlat.h +2 -0
  281. data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
  282. data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
  283. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
  284. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
  285. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
  286. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
  287. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
  288. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +19 -2
  289. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
  290. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +19 -2
  291. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +14 -0
  292. data/vendor/faiss/faiss/utils/Heap.cpp +56 -10
  293. data/vendor/faiss/faiss/utils/Heap.h +21 -0
  294. data/vendor/faiss/faiss/utils/NeuralNet.cpp +54 -40
  295. data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
  296. data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
  297. data/vendor/faiss/faiss/utils/distances.cpp +507 -559
  298. data/vendor/faiss/faiss/utils/distances.h +118 -1
  299. data/vendor/faiss/faiss/utils/distances_dispatch.h +250 -0
  300. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
  301. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
  302. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
  303. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
  304. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
  305. data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
  306. data/vendor/faiss/faiss/utils/distances_simd.cpp +72 -3681
  307. data/vendor/faiss/faiss/utils/extra_distances.cpp +60 -102
  308. data/vendor/faiss/faiss/utils/extra_distances.h +79 -7
  309. data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
  310. data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
  311. data/vendor/faiss/faiss/utils/hamming.h +92 -2
  312. data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
  313. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
  314. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
  315. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
  316. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
  317. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
  318. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
  319. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
  320. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
  321. data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
  322. data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
  323. data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
  324. data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
  325. data/vendor/faiss/faiss/utils/partitioning.h +31 -0
  326. data/vendor/faiss/faiss/utils/popcount.h +29 -0
  327. data/vendor/faiss/faiss/utils/pq_code_distance.h +251 -0
  328. data/vendor/faiss/faiss/utils/prefetch.h +2 -2
  329. data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
  330. data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
  331. data/vendor/faiss/faiss/utils/rabitq_simd.h +124 -343
  332. data/vendor/faiss/faiss/utils/random.cpp +6 -6
  333. data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
  334. data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +154 -0
  335. data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +777 -0
  336. data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +306 -0
  337. data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +1431 -0
  338. data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +1095 -0
  339. data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
  340. data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
  341. data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +392 -0
  342. data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
  343. data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
  344. data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
  345. data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
  346. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
  347. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
  348. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
  349. data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
  350. data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
  351. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
  352. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
  353. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
  354. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
  355. data/vendor/faiss/faiss/utils/simd_levels.cpp +334 -0
  356. data/vendor/faiss/faiss/utils/simd_levels.h +183 -0
  357. data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
  358. data/vendor/faiss/faiss/utils/utils.cpp +21 -14
  359. data/vendor/faiss/faiss/utils/utils.h +3 -3
  360. metadata +156 -42
  361. data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
  362. data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
  363. data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +0 -81
  364. data/vendor/faiss/faiss/impl/code_distance/code_distance.h +0 -186
  365. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -216
  366. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -224
  367. data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
  368. data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
  369. data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
  370. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
  371. data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -228
  372. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
  373. data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
  374. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -450
  375. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
  376. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
  377. data/vendor/faiss/faiss/utils/simdlib.h +0 -42
  378. data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -296
  379. /data/vendor/faiss/faiss/{cppcontrib/factory_tools.h → factory_tools.h} +0 -0
@@ -9,417 +9,198 @@
9
9
 
10
10
  #include <cstddef>
11
11
  #include <cstdint>
12
+ #include <cstring>
12
13
 
13
- // Only include x86 SIMD intrinsics on x86/x86_64 architectures
14
- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
15
- defined(_M_IX86)
16
- #include <immintrin.h>
17
- #endif // defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||
14
+ #include <faiss/utils/popcount.h>
15
+ #include <faiss/utils/simd_levels.h>
18
16
 
19
17
  namespace faiss::rabitq {
20
18
 
21
- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
22
- defined(_M_IX86)
23
19
  /**
24
- * Returns the lookup table for AVX512 popcount operations.
25
- * This table is used for lookup-based popcount implementation.
20
+ * Compute dot product between query and binary data using popcount on AND.
26
21
  *
27
- * Source: https://github.com/WojciechMula/sse-popcount.
28
- *
29
- * @return Lookup table as __m512i register
30
- */
31
- #if defined(__AVX512F__)
32
- inline __m512i get_lookup_512() {
33
- return _mm512_set_epi8(
34
- /* f */ 4,
35
- /* e */ 3,
36
- /* d */ 3,
37
- /* c */ 2,
38
- /* b */ 3,
39
- /* a */ 2,
40
- /* 9 */ 2,
41
- /* 8 */ 1,
42
- /* 7 */ 3,
43
- /* 6 */ 2,
44
- /* 5 */ 2,
45
- /* 4 */ 1,
46
- /* 3 */ 2,
47
- /* 2 */ 1,
48
- /* 1 */ 1,
49
- /* 0 */ 0,
50
- /* f */ 4,
51
- /* e */ 3,
52
- /* d */ 3,
53
- /* c */ 2,
54
- /* b */ 3,
55
- /* a */ 2,
56
- /* 9 */ 2,
57
- /* 8 */ 1,
58
- /* 7 */ 3,
59
- /* 6 */ 2,
60
- /* 5 */ 2,
61
- /* 4 */ 1,
62
- /* 3 */ 2,
63
- /* 2 */ 1,
64
- /* 1 */ 1,
65
- /* 0 */ 0,
66
- /* f */ 4,
67
- /* e */ 3,
68
- /* d */ 3,
69
- /* c */ 2,
70
- /* b */ 3,
71
- /* a */ 2,
72
- /* 9 */ 2,
73
- /* 8 */ 1,
74
- /* 7 */ 3,
75
- /* 6 */ 2,
76
- /* 5 */ 2,
77
- /* 4 */ 1,
78
- /* 3 */ 2,
79
- /* 2 */ 1,
80
- /* 1 */ 1,
81
- /* 0 */ 0,
82
- /* f */ 4,
83
- /* e */ 3,
84
- /* d */ 3,
85
- /* c */ 2,
86
- /* b */ 3,
87
- /* a */ 2,
88
- /* 9 */ 2,
89
- /* 8 */ 1,
90
- /* 7 */ 3,
91
- /* 6 */ 2,
92
- /* 5 */ 2,
93
- /* 4 */ 1,
94
- /* 3 */ 2,
95
- /* 2 */ 1,
96
- /* 1 */ 1,
97
- /* 0 */ 0);
98
- }
99
- #endif // defined(__AVX512F__)
100
- #if defined(__AVX2__)
101
- /**
102
- * Returns the lookup table for AVX2 popcount operations.
103
- * This table is used for lookup-based popcount implementation.
104
- *
105
- * @return Lookup table as __m256i register
22
+ * @param query Pointer to rearranged rotated query data
23
+ * @param data Pointer to binary data
24
+ * @param size Size in bytes
25
+ * @param qb Number of quantization bits
26
+ * @return Unsigned integer dot product
106
27
  */
107
- inline __m256i get_lookup_256() {
108
- return _mm256_setr_epi8(
109
- /* 0 */ 0,
110
- /* 1 */ 1,
111
- /* 2 */ 1,
112
- /* 3 */ 2,
113
- /* 4 */ 1,
114
- /* 5 */ 2,
115
- /* 6 */ 2,
116
- /* 7 */ 3,
117
- /* 8 */ 1,
118
- /* 9 */ 2,
119
- /* a */ 2,
120
- /* b */ 3,
121
- /* c */ 2,
122
- /* d */ 3,
123
- /* e */ 3,
124
- /* f */ 4,
125
- /* 0 */ 0,
126
- /* 1 */ 1,
127
- /* 2 */ 1,
128
- /* 3 */ 2,
129
- /* 4 */ 1,
130
- /* 5 */ 2,
131
- /* 6 */ 2,
132
- /* 7 */ 3,
133
- /* 8 */ 1,
134
- /* 9 */ 2,
135
- /* a */ 2,
136
- /* b */ 3,
137
- /* c */ 2,
138
- /* d */ 3,
139
- /* e */ 3,
140
- /* f */ 4);
141
- }
142
- #endif // defined(__AVX2__)
28
+ template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
29
+ uint64_t bitwise_and_dot_product(
30
+ const uint8_t* query,
31
+ const uint8_t* data,
32
+ size_t size,
33
+ size_t qb);
143
34
 
144
- #if defined(__AVX512F__)
145
35
  /**
146
- * Popcount for a 512-bit register, using lookup tables if necessary.
36
+ * Compute dot product between query and binary data using popcount on XOR.
147
37
  *
148
- * @param v Input vector to count bits in
149
- * @return Vector int32_t[16] with popcount results.
38
+ * @param query Pointer to rearranged rotated query data
39
+ * @param data Pointer to binary data
40
+ * @param size Size in bytes
41
+ * @param qb Number of quantization bits
42
+ * @return Unsigned integer dot product
150
43
  */
151
- inline __m512i popcount_512(__m512i v) {
152
- #if defined(__AVX512VPOPCNTDQ__)
153
- return _mm512_popcnt_epi64(v);
154
- #else
155
- const __m512i lookup = get_lookup_512();
156
- const __m512i low_mask = _mm512_set1_epi8(0x0f);
157
-
158
- const __m512i lo = _mm512_and_si512(v, low_mask);
159
- const __m512i hi = _mm512_and_si512(_mm512_srli_epi16(v, 4), low_mask);
160
- const __m512i popcnt_lo = _mm512_shuffle_epi8(lookup, lo);
161
- const __m512i popcnt_hi = _mm512_shuffle_epi8(lookup, hi);
162
- const __m512i popcnt = _mm512_add_epi8(popcnt_lo, popcnt_hi);
163
- return _mm512_sad_epu8(_mm512_setzero_si512(), popcnt);
164
- #endif // defined(__AVX512VPOPCNTDQ__)
165
- }
166
- #endif // defined(__AVX512F__)
44
+ template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
45
+ uint64_t bitwise_xor_dot_product(
46
+ const uint8_t* query,
47
+ const uint8_t* data,
48
+ size_t size,
49
+ size_t qb);
167
50
 
168
- #if defined(__AVX2__)
169
51
  /**
170
- * Popcount for a 256-bit register, using lookup tables if necessary.
52
+ * Count total set bits in data.
171
53
  *
172
- * @param v Input vector to count bits in
173
- * @return uint64_t[4] of popcounts for each portion of the input vector.
54
+ * @param data Pointer to binary data
55
+ * @param size Size in bytes
56
+ * @return Total popcount
174
57
  */
175
- inline __m256i popcount_256(__m256i v) {
176
- const __m256i lookup = get_lookup_256();
177
- const __m256i low_mask = _mm256_set1_epi8(0x0f);
178
-
179
- const __m256i lo = _mm256_and_si256(v, low_mask);
180
- const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask);
181
- const __m256i popcnt_lo = _mm256_shuffle_epi8(lookup, lo);
182
- const __m256i popcnt_hi = _mm256_shuffle_epi8(lookup, hi);
183
- const __m256i popcnt = _mm256_add_epi8(popcnt_lo, popcnt_hi);
184
- // Reduce uint8_t[32] into uint64_t[4] by addition.
185
- return _mm256_sad_epu8(_mm256_setzero_si256(), popcnt);
186
- }
187
-
188
- inline uint64_t reduce_add_256(__m256i v) {
189
- alignas(32) uint64_t lanes[4];
190
- _mm256_store_si256((__m256i*)lanes, v);
191
- return lanes[0] + lanes[1] + lanes[2] + lanes[3];
192
- }
193
- #endif // defined(__AVX2__)
194
-
195
- #if defined(__SSE4_1__)
196
- inline __m128i popcount_128(__m128i v) {
197
- // Scalar popcount for each 64-bit lane
198
- uint64_t lane0 = _mm_extract_epi64(v, 0);
199
- uint64_t lane1 = _mm_extract_epi64(v, 1);
200
- uint64_t pop0 = __builtin_popcountll(lane0);
201
- uint64_t pop1 = __builtin_popcountll(lane1);
202
- return _mm_set_epi64x(pop1, pop0);
203
- }
58
+ template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
59
+ uint64_t popcount(const uint8_t* data, size_t size);
204
60
 
205
- inline uint64_t reduce_add_128(__m128i v) {
206
- alignas(16) uint64_t lanes[2];
207
- _mm_store_si128((__m128i*)lanes, v);
208
- return lanes[0] + lanes[1];
209
- }
210
- #endif // defined(__SSE4_1__)
211
- #endif // defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||
61
+ // NONE specializations scalar fallbacks
212
62
 
213
- /**
214
- * Compute dot product between query and binary data using popcount operations.
215
- *
216
- * @param query Pointer to rearranged rotated query data
217
- * @param data Pointer to binary data
218
- * @param d Dimension
219
- * @param qb Number of quantization bits
220
- * @return Unsigned integer dot product
221
- */
222
- inline uint64_t bitwise_and_dot_product(
63
+ template <>
64
+ inline uint64_t bitwise_and_dot_product<SIMDLevel::NONE>(
223
65
  const uint8_t* query,
224
66
  const uint8_t* data,
225
67
  size_t size,
226
68
  size_t qb) {
227
69
  uint64_t sum = 0;
228
70
  size_t offset = 0;
229
- #if defined(__AVX512F__)
230
- // Handle 512-bit chunks.
231
- if (size_t step = 512 / 8; offset + step <= size) {
232
- __m512i sum_512 = _mm512_setzero_si512();
233
- for (; offset + step <= size; offset += step) {
234
- __m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
235
- for (int j = 0; j < qb; j++) {
236
- __m512i v_q = _mm512_loadu_si512(
237
- (const __m512i*)(query + j * size + offset));
238
- __m512i v_and = _mm512_and_si512(v_q, v_x);
239
- __m512i v_popcnt = popcount_512(v_and);
240
- __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
241
- sum_512 = _mm512_add_epi64(sum_512, v_shifted);
242
- }
243
- }
244
- sum += _mm512_reduce_add_epi64(sum_512);
245
- }
246
- #endif // defined(__AVX512F__)
247
- #if defined(__AVX2__)
248
- if (size_t step = 256 / 8; offset + step <= size) {
249
- __m256i sum_256 = _mm256_setzero_si256();
250
- for (; offset + step <= size; offset += step) {
251
- __m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
252
- for (int j = 0; j < qb; j++) {
253
- __m256i v_q = _mm256_loadu_si256(
254
- (const __m256i*)(query + j * size + offset));
255
- __m256i v_and = _mm256_and_si256(v_q, v_x);
256
- __m256i v_popcnt = popcount_256(v_and);
257
- __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
258
- sum_256 = _mm256_add_epi64(sum_256, v_shifted);
259
- }
260
- }
261
- sum += reduce_add_256(sum_256);
262
- }
263
- #endif // defined(__AVX2__)
264
- #if defined(__SSE4_1__)
265
- __m128i sum_128 = _mm_setzero_si128();
266
- for (size_t step = 128 / 8; offset + step <= size; offset += step) {
267
- __m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
268
- for (int j = 0; j < qb; j++) {
269
- __m128i v_q = _mm_loadu_si128(
270
- (const __m128i*)(query + j * size + offset));
271
- __m128i v_and = _mm_and_si128(v_q, v_x);
272
- __m128i v_popcnt = popcount_128(v_and);
273
- __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
274
- sum_128 = _mm_add_epi64(sum_128, v_shifted);
275
- }
276
- }
277
- sum += reduce_add_128(sum_128);
278
- #endif // defined(__SSE4_1__)
279
71
  for (size_t step = 64 / 8; offset + step <= size; offset += step) {
280
72
  const auto yv = *(const uint64_t*)(data + offset);
281
73
  for (int j = 0; j < qb; j++) {
282
74
  const auto qv = *(const uint64_t*)(query + j * size + offset);
283
- sum += __builtin_popcountll(qv & yv) << j;
75
+ sum += popcount64(qv & yv) << j;
284
76
  }
285
77
  }
286
78
  for (; offset < size; ++offset) {
287
79
  const auto yv = *(data + offset);
288
80
  for (int j = 0; j < qb; j++) {
289
81
  const auto qv = *(query + j * size + offset);
290
- sum += __builtin_popcount(qv & yv) << j;
82
+ sum += popcount32(qv & yv) << j;
291
83
  }
292
84
  }
293
85
  return sum;
294
86
  }
295
87
 
296
- /**
297
- * Compute dot product between query and binary data using popcount operations.
298
- *
299
- * @param query Pointer to rearranged rotated query data
300
- * @param data Pointer to binary data
301
- * @param d Dimension
302
- * @param qb Number of quantization bits
303
- * @return Unsigned integer dot product
304
- */
305
- inline uint64_t bitwise_xor_dot_product(
88
+ template <>
89
+ inline uint64_t bitwise_xor_dot_product<SIMDLevel::NONE>(
306
90
  const uint8_t* query,
307
91
  const uint8_t* data,
308
92
  size_t size,
309
93
  size_t qb) {
310
94
  uint64_t sum = 0;
311
95
  size_t offset = 0;
312
- #if defined(__AVX512F__)
313
- // Handle 512-bit chunks.
314
- if (size_t step = 512 / 8; offset + step <= size) {
315
- __m512i sum_512 = _mm512_setzero_si512();
316
- for (; offset + step <= size; offset += step) {
317
- __m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
318
- for (int j = 0; j < qb; j++) {
319
- __m512i v_q = _mm512_loadu_si512(
320
- (const __m512i*)(query + j * size + offset));
321
- __m512i v_xor = _mm512_xor_si512(v_q, v_x);
322
- __m512i v_popcnt = popcount_512(v_xor);
323
- __m512i v_shifted = _mm512_slli_epi64(v_popcnt, j);
324
- sum_512 = _mm512_add_epi64(sum_512, v_shifted);
325
- }
326
- }
327
- sum += _mm512_reduce_add_epi64(sum_512);
328
- }
329
- #endif
330
- #if defined(__AVX2__)
331
- if (size_t step = 256 / 8; offset + step <= size) {
332
- __m256i sum_256 = _mm256_setzero_si256();
333
- for (; offset + step <= size; offset += step) {
334
- __m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
335
- for (int j = 0; j < qb; j++) {
336
- __m256i v_q = _mm256_loadu_si256(
337
- (const __m256i*)(query + j * size + offset));
338
- __m256i v_xor = _mm256_xor_si256(v_q, v_x);
339
- __m256i v_popcnt = popcount_256(v_xor);
340
- __m256i v_shifted = _mm256_slli_epi64(v_popcnt, j);
341
- sum_256 = _mm256_add_epi64(sum_256, v_shifted);
342
- }
343
- }
344
- sum += reduce_add_256(sum_256);
345
- }
346
- #endif
347
- #if defined(__SSE4_1__)
348
- __m128i sum_128 = _mm_setzero_si128();
349
- for (size_t step = 128 / 8; offset + step <= size; offset += step) {
350
- __m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
351
- for (int j = 0; j < qb; j++) {
352
- __m128i v_q = _mm_loadu_si128(
353
- (const __m128i*)(query + j * size + offset));
354
- __m128i v_xor = _mm_xor_si128(v_q, v_x);
355
- __m128i v_popcnt = popcount_128(v_xor);
356
- __m128i v_shifted = _mm_slli_epi64(v_popcnt, j);
357
- sum_128 = _mm_add_epi64(sum_128, v_shifted);
358
- }
359
- }
360
- sum += reduce_add_128(sum_128);
361
- #endif
362
96
  for (size_t step = 64 / 8; offset + step <= size; offset += step) {
363
97
  const auto yv = *(const uint64_t*)(data + offset);
364
98
  for (int j = 0; j < qb; j++) {
365
99
  const auto qv = *(const uint64_t*)(query + j * size + offset);
366
- sum += __builtin_popcountll(qv ^ yv) << j;
100
+ sum += popcount64(qv ^ yv) << j;
367
101
  }
368
102
  }
369
103
  for (; offset < size; ++offset) {
370
104
  const auto yv = *(data + offset);
371
105
  for (int j = 0; j < qb; j++) {
372
106
  const auto qv = *(query + j * size + offset);
373
- sum += __builtin_popcount(qv ^ yv) << j;
107
+ sum += popcount32(qv ^ yv) << j;
374
108
  }
375
109
  }
376
110
  return sum;
377
111
  }
378
112
 
379
- inline uint64_t popcount(const uint8_t* data, size_t size) {
113
+ template <>
114
+ inline uint64_t popcount<SIMDLevel::NONE>(const uint8_t* data, size_t size) {
380
115
  uint64_t sum = 0;
381
116
  size_t offset = 0;
382
- #if defined(__AVX512F__)
383
- // Handle 512-bit chunks.
384
- if (offset + 512 / 8 <= size) {
385
- __m512i sum_512 = _mm512_setzero_si512();
386
- for (size_t end; (end = offset + 512 / 8) <= size; offset = end) {
387
- __m512i v_x = _mm512_loadu_si512((const __m512i*)(data + offset));
388
- __m512i v_popcnt = popcount_512(v_x);
389
- sum_512 = _mm512_add_epi64(sum_512, v_popcnt);
390
- }
391
- sum += _mm512_reduce_add_epi64(sum_512);
392
- }
393
- #endif // defined(__AVX512F__)
394
- #if defined(__AVX2__)
395
- if (offset + 256 / 8 <= size) {
396
- __m256i sum_256 = _mm256_setzero_si256();
397
- for (size_t end; (end = offset + 256 / 8) <= size; offset = end) {
398
- __m256i v_x = _mm256_loadu_si256((const __m256i*)(data + offset));
399
- __m256i v_popcnt = popcount_256(v_x);
400
- sum_256 = _mm256_add_epi64(sum_256, v_popcnt);
401
- }
402
- sum += reduce_add_256(sum_256);
403
- }
404
- #endif // defined(__AVX2__)
405
- #if defined(__SSE4_1__)
406
- __m128i sum_128 = _mm_setzero_si128();
407
- for (size_t step = 128 / 8; offset + step <= size; offset += step) {
408
- __m128i v_x = _mm_loadu_si128((const __m128i*)(data + offset));
409
- sum_128 = _mm_add_epi64(sum_128, popcount_128(v_x));
410
- }
411
- sum += reduce_add_128(sum_128);
412
- #endif // defined(__SSE4_1__)
413
-
414
117
  for (size_t step = 64 / 8; offset + step <= size; offset += step) {
415
118
  const auto yv = *(const uint64_t*)(data + offset);
416
- sum += __builtin_popcountll(yv);
119
+ sum += popcount64(yv);
417
120
  }
418
121
  for (; offset < size; ++offset) {
419
122
  const auto yv = *(data + offset);
420
- sum += __builtin_popcount(yv);
123
+ sum += popcount32(yv);
421
124
  }
422
125
  return sum;
423
126
  }
424
127
 
425
128
  } // namespace faiss::rabitq
129
+
130
+ /*********************************************************
131
+ * Multi-bit RaBitQ inner product kernels.
132
+ *
133
+ * Compute: sum_i rotated_q[i] * ((sign_bit_i << ex_bits) + ex_code_val_i + cb)
134
+ *
135
+ * Strategy:
136
+ * ex_bits == 1: Specialized kernel — both sign_bits and ex_code are
137
+ * 1-bit-per-dim packed, enabling direct bit→mask→float
138
+ * conversion with zero per-element extraction.
139
+ * ex_bits >= 2: Bit-plane decomposition (BMI2 required) — PEXT extracts
140
+ * each bit plane in one instruction, then the same
141
+ * bit→mask→float kernel computes each plane's dot product.
142
+ * Fallback: Scalar extraction via 64-bit window read + shift + mask.
143
+ *********************************************************/
144
+ namespace faiss::rabitq::multibit {
145
+
146
+ /// Scalar inner product for multi-bit RaBitQ.
147
+ /// Extracts each code value in O(1) via 64-bit window read + shift + mask.
148
+ /// Also serves as the tail handler for SIMD kernels via the @p start parameter.
149
+ inline float ip_scalar(
150
+ const uint8_t* __restrict sign_bits,
151
+ const uint8_t* __restrict ex_code,
152
+ const float* __restrict rotated_q,
153
+ size_t start,
154
+ size_t d,
155
+ size_t ex_bits,
156
+ float cb) {
157
+ float result = 0.0f;
158
+ const int sign_shift = static_cast<int>(ex_bits);
159
+ const uint64_t code_mask = (1ULL << ex_bits) - 1;
160
+ for (size_t i = start; i < d; i++) {
161
+ int sb = (sign_bits[i / 8] >> (i % 8)) & 1;
162
+ size_t bit_pos = i * ex_bits;
163
+ size_t byte_idx = bit_pos / 8;
164
+ size_t bit_offset = bit_pos % 8;
165
+ uint64_t raw = 0;
166
+ memcpy(&raw, ex_code + byte_idx, sizeof(uint64_t));
167
+ int ex_val = static_cast<int>((raw >> bit_offset) & code_mask);
168
+ result += rotated_q[i] *
169
+ (static_cast<float>((sb << sign_shift) + ex_val) + cb);
170
+ }
171
+ return result;
172
+ }
173
+
174
+ /**
175
+ * Dispatch to the best available kernel for the given ex_bits.
176
+ *
177
+ * @param sign_bits packed sign bits (1 bit/dim, standard byte packing)
178
+ * @param ex_code packed extra-bit codes (ex_bits bits/dim)
179
+ * @param rotated_q rotated query vector (float[d])
180
+ * @param d dimensionality
181
+ * @param ex_bits number of extra bits per dimension (nb_bits - 1)
182
+ * @param cb constant bias: -(2^ex_bits - 0.5)
183
+ * @return inner product value
184
+ */
185
+ template <SIMDLevel SL = SINGLE_SIMD_LEVEL>
186
+ float compute_inner_product(
187
+ const uint8_t* __restrict sign_bits,
188
+ const uint8_t* __restrict ex_code,
189
+ const float* __restrict rotated_q,
190
+ size_t d,
191
+ size_t ex_bits,
192
+ float cb);
193
+
194
+ // NONE specialization — pure scalar
195
+ template <>
196
+ inline float compute_inner_product<SIMDLevel::NONE>(
197
+ const uint8_t* __restrict sign_bits,
198
+ const uint8_t* __restrict ex_code,
199
+ const float* __restrict rotated_q,
200
+ size_t d,
201
+ size_t ex_bits,
202
+ float cb) {
203
+ return ip_scalar(sign_bits, ex_code, rotated_q, 0, d, ex_bits, cb);
204
+ }
205
+
206
+ } // namespace faiss::rabitq::multibit
@@ -100,7 +100,7 @@ void float_rand(float* x, size_t n, int64_t seed) {
100
100
  int a0 = rng0.rand_int(), b0 = rng0.rand_int();
101
101
 
102
102
  #pragma omp parallel for
103
- for (int64_t j = 0; j < nblock; j++) {
103
+ for (int64_t j = 0; j < static_cast<int64_t>(nblock); j++) {
104
104
  RandomGenerator rng(a0 + j * b0);
105
105
 
106
106
  const size_t istart = j * n / nblock;
@@ -120,7 +120,7 @@ void float_randn(float* x, size_t n, int64_t seed) {
120
120
  int a0 = rng0.rand_int(), b0 = rng0.rand_int();
121
121
 
122
122
  #pragma omp parallel for
123
- for (int64_t j = 0; j < nblock; j++) {
123
+ for (int64_t j = 0; j < static_cast<int64_t>(nblock); j++) {
124
124
  RandomGenerator rng(a0 + j * b0);
125
125
 
126
126
  double a = 0, b = 0, s = 0;
@@ -155,7 +155,7 @@ void int64_rand(int64_t* x, size_t n, int64_t seed) {
155
155
  int a0 = rng0.rand_int(), b0 = rng0.rand_int();
156
156
 
157
157
  #pragma omp parallel for
158
- for (int64_t j = 0; j < nblock; j++) {
158
+ for (int64_t j = 0; j < static_cast<int64_t>(nblock); j++) {
159
159
  RandomGenerator rng(a0 + j * b0);
160
160
 
161
161
  const size_t istart = j * n / nblock;
@@ -174,7 +174,7 @@ void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed) {
174
174
  int a0 = rng0.rand_int(), b0 = rng0.rand_int();
175
175
 
176
176
  #pragma omp parallel for
177
- for (int64_t j = 0; j < nblock; j++) {
177
+ for (int64_t j = 0; j < static_cast<int64_t>(nblock); j++) {
178
178
  RandomGenerator rng(a0 + j * b0);
179
179
 
180
180
  const size_t istart = j * n / nblock;
@@ -219,7 +219,7 @@ void byte_rand(uint8_t* x, size_t n, int64_t seed) {
219
219
  int a0 = rng0.rand_int(), b0 = rng0.rand_int();
220
220
 
221
221
  #pragma omp parallel for
222
- for (int64_t j = 0; j < nblock; j++) {
222
+ for (int64_t j = 0; j < static_cast<int64_t>(nblock); j++) {
223
223
  RandomGenerator rng(a0 + j * b0);
224
224
 
225
225
  const size_t istart = j * n / nblock;
@@ -261,7 +261,7 @@ void rand_smooth_vectors(size_t n, size_t d, float* x, int64_t seed) {
261
261
  float_rand(scales.data(), d, seed + 2);
262
262
 
263
263
  #pragma omp parallel for if (n * d > 10000)
264
- for (int64_t i = 0; i < n; i++) {
264
+ for (int64_t i = 0; i < static_cast<int64_t>(n); i++) {
265
265
  for (size_t j = 0; j < d; j++) {
266
266
  x[i * d + j] = sinf(x[i * d + j] * (scales[j] * 4 + 0.1));
267
267
  }
@@ -0,0 +1,51 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #pragma once
9
+
10
+ #include <faiss/IndexIVFFlat.h>
11
+ #include <faiss/impl/expanded_scanners.h>
12
+
13
+ #ifndef THE_SIMD_LEVEL
14
+ #error "THE_SIMD_LEVEL not defined"
15
+ #endif
16
+
17
+ namespace faiss {
18
+
19
+ constexpr faiss::SIMDLevel THE_SL = THE_SIMD_LEVEL;
20
+
21
+ #define DEFINE_IVFFLAT_SCANNER_METHODS(mt) \
22
+ template <> \
23
+ float IVFFlatScanner<VectorDistance<mt, THE_SL>>::distance_to_code( \
24
+ const uint8_t* code) const { \
25
+ const float* yj = (float*)code; \
26
+ return vd(xi, yj); \
27
+ } \
28
+ template <> \
29
+ size_t IVFFlatScanner<VectorDistance<mt, THE_SL>>::scan_codes( \
30
+ size_t list_size, \
31
+ const uint8_t* codes, \
32
+ const idx_t* ids, \
33
+ ResultHandler& handler) const { \
34
+ return run_scan_codes_fix_C<C>(*this, list_size, codes, ids, handler); \
35
+ }
36
+
37
+ DEFINE_IVFFLAT_SCANNER_METHODS(METRIC_L2)
38
+ DEFINE_IVFFLAT_SCANNER_METHODS(METRIC_INNER_PRODUCT)
39
+ DEFINE_IVFFLAT_SCANNER_METHODS(METRIC_L1)
40
+ DEFINE_IVFFLAT_SCANNER_METHODS(METRIC_Linf)
41
+ DEFINE_IVFFLAT_SCANNER_METHODS(METRIC_Lp)
42
+ DEFINE_IVFFLAT_SCANNER_METHODS(METRIC_Canberra)
43
+ DEFINE_IVFFLAT_SCANNER_METHODS(METRIC_BrayCurtis)
44
+ DEFINE_IVFFLAT_SCANNER_METHODS(METRIC_JensenShannon)
45
+ DEFINE_IVFFLAT_SCANNER_METHODS(METRIC_Jaccard)
46
+ DEFINE_IVFFLAT_SCANNER_METHODS(METRIC_NaNEuclidean)
47
+ DEFINE_IVFFLAT_SCANNER_METHODS(METRIC_GOWER)
48
+
49
+ #undef DEFINE_IVFFLAT_SCANNER_METHODS
50
+
51
+ } // namespace faiss