faiss 0.5.3 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (379) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/ext/faiss/ext.cpp +1 -1
  4. data/ext/faiss/extconf.rb +4 -4
  5. data/ext/faiss/index.cpp +63 -45
  6. data/ext/faiss/index_binary.cpp +37 -27
  7. data/ext/faiss/kmeans.cpp +9 -8
  8. data/ext/faiss/pca_matrix.cpp +9 -7
  9. data/ext/faiss/product_quantizer.cpp +13 -11
  10. data/ext/faiss/utils.cpp +4 -2
  11. data/ext/faiss/utils.h +4 -0
  12. data/lib/faiss/version.rb +1 -1
  13. data/lib/faiss.rb +1 -1
  14. data/vendor/faiss/faiss/AutoTune.cpp +214 -82
  15. data/vendor/faiss/faiss/AutoTune.h +14 -1
  16. data/vendor/faiss/faiss/Clustering.cpp +97 -249
  17. data/vendor/faiss/faiss/Clustering.h +18 -0
  18. data/vendor/faiss/faiss/IVFlib.cpp +67 -44
  19. data/vendor/faiss/faiss/Index.cpp +25 -12
  20. data/vendor/faiss/faiss/Index.h +26 -4
  21. data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
  22. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +68 -61
  23. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
  24. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
  25. data/vendor/faiss/faiss/IndexBinary.cpp +6 -3
  26. data/vendor/faiss/faiss/IndexBinary.h +4 -4
  27. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
  28. data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
  29. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
  30. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +92 -95
  31. data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
  32. data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
  33. data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
  34. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +120 -414
  35. data/vendor/faiss/faiss/IndexFastScan.cpp +105 -129
  36. data/vendor/faiss/faiss/IndexFastScan.h +35 -24
  37. data/vendor/faiss/faiss/IndexFlat.cpp +216 -152
  38. data/vendor/faiss/faiss/IndexFlat.h +32 -14
  39. data/vendor/faiss/faiss/IndexFlatCodes.cpp +88 -41
  40. data/vendor/faiss/faiss/IndexFlatCodes.h +7 -1
  41. data/vendor/faiss/faiss/IndexHNSW.cpp +299 -187
  42. data/vendor/faiss/faiss/IndexHNSW.h +30 -14
  43. data/vendor/faiss/faiss/IndexIDMap.cpp +26 -22
  44. data/vendor/faiss/faiss/IndexIDMap.h +9 -7
  45. data/vendor/faiss/faiss/IndexIVF.cpp +535 -405
  46. data/vendor/faiss/faiss/IndexIVF.h +47 -16
  47. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
  48. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +105 -99
  49. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +6 -3
  50. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +379 -249
  51. data/vendor/faiss/faiss/IndexIVFFastScan.h +65 -60
  52. data/vendor/faiss/faiss/IndexIVFFlat.cpp +41 -124
  53. data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
  54. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +89 -138
  55. data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
  56. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
  57. data/vendor/faiss/faiss/IndexIVFPQ.cpp +77 -907
  58. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +184 -122
  59. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
  60. data/vendor/faiss/faiss/IndexIVFPQR.cpp +23 -18
  61. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +59 -60
  62. data/vendor/faiss/faiss/IndexIVFRaBitQ.h +4 -3
  63. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +564 -416
  64. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +269 -111
  65. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
  66. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
  67. data/vendor/faiss/faiss/IndexLSH.cpp +44 -25
  68. data/vendor/faiss/faiss/IndexLattice.cpp +41 -36
  69. data/vendor/faiss/faiss/IndexNNDescent.cpp +37 -21
  70. data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
  71. data/vendor/faiss/faiss/IndexNSG.cpp +40 -23
  72. data/vendor/faiss/faiss/IndexNSG.h +0 -2
  73. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +32 -12
  74. data/vendor/faiss/faiss/IndexPQ.cpp +129 -213
  75. data/vendor/faiss/faiss/IndexPQ.h +3 -2
  76. data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
  77. data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
  78. data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
  79. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  80. data/vendor/faiss/faiss/IndexRaBitQ.cpp +31 -43
  81. data/vendor/faiss/faiss/IndexRaBitQ.h +4 -3
  82. data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +135 -317
  83. data/vendor/faiss/faiss/IndexRaBitQFastScan.h +192 -34
  84. data/vendor/faiss/faiss/IndexRefine.cpp +30 -55
  85. data/vendor/faiss/faiss/IndexRefine.h +4 -4
  86. data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
  87. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
  88. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
  89. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
  90. data/vendor/faiss/faiss/IndexShards.cpp +13 -13
  91. data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
  92. data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
  93. data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
  94. data/vendor/faiss/faiss/MetaIndexes.h +1 -1
  95. data/vendor/faiss/faiss/MetricType.h +29 -6
  96. data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
  97. data/vendor/faiss/faiss/SuperKMeans.h +97 -0
  98. data/vendor/faiss/faiss/VectorTransform.cpp +349 -141
  99. data/vendor/faiss/faiss/VectorTransform.h +39 -16
  100. data/vendor/faiss/faiss/build.cpp +23 -0
  101. data/vendor/faiss/faiss/build.h +15 -0
  102. data/vendor/faiss/faiss/clone_index.cpp +55 -51
  103. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
  104. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
  105. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
  106. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
  107. data/vendor/faiss/faiss/{cppcontrib/factory_tools.cpp → factory_tools.cpp} +6 -1
  108. data/vendor/faiss/faiss/gpu/GpuCloner.cpp +1 -1
  109. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
  110. data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
  111. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
  112. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
  113. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
  114. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
  115. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
  116. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
  117. data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
  118. data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
  119. data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
  120. data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
  121. data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
  122. data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
  123. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
  124. data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
  125. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
  126. data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
  127. data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
  128. data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
  129. data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
  130. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +64 -34
  131. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
  132. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
  133. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -28
  134. data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
  135. data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
  136. data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +367 -0
  137. data/vendor/faiss/faiss/impl/ClusteringInitialization.h +107 -0
  138. data/vendor/faiss/faiss/impl/CodePacker.cpp +7 -3
  139. data/vendor/faiss/faiss/impl/CodePacker.h +11 -3
  140. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +83 -0
  141. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.h +47 -0
  142. data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
  143. data/vendor/faiss/faiss/impl/FaissAssert.h +64 -3
  144. data/vendor/faiss/faiss/impl/FaissException.h +50 -3
  145. data/vendor/faiss/faiss/impl/HNSW.cpp +117 -351
  146. data/vendor/faiss/faiss/impl/HNSW.h +21 -40
  147. data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
  148. data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
  149. data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
  150. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +114 -102
  151. data/vendor/faiss/faiss/impl/NNDescent.cpp +63 -26
  152. data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
  153. data/vendor/faiss/faiss/impl/NSG.cpp +44 -26
  154. data/vendor/faiss/faiss/impl/NSG.h +20 -10
  155. data/vendor/faiss/faiss/impl/Panorama.cpp +76 -52
  156. data/vendor/faiss/faiss/impl/Panorama.h +265 -78
  157. data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
  158. data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
  159. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +62 -37
  160. data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
  161. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
  162. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
  163. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +99 -80
  164. data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
  165. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +135 -37
  166. data/vendor/faiss/faiss/impl/RaBitQUtils.h +148 -21
  167. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +298 -301
  168. data/vendor/faiss/faiss/impl/RaBitQuantizer.h +3 -10
  169. data/vendor/faiss/faiss/impl/RaBitQuantizerMultiBit.cpp +15 -41
  170. data/vendor/faiss/faiss/impl/RaBitQuantizerMultiBit.h +0 -4
  171. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +40 -32
  172. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
  173. data/vendor/faiss/faiss/impl/ResultHandler.h +218 -113
  174. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +119 -2362
  175. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -3
  176. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
  177. data/vendor/faiss/faiss/impl/VisitedTable.cpp +42 -0
  178. data/vendor/faiss/faiss/impl/VisitedTable.h +76 -0
  179. data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
  180. data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
  181. data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
  182. data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
  183. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
  184. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
  185. data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
  186. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
  187. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
  188. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
  189. data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
  190. data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
  191. data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
  192. data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
  193. data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
  194. data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
  195. data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
  196. data/vendor/faiss/faiss/impl/expanded_scanners.h +163 -0
  197. data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
  198. data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
  199. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
  200. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
  201. data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
  202. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
  203. data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +176 -4
  204. data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
  205. data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
  206. data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
  207. data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
  208. data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
  209. data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
  210. data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -348
  211. data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
  212. data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
  213. data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +290 -142
  214. data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
  215. data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
  216. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
  217. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
  218. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
  219. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
  220. data/vendor/faiss/faiss/impl/index_read.cpp +1950 -505
  221. data/vendor/faiss/faiss/impl/index_read_utils.h +1 -2
  222. data/vendor/faiss/faiss/impl/index_write.cpp +112 -21
  223. data/vendor/faiss/faiss/impl/io.cpp +6 -6
  224. data/vendor/faiss/faiss/impl/io_macros.h +33 -16
  225. data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
  226. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +81 -40
  227. data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
  228. data/vendor/faiss/faiss/impl/mapped_io.cpp +15 -8
  229. data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
  230. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
  231. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
  232. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
  233. data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
  234. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
  235. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
  236. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
  237. data/vendor/faiss/faiss/impl/{code_distance/code_distance-avx2.h → pq_code_distance/pq_code_distance-avx2.h} +43 -220
  238. data/vendor/faiss/faiss/impl/{code_distance/code_distance-avx512.h → pq_code_distance/pq_code_distance-avx512.h} +25 -112
  239. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +59 -0
  240. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
  241. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +256 -0
  242. data/vendor/faiss/faiss/impl/{code_distance/code_distance-sve.h → pq_code_distance/pq_code_distance-sve.cpp} +57 -146
  243. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
  244. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +320 -483
  245. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  246. data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +121 -0
  247. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +137 -0
  248. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +371 -0
  249. data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +190 -0
  250. data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +94 -0
  251. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +603 -0
  252. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +597 -0
  253. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +388 -0
  254. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +630 -0
  255. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
  256. data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +387 -0
  257. data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +54 -0
  258. data/vendor/faiss/faiss/impl/simd_dispatch.h +173 -0
  259. data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
  260. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +274 -171
  261. data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
  262. data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
  263. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
  264. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +275 -217
  265. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
  266. data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
  267. data/vendor/faiss/faiss/impl/svs_io.h +8 -2
  268. data/vendor/faiss/faiss/index_factory.cpp +115 -28
  269. data/vendor/faiss/faiss/index_io.h +53 -3
  270. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +73 -20
  271. data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
  272. data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
  273. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
  274. data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
  275. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
  276. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +14 -14
  277. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
  278. data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +9 -19
  279. data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
  280. data/vendor/faiss/faiss/svs/IndexSVSFlat.h +2 -0
  281. data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
  282. data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
  283. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
  284. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
  285. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
  286. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
  287. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
  288. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +19 -2
  289. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
  290. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +19 -2
  291. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +14 -0
  292. data/vendor/faiss/faiss/utils/Heap.cpp +56 -10
  293. data/vendor/faiss/faiss/utils/Heap.h +21 -0
  294. data/vendor/faiss/faiss/utils/NeuralNet.cpp +54 -40
  295. data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
  296. data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
  297. data/vendor/faiss/faiss/utils/distances.cpp +507 -559
  298. data/vendor/faiss/faiss/utils/distances.h +118 -1
  299. data/vendor/faiss/faiss/utils/distances_dispatch.h +250 -0
  300. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
  301. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
  302. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
  303. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
  304. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
  305. data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
  306. data/vendor/faiss/faiss/utils/distances_simd.cpp +72 -3681
  307. data/vendor/faiss/faiss/utils/extra_distances.cpp +60 -102
  308. data/vendor/faiss/faiss/utils/extra_distances.h +79 -7
  309. data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
  310. data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
  311. data/vendor/faiss/faiss/utils/hamming.h +92 -2
  312. data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
  313. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
  314. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
  315. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
  316. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
  317. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
  318. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
  319. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
  320. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
  321. data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
  322. data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
  323. data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
  324. data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
  325. data/vendor/faiss/faiss/utils/partitioning.h +31 -0
  326. data/vendor/faiss/faiss/utils/popcount.h +29 -0
  327. data/vendor/faiss/faiss/utils/pq_code_distance.h +251 -0
  328. data/vendor/faiss/faiss/utils/prefetch.h +2 -2
  329. data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
  330. data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
  331. data/vendor/faiss/faiss/utils/rabitq_simd.h +124 -343
  332. data/vendor/faiss/faiss/utils/random.cpp +6 -6
  333. data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
  334. data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +154 -0
  335. data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +777 -0
  336. data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +306 -0
  337. data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +1431 -0
  338. data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +1095 -0
  339. data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
  340. data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
  341. data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +392 -0
  342. data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
  343. data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
  344. data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
  345. data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
  346. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
  347. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
  348. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
  349. data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
  350. data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
  351. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
  352. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
  353. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
  354. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
  355. data/vendor/faiss/faiss/utils/simd_levels.cpp +334 -0
  356. data/vendor/faiss/faiss/utils/simd_levels.h +183 -0
  357. data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
  358. data/vendor/faiss/faiss/utils/utils.cpp +21 -14
  359. data/vendor/faiss/faiss/utils/utils.h +3 -3
  360. metadata +156 -42
  361. data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
  362. data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
  363. data/vendor/faiss/faiss/impl/code_distance/code_distance-generic.h +0 -81
  364. data/vendor/faiss/faiss/impl/code_distance/code_distance.h +0 -186
  365. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -216
  366. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -224
  367. data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
  368. data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
  369. data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
  370. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
  371. data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -228
  372. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
  373. data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
  374. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -450
  375. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
  376. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
  377. data/vendor/faiss/faiss/utils/simdlib.h +0 -42
  378. data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -296
  379. /data/vendor/faiss/faiss/{cppcontrib/factory_tools.h → factory_tools.h} +0 -0
@@ -0,0 +1,311 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #ifdef COMPILE_SIMD_RISCV_RVV
9
+
10
+ #include <faiss/impl/scalar_quantizer/codecs.h>
11
+ #include <faiss/impl/scalar_quantizer/distance_computers.h>
12
+ #include <faiss/impl/scalar_quantizer/quantizers.h>
13
+ #include <faiss/impl/scalar_quantizer/scanners.h>
14
+ #include <faiss/impl/scalar_quantizer/similarities.h>
15
+
16
+ #include <riscv_vector.h>
17
+ #include <cmath>
18
+
19
+ namespace faiss {
20
+
21
+ namespace scalar_quantizer {
22
+
23
+ /*************************************************************************
24
+ * Marker specializations.
25
+ *
26
+ * Unlike x86/NEON sq-*.cpp files that expose a fixed 8-wide / 16-wide codec
27
+ * interface (reconstruct_8_components / reconstruct_16_components), RVV is
28
+ * variable-width: the native vector length is implementation-defined and
29
+ * queried at runtime via __riscv_vsetvl. Forcing RVV into a fixed-width
30
+ * codec would leave performance on the table on wider hardware.
31
+ *
32
+ * So the strategy here is: Codec / Quantizer / Similarity classes for
33
+ * RISCV_RVV act as opaque TAG TYPES — they only need to be complete types
34
+ * so that baseline's sq-dispatch.h can form template arguments like
35
+ * `DCTemplate<QuantizerTemplate<Codec4bit<RISCV_RVV>, UNIFORM, RISCV_RVV>,
36
+ * SimilarityL2<RISCV_RVV>, RISCV_RVV>`.
37
+ *
38
+ * The real SIMD work lives in full DCTemplate specializations below.
39
+ * Unspecialized combinations fall through to scalar via the fallback
40
+ * `DCTemplate<Q, Sim, RISCV_RVV> : DCTemplate<Q, Sim, NONE>`.
41
+ ************************************************************************/
42
+
43
+ template <>
44
+ struct Codec8bit<SIMDLevel::RISCV_RVV> : Codec8bit<SIMDLevel::NONE> {};
45
+
46
+ template <>
47
+ struct Codec4bit<SIMDLevel::RISCV_RVV> : Codec4bit<SIMDLevel::NONE> {};
48
+
49
+ template <>
50
+ struct Codec6bit<SIMDLevel::RISCV_RVV> : Codec6bit<SIMDLevel::NONE> {};
51
+
52
+ template <class Codec>
53
+ struct QuantizerTemplate<
54
+ Codec,
55
+ QuantizerTemplateScaling::UNIFORM,
56
+ SIMDLevel::RISCV_RVV>
57
+ : QuantizerTemplate<
58
+ Codec,
59
+ QuantizerTemplateScaling::UNIFORM,
60
+ SIMDLevel::NONE> {
61
+ QuantizerTemplate(size_t d, const std::vector<float>& trained)
62
+ : QuantizerTemplate<
63
+ Codec,
64
+ QuantizerTemplateScaling::UNIFORM,
65
+ SIMDLevel::NONE>(d, trained) {}
66
+ };
67
+
68
+ template <class Codec>
69
+ struct QuantizerTemplate<
70
+ Codec,
71
+ QuantizerTemplateScaling::NON_UNIFORM,
72
+ SIMDLevel::RISCV_RVV>
73
+ : QuantizerTemplate<
74
+ Codec,
75
+ QuantizerTemplateScaling::NON_UNIFORM,
76
+ SIMDLevel::NONE> {
77
+ QuantizerTemplate(size_t d, const std::vector<float>& trained)
78
+ : QuantizerTemplate<
79
+ Codec,
80
+ QuantizerTemplateScaling::NON_UNIFORM,
81
+ SIMDLevel::NONE>(d, trained) {}
82
+ };
83
+
84
+ template <>
85
+ struct QuantizerFP16<SIMDLevel::RISCV_RVV> : QuantizerFP16<SIMDLevel::NONE> {
86
+ QuantizerFP16(size_t d, const std::vector<float>& trained)
87
+ : QuantizerFP16<SIMDLevel::NONE>(d, trained) {}
88
+ };
89
+
90
+ template <>
91
+ struct QuantizerBF16<SIMDLevel::RISCV_RVV> : QuantizerBF16<SIMDLevel::NONE> {
92
+ QuantizerBF16(size_t d, const std::vector<float>& trained)
93
+ : QuantizerBF16<SIMDLevel::NONE>(d, trained) {}
94
+ };
95
+
96
+ template <>
97
+ struct Quantizer8bitDirect<SIMDLevel::RISCV_RVV>
98
+ : Quantizer8bitDirect<SIMDLevel::NONE> {
99
+ Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
100
+ : Quantizer8bitDirect<SIMDLevel::NONE>(d, trained) {}
101
+ };
102
+
103
+ template <>
104
+ struct Quantizer8bitDirectSigned<SIMDLevel::RISCV_RVV>
105
+ : Quantizer8bitDirectSigned<SIMDLevel::NONE> {
106
+ Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
107
+ : Quantizer8bitDirectSigned<SIMDLevel::NONE>(d, trained) {}
108
+ };
109
+
110
+ template <>
111
+ struct SimilarityL2<SIMDLevel::RISCV_RVV> : SimilarityL2<SIMDLevel::NONE> {
112
+ using SimilarityL2<SIMDLevel::NONE>::SimilarityL2;
113
+ };
114
+
115
+ template <>
116
+ struct SimilarityIP<SIMDLevel::RISCV_RVV> : SimilarityIP<SIMDLevel::NONE> {
117
+ using SimilarityIP<SIMDLevel::NONE>::SimilarityIP;
118
+ };
119
+
120
+ /*************************************************************************
121
+ * Fallback DCTemplate / DistanceComputerByte for RISCV_RVV.
122
+ *
123
+ * Inheriting from the NONE specialization means every (Quantizer, Similarity)
124
+ * combination that does NOT have a hand-tuned RVV full specialization below
125
+ * falls through to scalar code. Callers and the dispatcher don't know or care.
126
+ ************************************************************************/
127
+
128
+ template <class Quantizer, class Similarity>
129
+ struct DCTemplate<Quantizer, Similarity, SIMDLevel::RISCV_RVV>
130
+ : DCTemplate<Quantizer, Similarity, SIMDLevel::NONE> {
131
+ using Base = DCTemplate<Quantizer, Similarity, SIMDLevel::NONE>;
132
+ using Base::Base;
133
+ };
134
+
135
+ template <class Similarity>
136
+ struct DistanceComputerByte<Similarity, SIMDLevel::RISCV_RVV>
137
+ : DistanceComputerByte<Similarity, SIMDLevel::NONE> {
138
+ using Base = DistanceComputerByte<Similarity, SIMDLevel::NONE>;
139
+ using Base::Base;
140
+ };
141
+
142
+ /*************************************************************************
143
+ * Fast path — QT_4bit_uniform + L2
144
+ *
145
+ * 4-bit UNIFORM scaling: every component reconstructs as an affine function
146
+ * of the 4-bit code,
147
+ * recon(c) = vmin + vdiff * (c + 0.5) / 15 = final_scale * c + bias
148
+ * where final_scale = vdiff / 15. L2 distance between two reconstructions
149
+ * therefore reduces to final_scale^2 * (q_c - c_c)^2 over integer codes,
150
+ * so we can stay in the int domain and pay one float multiply at the end.
151
+ *
152
+ * The RVV path pre-nibbles the query into q_lo / q_hi (even / odd lanes)
153
+ * once at set_query time and then processes native-VL-sized chunks of code
154
+ * without ever decoding to float.
155
+ ************************************************************************/
156
+
157
+ template <>
158
+ struct DCTemplate<
159
+ QuantizerTemplate<
160
+ Codec4bit<SIMDLevel::RISCV_RVV>,
161
+ QuantizerTemplateScaling::UNIFORM,
162
+ SIMDLevel::RISCV_RVV>,
163
+ SimilarityL2<SIMDLevel::RISCV_RVV>,
164
+ SIMDLevel::RISCV_RVV> : SQDistanceComputer {
165
+ using Sim = SimilarityL2<SIMDLevel::RISCV_RVV>;
166
+
167
+ size_t d;
168
+ float vmin;
169
+ float vdiff;
170
+ float final_scale_sq;
171
+ std::vector<uint8_t> q_lo;
172
+ std::vector<uint8_t> q_hi;
173
+
174
+ DCTemplate(size_t d_in, const std::vector<float>& trained)
175
+ : d(d_in),
176
+ vmin(trained[0]),
177
+ vdiff(trained[1]),
178
+ q_lo((d_in + 1) / 2, 0),
179
+ q_hi((d_in + 1) / 2, 0) {
180
+ const float final_scale = vdiff / 15.0f;
181
+ final_scale_sq = final_scale * final_scale;
182
+ }
183
+
184
+ void set_query(const float* x) final {
185
+ this->q = x;
186
+ const float inv_scale = (vdiff == 0.0f) ? 0.0f : 15.0f / vdiff;
187
+ for (size_t i = 0; i < d; i++) {
188
+ float val = (x[i] - vmin) * inv_scale;
189
+ int code = static_cast<int>(val);
190
+ if (code < 0) {
191
+ code = 0;
192
+ }
193
+ if (code > 15) {
194
+ code = 15;
195
+ }
196
+ if (i % 2 == 0) {
197
+ q_lo[i / 2] = static_cast<uint8_t>(code);
198
+ } else {
199
+ q_hi[i / 2] = static_cast<uint8_t>(code);
200
+ }
201
+ }
202
+ }
203
+
204
+ /// Squared integer-domain L2 between pre-nibbled q and packed 4-bit code.
205
+ /// Uses RVV's native VL; no fixed width assumptions. Returns the raw
206
+ /// integer sum — caller multiplies by final_scale_sq.
207
+ int64_t accumulate_int_l2(const uint8_t* code) const {
208
+ int64_t acc = 0;
209
+ size_t i = 0;
210
+ while (i < d) {
211
+ // Process up to vl codes per iteration. Each code byte packs two
212
+ // 4-bit codes, so we load (vl + 1) / 2 bytes; keep vl even to
213
+ // keep the nibble split aligned with the i % 2 split we used at
214
+ // set_query time.
215
+ size_t remaining = d - i;
216
+ size_t vl = __riscv_vsetvl_e8m1(remaining);
217
+ if (vl & 1) {
218
+ vl -= 1; // keep even; tail handled on next iter or scalar
219
+ }
220
+ if (vl == 0) {
221
+ break;
222
+ }
223
+ const size_t byte_vl = vl / 2;
224
+
225
+ vuint8m1_t packed = __riscv_vle8_v_u8m1(code + i / 2, byte_vl);
226
+ vuint8m1_t ql = __riscv_vle8_v_u8m1(q_lo.data() + i / 2, byte_vl);
227
+ vuint8m1_t qh = __riscv_vle8_v_u8m1(q_hi.data() + i / 2, byte_vl);
228
+
229
+ vuint8m1_t lo_nib = __riscv_vand_vx_u8m1(packed, 0x0F, byte_vl);
230
+ vuint8m1_t hi_nib = __riscv_vsrl_vx_u8m1(packed, 4, byte_vl);
231
+
232
+ // |ql - lo| and |qh - hi| fit in u8 (values are in [0, 15]).
233
+ vuint8m1_t d_lo = __riscv_vsub_vv_u8m1(
234
+ __riscv_vmaxu_vv_u8m1(ql, lo_nib, byte_vl),
235
+ __riscv_vminu_vv_u8m1(ql, lo_nib, byte_vl),
236
+ byte_vl);
237
+ vuint8m1_t d_hi = __riscv_vsub_vv_u8m1(
238
+ __riscv_vmaxu_vv_u8m1(qh, hi_nib, byte_vl),
239
+ __riscv_vminu_vv_u8m1(qh, hi_nib, byte_vl),
240
+ byte_vl);
241
+
242
+ // Square via widening multiply (each byte squared fits in u16,
243
+ // since max byte value is 15 -> 225).
244
+ vuint16m2_t sq_lo = __riscv_vwmulu_vv_u16m2(d_lo, d_lo, byte_vl);
245
+ vuint16m2_t sq_hi = __riscv_vwmulu_vv_u16m2(d_hi, d_hi, byte_vl);
246
+ vuint16m2_t sq_sum = __riscv_vadd_vv_u16m2(sq_lo, sq_hi, byte_vl);
247
+
248
+ // Reduce to a scalar u32 (safe: byte_vl * 450 fits in u32 for
249
+ // any realistic d).
250
+ vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, 1);
251
+ vuint32m1_t red =
252
+ __riscv_vwredsumu_vs_u16m2_u32m1(sq_sum, zero, byte_vl);
253
+ acc += __riscv_vmv_x_s_u32m1_u32(red);
254
+
255
+ i += vl;
256
+ }
257
+ // Scalar tail: cover any leftover odd lane (at most one).
258
+ for (; i < d; i++) {
259
+ uint8_t c_code =
260
+ (i % 2 == 0) ? (code[i / 2] & 0x0F) : (code[i / 2] >> 4);
261
+ uint8_t q_code = (i % 2 == 0) ? q_lo[i / 2] : q_hi[i / 2];
262
+ int diff = int(q_code) - int(c_code);
263
+ acc += diff * diff;
264
+ }
265
+ return acc;
266
+ }
267
+
268
+ float query_to_code(const uint8_t* code) const final {
269
+ return static_cast<float>(accumulate_int_l2(code)) * final_scale_sq;
270
+ }
271
+
272
+ float symmetric_dis(idx_t i, idx_t j) override {
273
+ // Not on the critical path for most workloads; reconstruct both
274
+ // codes into nibbles scalar-style and compute squared distance.
275
+ const uint8_t* c1 = codes + i * code_size;
276
+ const uint8_t* c2 = codes + j * code_size;
277
+ int64_t acc = 0;
278
+ for (size_t k = 0; k < d; k++) {
279
+ uint8_t a = (k % 2 == 0) ? (c1[k / 2] & 0x0F) : (c1[k / 2] >> 4);
280
+ uint8_t b = (k % 2 == 0) ? (c2[k / 2] & 0x0F) : (c2[k / 2] >> 4);
281
+ int diff = int(a) - int(b);
282
+ acc += diff * diff;
283
+ }
284
+ return static_cast<float>(acc) * final_scale_sq;
285
+ }
286
+
287
+ void query_to_codes_batch_4(
288
+ const uint8_t* code_0,
289
+ const uint8_t* code_1,
290
+ const uint8_t* code_2,
291
+ const uint8_t* code_3,
292
+ float& dis0,
293
+ float& dis1,
294
+ float& dis2,
295
+ float& dis3) const final {
296
+ // Simple 4x unroll of the single-code path; good enough as a first
297
+ // cut — gives ILP across the four independent accumulate loops.
298
+ dis0 = static_cast<float>(accumulate_int_l2(code_0)) * final_scale_sq;
299
+ dis1 = static_cast<float>(accumulate_int_l2(code_1)) * final_scale_sq;
300
+ dis2 = static_cast<float>(accumulate_int_l2(code_2)) * final_scale_sq;
301
+ dis3 = static_cast<float>(accumulate_int_l2(code_3)) * final_scale_sq;
302
+ }
303
+ };
304
+
305
+ } // namespace scalar_quantizer
306
+ } // namespace faiss
307
+
308
+ #define THE_LEVEL_TO_DISPATCH SIMDLevel::RISCV_RVV
309
+ #include <faiss/impl/scalar_quantizer/sq-dispatch.h>
310
+
311
+ #endif // COMPILE_SIMD_RISCV_RVV
@@ -0,0 +1,387 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #include <faiss/impl/scalar_quantizer/training.h>
9
+
10
+ #include <faiss/impl/FaissAssert.h>
11
+ #include <algorithm>
12
+ #include <cmath>
13
+
14
+ namespace faiss {
15
+
16
+ namespace scalar_quantizer {
17
+ /*******************************************************************
18
+ * Quantizer range training
19
+ */
20
+
21
+ static float sqr(float x) {
22
+ return x * x;
23
+ }
24
+
25
+ constexpr size_t kTurboQuantMaxBits = 8;
26
+ // TurboQuant builds a 1-D optimal scalar quantizer analytically. We approximate
27
+ // the target density on a uniform grid over [-1, 1]; the grid is kept dense
28
+ // enough both in absolute terms and per output centroid.
29
+ constexpr size_t kTurboQuantGridMin = 1 << 15;
30
+ constexpr size_t kTurboQuantGridPerCentroid = 512;
31
+ constexpr int kTurboQuantMaxIter = 100;
32
+ constexpr double kTurboQuantTol = 1e-8;
33
+
34
+ void build_TurboQuantMSECodebook(
35
+ size_t d,
36
+ size_t nbits,
37
+ std::vector<float>& centroids,
38
+ std::vector<float>& boundaries) {
39
+ FAISS_THROW_IF_NOT_FMT(
40
+ nbits <= kTurboQuantMaxBits,
41
+ "invalid TurboQuant nbits %zu (must be in [0, %zu])",
42
+ nbits,
43
+ kTurboQuantMaxBits);
44
+
45
+ if (nbits == 0) {
46
+ centroids.clear();
47
+ boundaries.clear();
48
+ return;
49
+ }
50
+
51
+ const size_t k = size_t(1) << nbits;
52
+
53
+ if (d == 1) {
54
+ // In 1-D, a unit vector can only be -1 or +1, so the marginal
55
+ // distribution collapses to two atoms. The TurboQuant codebook is
56
+ // therefore a repeated pair of endpoint centroids.
57
+ centroids.resize(k);
58
+ for (size_t i = 0; i < k; i++) {
59
+ centroids[i] = i < k / 2 ? -1.0f : 1.0f;
60
+ }
61
+ boundaries.resize(k - 1);
62
+ for (size_t i = 0; i + 1 < k; i++) {
63
+ boundaries[i] = 0.5f * (centroids[i] + centroids[i + 1]);
64
+ }
65
+ return;
66
+ }
67
+
68
+ // For d > 1, TurboQuant uses the marginal distribution of one coordinate of
69
+ // a random unit vector in R^d. On [-1, 1], this density is proportional to
70
+ // (1 - x^2)^((d - 3) / 2), which is a symmetric beta-law after a change of
71
+ // variables. The code below discretizes that density.
72
+ const size_t ngrid =
73
+ std::max(kTurboQuantGridMin, k * kTurboQuantGridPerCentroid);
74
+ const double step = 2.0 / ngrid;
75
+ const double alpha = 0.5 * (double(d) - 3.0);
76
+
77
+ std::vector<double> xs(ngrid);
78
+ // prefix_w stores the cumulative mass of the discretized density and
79
+ // prefix_wx stores its cumulative first moment, so interval means can be
80
+ // recovered in O(1).
81
+ std::vector<double> prefix_w(ngrid + 1, 0.0);
82
+ std::vector<double> prefix_wx(ngrid + 1, 0.0);
83
+
84
+ for (size_t i = 0; i < ngrid; i++) {
85
+ const double x = -1.0 + (i + 0.5) * step;
86
+ const double one_minus_x2 = std::max(0.0, 1.0 - x * x);
87
+ double w;
88
+ if (alpha == 0.0) { // when d == 3
89
+ w = 1.0;
90
+ } else {
91
+ // (1-x^2)^((d-3)/2)
92
+ w = std::pow(one_minus_x2, alpha);
93
+ }
94
+ if (!std::isfinite(w) || w < 0.0) {
95
+ w = 0.0;
96
+ }
97
+ xs[i] = x;
98
+ prefix_w[i + 1] = prefix_w[i] + w;
99
+ prefix_wx[i + 1] = prefix_wx[i] + w * x;
100
+ }
101
+
102
+ auto range_mean = [&](size_t i0, size_t i1, double fallback) {
103
+ const double w = prefix_w[i1] - prefix_w[i0];
104
+ if (w <= 0.0) {
105
+ return fallback;
106
+ }
107
+ return (prefix_wx[i1] - prefix_wx[i0]) / w;
108
+ };
109
+
110
+ const double total_w = prefix_w.back();
111
+ std::vector<size_t> cuts(k + 1, 0);
112
+ cuts[k] = ngrid;
113
+
114
+ // Initialize with k equal-mass cells under the target density. This gives
115
+ // a stable starting point before the Lloyd refinements below.
116
+ for (size_t i = 1; i < k; i++) {
117
+ const double target = total_w * i / k;
118
+ cuts[i] = std::lower_bound(prefix_w.begin(), prefix_w.end(), target) -
119
+ prefix_w.begin();
120
+ cuts[i] = std::min(cuts[i], ngrid);
121
+ }
122
+
123
+ std::vector<double> centroids_d(k);
124
+ for (size_t i = 0; i < k; i++) {
125
+ const double left = -1.0 + 2.0 * i / k;
126
+ const double right = -1.0 + 2.0 * (i + 1) / k;
127
+ // First estimate of each centroid: the conditional mean of its initial
128
+ // equal-mass cell, with a uniform-cell midpoint as a fallback.
129
+ centroids_d[i] = range_mean(cuts[i], cuts[i + 1], 0.5 * (left + right));
130
+ }
131
+
132
+ std::vector<double> boundaries_d(k > 0 ? k - 1 : 0);
133
+
134
+ // Refine the 1-D codebook with a weighted Lloyd iteration over the
135
+ // discretized marginal density on [-1, 1]:
136
+ // 1. boundaries_d are the Voronoi separators implied by neighboring
137
+ // centroids.
138
+ // 2. cuts map each boundary interval back to a contiguous range of the
139
+ // integration grid xs[].
140
+ // 3. each centroid becomes the weighted mean of the samples currently in
141
+ // its cell, clipped to stay within its neighboring boundaries.
142
+ //
143
+ // The loop stops once the largest centroid update is below kTurboQuantTol.
144
+ for (int iter = 0; iter < kTurboQuantMaxIter; iter++) {
145
+ // Midpoints between adjacent centroids define the current Voronoi
146
+ // partition of [-1, 1].
147
+ for (size_t i = 0; i + 1 < k; i++) {
148
+ boundaries_d[i] = 0.5 * (centroids_d[i] + centroids_d[i + 1]);
149
+ }
150
+
151
+ cuts[0] = 0;
152
+ cuts[k] = ngrid;
153
+ // Reassign the discretized density samples to the Voronoi cell induced
154
+ // by each boundary. Because xs is sorted, the reassignment reduces to
155
+ // finding the first grid point strictly greater than each boundary.
156
+ for (size_t i = 1; i < k; i++) {
157
+ cuts[i] = std::upper_bound(
158
+ xs.begin(), xs.end(), boundaries_d[i - 1]) -
159
+ xs.begin();
160
+ }
161
+
162
+ double max_delta = 0.0;
163
+ for (size_t i = 0; i < k; i++) {
164
+ const double left = i == 0 ? -1.0 : boundaries_d[i - 1];
165
+ const double right = i + 1 == k ? 1.0 : boundaries_d[i];
166
+ // Lloyd update: replace the centroid with the weighted average of
167
+ // the mass assigned to its cell. Empty cells fall back to the cell
168
+ // midpoint, and we clamp to [left, right] to preserve ordering.
169
+ double c = range_mean(cuts[i], cuts[i + 1], 0.5 * (left + right));
170
+ c = std::min(std::max(c, left), right);
171
+ max_delta = std::max(max_delta, std::abs(c - centroids_d[i]));
172
+ centroids_d[i] = c;
173
+ }
174
+
175
+ if (max_delta < kTurboQuantTol) {
176
+ break;
177
+ }
178
+ }
179
+
180
+ std::sort(centroids_d.begin(), centroids_d.end());
181
+
182
+ centroids.resize(k);
183
+ boundaries.resize(k - 1);
184
+ for (size_t i = 0; i < k; i++) {
185
+ centroids[i] = centroids_d[i];
186
+ }
187
+ for (size_t i = 0; i + 1 < k; i++) {
188
+ boundaries[i] = 0.5f * (centroids[i] + centroids[i + 1]);
189
+ }
190
+ }
191
+
192
+ void train_TurboQuantMSE(size_t d, size_t nbits, std::vector<float>& trained) {
193
+ FAISS_THROW_IF_NOT_FMT(
194
+ nbits > 0, "invalid TurboQuant SQ nbits %zu (must be > 0)", nbits);
195
+ std::vector<float> centroids;
196
+ std::vector<float> boundaries;
197
+ build_TurboQuantMSECodebook(d, nbits, centroids, boundaries);
198
+ const size_t k = centroids.size();
199
+
200
+ trained.resize(k + (k - 1));
201
+ for (size_t i = 0; i < k; i++) {
202
+ trained[i] = centroids[i];
203
+ }
204
+ for (size_t i = 0; i + 1 < k; i++) {
205
+ trained[k + i] = boundaries[i];
206
+ }
207
+ }
208
+
209
+ void train_Uniform(
210
+ RangeStat rs,
211
+ float rs_arg,
212
+ idx_t n,
213
+ int k,
214
+ const float* x,
215
+ std::vector<float>& trained) {
216
+ FAISS_THROW_IF_NOT(n > 0);
217
+ trained.resize(2);
218
+ float& vmin = trained[0];
219
+ float& vmax = trained[1];
220
+
221
+ if (rs == ScalarQuantizer::RS_minmax) {
222
+ vmin = HUGE_VAL;
223
+ vmax = -HUGE_VAL;
224
+ for (idx_t i = 0; i < n; i++) {
225
+ if (x[i] < vmin) {
226
+ vmin = x[i];
227
+ }
228
+ if (x[i] > vmax) {
229
+ vmax = x[i];
230
+ }
231
+ }
232
+ float vexp = (vmax - vmin) * rs_arg;
233
+ vmin -= vexp;
234
+ vmax += vexp;
235
+ } else if (rs == ScalarQuantizer::RS_meanstd) {
236
+ double sum = 0, sum2 = 0;
237
+ for (idx_t i = 0; i < n; i++) {
238
+ sum += x[i];
239
+ sum2 += x[i] * x[i];
240
+ }
241
+ float mean = sum / n;
242
+ float var = sum2 / n - mean * mean;
243
+ float std = var <= 0 ? 1.0 : std::sqrt(var);
244
+
245
+ vmin = mean - std * rs_arg;
246
+ vmax = mean + std * rs_arg;
247
+ } else if (rs == ScalarQuantizer::RS_quantiles) {
248
+ std::vector<float> x_copy(n);
249
+ memcpy(x_copy.data(), x, n * sizeof(*x));
250
+ idx_t o = static_cast<idx_t>(rs_arg * n);
251
+ if (o < 0) {
252
+ o = 0;
253
+ }
254
+ if (o > n - o) {
255
+ o = n / 2;
256
+ }
257
+ std::nth_element(x_copy.begin(), x_copy.begin() + o, x_copy.end());
258
+ vmin = x_copy[o];
259
+ std::nth_element(
260
+ x_copy.begin(), x_copy.begin() + (n - 1 - o), x_copy.end());
261
+ vmax = x_copy[n - 1 - o];
262
+
263
+ } else if (rs == ScalarQuantizer::RS_optim) {
264
+ float a, b;
265
+ float sx = 0;
266
+ {
267
+ vmin = HUGE_VAL, vmax = -HUGE_VAL;
268
+ for (idx_t i = 0; i < n; i++) {
269
+ if (x[i] < vmin) {
270
+ vmin = x[i];
271
+ }
272
+ if (x[i] > vmax) {
273
+ vmax = x[i];
274
+ }
275
+ sx += x[i];
276
+ }
277
+ b = vmin;
278
+ a = (vmax - vmin) / (k - 1);
279
+ }
280
+ int verbose = false;
281
+ int niter = 2000;
282
+ float last_err = -1;
283
+ int iter_last_err = 0;
284
+ for (int it = 0; it < niter; it++) {
285
+ float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
286
+
287
+ for (idx_t i = 0; i < n; i++) {
288
+ float xi = x[i];
289
+ float ni = floor((xi - b) / a + 0.5);
290
+ if (ni < 0) {
291
+ ni = 0;
292
+ }
293
+ if (ni >= k) {
294
+ ni = k - 1;
295
+ }
296
+ err1 += sqr(xi - (ni * a + b));
297
+ sn += ni;
298
+ sn2 += ni * ni;
299
+ sxn += ni * xi;
300
+ }
301
+
302
+ if (err1 == last_err) {
303
+ iter_last_err++;
304
+ if (iter_last_err == 16) {
305
+ break;
306
+ }
307
+ } else {
308
+ last_err = err1;
309
+ iter_last_err = 0;
310
+ }
311
+
312
+ float det = sqr(sn) - sn2 * n;
313
+
314
+ b = (sn * sxn - sn2 * sx) / det;
315
+ a = (sn * sx - n * sxn) / det;
316
+ if (verbose) {
317
+ printf("it %d, err1=%g \r", it, err1);
318
+ fflush(stdout);
319
+ }
320
+ }
321
+ if (verbose) {
322
+ printf("\n");
323
+ }
324
+
325
+ vmin = b;
326
+ vmax = b + a * (k - 1);
327
+
328
+ } else {
329
+ FAISS_THROW_MSG("Invalid qtype");
330
+ }
331
+ vmax -= vmin;
332
+ }
333
+
334
+ void train_NonUniform(
335
+ RangeStat rs,
336
+ float rs_arg,
337
+ idx_t n,
338
+ int d,
339
+ int k,
340
+ const float* x,
341
+ std::vector<float>& trained) {
342
+ trained.resize(static_cast<size_t>(2) * d);
343
+ float* vmin = trained.data();
344
+ float* vmax = trained.data() + d;
345
+ if (rs == ScalarQuantizer::RS_minmax) {
346
+ memcpy(vmin, x, sizeof(*x) * d);
347
+ memcpy(vmax, x, sizeof(*x) * d);
348
+ for (idx_t i = 1; i < n; i++) {
349
+ const float* xi = x + i * d;
350
+ for (int j = 0; j < d; j++) {
351
+ if (xi[j] < vmin[j]) {
352
+ vmin[j] = xi[j];
353
+ }
354
+ if (xi[j] > vmax[j]) {
355
+ vmax[j] = xi[j];
356
+ }
357
+ }
358
+ }
359
+ float* vdiff = vmax;
360
+ for (int j = 0; j < d; j++) {
361
+ float vexp = (vmax[j] - vmin[j]) * rs_arg;
362
+ vmin[j] -= vexp;
363
+ vmax[j] += vexp;
364
+ vdiff[j] = vmax[j] - vmin[j];
365
+ }
366
+ } else {
367
+ // transpose
368
+ std::vector<float> xt(n * d);
369
+ for (idx_t i = 1; i < n; i++) {
370
+ const float* xi = x + i * d;
371
+ for (int j = 0; j < d; j++) {
372
+ xt[j * n + i] = xi[j];
373
+ }
374
+ }
375
+ std::vector<float> trained_d(2);
376
+ #pragma omp parallel for
377
+ for (int j = 0; j < d; j++) {
378
+ train_Uniform(rs, rs_arg, n, k, xt.data() + j * n, trained_d);
379
+ vmin[j] = trained_d[0];
380
+ vmax[j] = trained_d[1];
381
+ }
382
+ }
383
+ }
384
+
385
+ } // namespace scalar_quantizer
386
+
387
+ } // namespace faiss