faiss 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/faiss/extconf.rb +2 -1
  4. data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
  5. data/ext/faiss/index_binary.cpp +1 -1
  6. data/ext/faiss/kmeans.cpp +1 -1
  7. data/ext/faiss/pca_matrix.cpp +1 -1
  8. data/ext/faiss/product_quantizer.cpp +1 -1
  9. data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
  10. data/lib/faiss/version.rb +1 -1
  11. data/vendor/faiss/faiss/AutoTune.cpp +93 -80
  12. data/vendor/faiss/faiss/Clustering.cpp +39 -240
  13. data/vendor/faiss/faiss/Clustering.h +6 -0
  14. data/vendor/faiss/faiss/IVFlib.cpp +41 -21
  15. data/vendor/faiss/faiss/Index.cpp +6 -5
  16. data/vendor/faiss/faiss/Index.h +5 -5
  17. data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
  21. data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
  22. data/vendor/faiss/faiss/IndexBinary.h +4 -4
  23. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
  24. data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +84 -92
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +87 -415
  31. data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
  32. data/vendor/faiss/faiss/IndexFastScan.h +25 -23
  33. data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
  34. data/vendor/faiss/faiss/IndexFlat.h +21 -18
  35. data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
  36. data/vendor/faiss/faiss/IndexHNSW.cpp +283 -145
  37. data/vendor/faiss/faiss/IndexHNSW.h +16 -2
  38. data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
  39. data/vendor/faiss/faiss/IndexIDMap.h +9 -7
  40. data/vendor/faiss/faiss/IndexIVF.cpp +465 -362
  41. data/vendor/faiss/faiss/IndexIVF.h +33 -12
  42. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +77 -74
  43. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
  45. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
  46. data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
  47. data/vendor/faiss/faiss/IndexIVFFlat.cpp +36 -68
  48. data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
  49. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +53 -30
  50. data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
  51. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +71 -843
  53. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
  55. data/vendor/faiss/faiss/IndexIVFPQR.cpp +21 -17
  56. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +26 -39
  57. data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
  58. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
  59. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
  60. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
  61. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
  62. data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
  63. data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
  66. data/vendor/faiss/faiss/IndexNSG.cpp +39 -23
  67. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
  68. data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
  69. data/vendor/faiss/faiss/IndexPQ.h +3 -2
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
  73. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  74. data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
  75. data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
  76. data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
  77. data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
  78. data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
  79. data/vendor/faiss/faiss/IndexRefine.h +4 -4
  80. data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
  81. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
  82. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
  83. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +82 -14
  84. data/vendor/faiss/faiss/IndexShards.cpp +10 -9
  85. data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
  86. data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
  87. data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
  88. data/vendor/faiss/faiss/MetaIndexes.h +1 -1
  89. data/vendor/faiss/faiss/MetricType.h +14 -7
  90. data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
  91. data/vendor/faiss/faiss/SuperKMeans.h +97 -0
  92. data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
  93. data/vendor/faiss/faiss/VectorTransform.h +16 -16
  94. data/vendor/faiss/faiss/build.cpp +23 -0
  95. data/vendor/faiss/faiss/build.h +15 -0
  96. data/vendor/faiss/faiss/clone_index.cpp +48 -47
  97. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
  98. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
  101. data/vendor/faiss/faiss/factory_tools.cpp +5 -0
  102. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
  103. data/vendor/faiss/faiss/gpu/GpuResources.h +1 -1
  104. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +9 -9
  105. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +4 -3
  106. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
  107. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
  108. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
  109. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
  110. data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
  111. data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
  112. data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
  113. data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
  114. data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
  115. data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
  116. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +51 -0
  117. data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
  118. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +66 -0
  119. data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
  120. data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
  121. data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
  122. data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
  123. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
  124. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
  125. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
  126. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
  127. data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
  128. data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
  129. data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
  130. data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
  131. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
  132. data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
  133. data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
  134. data/vendor/faiss/faiss/impl/FaissException.h +50 -3
  135. data/vendor/faiss/faiss/impl/HNSW.cpp +92 -317
  136. data/vendor/faiss/faiss/impl/HNSW.h +13 -34
  137. data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
  138. data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
  139. data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
  140. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
  141. data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
  142. data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
  143. data/vendor/faiss/faiss/impl/NSG.cpp +38 -21
  144. data/vendor/faiss/faiss/impl/NSG.h +4 -4
  145. data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
  146. data/vendor/faiss/faiss/impl/Panorama.h +258 -87
  147. data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
  148. data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
  149. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
  150. data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
  151. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
  152. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
  153. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +30 -23
  154. data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
  155. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
  156. data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
  157. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +296 -283
  158. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
  159. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
  160. data/vendor/faiss/faiss/impl/ResultHandler.h +99 -75
  161. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +52 -4
  162. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +27 -1
  163. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
  164. data/vendor/faiss/faiss/impl/VisitedTable.h +7 -0
  165. data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
  166. data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
  167. data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
  168. data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
  169. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
  170. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
  171. data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
  172. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
  173. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
  174. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
  175. data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
  176. data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
  177. data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
  178. data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
  179. data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
  180. data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
  181. data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
  182. data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
  183. data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
  184. data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
  185. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
  186. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
  187. data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
  188. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +268 -0
  189. data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
  190. data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
  191. data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
  192. data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
  193. data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
  194. data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
  195. data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
  196. data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
  197. data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
  198. data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
  199. data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
  200. data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
  201. data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
  202. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +91 -0
  203. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +64 -0
  204. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +104 -0
  205. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +111 -0
  206. data/vendor/faiss/faiss/impl/index_read.cpp +1132 -45
  207. data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
  208. data/vendor/faiss/faiss/impl/index_write.cpp +95 -13
  209. data/vendor/faiss/faiss/impl/io.cpp +6 -6
  210. data/vendor/faiss/faiss/impl/io_macros.h +33 -16
  211. data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
  212. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
  213. data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
  214. data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
  215. data/vendor/faiss/faiss/impl/platform_macros.h +11 -4
  216. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
  217. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
  218. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
  219. data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
  220. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +21 -0
  221. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +21 -0
  222. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +21 -0
  223. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
  224. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
  225. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +29 -111
  226. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
  227. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +238 -5
  228. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +5 -7
  229. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +68 -0
  230. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
  231. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  232. data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
  233. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +3 -2
  234. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +102 -11
  235. data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
  236. data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
  237. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +148 -0
  238. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +167 -0
  239. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +59 -0
  240. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +163 -0
  241. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +311 -0
  242. data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
  243. data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
  244. data/vendor/faiss/faiss/impl/simd_dispatch.h +100 -66
  245. data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
  246. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
  247. data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
  248. data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
  249. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
  250. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
  251. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
  252. data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
  253. data/vendor/faiss/faiss/impl/svs_io.h +8 -2
  254. data/vendor/faiss/faiss/index_factory.cpp +86 -18
  255. data/vendor/faiss/faiss/index_io.h +24 -0
  256. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
  257. data/vendor/faiss/faiss/invlists/DirectMap.cpp +24 -14
  258. data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
  259. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +157 -73
  260. data/vendor/faiss/faiss/invlists/InvertedLists.h +86 -23
  261. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
  262. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
  263. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
  264. data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
  265. data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
  266. data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
  267. data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
  268. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
  269. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
  270. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
  271. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
  272. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +25 -1
  273. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +18 -2
  274. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +1 -1
  275. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +12 -3
  276. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +7 -2
  277. data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
  278. data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
  279. data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
  280. data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
  281. data/vendor/faiss/faiss/utils/distances.cpp +390 -560
  282. data/vendor/faiss/faiss/utils/distances.h +20 -1
  283. data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
  284. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
  285. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
  286. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
  287. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
  288. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
  289. data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
  290. data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -177
  291. data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
  292. data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
  293. data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
  294. data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
  295. data/vendor/faiss/faiss/utils/hamming.h +92 -2
  296. data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
  297. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +15 -0
  298. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
  299. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
  300. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +234 -0
  301. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
  302. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
  303. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
  304. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
  305. data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
  306. data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
  307. data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
  308. data/vendor/faiss/faiss/utils/partitioning.cpp +66 -987
  309. data/vendor/faiss/faiss/utils/partitioning.h +31 -0
  310. data/vendor/faiss/faiss/utils/popcount.h +29 -0
  311. data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
  312. data/vendor/faiss/faiss/utils/prefetch.h +2 -2
  313. data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
  314. data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
  315. data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
  316. data/vendor/faiss/faiss/utils/random.cpp +6 -6
  317. data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
  318. data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
  319. data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
  320. data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
  321. data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
  322. data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
  323. data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
  324. data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
  325. data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
  326. data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
  327. data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
  328. data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
  329. data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
  330. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1085 -0
  331. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
  332. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
  333. data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
  334. data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
  335. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
  336. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
  337. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
  338. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
  339. data/vendor/faiss/faiss/utils/simd_levels.cpp +17 -5
  340. data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
  341. data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
  342. data/vendor/faiss/faiss/utils/utils.cpp +5 -5
  343. data/vendor/faiss/faiss/utils/utils.h +3 -3
  344. metadata +119 -34
  345. data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
  346. data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
  347. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
  348. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
  349. data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
  350. data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
  351. data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
  352. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
  353. data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
  354. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
  355. data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
  356. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
  357. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
  358. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
  359. data/vendor/faiss/faiss/utils/simdlib.h +0 -42
  360. data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
  361. /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
@@ -8,12 +8,15 @@
8
8
  #include <faiss/utils/partitioning.h>
9
9
 
10
10
  #include <cassert>
11
+ #include <cinttypes>
11
12
  #include <cmath>
13
+ #include <cstring>
14
+ #include <type_traits>
12
15
 
13
16
  #include <faiss/impl/FaissAssert.h>
17
+ #include <faiss/impl/simd_dispatch.h>
14
18
  #include <faiss/utils/AlignedTable.h>
15
19
  #include <faiss/utils/ordered_key_value.h>
16
- #include <faiss/utils/simdlib.h>
17
20
 
18
21
  #include <faiss/impl/platform_macros.h>
19
22
 
@@ -50,8 +53,8 @@ typename C::T sample_threshold_median3(
50
53
  T val3[3];
51
54
  int vi = 0;
52
55
 
53
- for (size_t i = 0; i < n; i++) {
54
- T v = vals[(i * big_prime) % n];
56
+ for (size_t i = 0; i < static_cast<size_t>(n); i++) {
57
+ T v = vals[(i * big_prime) % static_cast<size_t>(n)];
55
58
  // thresh_inf < v < thresh_sup (for CMax)
56
59
  if (C::cmp(v, thresh_inf) && C::cmp(thresh_sup, v)) {
57
60
  val3[vi++] = v;
@@ -217,527 +220,9 @@ typename C::T partition_fuzzy_median3(
217
220
  return thresh;
218
221
  }
219
222
 
220
- } // namespace partitioning
221
-
222
- /******************************************************************
223
- * SIMD routines when vals is an aligned array of uint16_t
224
- ******************************************************************/
225
-
226
- namespace simd_partitioning {
227
-
228
- void find_minimax(
229
- const uint16_t* vals,
230
- size_t n,
231
- uint16_t& smin,
232
- uint16_t& smax) {
233
- simd16uint16 vmin(0xffff), vmax(0);
234
- for (size_t i = 0; i + 15 < n; i += 16) {
235
- simd16uint16 v(vals + i);
236
- vmin.accu_min(v);
237
- vmax.accu_max(v);
238
- }
239
-
240
- ALIGNED(32) uint16_t tab32[32];
241
- vmin.store(tab32);
242
- vmax.store(tab32 + 16);
243
-
244
- smin = tab32[0], smax = tab32[16];
245
-
246
- for (int i = 1; i < 16; i++) {
247
- smin = std::min(smin, tab32[i]);
248
- smax = std::max(smax, tab32[i + 16]);
249
- }
250
-
251
- // missing values
252
- for (size_t i = (n & ~15); i < n; i++) {
253
- smin = std::min(smin, vals[i]);
254
- smax = std::max(smax, vals[i]);
255
- }
256
- }
257
-
258
- // max func differentiates between CMin and CMax (keep lowest or largest)
259
- template <class C>
260
- simd16uint16 max_func(simd16uint16 v, simd16uint16 thr16) {
261
- constexpr bool is_max = C::is_max;
262
- if (is_max) {
263
- return max(v, thr16);
264
- } else {
265
- return min(v, thr16);
266
- }
267
- }
268
-
269
- template <class C>
270
- void count_lt_and_eq(
271
- const uint16_t* vals,
272
- int n,
273
- uint16_t thresh,
274
- size_t& n_lt,
275
- size_t& n_eq) {
276
- n_lt = n_eq = 0;
277
- simd16uint16 thr16(thresh);
278
-
279
- size_t n1 = n / 16;
280
-
281
- for (size_t i = 0; i < n1; i++) {
282
- simd16uint16 v(vals);
283
- vals += 16;
284
- simd16uint16 eqmask = (v == thr16);
285
- simd16uint16 max2 = max_func<C>(v, thr16);
286
- simd16uint16 gemask = (v == max2);
287
- uint32_t bits = get_MSBs(uint16_to_uint8_saturate(eqmask, gemask));
288
- int i_eq = __builtin_popcount(bits & 0x00ff00ff);
289
- int i_ge = __builtin_popcount(bits) - i_eq;
290
- n_eq += i_eq;
291
- n_lt += 16 - i_ge;
292
- }
293
-
294
- for (size_t i = n1 * 16; i < n; i++) {
295
- uint16_t v = *vals++;
296
- if (C::cmp(thresh, v)) {
297
- n_lt++;
298
- } else if (v == thresh) {
299
- n_eq++;
300
- }
301
- }
302
- }
303
-
304
- /* compress separated values and ids table, keeping all values < thresh and at
305
- * most n_eq equal values */
306
- template <class C>
307
- int simd_compress_array(
308
- uint16_t* vals,
309
- typename C::TI* ids,
310
- size_t n,
311
- uint16_t thresh,
312
- int n_eq) {
313
- simd16uint16 thr16(thresh);
314
- simd16uint16 mixmask(0xff00);
315
-
316
- int wp = 0;
317
- size_t i0;
318
-
319
- // loop while there are eqs to collect
320
- for (i0 = 0; i0 + 15 < n && n_eq > 0; i0 += 16) {
321
- simd16uint16 v(vals + i0);
322
- simd16uint16 max2 = max_func<C>(v, thr16);
323
- simd16uint16 gemask = (v == max2);
324
- simd16uint16 eqmask = (v == thr16);
325
- uint32_t bits = get_MSBs(
326
- blendv(simd32uint8(eqmask),
327
- simd32uint8(gemask),
328
- simd32uint8(mixmask)));
329
- bits ^= 0xAAAAAAAA;
330
- // bit 2*i : eq
331
- // bit 2*i + 1 : lt
332
-
333
- while (bits) {
334
- int j = __builtin_ctz(bits) & (~1);
335
- bool is_eq = (bits >> j) & 1;
336
- bool is_lt = (bits >> j) & 2;
337
- bits &= ~(3 << j);
338
- j >>= 1;
339
-
340
- if (is_lt) {
341
- vals[wp] = vals[i0 + j];
342
- ids[wp] = ids[i0 + j];
343
- wp++;
344
- } else if (is_eq && n_eq > 0) {
345
- vals[wp] = vals[i0 + j];
346
- ids[wp] = ids[i0 + j];
347
- wp++;
348
- n_eq--;
349
- }
350
- }
351
- }
352
-
353
- // handle remaining, only strictly lt ones.
354
- for (; i0 + 15 < n; i0 += 16) {
355
- simd16uint16 v(vals + i0);
356
- simd16uint16 max2 = max_func<C>(v, thr16);
357
- simd16uint16 gemask = (v == max2);
358
- uint32_t bits = ~get_MSBs(simd32uint8(gemask));
359
-
360
- while (bits) {
361
- int j = __builtin_ctz(bits);
362
- bits &= ~(3 << j);
363
- j >>= 1;
364
-
365
- vals[wp] = vals[i0 + j];
366
- ids[wp] = ids[i0 + j];
367
- wp++;
368
- }
369
- }
370
-
371
- // end with scalar
372
- for (int i = (n & ~15); i < n; i++) {
373
- if (C::cmp(thresh, vals[i])) {
374
- vals[wp] = vals[i];
375
- ids[wp] = ids[i];
376
- wp++;
377
- } else if (vals[i] == thresh && n_eq > 0) {
378
- vals[wp] = vals[i];
379
- ids[wp] = ids[i];
380
- wp++;
381
- n_eq--;
382
- }
383
- }
384
- assert(n_eq == 0);
385
- return wp;
386
- }
387
-
388
- // #define MICRO_BENCHMARK
389
-
390
- static uint64_t get_cy() {
391
- #ifdef MICRO_BENCHMARK
392
- uint32_t high, low;
393
- asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
394
- return ((uint64_t)high << 32) | (low);
395
- #else
396
- return 0;
397
- #endif
398
- }
399
-
400
- #define IFV if (false)
401
-
402
- template <class C>
403
- uint16_t simd_partition_fuzzy_with_bounds(
404
- uint16_t* vals,
405
- typename C::TI* ids,
406
- size_t n,
407
- size_t q_min,
408
- size_t q_max,
409
- size_t* q_out,
410
- uint16_t s0i,
411
- uint16_t s1i) {
412
- if (q_min == 0) {
413
- if (q_out) {
414
- *q_out = 0;
415
- }
416
- return 0;
417
- }
418
- if (q_max >= n) {
419
- if (q_out) {
420
- *q_out = q_max;
421
- }
422
- return 0xffff;
423
- }
424
- if (s0i == s1i) {
425
- if (q_out) {
426
- *q_out = q_min;
427
- }
428
- return s0i;
429
- }
430
- uint64_t t0 = get_cy();
431
-
432
- // lower bound inclusive, upper exclusive
433
- size_t s0 = s0i, s1 = s1i + 1;
434
-
435
- IFV printf("bounds: %ld %ld\n", s0, s1 - 1);
436
-
437
- int thresh;
438
- size_t n_eq = 0, n_lt = 0;
439
- size_t q = 0;
440
-
441
- for (int it = 0; it < 200; it++) {
442
- // while(s0 + 1 < s1) {
443
- thresh = (s0 + s1) / 2;
444
- count_lt_and_eq<C>(vals, n, thresh, n_lt, n_eq);
445
-
446
- IFV printf(
447
- " [%ld %ld] thresh=%d n_lt=%ld n_eq=%ld, q=%ld:%ld/%ld\n",
448
- s0,
449
- s1,
450
- thresh,
451
- n_lt,
452
- n_eq,
453
- q_min,
454
- q_max,
455
- n);
456
- if (n_lt <= q_min) {
457
- if (n_lt + n_eq >= q_min) {
458
- q = q_min;
459
- break;
460
- } else {
461
- if (C::is_max) {
462
- s0 = thresh;
463
- } else {
464
- s1 = thresh;
465
- }
466
- }
467
- } else if (n_lt <= q_max) {
468
- q = n_lt;
469
- break;
470
- } else {
471
- if (C::is_max) {
472
- s1 = thresh;
473
- } else {
474
- s0 = thresh;
475
- }
476
- }
477
- }
478
-
479
- uint64_t t1 = get_cy();
480
-
481
- // number of equal values to keep
482
- int64_t n_eq_1 = q - n_lt;
483
-
484
- IFV printf("shrink: thresh=%d q=%ld n_eq_1=%ld\n", thresh, q, n_eq_1);
485
- if (n_eq_1 < 0) { // happens when > q elements are at lower bound
486
- assert(s0 + 1 == s1);
487
- q = q_min;
488
- if (C::is_max) {
489
- thresh--;
490
- } else {
491
- thresh++;
492
- }
493
- n_eq_1 = q;
494
- IFV printf(" override: thresh=%d n_eq_1=%ld\n", thresh, n_eq_1);
495
- } else {
496
- assert(n_eq_1 <= n_eq);
497
- }
498
-
499
- size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq_1);
500
-
501
- IFV printf("wp=%ld\n", wp);
502
- assert(wp == q);
503
- if (q_out) {
504
- *q_out = q;
505
- }
506
-
507
- uint64_t t2 = get_cy();
508
-
509
- partition_stats.bisect_cycles += t1 - t0;
510
- partition_stats.compress_cycles += t2 - t1;
511
-
512
- return thresh;
513
- }
514
-
515
- template <class C>
516
- uint16_t simd_partition_fuzzy_with_bounds_histogram(
517
- uint16_t* vals,
518
- typename C::TI* ids,
519
- size_t n,
520
- size_t q_min,
521
- size_t q_max,
522
- size_t* q_out,
523
- uint16_t s0i,
524
- uint16_t s1i) {
525
- if (q_min == 0) {
526
- if (q_out) {
527
- *q_out = 0;
528
- }
529
- return 0;
530
- }
531
- if (q_max >= n) {
532
- if (q_out) {
533
- *q_out = q_max;
534
- }
535
- return 0xffff;
536
- }
537
- if (s0i == s1i) {
538
- if (q_out) {
539
- *q_out = q_min;
540
- }
541
- return s0i;
542
- }
543
-
544
- IFV printf(
545
- "partition fuzzy, q=%ld:%ld / %ld, bounds=%d %d\n",
546
- q_min,
547
- q_max,
548
- n,
549
- s0i,
550
- s1i);
551
-
552
- if (!C::is_max) {
553
- IFV printf(
554
- "revert due to CMin, q_min:q_max -> %ld:%ld\n", q_min, q_max);
555
- q_min = n - q_min;
556
- q_max = n - q_max;
557
- }
558
-
559
- // lower and upper bound of range, inclusive
560
- int s0 = s0i, s1 = s1i;
561
- // number of values < s0 and > s1
562
- size_t n_lt = 0, n_gt = 0;
563
-
564
- // output of loop:
565
- int thresh; // final threshold
566
- uint64_t tot_eq = 0; // total nb of equal values
567
- uint64_t n_eq = 0; // nb of equal values to keep
568
- size_t q; // final quantile
569
-
570
- // buffer for the histograms
571
- int hist[16];
572
-
573
- for (int it = 0; it < 20; it++) {
574
- // otherwise we would be done already
575
-
576
- int shift = 0;
577
-
578
- IFV printf(
579
- " it %d bounds: %d %d n_lt=%ld n_gt=%ld\n",
580
- it,
581
- s0,
582
- s1,
583
- n_lt,
584
- n_gt);
585
-
586
- int maxval = s1 - s0;
587
-
588
- while (maxval > 15) {
589
- shift++;
590
- maxval >>= 1;
591
- }
592
-
593
- IFV printf(
594
- " histogram shift %d maxval %d ?= %d\n",
595
- shift,
596
- maxval,
597
- int((s1 - s0) >> shift));
598
-
599
- if (maxval > 7) {
600
- simd_histogram_16(vals, n, s0, shift, hist);
601
- } else {
602
- simd_histogram_8(vals, n, s0, shift, hist);
603
- }
604
- IFV {
605
- int sum = n_lt + n_gt;
606
- printf(" n_lt=%ld hist=[", n_lt);
607
- for (int i = 0; i <= maxval; i++) {
608
- printf("%d ", hist[i]);
609
- sum += hist[i];
610
- }
611
- printf("] n_gt=%ld sum=%d\n", n_gt, sum);
612
- assert(sum == n);
613
- }
614
-
615
- size_t sum_below = n_lt;
616
- int i;
617
- for (i = 0; i <= maxval; i++) {
618
- sum_below += hist[i];
619
- if (sum_below >= q_min) {
620
- break;
621
- }
622
- }
623
- IFV printf(" i=%d sum_below=%ld\n", i, sum_below);
624
- if (i <= maxval) {
625
- s0 = s0 + (i << shift);
626
- s1 = s0 + (1 << shift) - 1;
627
- n_lt = sum_below - hist[i];
628
- n_gt = n - sum_below;
629
- } else {
630
- assert(false && "not implemented");
631
- }
632
-
633
- IFV printf(
634
- " new bin: s0=%d s1=%d n_lt=%ld n_gt=%ld\n",
635
- s0,
636
- s1,
637
- n_lt,
638
- n_gt);
639
-
640
- if (s1 > s0) {
641
- if (n_lt >= q_min && q_max >= n_lt) {
642
- IFV printf(" FOUND1\n");
643
- thresh = s0;
644
- q = n_lt;
645
- break;
646
- }
223
+ #undef IFV
647
224
 
648
- size_t n_lt_2 = n - n_gt;
649
- if (n_lt_2 >= q_min && q_max >= n_lt_2) {
650
- thresh = s1 + 1;
651
- q = n_lt_2;
652
- IFV printf(" FOUND2\n");
653
- break;
654
- }
655
- } else {
656
- thresh = s0;
657
- q = q_min;
658
- tot_eq = n - n_gt - n_lt;
659
- n_eq = q_min - n_lt;
660
- IFV printf(" FOUND3\n");
661
- break;
662
- }
663
- }
664
-
665
- IFV printf("end bisection: thresh=%d q=%ld n_eq=%ld\n", thresh, q, n_eq);
666
-
667
- if (!C::is_max) {
668
- if (n_eq == 0) {
669
- thresh--;
670
- } else {
671
- // thresh unchanged
672
- n_eq = tot_eq - n_eq;
673
- }
674
- q = n - q;
675
- IFV printf("revert due to CMin, q->%ld n_eq->%ld\n", q, n_eq);
676
- }
677
-
678
- size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq);
679
- IFV printf("wp=%ld ?= %ld\n", wp, q);
680
- assert(wp == q);
681
- if (q_out) {
682
- *q_out = wp;
683
- }
684
-
685
- return thresh;
686
- }
687
-
688
- template <class C>
689
- uint16_t simd_partition_fuzzy(
690
- uint16_t* vals,
691
- typename C::TI* ids,
692
- size_t n,
693
- size_t q_min,
694
- size_t q_max,
695
- size_t* q_out) {
696
- assert(is_aligned_pointer(vals));
697
-
698
- uint16_t s0i, s1i;
699
- find_minimax(vals, n, s0i, s1i);
700
- // QSelect_stats.t0 += get_cy() - t0;
701
-
702
- return simd_partition_fuzzy_with_bounds<C>(
703
- vals, ids, n, q_min, q_max, q_out, s0i, s1i);
704
- }
705
-
706
- template <class C>
707
- uint16_t simd_partition(
708
- uint16_t* vals,
709
- typename C::TI* ids,
710
- size_t n,
711
- size_t q) {
712
- assert(is_aligned_pointer(vals));
713
-
714
- if (q == 0) {
715
- return 0;
716
- }
717
- if (q >= n) {
718
- return 0xffff;
719
- }
720
-
721
- uint16_t s0i, s1i;
722
- find_minimax(vals, n, s0i, s1i);
723
-
724
- return simd_partition_fuzzy_with_bounds<C>(
725
- vals, ids, n, q, q, nullptr, s0i, s1i);
726
- }
727
-
728
- template <class C>
729
- uint16_t simd_partition_with_bounds(
730
- uint16_t* vals,
731
- typename C::TI* ids,
732
- size_t n,
733
- size_t q,
734
- uint16_t s0i,
735
- uint16_t s1i) {
736
- return simd_partition_fuzzy_with_bounds<C>(
737
- vals, ids, n, q, q, nullptr, s0i, s1i);
738
- }
739
-
740
- } // namespace simd_partitioning
225
+ } // namespace partitioning
741
226
 
742
227
  /******************************************************************
743
228
  * Driver routine
@@ -751,13 +236,20 @@ typename C::T partition_fuzzy(
751
236
  size_t q_min,
752
237
  size_t q_max,
753
238
  size_t* q_out) {
754
- #ifdef __AVX2__
755
239
  constexpr bool is_uint16 = std::is_same<typename C::T, uint16_t>::value;
756
- if (is_uint16 && is_aligned_pointer(vals)) {
757
- return simd_partitioning::simd_partition_fuzzy<C>(
758
- (uint16_t*)vals, ids, n, q_min, q_max, q_out);
240
+ if constexpr (is_uint16) {
241
+ if (is_aligned_pointer(vals)) {
242
+ return with_simd_level_256bit([&]<SIMDLevel SL>() -> typename C::T {
243
+ if constexpr (SL == SIMDLevel::NONE) {
244
+ return partitioning::partition_fuzzy_median3<C>(
245
+ vals, ids, n, q_min, q_max, q_out);
246
+ } else {
247
+ return partition_fuzzy_simd<SL, C>(
248
+ (uint16_t*)vals, ids, n, q_min, q_max, q_out);
249
+ }
250
+ });
251
+ }
759
252
  }
760
- #endif
761
253
  return partitioning::partition_fuzzy_median3<C>(
762
254
  vals, ids, n, q_min, q_max, q_out);
763
255
  }
@@ -813,457 +305,12 @@ template uint16_t partition_fuzzy<CMax<uint16_t, int>>(
813
305
  size_t* q_out);
814
306
 
815
307
  /******************************************************************
816
- * Histogram subroutines
308
+ * Histogram subroutines — scalar fallbacks
817
309
  ******************************************************************/
818
310
 
819
- #if defined(__AVX2__) || defined(__aarch64__)
820
- /// FIXME when MSB of uint16 is set
821
- // this code does not compile properly with GCC 7.4.0
822
-
823
311
  namespace {
824
312
 
825
- /************************************************************
826
- * 8 bins
827
- ************************************************************/
828
-
829
- simd32uint8 accu4to8(simd16uint16 a4) {
830
- simd16uint16 mask4(0x0f0f);
831
-
832
- simd16uint16 a8_0 = a4 & mask4;
833
- simd16uint16 a8_1 = (a4 >> 4) & mask4;
834
-
835
- return simd32uint8(hadd(a8_0, a8_1));
836
- }
837
-
838
- simd16uint16 accu8to16(simd32uint8 a8) {
839
- simd16uint16 mask8(0x00ff);
840
-
841
- simd16uint16 a8_0 = simd16uint16(a8) & mask8;
842
- simd16uint16 a8_1 = (simd16uint16(a8) >> 8) & mask8;
843
-
844
- return hadd(a8_0, a8_1);
845
- }
846
-
847
- static const simd32uint8 shifts = simd32uint8::create<
848
- 1,
849
- 16,
850
- 0,
851
- 0,
852
- 4,
853
- 64,
854
- 0,
855
- 0,
856
- 0,
857
- 0,
858
- 1,
859
- 16,
860
- 0,
861
- 0,
862
- 4,
863
- 64,
864
- 1,
865
- 16,
866
- 0,
867
- 0,
868
- 4,
869
- 64,
870
- 0,
871
- 0,
872
- 0,
873
- 0,
874
- 1,
875
- 16,
876
- 0,
877
- 0,
878
- 4,
879
- 64>();
880
-
881
- // 2-bit accumulator: we can add only up to 3 elements
882
- // on output we return 2*4-bit results
883
- // preproc returns either an index in 0..7 or 0xffff
884
- // that yields a 0 when used in the table look-up
885
- template <int N, class Preproc>
886
- void compute_accu2(
887
- const uint16_t*& data,
888
- Preproc& pp,
889
- simd16uint16& a4lo,
890
- simd16uint16& a4hi) {
891
- simd16uint16 mask2(0x3333);
892
- simd16uint16 a2((uint16_t)0); // 2-bit accu
893
- for (int j = 0; j < N; j++) {
894
- simd16uint16 v(data);
895
- data += 16;
896
- v = pp(v);
897
- // 0x800 -> force second half of table
898
- simd16uint16 idx = v | (v << 8) | simd16uint16(0x800);
899
- a2 += simd16uint16(shifts.lookup_2_lanes(simd32uint8(idx)));
900
- }
901
- a4lo += a2 & mask2;
902
- a4hi += (a2 >> 2) & mask2;
903
- }
904
-
905
- template <class Preproc>
906
- simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) {
907
- assert(n_in % 16 == 0);
908
- int n = n_in / 16;
909
-
910
- simd32uint8 a8lo(0);
911
- simd32uint8 a8hi(0);
912
-
913
- for (int i0 = 0; i0 < n; i0 += 15) {
914
- simd16uint16 a4lo(0); // 4-bit accus
915
- simd16uint16 a4hi(0);
916
-
917
- int i1 = std::min(i0 + 15, n);
918
- int i;
919
- for (i = i0; i + 2 < i1; i += 3) {
920
- compute_accu2<3>(data, pp, a4lo, a4hi); // adds 3 max
921
- }
922
- switch (i1 - i) {
923
- case 2:
924
- compute_accu2<2>(data, pp, a4lo, a4hi);
925
- break;
926
- case 1:
927
- compute_accu2<1>(data, pp, a4lo, a4hi);
928
- break;
929
- }
930
-
931
- a8lo += accu4to8(a4lo);
932
- a8hi += accu4to8(a4hi);
933
- }
934
-
935
- // move to 16-bit accu
936
- simd16uint16 a16lo = accu8to16(a8lo);
937
- simd16uint16 a16hi = accu8to16(a8hi);
938
-
939
- simd16uint16 a16 = hadd(a16lo, a16hi);
940
-
941
- // the 2 lanes must still be combined
942
- return a16;
943
- }
944
-
945
- /************************************************************
946
- * 16 bins
947
- ************************************************************/
948
-
949
- static const simd32uint8 shifts2 = simd32uint8::create<
950
- 1,
951
- 2,
952
- 4,
953
- 8,
954
- 16,
955
- 32,
956
- 64,
957
- 128,
958
- 1,
959
- 2,
960
- 4,
961
- 8,
962
- 16,
963
- 32,
964
- 64,
965
- 128,
966
- 1,
967
- 2,
968
- 4,
969
- 8,
970
- 16,
971
- 32,
972
- 64,
973
- 128,
974
- 1,
975
- 2,
976
- 4,
977
- 8,
978
- 16,
979
- 32,
980
- 64,
981
- 128>();
982
-
983
- simd32uint8 shiftr_16(simd32uint8 x, int n) {
984
- return simd32uint8(simd16uint16(x) >> n);
985
- }
986
-
987
- // 2-bit accumulator: we can add only up to 3 elements
988
- // on output we return 2*4-bit results
989
- template <int N, class Preproc>
990
- void compute_accu2_16(
991
- const uint16_t*& data,
992
- Preproc pp,
993
- simd32uint8& a4_0,
994
- simd32uint8& a4_1,
995
- simd32uint8& a4_2,
996
- simd32uint8& a4_3) {
997
- simd32uint8 mask1(0x55);
998
- simd32uint8 a2_0; // 2-bit accu
999
- simd32uint8 a2_1; // 2-bit accu
1000
- a2_0.clear();
1001
- a2_1.clear();
1002
-
1003
- for (int j = 0; j < N; j++) {
1004
- simd16uint16 v(data);
1005
- data += 16;
1006
- v = pp(v);
1007
-
1008
- simd16uint16 idx = v | (v << 8);
1009
- simd32uint8 a1 = shifts2.lookup_2_lanes(simd32uint8(idx));
1010
- // contains 0s for out-of-bounds elements
1011
-
1012
- simd16uint16 lt8 = (v >> 3) == simd16uint16(0);
1013
- lt8 = lt8 ^ simd16uint16(0xff00);
1014
-
1015
- a1 = a1 & lt8;
1016
-
1017
- a2_0 += a1 & mask1;
1018
- a2_1 += shiftr_16(a1, 1) & mask1;
1019
- }
1020
- simd32uint8 mask2(0x33);
1021
-
1022
- a4_0 += a2_0 & mask2;
1023
- a4_1 += a2_1 & mask2;
1024
- a4_2 += shiftr_16(a2_0, 2) & mask2;
1025
- a4_3 += shiftr_16(a2_1, 2) & mask2;
1026
- }
1027
-
1028
- simd32uint8 accu4to8_2(simd32uint8 a4_0, simd32uint8 a4_1) {
1029
- simd32uint8 mask4(0x0f);
1030
-
1031
- simd16uint16 a8_0 = combine2x2(
1032
- (simd16uint16)(a4_0 & mask4),
1033
- (simd16uint16)(shiftr_16(a4_0, 4) & mask4));
1034
-
1035
- simd16uint16 a8_1 = combine2x2(
1036
- (simd16uint16)(a4_1 & mask4),
1037
- (simd16uint16)(shiftr_16(a4_1, 4) & mask4));
1038
-
1039
- return simd32uint8(hadd(a8_0, a8_1));
1040
- }
1041
-
1042
- template <class Preproc>
1043
- simd16uint16 histogram_16(const uint16_t* data, Preproc pp, size_t n_in) {
1044
- assert(n_in % 16 == 0);
1045
- int n = n_in / 16;
1046
-
1047
- simd32uint8 a8lo((uint8_t)0);
1048
- simd32uint8 a8hi((uint8_t)0);
1049
-
1050
- for (int i0 = 0; i0 < n; i0 += 7) {
1051
- simd32uint8 a4_0(0); // 0, 4, 8, 12
1052
- simd32uint8 a4_1(0); // 1, 5, 9, 13
1053
- simd32uint8 a4_2(0); // 2, 6, 10, 14
1054
- simd32uint8 a4_3(0); // 3, 7, 11, 15
1055
-
1056
- int i1 = std::min(i0 + 7, n);
1057
- int i;
1058
- for (i = i0; i + 2 < i1; i += 3) {
1059
- compute_accu2_16<3>(data, pp, a4_0, a4_1, a4_2, a4_3);
1060
- }
1061
- switch (i1 - i) {
1062
- case 2:
1063
- compute_accu2_16<2>(data, pp, a4_0, a4_1, a4_2, a4_3);
1064
- break;
1065
- case 1:
1066
- compute_accu2_16<1>(data, pp, a4_0, a4_1, a4_2, a4_3);
1067
- break;
1068
- }
1069
-
1070
- a8lo += accu4to8_2(a4_0, a4_1);
1071
- a8hi += accu4to8_2(a4_2, a4_3);
1072
- }
1073
-
1074
- // move to 16-bit accu
1075
- simd16uint16 a16lo = accu8to16(a8lo);
1076
- simd16uint16 a16hi = accu8to16(a8hi);
1077
-
1078
- simd16uint16 a16 = hadd(a16lo, a16hi);
1079
-
1080
- a16 = simd16uint16{simd8uint32{a16}.unzip()};
1081
-
1082
- return a16;
1083
- }
1084
-
1085
- struct PreprocNOP {
1086
- simd16uint16 operator()(simd16uint16 x) {
1087
- return x;
1088
- }
1089
- };
1090
-
1091
- template <int shift, int nbin>
1092
- struct PreprocMinShift {
1093
- simd16uint16 min16;
1094
- simd16uint16 max16;
1095
-
1096
- explicit PreprocMinShift(uint16_t min) {
1097
- min16.set1(min);
1098
- int vmax0 = std::min((nbin << shift) + min, 65536);
1099
- uint16_t vmax = uint16_t(vmax0 - 1 - min);
1100
- max16.set1(vmax); // vmax inclusive
1101
- }
1102
-
1103
- simd16uint16 operator()(simd16uint16 x) {
1104
- x = x - min16;
1105
- simd16uint16 mask = (x == max(x, max16)) - (x == max16);
1106
- return (x >> shift) | mask;
1107
- }
1108
- };
1109
-
1110
- /* unbounded versions of the functions */
1111
-
1112
- void simd_histogram_8_unbounded(const uint16_t* data, int n, int* hist) {
1113
- PreprocNOP pp;
1114
- simd16uint16 a16 = histogram_8(data, pp, (n & ~15));
1115
-
1116
- ALIGNED(32) uint16_t a16_tab[16];
1117
- a16.store(a16_tab);
1118
-
1119
- for (int i = 0; i < 8; i++) {
1120
- hist[i] = a16_tab[i] + a16_tab[i + 8];
1121
- }
1122
-
1123
- for (int i = (n & ~15); i < n; i++) {
1124
- hist[data[i]]++;
1125
- }
1126
- }
1127
-
1128
- void simd_histogram_16_unbounded(const uint16_t* data, int n, int* hist) {
1129
- simd16uint16 a16 = histogram_16(data, PreprocNOP(), (n & ~15));
1130
-
1131
- ALIGNED(32) uint16_t a16_tab[16];
1132
- a16.store(a16_tab);
1133
-
1134
- for (int i = 0; i < 16; i++) {
1135
- hist[i] = a16_tab[i];
1136
- }
1137
-
1138
- for (int i = (n & ~15); i < n; i++) {
1139
- hist[data[i]]++;
1140
- }
1141
- }
1142
-
1143
- } // anonymous namespace
1144
-
1145
- /************************************************************
1146
- * Driver routines
1147
- ************************************************************/
1148
-
1149
- void simd_histogram_8(
1150
- const uint16_t* data,
1151
- int n,
1152
- uint16_t min,
1153
- int shift,
1154
- int* hist) {
1155
- if (shift < 0) {
1156
- simd_histogram_8_unbounded(data, n, hist);
1157
- return;
1158
- }
1159
-
1160
- simd16uint16 a16;
1161
-
1162
- #define DISPATCH(s) \
1163
- case s: \
1164
- a16 = histogram_8(data, PreprocMinShift<s, 8>(min), (n & ~15)); \
1165
- break
1166
-
1167
- switch (shift) {
1168
- DISPATCH(0);
1169
- DISPATCH(1);
1170
- DISPATCH(2);
1171
- DISPATCH(3);
1172
- DISPATCH(4);
1173
- DISPATCH(5);
1174
- DISPATCH(6);
1175
- DISPATCH(7);
1176
- DISPATCH(8);
1177
- DISPATCH(9);
1178
- DISPATCH(10);
1179
- DISPATCH(11);
1180
- DISPATCH(12);
1181
- DISPATCH(13);
1182
- default:
1183
- FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
1184
- }
1185
- #undef DISPATCH
1186
-
1187
- ALIGNED(32) uint16_t a16_tab[16];
1188
- a16.store(a16_tab);
1189
-
1190
- for (int i = 0; i < 8; i++) {
1191
- hist[i] = a16_tab[i] + a16_tab[i + 8];
1192
- }
1193
-
1194
- // complete with remaining bins
1195
- for (int i = (n & ~15); i < n; i++) {
1196
- if (data[i] < min) {
1197
- continue;
1198
- }
1199
- uint16_t v = data[i] - min;
1200
- v >>= shift;
1201
- if (v < 8) {
1202
- hist[v]++;
1203
- }
1204
- }
1205
- }
1206
-
1207
- void simd_histogram_16(
1208
- const uint16_t* data,
1209
- int n,
1210
- uint16_t min,
1211
- int shift,
1212
- int* hist) {
1213
- if (shift < 0) {
1214
- simd_histogram_16_unbounded(data, n, hist);
1215
- return;
1216
- }
1217
-
1218
- simd16uint16 a16;
1219
-
1220
- #define DISPATCH(s) \
1221
- case s: \
1222
- a16 = histogram_16(data, PreprocMinShift<s, 16>(min), (n & ~15)); \
1223
- break
1224
-
1225
- switch (shift) {
1226
- DISPATCH(0);
1227
- DISPATCH(1);
1228
- DISPATCH(2);
1229
- DISPATCH(3);
1230
- DISPATCH(4);
1231
- DISPATCH(5);
1232
- DISPATCH(6);
1233
- DISPATCH(7);
1234
- DISPATCH(8);
1235
- DISPATCH(9);
1236
- DISPATCH(10);
1237
- DISPATCH(11);
1238
- DISPATCH(12);
1239
- default:
1240
- FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
1241
- }
1242
- #undef DISPATCH
1243
-
1244
- ALIGNED(32) uint16_t a16_tab[16];
1245
- a16.store(a16_tab);
1246
-
1247
- for (int i = 0; i < 16; i++) {
1248
- hist[i] = a16_tab[i];
1249
- }
1250
-
1251
- for (int i = (n & ~15); i < n; i++) {
1252
- if (data[i] < min) {
1253
- continue;
1254
- }
1255
- uint16_t v = data[i] - min;
1256
- v >>= shift;
1257
- if (v < 16) {
1258
- hist[v]++;
1259
- }
1260
- }
1261
- }
1262
-
1263
- // no AVX2
1264
- #else
1265
-
1266
- void simd_histogram_16(
313
+ void simd_histogram_16_scalar(
1267
314
  const uint16_t* data,
1268
315
  int n,
1269
316
  uint16_t min,
@@ -1271,32 +318,25 @@ void simd_histogram_16(
1271
318
  int* hist) {
1272
319
  memset(hist, 0, sizeof(*hist) * 16);
1273
320
  if (shift < 0) {
1274
- for (size_t i = 0; i < n; i++) {
321
+ for (size_t i = 0; i < static_cast<size_t>(n); i++) {
1275
322
  hist[data[i]]++;
1276
323
  }
1277
324
  } else {
1278
325
  int vmax0 = std::min((16 << shift) + min, 65536);
1279
326
  uint16_t vmax = uint16_t(vmax0 - 1 - min);
1280
327
 
1281
- for (size_t i = 0; i < n; i++) {
328
+ for (size_t i = 0; i < static_cast<size_t>(n); i++) {
1282
329
  uint16_t v = data[i];
1283
330
  v -= min;
1284
331
  if (!(v <= vmax))
1285
332
  continue;
1286
333
  v >>= shift;
1287
334
  hist[v]++;
1288
-
1289
- /*
1290
- if (data[i] < min) continue;
1291
- uint16_t v = data[i] - min;
1292
- v >>= shift;
1293
- if (v < 16) hist[v]++;
1294
- */
1295
335
  }
1296
336
  }
1297
337
  }
1298
338
 
1299
- void simd_histogram_8(
339
+ void simd_histogram_8_scalar(
1300
340
  const uint16_t* data,
1301
341
  int n,
1302
342
  uint16_t min,
@@ -1304,11 +344,11 @@ void simd_histogram_8(
1304
344
  int* hist) {
1305
345
  memset(hist, 0, sizeof(*hist) * 8);
1306
346
  if (shift < 0) {
1307
- for (size_t i = 0; i < n; i++) {
347
+ for (size_t i = 0; i < static_cast<size_t>(n); i++) {
1308
348
  hist[data[i]]++;
1309
349
  }
1310
350
  } else {
1311
- for (size_t i = 0; i < n; i++) {
351
+ for (size_t i = 0; i < static_cast<size_t>(n); i++) {
1312
352
  if (data[i] < min)
1313
353
  continue;
1314
354
  uint16_t v = data[i] - min;
@@ -1319,7 +359,46 @@ void simd_histogram_8(
1319
359
  }
1320
360
  }
1321
361
 
362
+ } // anonymous namespace
363
+
364
+ /******************************************************************
365
+ * Histogram subroutines — dispatch to SIMD or scalar
366
+ ******************************************************************/
367
+
368
+ void simd_histogram_8(
369
+ const uint16_t* data,
370
+ int n,
371
+ uint16_t min,
372
+ int shift,
373
+ int* hist) {
374
+ with_simd_level_256bit([&]<SIMDLevel SL>() {
375
+ if constexpr (SL == SIMDLevel::NONE) {
376
+ simd_histogram_8_scalar(data, n, min, shift, hist);
377
+ } else {
378
+ faiss::simd_histogram_8<SL>(data, n, min, shift, hist);
379
+ }
380
+ });
381
+ }
382
+
383
+ void simd_histogram_16(
384
+ const uint16_t* data,
385
+ int n,
386
+ uint16_t min,
387
+ int shift,
388
+ int* hist) {
389
+ // GCC 12 miscompiles the AVX2 SIMD histogram — fall back to scalar.
390
+ #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ == 12
391
+ simd_histogram_16_scalar(data, n, min, shift, hist);
392
+ #else
393
+ with_simd_level_256bit([&]<SIMDLevel SL>() {
394
+ if constexpr (SL == SIMDLevel::NONE) {
395
+ simd_histogram_16_scalar(data, n, min, shift, hist);
396
+ } else {
397
+ faiss::simd_histogram_16<SL>(data, n, min, shift, hist);
398
+ }
399
+ });
1322
400
  #endif
401
+ }
1323
402
 
1324
403
  void PartitionStats::reset() {
1325
404
  memset(this, 0, sizeof(*this));