faiss 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (378) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/ext/faiss/extconf.rb +2 -1
  4. data/ext/faiss/{index_rb.cpp → index.cpp} +1 -1
  5. data/ext/faiss/index_binary.cpp +1 -1
  6. data/ext/faiss/kmeans.cpp +1 -1
  7. data/ext/faiss/pca_matrix.cpp +1 -1
  8. data/ext/faiss/product_quantizer.cpp +1 -1
  9. data/ext/faiss/{utils_rb.cpp → utils.cpp} +1 -1
  10. data/lib/faiss/version.rb +1 -1
  11. data/vendor/faiss/faiss/AutoTune.cpp +93 -80
  12. data/vendor/faiss/faiss/Clustering.cpp +39 -240
  13. data/vendor/faiss/faiss/Clustering.h +6 -0
  14. data/vendor/faiss/faiss/IVFlib.cpp +41 -21
  15. data/vendor/faiss/faiss/Index.cpp +6 -5
  16. data/vendor/faiss/faiss/Index.h +5 -5
  17. data/vendor/faiss/faiss/Index2Layer.cpp +37 -53
  18. data/vendor/faiss/faiss/IndexAdditiveQuantizer.cpp +49 -37
  19. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp +36 -34
  20. data/vendor/faiss/faiss/IndexAdditiveQuantizerFastScan.h +4 -1
  21. data/vendor/faiss/faiss/IndexBinary.cpp +5 -3
  22. data/vendor/faiss/faiss/IndexBinary.h +4 -4
  23. data/vendor/faiss/faiss/IndexBinaryFlat.cpp +1 -1
  24. data/vendor/faiss/faiss/IndexBinaryFlat.h +1 -1
  25. data/vendor/faiss/faiss/IndexBinaryFromFloat.cpp +4 -4
  26. data/vendor/faiss/faiss/IndexBinaryHNSW.cpp +88 -97
  27. data/vendor/faiss/faiss/IndexBinaryHNSW.h +9 -3
  28. data/vendor/faiss/faiss/IndexBinaryHash.cpp +45 -236
  29. data/vendor/faiss/faiss/IndexBinaryHash.h +6 -6
  30. data/vendor/faiss/faiss/IndexBinaryIVF.cpp +89 -417
  31. data/vendor/faiss/faiss/IndexFastScan.cpp +72 -109
  32. data/vendor/faiss/faiss/IndexFastScan.h +25 -23
  33. data/vendor/faiss/faiss/IndexFlat.cpp +27 -20
  34. data/vendor/faiss/faiss/IndexFlat.h +21 -18
  35. data/vendor/faiss/faiss/IndexFlatCodes.cpp +42 -19
  36. data/vendor/faiss/faiss/IndexHNSW.cpp +374 -206
  37. data/vendor/faiss/faiss/IndexHNSW.h +16 -2
  38. data/vendor/faiss/faiss/IndexIDMap.cpp +25 -21
  39. data/vendor/faiss/faiss/IndexIDMap.h +9 -7
  40. data/vendor/faiss/faiss/IndexIVF.cpp +467 -364
  41. data/vendor/faiss/faiss/IndexIVF.h +33 -12
  42. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizer.cpp +79 -76
  43. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp +96 -93
  44. data/vendor/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h +4 -1
  45. data/vendor/faiss/faiss/IndexIVFFastScan.cpp +357 -238
  46. data/vendor/faiss/faiss/IndexIVFFastScan.h +42 -41
  47. data/vendor/faiss/faiss/IndexIVFFlat.cpp +39 -69
  48. data/vendor/faiss/faiss/IndexIVFFlat.h +32 -0
  49. data/vendor/faiss/faiss/IndexIVFFlatPanorama.cpp +56 -33
  50. data/vendor/faiss/faiss/IndexIVFFlatPanorama.h +3 -1
  51. data/vendor/faiss/faiss/IndexIVFIndependentQuantizer.cpp +18 -15
  52. data/vendor/faiss/faiss/IndexIVFPQ.cpp +73 -846
  53. data/vendor/faiss/faiss/IndexIVFPQFastScan.cpp +151 -121
  54. data/vendor/faiss/faiss/IndexIVFPQFastScan.h +3 -0
  55. data/vendor/faiss/faiss/IndexIVFPQR.cpp +23 -20
  56. data/vendor/faiss/faiss/IndexIVFRaBitQ.cpp +30 -52
  57. data/vendor/faiss/faiss/IndexIVFRaBitQ.h +2 -1
  58. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.cpp +475 -476
  59. data/vendor/faiss/faiss/IndexIVFRaBitQFastScan.h +248 -93
  60. data/vendor/faiss/faiss/IndexIVFSpectralHash.cpp +41 -127
  61. data/vendor/faiss/faiss/IndexIVFSpectralHash.h +1 -1
  62. data/vendor/faiss/faiss/IndexLSH.cpp +36 -19
  63. data/vendor/faiss/faiss/IndexLattice.cpp +13 -13
  64. data/vendor/faiss/faiss/IndexNNDescent.cpp +36 -21
  65. data/vendor/faiss/faiss/IndexNNDescent.h +2 -2
  66. data/vendor/faiss/faiss/IndexNSG.cpp +38 -23
  67. data/vendor/faiss/faiss/IndexNeuralNetCodec.cpp +31 -11
  68. data/vendor/faiss/faiss/IndexPQ.cpp +128 -221
  69. data/vendor/faiss/faiss/IndexPQ.h +3 -2
  70. data/vendor/faiss/faiss/IndexPQFastScan.cpp +20 -14
  71. data/vendor/faiss/faiss/IndexPQFastScan.h +3 -0
  72. data/vendor/faiss/faiss/IndexPreTransform.cpp +25 -18
  73. data/vendor/faiss/faiss/IndexPreTransform.h +1 -1
  74. data/vendor/faiss/faiss/IndexRaBitQ.cpp +11 -36
  75. data/vendor/faiss/faiss/IndexRaBitQ.h +2 -1
  76. data/vendor/faiss/faiss/IndexRaBitQFastScan.cpp +41 -277
  77. data/vendor/faiss/faiss/IndexRaBitQFastScan.h +183 -27
  78. data/vendor/faiss/faiss/IndexRefine.cpp +30 -25
  79. data/vendor/faiss/faiss/IndexRefine.h +4 -4
  80. data/vendor/faiss/faiss/IndexReplicas.cpp +6 -6
  81. data/vendor/faiss/faiss/IndexRowwiseMinMax.cpp +15 -14
  82. data/vendor/faiss/faiss/IndexRowwiseMinMax.h +1 -1
  83. data/vendor/faiss/faiss/IndexScalarQuantizer.cpp +150 -20
  84. data/vendor/faiss/faiss/IndexScalarQuantizer.h +10 -0
  85. data/vendor/faiss/faiss/IndexShards.cpp +10 -9
  86. data/vendor/faiss/faiss/IndexShardsIVF.cpp +21 -15
  87. data/vendor/faiss/faiss/MatrixStats.cpp +5 -4
  88. data/vendor/faiss/faiss/MetaIndexes.cpp +19 -17
  89. data/vendor/faiss/faiss/MetaIndexes.h +1 -1
  90. data/vendor/faiss/faiss/MetricType.h +14 -7
  91. data/vendor/faiss/faiss/SuperKMeans.cpp +656 -0
  92. data/vendor/faiss/faiss/SuperKMeans.h +97 -0
  93. data/vendor/faiss/faiss/VectorTransform.cpp +237 -149
  94. data/vendor/faiss/faiss/VectorTransform.h +16 -16
  95. data/vendor/faiss/faiss/build.cpp +23 -0
  96. data/vendor/faiss/faiss/build.h +15 -0
  97. data/vendor/faiss/faiss/clone_index.cpp +48 -47
  98. data/vendor/faiss/faiss/cppcontrib/SaDecodeKernels.h +1 -1
  99. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h +47 -47
  100. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h +11 -0
  101. data/vendor/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h +902 -12
  102. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h +38 -38
  103. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h +11 -0
  104. data/vendor/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h +702 -10
  105. data/vendor/faiss/faiss/factory_tools.cpp +9 -0
  106. data/vendor/faiss/faiss/gpu/GpuIndexCagra.h +6 -5
  107. data/vendor/faiss/faiss/gpu/GpuResources.h +3 -2
  108. data/vendor/faiss/faiss/gpu/StandardGpuResources.cpp +15 -16
  109. data/vendor/faiss/faiss/gpu/StandardGpuResources.h +5 -4
  110. data/vendor/faiss/faiss/gpu/test/TestGpuIndexFlat.cpp +46 -0
  111. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +56 -0
  112. data/vendor/faiss/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +78 -1
  113. data/vendor/faiss/faiss/gpu/test/TestUtils.cpp +72 -0
  114. data/vendor/faiss/faiss/gpu/test/TestUtils.h +23 -0
  115. data/vendor/faiss/faiss/gpu/utils/CuvsFilterConvert.h +1 -1
  116. data/vendor/faiss/faiss/gpu/utils/CuvsUtils.h +21 -10
  117. data/vendor/faiss/faiss/gpu_metal/GpuIndexFlat.h +22 -0
  118. data/vendor/faiss/faiss/gpu_metal/MetalCloner.h +35 -0
  119. data/vendor/faiss/faiss/gpu_metal/MetalDistance.h +87 -0
  120. data/vendor/faiss/faiss/gpu_metal/MetalFlatKernels.h +40 -0
  121. data/vendor/faiss/faiss/gpu_metal/MetalIndex.h +58 -0
  122. data/vendor/faiss/faiss/gpu_metal/MetalIndexFlat.h +65 -0
  123. data/vendor/faiss/faiss/gpu_metal/MetalIndexIVFFlat.h +181 -0
  124. data/vendor/faiss/faiss/gpu_metal/MetalKernels.h +111 -0
  125. data/vendor/faiss/faiss/gpu_metal/MetalPythonBridge.h +45 -0
  126. data/vendor/faiss/faiss/gpu_metal/MetalResources.h +79 -0
  127. data/vendor/faiss/faiss/gpu_metal/StandardMetalResources.h +35 -0
  128. data/vendor/faiss/faiss/gpu_metal/impl/MetalIVFFlat.h +193 -0
  129. data/vendor/faiss/faiss/impl/AdSampling.cpp +103 -0
  130. data/vendor/faiss/faiss/impl/AdSampling.h +35 -0
  131. data/vendor/faiss/faiss/impl/AdditiveQuantizer.cpp +29 -25
  132. data/vendor/faiss/faiss/impl/AdditiveQuantizer.h +1 -0
  133. data/vendor/faiss/faiss/impl/AuxIndexStructures.cpp +10 -9
  134. data/vendor/faiss/faiss/impl/AuxIndexStructures.h +3 -0
  135. data/vendor/faiss/faiss/impl/ClusteringHelpers.cpp +244 -0
  136. data/vendor/faiss/faiss/impl/ClusteringHelpers.h +94 -0
  137. data/vendor/faiss/faiss/impl/ClusteringInitialization.cpp +16 -16
  138. data/vendor/faiss/faiss/impl/CodePacker.cpp +3 -3
  139. data/vendor/faiss/faiss/impl/CodePackerRaBitQ.cpp +1 -1
  140. data/vendor/faiss/faiss/impl/DistanceComputer.h +8 -8
  141. data/vendor/faiss/faiss/impl/FaissAssert.h +6 -3
  142. data/vendor/faiss/faiss/impl/FaissException.h +50 -3
  143. data/vendor/faiss/faiss/impl/HNSW.cpp +639 -507
  144. data/vendor/faiss/faiss/impl/HNSW.h +61 -44
  145. data/vendor/faiss/faiss/impl/IDSelector.cpp +15 -11
  146. data/vendor/faiss/faiss/impl/IDSelector.h +8 -8
  147. data/vendor/faiss/faiss/impl/InvertedListScannerStats.h +26 -0
  148. data/vendor/faiss/faiss/impl/LocalSearchQuantizer.cpp +82 -77
  149. data/vendor/faiss/faiss/impl/NNDescent.cpp +62 -25
  150. data/vendor/faiss/faiss/impl/NNDescent.h +6 -2
  151. data/vendor/faiss/faiss/impl/NSG.cpp +53 -32
  152. data/vendor/faiss/faiss/impl/NSG.h +4 -4
  153. data/vendor/faiss/faiss/impl/Panorama.cpp +23 -6
  154. data/vendor/faiss/faiss/impl/Panorama.h +269 -87
  155. data/vendor/faiss/faiss/impl/PdxLayout.cpp +93 -0
  156. data/vendor/faiss/faiss/impl/PdxLayout.h +41 -0
  157. data/vendor/faiss/faiss/impl/PolysemousTraining.cpp +46 -32
  158. data/vendor/faiss/faiss/impl/PolysemousTraining.h +3 -3
  159. data/vendor/faiss/faiss/impl/ProductAdditiveQuantizer.cpp +35 -35
  160. data/vendor/faiss/faiss/impl/ProductQuantizer-inl.h +21 -16
  161. data/vendor/faiss/faiss/impl/ProductQuantizer.cpp +55 -25
  162. data/vendor/faiss/faiss/impl/Quantizer.h +2 -2
  163. data/vendor/faiss/faiss/impl/RaBitQUtils.cpp +55 -49
  164. data/vendor/faiss/faiss/impl/RaBitQUtils.h +65 -0
  165. data/vendor/faiss/faiss/impl/RaBitQuantizer.cpp +302 -283
  166. data/vendor/faiss/faiss/impl/ResidualQuantizer.cpp +26 -23
  167. data/vendor/faiss/faiss/impl/ResidualQuantizer.h +1 -1
  168. data/vendor/faiss/faiss/impl/ResultHandler.h +100 -75
  169. data/vendor/faiss/faiss/impl/ScalarQuantizer.cpp +318 -7
  170. data/vendor/faiss/faiss/impl/ScalarQuantizer.h +77 -1
  171. data/vendor/faiss/faiss/impl/ThreadedIndex-inl.h +14 -11
  172. data/vendor/faiss/faiss/impl/VisitedTable.cpp +10 -10
  173. data/vendor/faiss/faiss/impl/VisitedTable.h +70 -28
  174. data/vendor/faiss/faiss/impl/approx_topk/approx_topk.h +276 -0
  175. data/vendor/faiss/faiss/impl/approx_topk/avx2.cpp +68 -0
  176. data/vendor/faiss/faiss/{utils → impl}/approx_topk/generic.h +15 -8
  177. data/vendor/faiss/faiss/impl/approx_topk/neon.cpp +68 -0
  178. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab-inl.h +169 -0
  179. data/vendor/faiss/faiss/impl/approx_topk/rq_beam_search_tab.h +117 -0
  180. data/vendor/faiss/faiss/impl/approx_topk/simdlib256-inl.h +146 -0
  181. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h +73 -0
  182. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryHash_impl.h +270 -0
  183. data/vendor/faiss/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h +460 -0
  184. data/vendor/faiss/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h +159 -0
  185. data/vendor/faiss/faiss/impl/binary_hamming/IndexPQ_impl.h +92 -0
  186. data/vendor/faiss/faiss/impl/binary_hamming/avx2.cpp +26 -0
  187. data/vendor/faiss/faiss/impl/binary_hamming/avx512.cpp +26 -0
  188. data/vendor/faiss/faiss/impl/binary_hamming/dispatch.h +143 -0
  189. data/vendor/faiss/faiss/impl/binary_hamming/neon.cpp +26 -0
  190. data/vendor/faiss/faiss/impl/binary_hamming/rvv.cpp +26 -0
  191. data/vendor/faiss/faiss/impl/expanded_scanners.h +8 -3
  192. data/vendor/faiss/faiss/impl/{FastScanDistancePostProcessing.h → fast_scan/FastScanDistancePostProcessing.h} +13 -6
  193. data/vendor/faiss/faiss/impl/{LookupTableScaler.h → fast_scan/LookupTableScaler.h} +16 -5
  194. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops.h +237 -0
  195. data/vendor/faiss/faiss/impl/fast_scan/accumulate_loops_512.h +185 -0
  196. data/vendor/faiss/faiss/impl/fast_scan/decompose_qbs.h +229 -0
  197. data/vendor/faiss/faiss/impl/fast_scan/dispatching.h +270 -0
  198. data/vendor/faiss/faiss/impl/{pq4_fast_scan.cpp → fast_scan/fast_scan.cpp} +169 -2
  199. data/vendor/faiss/faiss/impl/fast_scan/fast_scan.h +341 -0
  200. data/vendor/faiss/faiss/impl/fast_scan/impl-avx2.cpp +36 -0
  201. data/vendor/faiss/faiss/impl/fast_scan/impl-avx512.cpp +40 -0
  202. data/vendor/faiss/faiss/impl/fast_scan/impl-neon.cpp +120 -0
  203. data/vendor/faiss/faiss/impl/fast_scan/impl-riscv.cpp +104 -0
  204. data/vendor/faiss/faiss/impl/fast_scan/kernels_simd256.h +213 -0
  205. data/vendor/faiss/faiss/impl/{pq4_fast_scan_search_qbs.cpp → fast_scan/kernels_simd512.h} +26 -356
  206. data/vendor/faiss/faiss/impl/fast_scan/rabitq_dispatching.h +90 -0
  207. data/vendor/faiss/faiss/impl/fast_scan/rabitq_result_handler.h +108 -0
  208. data/vendor/faiss/faiss/impl/{simd_result_handlers.h → fast_scan/simd_result_handlers.h} +282 -134
  209. data/vendor/faiss/faiss/impl/hnsw/LockVector.cpp +54 -0
  210. data/vendor/faiss/faiss/impl/hnsw/LockVector.h +64 -0
  211. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.cpp +83 -0
  212. data/vendor/faiss/faiss/impl/hnsw/MinimaxHeap.h +113 -0
  213. data/vendor/faiss/faiss/impl/hnsw/avx2.cpp +150 -0
  214. data/vendor/faiss/faiss/impl/hnsw/avx512.cpp +142 -0
  215. data/vendor/faiss/faiss/impl/index_read.cpp +1227 -79
  216. data/vendor/faiss/faiss/impl/index_read_utils.h +1 -1
  217. data/vendor/faiss/faiss/impl/index_write.cpp +96 -13
  218. data/vendor/faiss/faiss/impl/io.cpp +6 -6
  219. data/vendor/faiss/faiss/impl/io_macros.h +58 -16
  220. data/vendor/faiss/faiss/impl/kmeans1d.cpp +10 -10
  221. data/vendor/faiss/faiss/impl/lattice_Zn.cpp +37 -23
  222. data/vendor/faiss/faiss/impl/lattice_Zn.h +6 -6
  223. data/vendor/faiss/faiss/impl/mapped_io.cpp +6 -6
  224. data/vendor/faiss/faiss/impl/platform_macros.h +15 -4
  225. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQScanner_impl.h +549 -0
  226. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.cpp +245 -0
  227. data/vendor/faiss/faiss/impl/pq_code_distance/IVFPQ_QueryTables.h +105 -0
  228. data/vendor/faiss/faiss/impl/pq_code_distance/PQDistanceComputer_impl.h +106 -0
  229. data/vendor/faiss/faiss/impl/pq_code_distance/avx2.cpp +23 -0
  230. data/vendor/faiss/faiss/impl/pq_code_distance/avx512.cpp +23 -0
  231. data/vendor/faiss/faiss/impl/pq_code_distance/neon.cpp +23 -0
  232. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx2.cpp → pq_code_distance-avx2.h} +9 -13
  233. data/vendor/faiss/faiss/impl/pq_code_distance/{pq_code_distance-avx512.cpp → pq_code_distance-avx512.h} +9 -57
  234. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.cpp +45 -107
  235. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-generic.h +96 -0
  236. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-inl.h +274 -5
  237. data/vendor/faiss/faiss/impl/pq_code_distance/pq_code_distance-sve.cpp +10 -7
  238. data/vendor/faiss/faiss/impl/pq_code_distance/pq_scan_impl.h +105 -0
  239. data/vendor/faiss/faiss/impl/pq_code_distance/rvv.cpp +70 -0
  240. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.cpp +311 -477
  241. data/vendor/faiss/faiss/impl/residual_quantizer_encode_steps.h +1 -1
  242. data/vendor/faiss/faiss/impl/scalar_quantizer/codecs.h +1 -1
  243. data/vendor/faiss/faiss/impl/scalar_quantizer/distance_computers.h +9 -2
  244. data/vendor/faiss/faiss/impl/scalar_quantizer/quantizers.h +419 -19
  245. data/vendor/faiss/faiss/impl/scalar_quantizer/scanners.h +27 -1
  246. data/vendor/faiss/faiss/impl/scalar_quantizer/similarities.h +3 -3
  247. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx2.cpp +387 -2
  248. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-impl.h +553 -0
  249. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512-spr.cpp +559 -0
  250. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-avx512.cpp +341 -2
  251. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-dispatch.h +425 -3
  252. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-neon.cpp +290 -2
  253. data/vendor/faiss/faiss/impl/scalar_quantizer/sq-rvv.cpp +337 -0
  254. data/vendor/faiss/faiss/impl/scalar_quantizer/training.cpp +192 -8
  255. data/vendor/faiss/faiss/impl/scalar_quantizer/training.h +12 -0
  256. data/vendor/faiss/faiss/impl/simd_dispatch.h +157 -66
  257. data/vendor/faiss/faiss/impl/simdlib/simdlib.h +57 -0
  258. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_avx2.h +264 -172
  259. data/vendor/faiss/faiss/impl/simdlib/simdlib_avx512.h +414 -0
  260. data/vendor/faiss/faiss/impl/simdlib/simdlib_dispatch.h +44 -0
  261. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_emulated.h +231 -166
  262. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_neon.h +270 -218
  263. data/vendor/faiss/faiss/{utils → impl/simdlib}/simdlib_ppc64.h +201 -160
  264. data/vendor/faiss/faiss/impl/svs_io.cpp +12 -3
  265. data/vendor/faiss/faiss/impl/svs_io.h +8 -2
  266. data/vendor/faiss/faiss/index_factory.cpp +90 -18
  267. data/vendor/faiss/faiss/index_io.h +40 -0
  268. data/vendor/faiss/faiss/invlists/BlockInvertedLists.cpp +66 -16
  269. data/vendor/faiss/faiss/invlists/DirectMap.cpp +28 -15
  270. data/vendor/faiss/faiss/invlists/DirectMap.h +4 -3
  271. data/vendor/faiss/faiss/invlists/InvertedLists.cpp +170 -86
  272. data/vendor/faiss/faiss/invlists/InvertedLists.h +88 -25
  273. data/vendor/faiss/faiss/invlists/InvertedListsIOHook.cpp +4 -4
  274. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.cpp +13 -13
  275. data/vendor/faiss/faiss/invlists/OnDiskInvertedLists.h +1 -1
  276. data/vendor/faiss/faiss/svs/IndexSVSFaissUtils.h +1 -1
  277. data/vendor/faiss/faiss/svs/IndexSVSFlat.cpp +2 -2
  278. data/vendor/faiss/faiss/svs/IndexSVSIVF.cpp +350 -0
  279. data/vendor/faiss/faiss/svs/IndexSVSIVF.h +128 -0
  280. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.cpp +40 -0
  281. data/vendor/faiss/faiss/svs/IndexSVSIVFLVQ.h +43 -0
  282. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.cpp +225 -0
  283. data/vendor/faiss/faiss/svs/IndexSVSIVFLeanVec.h +71 -0
  284. data/vendor/faiss/faiss/svs/IndexSVSVamana.cpp +142 -21
  285. data/vendor/faiss/faiss/svs/IndexSVSVamana.h +33 -7
  286. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.cpp +3 -2
  287. data/vendor/faiss/faiss/svs/IndexSVSVamanaLVQ.h +2 -1
  288. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.cpp +77 -27
  289. data/vendor/faiss/faiss/svs/IndexSVSVamanaLeanVec.h +10 -4
  290. data/vendor/faiss/faiss/utils/Heap.cpp +10 -10
  291. data/vendor/faiss/faiss/utils/NeuralNet.cpp +47 -36
  292. data/vendor/faiss/faiss/utils/NeuralNet.h +1 -1
  293. data/vendor/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h +10 -4
  294. data/vendor/faiss/faiss/utils/bf16.h +34 -0
  295. data/vendor/faiss/faiss/utils/distances.cpp +390 -560
  296. data/vendor/faiss/faiss/utils/distances.h +20 -1
  297. data/vendor/faiss/faiss/utils/distances_dispatch.h +117 -37
  298. data/vendor/faiss/faiss/utils/distances_fused/avx512.cpp +8 -7
  299. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.cpp +33 -14
  300. data/vendor/faiss/faiss/utils/distances_fused/distances_fused.h +12 -1
  301. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based.cpp +16 -293
  302. data/vendor/faiss/faiss/utils/distances_fused/simdlib_based_neon.cpp +57 -0
  303. data/vendor/faiss/faiss/utils/distances_fused/simdlib_kernel-inl.h +290 -0
  304. data/vendor/faiss/faiss/utils/distances_simd.cpp +5 -178
  305. data/vendor/faiss/faiss/utils/extra_distances.cpp +9 -8
  306. data/vendor/faiss/faiss/utils/extra_distances.h +32 -6
  307. data/vendor/faiss/faiss/utils/hamming-inl.h +13 -11
  308. data/vendor/faiss/faiss/utils/hamming.cpp +66 -517
  309. data/vendor/faiss/faiss/utils/hamming.h +92 -2
  310. data/vendor/faiss/faiss/utils/hamming_distance/common.h +287 -10
  311. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx2.cpp +16 -0
  312. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512.cpp +15 -0
  313. data/vendor/faiss/faiss/utils/hamming_distance/hamming_avx512_spr.cpp +15 -0
  314. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx2.h +142 -0
  315. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512.h +210 -0
  316. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-avx512_spr.h +171 -0
  317. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-generic.h +368 -0
  318. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-neon.h +322 -0
  319. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer-rvv.h +39 -0
  320. data/vendor/faiss/faiss/utils/hamming_distance/hamming_computer.h +146 -0
  321. data/vendor/faiss/faiss/utils/hamming_distance/hamming_impl.h +481 -0
  322. data/vendor/faiss/faiss/utils/hamming_distance/hamming_neon.cpp +15 -0
  323. data/vendor/faiss/faiss/utils/hamming_distance/hamming_rvv.cpp +15 -0
  324. data/vendor/faiss/faiss/utils/partitioning.cpp +66 -989
  325. data/vendor/faiss/faiss/utils/partitioning.h +31 -0
  326. data/vendor/faiss/faiss/utils/popcount.h +29 -0
  327. data/vendor/faiss/faiss/utils/pq_code_distance.h +2 -2
  328. data/vendor/faiss/faiss/utils/prefetch.h +2 -2
  329. data/vendor/faiss/faiss/utils/quantize_lut.cpp +30 -30
  330. data/vendor/faiss/faiss/utils/quantize_lut.h +1 -1
  331. data/vendor/faiss/faiss/utils/rabitq_simd.h +57 -536
  332. data/vendor/faiss/faiss/utils/random.cpp +6 -6
  333. data/vendor/faiss/faiss/utils/simd_impl/IVFFlatScanner-inl.h +51 -0
  334. data/vendor/faiss/faiss/utils/simd_impl/distances_aarch64.cpp +5 -1
  335. data/vendor/faiss/faiss/utils/simd_impl/distances_arm_sve.cpp +213 -4
  336. data/vendor/faiss/faiss/utils/simd_impl/distances_autovec-inl.h +163 -10
  337. data/vendor/faiss/faiss/utils/simd_impl/distances_avx2.cpp +250 -4
  338. data/vendor/faiss/faiss/utils/simd_impl/distances_avx512.cpp +7 -4
  339. data/vendor/faiss/faiss/utils/simd_impl/distances_rvv.cpp +189 -0
  340. data/vendor/faiss/faiss/utils/simd_impl/distances_simdlib256.h +195 -0
  341. data/vendor/faiss/faiss/utils/simd_impl/distances_sse-inl.h +2 -1
  342. data/vendor/faiss/faiss/utils/{distances_fused/simdlib_based.h → simd_impl/exhaustive_L2sqr_blas_cmax.h} +5 -10
  343. data/vendor/faiss/faiss/utils/simd_impl/hamming_impl.h +481 -0
  344. data/vendor/faiss/faiss/utils/simd_impl/partitioning_avx2.cpp +14 -0
  345. data/vendor/faiss/faiss/utils/simd_impl/partitioning_neon.cpp +14 -0
  346. data/vendor/faiss/faiss/utils/simd_impl/partitioning_simdlib256.h +1031 -0
  347. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx2.cpp +355 -0
  348. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512.cpp +477 -0
  349. data/vendor/faiss/faiss/utils/simd_impl/rabitq_avx512_spr.cpp +343 -0
  350. data/vendor/faiss/faiss/utils/simd_impl/rabitq_neon.cpp +55 -0
  351. data/vendor/faiss/faiss/utils/simd_impl/rabitq_rvv.cpp +55 -0
  352. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_dispatch.h +32 -0
  353. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels.h +43 -0
  354. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx2.cpp +57 -0
  355. data/vendor/faiss/faiss/utils/simd_impl/super_kmeans_kernels_avx512.cpp +45 -0
  356. data/vendor/faiss/faiss/utils/simd_levels.cpp +29 -7
  357. data/vendor/faiss/faiss/utils/simd_levels.h +93 -1
  358. data/vendor/faiss/faiss/utils/sorting.cpp +48 -36
  359. data/vendor/faiss/faiss/utils/utils.cpp +5 -5
  360. data/vendor/faiss/faiss/utils/utils.h +3 -3
  361. metadata +129 -34
  362. data/vendor/faiss/faiss/impl/RaBitQStats.cpp +0 -29
  363. data/vendor/faiss/faiss/impl/RaBitQStats.h +0 -56
  364. data/vendor/faiss/faiss/impl/pq4_fast_scan.h +0 -224
  365. data/vendor/faiss/faiss/impl/pq4_fast_scan_search_1.cpp +0 -230
  366. data/vendor/faiss/faiss/utils/approx_topk/approx_topk.h +0 -84
  367. data/vendor/faiss/faiss/utils/approx_topk/avx2-inl.h +0 -196
  368. data/vendor/faiss/faiss/utils/approx_topk/mode.h +0 -34
  369. data/vendor/faiss/faiss/utils/distances_fused/avx512.h +0 -36
  370. data/vendor/faiss/faiss/utils/extra_distances-inl.h +0 -235
  371. data/vendor/faiss/faiss/utils/hamming_distance/avx2-inl.h +0 -462
  372. data/vendor/faiss/faiss/utils/hamming_distance/avx512-inl.h +0 -490
  373. data/vendor/faiss/faiss/utils/hamming_distance/generic-inl.h +0 -449
  374. data/vendor/faiss/faiss/utils/hamming_distance/hamdis-inl.h +0 -87
  375. data/vendor/faiss/faiss/utils/hamming_distance/neon-inl.h +0 -524
  376. data/vendor/faiss/faiss/utils/simdlib.h +0 -42
  377. data/vendor/faiss/faiss/utils/simdlib_avx512.h +0 -365
  378. /data/ext/faiss/{utils_rb.h → utils.h} +0 -0
@@ -0,0 +1,1031 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ */
7
+
8
+ #pragma once
9
+
10
+ /**
11
+ * @file partitioning_simdlib256.h
12
+ * @brief Per-SIMD TU implementation of partitioning and histogram functions.
13
+ *
14
+ * This header is included once per SIMD TU with THE_SIMD_LEVEL set to the
15
+ * desired SIMDLevel. It uses simdlib 256-bit wrappers (simd16uint16, etc.)
16
+ * which are locally aliased to THE_SIMD_LEVEL — the global aliases from
17
+ * simdlib_dispatch.h resolve through SINGLE_SIMD_LEVEL (= NONE in DD mode)
18
+ * and must NOT be used directly in per-ISA TU code.
19
+ *
20
+ * Usage (in a per-SIMD .cpp file):
21
+ * #define THE_SIMD_LEVEL SIMDLevel::AVX2
22
+ * #include <faiss/utils/simd_impl/partitioning_simdlib256.h>
23
+ */
24
+
25
+ #ifndef THE_SIMD_LEVEL
26
+ #error "Define THE_SIMD_LEVEL before including this header"
27
+ #endif
28
+
29
+ #include <faiss/utils/partitioning.h>
30
+
31
+ #include <cassert>
32
+ #include <cinttypes>
33
+ #include <cmath>
34
+
35
+ #include <faiss/impl/FaissAssert.h>
36
+ #include <faiss/impl/simdlib/simdlib_dispatch.h>
37
+ #include <faiss/utils/AlignedTable.h>
38
+ #include <faiss/utils/ordered_key_value.h>
39
+
40
+ #include <faiss/impl/platform_macros.h>
41
+ #include <faiss/utils/popcount.h>
42
+
43
+ namespace faiss {
44
+
45
+ namespace {
46
+
47
+ // ── Per-TU SIMD type aliases ──────────────────────────────────────────
48
+ // The global aliases (simd16uint16 etc.) from simdlib_dispatch.h resolve
49
+ // through SINGLE_SIMD_LEVEL, which is NONE in DD mode on x86 — meaning
50
+ // scalar emulation even in per-ISA TUs compiled with -mavx2.
51
+ // We shadow them here with THE_SIMD_LEVEL so the implementation actually
52
+ // uses the SIMD level this TU was compiled for.
53
+ static constexpr SIMDLevel THE_SL_256 =
54
+ simd256_level_selector<THE_SIMD_LEVEL>::value;
55
+ static_assert(
56
+ THE_SL_256 == SIMDLevel::NONE || THE_SL_256 == SIMDLevel::AVX2 ||
57
+ THE_SL_256 == SIMDLevel::ARM_NEON,
58
+ "simd256_level_selector must yield a 256-bit (or scalar) level");
59
+ using simd256bit = simd256bit_tpl<THE_SL_256>;
60
+ using simd16uint16 = simd16uint16_tpl<THE_SL_256>;
61
+ using simd32uint8 = simd32uint8_tpl<THE_SL_256>;
62
+ using simd8uint32 = simd8uint32_tpl<THE_SL_256>;
63
+ using simd8float32 = simd8float32_tpl<THE_SL_256>;
64
+
65
+ /******************************************************************
66
+ * SIMD routines when vals is an aligned array of uint16_t
67
+ ******************************************************************/
68
+
69
+ namespace simd_partitioning {
70
+
71
+ void find_minimax(
72
+ const uint16_t* vals,
73
+ size_t n,
74
+ uint16_t& smin,
75
+ uint16_t& smax) {
76
+ simd16uint16 vmin(0xffff), vmax(0);
77
+ for (size_t i = 0; i + 15 < n; i += 16) {
78
+ simd16uint16 v(vals + i);
79
+ vmin.accu_min(v);
80
+ vmax.accu_max(v);
81
+ }
82
+
83
+ ALIGNED(32) uint16_t tab32[32];
84
+ vmin.store(tab32);
85
+ vmax.store(tab32 + 16);
86
+
87
+ smin = tab32[0], smax = tab32[16];
88
+
89
+ for (int i = 1; i < 16; i++) {
90
+ smin = std::min(smin, tab32[i]);
91
+ smax = std::max(smax, tab32[i + 16]);
92
+ }
93
+
94
+ // missing values
95
+ for (size_t i = (n & ~15); i < n; i++) {
96
+ smin = std::min(smin, vals[i]);
97
+ smax = std::max(smax, vals[i]);
98
+ }
99
+ }
100
+
101
+ // max func differentiates between CMin and CMax (keep lowest or largest)
102
+ template <class C>
103
+ simd16uint16 max_func(simd16uint16 v, simd16uint16 thr16) {
104
+ constexpr bool is_max = C::is_max;
105
+ if (is_max) {
106
+ return max(v, thr16);
107
+ } else {
108
+ return min(v, thr16);
109
+ }
110
+ }
111
+
112
+ template <class C>
113
+ void count_lt_and_eq(
114
+ const uint16_t* vals,
115
+ int n,
116
+ uint16_t thresh,
117
+ size_t& n_lt,
118
+ size_t& n_eq) {
119
+ n_lt = n_eq = 0;
120
+ simd16uint16 thr16(thresh);
121
+
122
+ size_t n1 = n / 16;
123
+
124
+ for (size_t i = 0; i < n1; i++) {
125
+ simd16uint16 v(vals);
126
+ vals += 16;
127
+ simd16uint16 eqmask = (v == thr16);
128
+ simd16uint16 max2 = max_func<C>(v, thr16);
129
+ simd16uint16 gemask = (v == max2);
130
+ uint32_t bits = get_MSBs(uint16_to_uint8_saturate(eqmask, gemask));
131
+ int i_eq = popcount32(bits & 0x00ff00ff);
132
+ int i_ge = popcount32(bits) - i_eq;
133
+ n_eq += i_eq;
134
+ n_lt += 16 - i_ge;
135
+ }
136
+
137
+ for (int i = n1 * 16; i < n; i++) {
138
+ uint16_t v = *vals++;
139
+ if (C::cmp(thresh, v)) {
140
+ n_lt++;
141
+ } else if (v == thresh) {
142
+ n_eq++;
143
+ }
144
+ }
145
+ }
146
+
147
+ /* compress separated values and ids table, keeping all values < thresh and at
148
+ * most n_eq equal values */
149
+ template <class C>
150
+ int simd_compress_array(
151
+ uint16_t* vals,
152
+ typename C::TI* ids,
153
+ size_t n,
154
+ uint16_t thresh,
155
+ int n_eq) {
156
+ simd16uint16 thr16(thresh);
157
+ simd16uint16 mixmask(0xff00);
158
+
159
+ int wp = 0;
160
+ size_t i0;
161
+
162
+ // loop while there are eqs to collect
163
+ for (i0 = 0; i0 + 15 < n && n_eq > 0; i0 += 16) {
164
+ simd16uint16 v(vals + i0);
165
+ simd16uint16 max2 = max_func<C>(v, thr16);
166
+ simd16uint16 gemask = (v == max2);
167
+ simd16uint16 eqmask = (v == thr16);
168
+ uint32_t bits = get_MSBs(
169
+ blendv(simd32uint8(eqmask),
170
+ simd32uint8(gemask),
171
+ simd32uint8(mixmask)));
172
+ bits ^= 0xAAAAAAAA;
173
+ // bit 2*i : eq
174
+ // bit 2*i + 1 : lt
175
+
176
+ while (bits) {
177
+ int j = __builtin_ctz(bits) & (~1);
178
+ bool is_eq = (bits >> j) & 1;
179
+ bool is_lt = (bits >> j) & 2;
180
+ bits &= ~(3 << j);
181
+ j >>= 1;
182
+
183
+ if (is_lt) {
184
+ vals[wp] = vals[i0 + j];
185
+ ids[wp] = ids[i0 + j];
186
+ wp++;
187
+ } else if (is_eq && n_eq > 0) {
188
+ vals[wp] = vals[i0 + j];
189
+ ids[wp] = ids[i0 + j];
190
+ wp++;
191
+ n_eq--;
192
+ }
193
+ }
194
+ }
195
+
196
+ // handle remaining, only strictly lt ones.
197
+ for (; i0 + 15 < n; i0 += 16) {
198
+ simd16uint16 v(vals + i0);
199
+ simd16uint16 max2 = max_func<C>(v, thr16);
200
+ simd16uint16 gemask = (v == max2);
201
+ uint32_t bits = ~get_MSBs(simd32uint8(gemask));
202
+
203
+ while (bits) {
204
+ int j = __builtin_ctz(bits);
205
+ bits &= ~(3 << j);
206
+ j >>= 1;
207
+
208
+ vals[wp] = vals[i0 + j];
209
+ ids[wp] = ids[i0 + j];
210
+ wp++;
211
+ }
212
+ }
213
+
214
+ // end with scalar
215
+ for (size_t i = (n & ~size_t(15)); i < n; i++) {
216
+ if (C::cmp(thresh, vals[i])) {
217
+ vals[wp] = vals[i];
218
+ ids[wp] = ids[i];
219
+ wp++;
220
+ } else if (vals[i] == thresh && n_eq > 0) {
221
+ vals[wp] = vals[i];
222
+ ids[wp] = ids[i];
223
+ wp++;
224
+ n_eq--;
225
+ }
226
+ }
227
+ assert(n_eq == 0);
228
+ return wp;
229
+ }
230
+
231
+ // #define MICRO_BENCHMARK
232
+
233
+ static uint64_t get_cy() {
234
+ #ifdef MICRO_BENCHMARK
235
+ uint32_t high, low;
236
+ asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
237
+ return ((uint64_t)high << 32) | (low);
238
+ #else
239
+ return 0;
240
+ #endif
241
+ }
242
+
243
+ #define IFV if (false)
244
+
245
+ template <class C>
246
+ uint16_t simd_partition_fuzzy_with_bounds(
247
+ uint16_t* vals,
248
+ typename C::TI* ids,
249
+ size_t n,
250
+ size_t q_min,
251
+ size_t q_max,
252
+ size_t* q_out,
253
+ uint16_t s0i,
254
+ uint16_t s1i) {
255
+ if (q_min == 0) {
256
+ if (q_out) {
257
+ *q_out = 0;
258
+ }
259
+ return 0;
260
+ }
261
+ if (q_max >= n) {
262
+ if (q_out) {
263
+ *q_out = q_max;
264
+ }
265
+ return 0xffff;
266
+ }
267
+ if (s0i == s1i) {
268
+ if (q_out) {
269
+ *q_out = q_min;
270
+ }
271
+ return s0i;
272
+ }
273
+ uint64_t t0 = get_cy();
274
+
275
+ // lower bound inclusive, upper exclusive
276
+ size_t s0 = s0i, s1 = s1i + 1;
277
+
278
+ IFV printf("bounds: %zu %zu\n", s0, s1 - 1);
279
+
280
+ int thresh;
281
+ size_t n_eq = 0, n_lt = 0;
282
+ size_t q = 0;
283
+
284
+ for (int it = 0; it < 200; it++) {
285
+ // while(s0 + 1 < s1) {
286
+ thresh = (s0 + s1) / 2;
287
+ count_lt_and_eq<C>(vals, n, thresh, n_lt, n_eq);
288
+
289
+ IFV printf(
290
+ " [%zu %zu] thresh=%d n_lt=%zu n_eq=%zu, q=%zu:%zu/%zu\n",
291
+ s0,
292
+ s1,
293
+ thresh,
294
+ n_lt,
295
+ n_eq,
296
+ q_min,
297
+ q_max,
298
+ n);
299
+ if (n_lt <= q_min) {
300
+ if (n_lt + n_eq >= q_min) {
301
+ q = q_min;
302
+ break;
303
+ } else {
304
+ if (C::is_max) {
305
+ s0 = thresh;
306
+ } else {
307
+ s1 = thresh;
308
+ }
309
+ }
310
+ } else if (n_lt <= q_max) {
311
+ q = n_lt;
312
+ break;
313
+ } else {
314
+ if (C::is_max) {
315
+ s1 = thresh;
316
+ } else {
317
+ s0 = thresh;
318
+ }
319
+ }
320
+ }
321
+
322
+ uint64_t t1 = get_cy();
323
+
324
+ // number of equal values to keep
325
+ int64_t n_eq_1 = q - n_lt;
326
+
327
+ IFV printf(
328
+ "shrink: thresh=%d q=%zu n_eq_1=%" PRId64 "\n", thresh, q, n_eq_1);
329
+ if (n_eq_1 < 0) { // happens when > q elements are at lower bound
330
+ assert(s0 + 1 == s1);
331
+ q = q_min;
332
+ if (C::is_max) {
333
+ thresh--;
334
+ } else {
335
+ thresh++;
336
+ }
337
+ n_eq_1 = q;
338
+ IFV printf(
339
+ " override: thresh=%d n_eq_1=%" PRId64 "\n", thresh, n_eq_1);
340
+ } else {
341
+ assert(n_eq_1 <= n_eq);
342
+ }
343
+
344
+ size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq_1);
345
+
346
+ IFV printf("wp=%zu\n", wp);
347
+ assert(wp == q);
348
+ if (q_out) {
349
+ *q_out = q;
350
+ }
351
+
352
+ uint64_t t2 = get_cy();
353
+
354
+ partition_stats.bisect_cycles += t1 - t0;
355
+ partition_stats.compress_cycles += t2 - t1;
356
+
357
+ return thresh;
358
+ }
359
+
360
+ // Forward declarations of histogram functions defined below
361
+ void local_simd_histogram_8(
362
+ const uint16_t* data,
363
+ int n,
364
+ uint16_t min,
365
+ int shift,
366
+ int* hist);
367
+ void local_simd_histogram_16(
368
+ const uint16_t* data,
369
+ int n,
370
+ uint16_t min,
371
+ int shift,
372
+ int* hist);
373
+
374
+ template <class C>
375
+ uint16_t simd_partition_fuzzy_with_bounds_histogram(
376
+ uint16_t* vals,
377
+ typename C::TI* ids,
378
+ size_t n,
379
+ size_t q_min,
380
+ size_t q_max,
381
+ size_t* q_out,
382
+ uint16_t s0i,
383
+ uint16_t s1i) {
384
+ if (q_min == 0) {
385
+ if (q_out) {
386
+ *q_out = 0;
387
+ }
388
+ return 0;
389
+ }
390
+ if (q_max >= n) {
391
+ if (q_out) {
392
+ *q_out = q_max;
393
+ }
394
+ return 0xffff;
395
+ }
396
+ if (s0i == s1i) {
397
+ if (q_out) {
398
+ *q_out = q_min;
399
+ }
400
+ return s0i;
401
+ }
402
+
403
+ IFV printf(
404
+ "partition fuzzy, q=%ld:%ld / %ld, bounds=%d %d\n",
405
+ q_min,
406
+ q_max,
407
+ n,
408
+ s0i,
409
+ s1i);
410
+
411
+ if (!C::is_max) {
412
+ IFV printf(
413
+ "revert due to CMin, q_min:q_max -> %ld:%ld\n", q_min, q_max);
414
+ q_min = n - q_min;
415
+ q_max = n - q_max;
416
+ }
417
+
418
+ // lower and upper bound of range, inclusive
419
+ int s0 = s0i, s1 = s1i;
420
+ // number of values < s0 and > s1
421
+ size_t n_lt = 0, n_gt = 0;
422
+
423
+ // output of loop:
424
+ int thresh; // final threshold
425
+ uint64_t tot_eq = 0; // total nb of equal values
426
+ uint64_t n_eq = 0; // nb of equal values to keep
427
+ size_t q; // final quantile
428
+
429
+ // buffer for the histograms
430
+ int hist[16];
431
+
432
+ for (int it = 0; it < 20; it++) {
433
+ // otherwise we would be done already
434
+
435
+ int shift = 0;
436
+
437
+ IFV printf(
438
+ " it %d bounds: %d %d n_lt=%ld n_gt=%ld\n",
439
+ it,
440
+ s0,
441
+ s1,
442
+ n_lt,
443
+ n_gt);
444
+
445
+ int maxval = s1 - s0;
446
+
447
+ while (maxval > 15) {
448
+ shift++;
449
+ maxval >>= 1;
450
+ }
451
+
452
+ IFV printf(
453
+ " histogram shift %d maxval %d ?= %d\n",
454
+ shift,
455
+ maxval,
456
+ int((s1 - s0) >> shift));
457
+
458
+ if (maxval > 7) {
459
+ local_simd_histogram_16(vals, n, s0, shift, hist);
460
+ } else {
461
+ local_simd_histogram_8(vals, n, s0, shift, hist);
462
+ }
463
+ IFV {
464
+ int sum = n_lt + n_gt;
465
+ printf(" n_lt=%ld hist=[", n_lt);
466
+ for (int i = 0; i <= maxval; i++) {
467
+ printf("%d ", hist[i]);
468
+ sum += hist[i];
469
+ }
470
+ printf("] n_gt=%ld sum=%d\n", n_gt, sum);
471
+ assert(sum == n);
472
+ }
473
+
474
+ size_t sum_below = n_lt;
475
+ int i;
476
+ for (i = 0; i <= maxval; i++) {
477
+ sum_below += hist[i];
478
+ if (sum_below >= q_min) {
479
+ break;
480
+ }
481
+ }
482
+ IFV printf(" i=%d sum_below=%ld\n", i, sum_below);
483
+ if (i <= maxval) {
484
+ s0 = s0 + (i << shift);
485
+ s1 = s0 + (1 << shift) - 1;
486
+ n_lt = sum_below - hist[i];
487
+ n_gt = n - sum_below;
488
+ } else {
489
+ assert(false && "not implemented");
490
+ }
491
+
492
+ IFV printf(
493
+ " new bin: s0=%d s1=%d n_lt=%ld n_gt=%ld\n",
494
+ s0,
495
+ s1,
496
+ n_lt,
497
+ n_gt);
498
+
499
+ if (s1 > s0) {
500
+ if (n_lt >= q_min && q_max >= n_lt) {
501
+ IFV printf(" FOUND1\n");
502
+ thresh = s0;
503
+ q = n_lt;
504
+ break;
505
+ }
506
+
507
+ size_t n_lt_2 = n - n_gt;
508
+ if (n_lt_2 >= q_min && q_max >= n_lt_2) {
509
+ thresh = s1 + 1;
510
+ q = n_lt_2;
511
+ IFV printf(" FOUND2\n");
512
+ break;
513
+ }
514
+ } else {
515
+ thresh = s0;
516
+ q = q_min;
517
+ tot_eq = n - n_gt - n_lt;
518
+ n_eq = q_min - n_lt;
519
+ IFV printf(" FOUND3\n");
520
+ break;
521
+ }
522
+ }
523
+
524
+ IFV printf("end bisection: thresh=%d q=%ld n_eq=%ld\n", thresh, q, n_eq);
525
+
526
+ if (!C::is_max) {
527
+ if (n_eq == 0) {
528
+ thresh--;
529
+ } else {
530
+ // thresh unchanged
531
+ n_eq = tot_eq - n_eq;
532
+ }
533
+ q = n - q;
534
+ IFV printf("revert due to CMin, q->%ld n_eq->%ld\n", q, n_eq);
535
+ }
536
+
537
+ size_t wp = simd_compress_array<C>(vals, ids, n, thresh, n_eq);
538
+ IFV printf("wp=%ld ?= %ld\n", wp, q);
539
+ assert(wp == q);
540
+ if (q_out) {
541
+ *q_out = wp;
542
+ }
543
+
544
+ return thresh;
545
+ }
546
+
547
+ template <class C>
548
+ uint16_t simd_partition_fuzzy(
549
+ uint16_t* vals,
550
+ typename C::TI* ids,
551
+ size_t n,
552
+ size_t q_min,
553
+ size_t q_max,
554
+ size_t* q_out) {
555
+ assert(is_aligned_pointer(vals));
556
+
557
+ uint16_t s0i, s1i;
558
+ find_minimax(vals, n, s0i, s1i);
559
+ // QSelect_stats.t0 += get_cy() - t0;
560
+
561
+ return simd_partition_fuzzy_with_bounds<C>(
562
+ vals, ids, n, q_min, q_max, q_out, s0i, s1i);
563
+ }
564
+
565
+ #undef IFV
566
+
567
+ } // namespace simd_partitioning
568
+
569
+ /******************************************************************
570
+ * Histogram subroutines
571
+ ******************************************************************/
572
+
573
+ /************************************************************
574
+ * 8 bins
575
+ ************************************************************/
576
+
577
+ simd32uint8 accu4to8(simd16uint16 a4) {
578
+ simd16uint16 mask4(0x0f0f);
579
+
580
+ simd16uint16 a8_0 = a4 & mask4;
581
+ simd16uint16 a8_1 = (a4 >> 4) & mask4;
582
+
583
+ return simd32uint8(hadd(a8_0, a8_1));
584
+ }
585
+
586
+ simd16uint16 accu8to16(simd32uint8 a8) {
587
+ simd16uint16 mask8(0x00ff);
588
+
589
+ simd16uint16 a8_0 = simd16uint16(a8) & mask8;
590
+ simd16uint16 a8_1 = (simd16uint16(a8) >> 8) & mask8;
591
+
592
+ return hadd(a8_0, a8_1);
593
+ }
594
+
595
+ // Lookup table held as a plain byte array in .rodata. Storing it as a
596
+ // `simd32uint8` global would emit an AVX2 initializer into `.init_array` that
597
+ // runs at dlopen, before runtime SIMD dispatch, and SIGILLs on non-AVX2 CPUs
598
+ alignas(32) static const uint8_t shifts[32] = {
599
+ 1, 16, 0, 0, 4, 64, 0, 0, 0, 0, 1, 16, 0, 0, 4, 64,
600
+ 1, 16, 0, 0, 4, 64, 0, 0, 0, 0, 1, 16, 0, 0, 4, 64};
601
+
602
+ // 2-bit accumulator: we can add only up to 3 elements
603
+ // on output we return 2*4-bit results
604
+ // preproc returns either an index in 0..7 or 0xffff
605
+ // that yields a 0 when used in the table look-up
606
+ template <int N, class Preproc>
607
+ void compute_accu2(
608
+ const uint16_t*& data,
609
+ Preproc& pp,
610
+ simd16uint16& a4lo,
611
+ simd16uint16& a4hi) {
612
+ simd16uint16 mask2(0x3333);
613
+ simd16uint16 a2((uint16_t)0); // 2-bit accu
614
+ for (int j = 0; j < N; j++) {
615
+ simd16uint16 v(data);
616
+ data += 16;
617
+ v = pp(v);
618
+ // 0x800 -> force second half of table
619
+ simd16uint16 idx = v | (v << 8) | simd16uint16(0x800);
620
+ a2 += simd16uint16(
621
+ simd32uint8(shifts).lookup_2_lanes(simd32uint8(idx)));
622
+ }
623
+ a4lo += a2 & mask2;
624
+ a4hi += (a2 >> 2) & mask2;
625
+ }
626
+
627
+ template <class Preproc>
628
+ simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) {
629
+ assert(n_in % 16 == 0);
630
+ int n = n_in / 16;
631
+
632
+ simd32uint8 a8lo(0);
633
+ simd32uint8 a8hi(0);
634
+
635
+ for (int i0 = 0; i0 < n; i0 += 15) {
636
+ simd16uint16 a4lo(0); // 4-bit accus
637
+ simd16uint16 a4hi(0);
638
+
639
+ int i1 = std::min(i0 + 15, n);
640
+ int i;
641
+ for (i = i0; i + 2 < i1; i += 3) {
642
+ compute_accu2<3>(data, pp, a4lo, a4hi); // adds 3 max
643
+ }
644
+ switch (i1 - i) {
645
+ case 2:
646
+ compute_accu2<2>(data, pp, a4lo, a4hi);
647
+ break;
648
+ case 1:
649
+ compute_accu2<1>(data, pp, a4lo, a4hi);
650
+ break;
651
+ }
652
+
653
+ a8lo += accu4to8(a4lo);
654
+ a8hi += accu4to8(a4hi);
655
+ }
656
+
657
+ // move to 16-bit accu
658
+ simd16uint16 a16lo = accu8to16(a8lo);
659
+ simd16uint16 a16hi = accu8to16(a8hi);
660
+
661
+ simd16uint16 a16 = hadd(a16lo, a16hi);
662
+
663
+ // the 2 lanes must still be combined
664
+ return a16;
665
+ }
666
+
667
+ /************************************************************
668
+ * 16 bins
669
+ ************************************************************/
670
+
671
+ // See the note on `shifts` above: kept as a .rodata byte array so its
672
+ // initializer does not emit AVX2 into `.init_array`
673
+ alignas(32) static const uint8_t shifts2[32] = {
674
+ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
675
+ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
676
+
677
+ simd32uint8 shiftr_16(simd32uint8 x, int n) {
678
+ return simd32uint8(simd16uint16(x) >> n);
679
+ }
680
+
681
+ // 2-bit accumulator: we can add only up to 3 elements
682
+ // on output we return 2*4-bit results
683
+ template <int N, class Preproc>
684
+ void compute_accu2_16(
685
+ const uint16_t*& data,
686
+ Preproc pp,
687
+ simd32uint8& a4_0,
688
+ simd32uint8& a4_1,
689
+ simd32uint8& a4_2,
690
+ simd32uint8& a4_3) {
691
+ simd32uint8 mask1(0x55);
692
+ simd32uint8 a2_0; // 2-bit accu
693
+ simd32uint8 a2_1; // 2-bit accu
694
+ a2_0.clear();
695
+ a2_1.clear();
696
+
697
+ for (int j = 0; j < N; j++) {
698
+ simd16uint16 v(data);
699
+ data += 16;
700
+ v = pp(v);
701
+
702
+ simd16uint16 idx = v | (v << 8);
703
+ simd32uint8 a1 = simd32uint8(shifts2).lookup_2_lanes(simd32uint8(idx));
704
+ // contains 0s for out-of-bounds elements
705
+
706
+ simd16uint16 lt8 = (v >> 3) == simd16uint16(0);
707
+ lt8 = lt8 ^ simd16uint16(0xff00);
708
+
709
+ a1 = a1 & lt8;
710
+
711
+ a2_0 += a1 & mask1;
712
+ a2_1 += shiftr_16(a1, 1) & mask1;
713
+ }
714
+ simd32uint8 mask2(0x33);
715
+
716
+ a4_0 += a2_0 & mask2;
717
+ a4_1 += a2_1 & mask2;
718
+ a4_2 += shiftr_16(a2_0, 2) & mask2;
719
+ a4_3 += shiftr_16(a2_1, 2) & mask2;
720
+ }
721
+
722
+ simd32uint8 accu4to8_2(simd32uint8 a4_0, simd32uint8 a4_1) {
723
+ simd32uint8 mask4(0x0f);
724
+
725
+ simd16uint16 a8_0 = combine2x2(
726
+ (simd16uint16)(a4_0 & mask4),
727
+ (simd16uint16)(shiftr_16(a4_0, 4) & mask4));
728
+
729
+ simd16uint16 a8_1 = combine2x2(
730
+ (simd16uint16)(a4_1 & mask4),
731
+ (simd16uint16)(shiftr_16(a4_1, 4) & mask4));
732
+
733
+ return simd32uint8(hadd(a8_0, a8_1));
734
+ }
735
+
736
+ template <class Preproc>
737
+ simd16uint16 histogram_16(const uint16_t* data, Preproc pp, size_t n_in) {
738
+ assert(n_in % 16 == 0);
739
+ int n = n_in / 16;
740
+
741
+ simd32uint8 a8lo((uint8_t)0);
742
+ simd32uint8 a8hi((uint8_t)0);
743
+
744
+ for (int i0 = 0; i0 < n; i0 += 7) {
745
+ simd32uint8 a4_0(0); // 0, 4, 8, 12
746
+ simd32uint8 a4_1(0); // 1, 5, 9, 13
747
+ simd32uint8 a4_2(0); // 2, 6, 10, 14
748
+ simd32uint8 a4_3(0); // 3, 7, 11, 15
749
+
750
+ int i1 = std::min(i0 + 7, n);
751
+ int i;
752
+ for (i = i0; i + 2 < i1; i += 3) {
753
+ compute_accu2_16<3>(data, pp, a4_0, a4_1, a4_2, a4_3);
754
+ }
755
+ switch (i1 - i) {
756
+ case 2:
757
+ compute_accu2_16<2>(data, pp, a4_0, a4_1, a4_2, a4_3);
758
+ break;
759
+ case 1:
760
+ compute_accu2_16<1>(data, pp, a4_0, a4_1, a4_2, a4_3);
761
+ break;
762
+ }
763
+
764
+ a8lo += accu4to8_2(a4_0, a4_1);
765
+ a8hi += accu4to8_2(a4_2, a4_3);
766
+ }
767
+
768
+ // move to 16-bit accu
769
+ simd16uint16 a16lo = accu8to16(a8lo);
770
+ simd16uint16 a16hi = accu8to16(a8hi);
771
+
772
+ simd16uint16 a16 = hadd(a16lo, a16hi);
773
+
774
+ a16 = simd16uint16{simd8uint32{a16}.unzip()};
775
+
776
+ return a16;
777
+ }
778
+
779
+ struct PreprocNOP {
780
+ simd16uint16 operator()(simd16uint16 x) {
781
+ return x;
782
+ }
783
+ };
784
+
785
+ template <int shift, int nbin>
786
+ struct PreprocMinShift {
787
+ simd16uint16 min16;
788
+ simd16uint16 max16;
789
+
790
+ explicit PreprocMinShift(uint16_t min) {
791
+ min16.set1(min);
792
+ int vmax0 = std::min((nbin << shift) + min, 65536);
793
+ uint16_t vmax = uint16_t(vmax0 - 1 - min);
794
+ max16.set1(vmax); // vmax inclusive
795
+ }
796
+
797
+ simd16uint16 operator()(simd16uint16 x) {
798
+ x = x - min16;
799
+ simd16uint16 mask = (x == max(x, max16)) - (x == max16);
800
+ return (x >> shift) | mask;
801
+ }
802
+ };
803
+
804
+ /* unbounded versions of the functions */
805
+
806
+ void simd_histogram_8_unbounded(const uint16_t* data, int n, int* hist) {
807
+ PreprocNOP pp;
808
+ simd16uint16 a16 = histogram_8(data, pp, (n & ~15));
809
+
810
+ ALIGNED(32) uint16_t a16_tab[16];
811
+ a16.store(a16_tab);
812
+
813
+ for (int i = 0; i < 8; i++) {
814
+ hist[i] = a16_tab[i] + a16_tab[i + 8];
815
+ }
816
+
817
+ for (int i = (n & ~15); i < n; i++) {
818
+ hist[data[i]]++;
819
+ }
820
+ }
821
+
822
+ void simd_histogram_16_unbounded(const uint16_t* data, int n, int* hist) {
823
+ simd16uint16 a16 = histogram_16(data, PreprocNOP(), (n & ~15));
824
+
825
+ ALIGNED(32) uint16_t a16_tab[16];
826
+ a16.store(a16_tab);
827
+
828
+ for (int i = 0; i < 16; i++) {
829
+ hist[i] = a16_tab[i];
830
+ }
831
+
832
+ for (int i = (n & ~15); i < n; i++) {
833
+ hist[data[i]]++;
834
+ }
835
+ }
836
+
837
+ /************************************************************
838
+ * Histogram driver routines
839
+ ************************************************************/
840
+
841
+ void local_simd_histogram_8(
842
+ const uint16_t* data,
843
+ int n,
844
+ uint16_t min,
845
+ int shift,
846
+ int* hist) {
847
+ if (shift < 0) {
848
+ simd_histogram_8_unbounded(data, n, hist);
849
+ return;
850
+ }
851
+
852
+ simd16uint16 a16;
853
+
854
+ #define DISPATCH(s) \
855
+ case s: \
856
+ a16 = histogram_8(data, PreprocMinShift<s, 8>(min), (n & ~15)); \
857
+ break
858
+
859
+ switch (shift) {
860
+ DISPATCH(0);
861
+ DISPATCH(1);
862
+ DISPATCH(2);
863
+ DISPATCH(3);
864
+ DISPATCH(4);
865
+ DISPATCH(5);
866
+ DISPATCH(6);
867
+ DISPATCH(7);
868
+ DISPATCH(8);
869
+ DISPATCH(9);
870
+ DISPATCH(10);
871
+ DISPATCH(11);
872
+ DISPATCH(12);
873
+ DISPATCH(13);
874
+ default:
875
+ FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
876
+ }
877
+ #undef DISPATCH
878
+
879
+ ALIGNED(32) uint16_t a16_tab[16];
880
+ a16.store(a16_tab);
881
+
882
+ for (int i = 0; i < 8; i++) {
883
+ hist[i] = a16_tab[i] + a16_tab[i + 8];
884
+ }
885
+
886
+ // complete with remaining bins
887
+ for (int i = (n & ~15); i < n; i++) {
888
+ if (data[i] < min) {
889
+ continue;
890
+ }
891
+ uint16_t v = data[i] - min;
892
+ v >>= shift;
893
+ if (v < 8) {
894
+ hist[v]++;
895
+ }
896
+ }
897
+ }
898
+
899
+ void local_simd_histogram_16(
900
+ const uint16_t* data,
901
+ int n,
902
+ uint16_t min,
903
+ int shift,
904
+ int* hist) {
905
+ if (shift < 0) {
906
+ simd_histogram_16_unbounded(data, n, hist);
907
+ return;
908
+ }
909
+
910
+ simd16uint16 a16;
911
+
912
+ #define DISPATCH(s) \
913
+ case s: \
914
+ a16 = histogram_16(data, PreprocMinShift<s, 16>(min), (n & ~15)); \
915
+ break
916
+
917
+ switch (shift) {
918
+ DISPATCH(0);
919
+ DISPATCH(1);
920
+ DISPATCH(2);
921
+ DISPATCH(3);
922
+ DISPATCH(4);
923
+ DISPATCH(5);
924
+ DISPATCH(6);
925
+ DISPATCH(7);
926
+ DISPATCH(8);
927
+ DISPATCH(9);
928
+ DISPATCH(10);
929
+ DISPATCH(11);
930
+ DISPATCH(12);
931
+ default:
932
+ FAISS_THROW_FMT("dispatch for shift=%d not instantiated", shift);
933
+ }
934
+ #undef DISPATCH
935
+
936
+ ALIGNED(32) uint16_t a16_tab[16];
937
+ a16.store(a16_tab);
938
+
939
+ for (int i = 0; i < 16; i++) {
940
+ hist[i] = a16_tab[i];
941
+ }
942
+
943
+ for (int i = (n & ~15); i < n; i++) {
944
+ if (data[i] < min) {
945
+ continue;
946
+ }
947
+ uint16_t v = data[i] - min;
948
+ v >>= shift;
949
+ if (v < 16) {
950
+ hist[v]++;
951
+ }
952
+ }
953
+ }
954
+
955
+ } // anonymous namespace
956
+
957
+ /******************************************************************
958
+ * Template specializations — entry points called from partitioning.cpp
959
+ ******************************************************************/
960
+
961
+ constexpr SIMDLevel SL = THE_SIMD_LEVEL;
962
+
963
+ template <>
964
+ uint16_t partition_fuzzy_simd<SL, CMax<uint16_t, int64_t>>(
965
+ uint16_t* vals,
966
+ int64_t* ids,
967
+ size_t n,
968
+ size_t q_min,
969
+ size_t q_max,
970
+ size_t* q_out) {
971
+ return simd_partitioning::simd_partition_fuzzy<CMax<uint16_t, int64_t>>(
972
+ vals, ids, n, q_min, q_max, q_out);
973
+ }
974
+
975
+ template <>
976
+ uint16_t partition_fuzzy_simd<SL, CMin<uint16_t, int64_t>>(
977
+ uint16_t* vals,
978
+ int64_t* ids,
979
+ size_t n,
980
+ size_t q_min,
981
+ size_t q_max,
982
+ size_t* q_out) {
983
+ return simd_partitioning::simd_partition_fuzzy<CMin<uint16_t, int64_t>>(
984
+ vals, ids, n, q_min, q_max, q_out);
985
+ }
986
+
987
+ template <>
988
+ uint16_t partition_fuzzy_simd<SL, CMax<uint16_t, int>>(
989
+ uint16_t* vals,
990
+ int* ids,
991
+ size_t n,
992
+ size_t q_min,
993
+ size_t q_max,
994
+ size_t* q_out) {
995
+ return simd_partitioning::simd_partition_fuzzy<CMax<uint16_t, int>>(
996
+ vals, ids, n, q_min, q_max, q_out);
997
+ }
998
+
999
+ template <>
1000
+ uint16_t partition_fuzzy_simd<SL, CMin<uint16_t, int>>(
1001
+ uint16_t* vals,
1002
+ int* ids,
1003
+ size_t n,
1004
+ size_t q_min,
1005
+ size_t q_max,
1006
+ size_t* q_out) {
1007
+ return simd_partitioning::simd_partition_fuzzy<CMin<uint16_t, int>>(
1008
+ vals, ids, n, q_min, q_max, q_out);
1009
+ }
1010
+
1011
+ template <>
1012
+ void simd_histogram_8<SL>(
1013
+ const uint16_t* data,
1014
+ int n,
1015
+ uint16_t min,
1016
+ int shift,
1017
+ int* hist) {
1018
+ local_simd_histogram_8(data, n, min, shift, hist);
1019
+ }
1020
+
1021
+ template <>
1022
+ void simd_histogram_16<SL>(
1023
+ const uint16_t* data,
1024
+ int n,
1025
+ uint16_t min,
1026
+ int shift,
1027
+ int* hist) {
1028
+ local_simd_histogram_16(data, n, min, shift, hist);
1029
+ }
1030
+
1031
+ } // namespace faiss