local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,1074 @@
1
+ #include "speculative.h"
2
+
3
+ #include "common.h"
4
+ #include "ggml.h"
5
+ #include "llama.h"
6
+ #include "log.h"
7
+ #include "ngram-cache.h"
8
+ #include "ngram-map.h"
9
+ #include "ngram-mod.h"
10
+ #include "sampling.h"
11
+
12
+ #include <algorithm>
13
+ #include <cstring>
14
+ #include <iomanip>
15
+ #include <map>
16
+
17
+ #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
18
+ #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
19
+
20
+ const std::vector<enum common_speculative_type> common_speculative_types = {
21
+ COMMON_SPECULATIVE_TYPE_NONE,
22
+ COMMON_SPECULATIVE_TYPE_DRAFT,
23
+ COMMON_SPECULATIVE_TYPE_EAGLE3,
24
+ COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,
25
+ COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,
26
+ COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V,
27
+ COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
28
+ COMMON_SPECULATIVE_TYPE_NGRAM_CACHE
29
+ };
30
+
31
+ const std::map<std::string, enum common_speculative_type> common_speculative_type_from_name_map = {
32
+ {"none", COMMON_SPECULATIVE_TYPE_NONE},
33
+ {"draft", COMMON_SPECULATIVE_TYPE_DRAFT},
34
+ {"eagle3", COMMON_SPECULATIVE_TYPE_EAGLE3},
35
+ {"ngram_simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
36
+ {"ngram_map_k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
37
+ {"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
38
+ {"ngram_mod", COMMON_SPECULATIVE_TYPE_NGRAM_MOD},
39
+ {"ngram_cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
40
+ };
41
+
42
+ struct common_speculative_config {
43
+ common_speculative_type type;
44
+ common_params_speculative params;
45
+
46
+ common_speculative_config(common_speculative_type t,
47
+ const common_params_speculative & p = common_params_speculative{}) : type(t), params(p) {}
48
+ };
49
+
50
+ static bool common_speculative_are_compatible(
51
+ const llama_model * model_tgt,
52
+ const llama_model * model_dft) {
53
+ const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
54
+ const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
55
+
56
+ const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
57
+ LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
58
+
59
+ const bool vocab_type_dft = llama_vocab_type(vocab_dft);
60
+ LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
61
+
62
+ if (vocab_type_tgt != vocab_type_dft) {
63
+ LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
64
+ LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
65
+ return false;
66
+ }
67
+
68
+ if (
69
+ llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
70
+ llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
71
+ llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
72
+ llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
73
+ ) {
74
+ LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
75
+ return false;
76
+ }
77
+
78
+ {
79
+ const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
80
+ const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
81
+ const int vocab_diff = n_vocab_tgt > n_vocab_dft
82
+ ? n_vocab_tgt - n_vocab_dft
83
+ : n_vocab_dft - n_vocab_tgt;
84
+
85
+ if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
86
+ LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__);
87
+ LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
88
+ n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
89
+ return false;
90
+ }
91
+
92
+ for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
93
+ const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
94
+ const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
95
+
96
+ if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
97
+ LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__);
98
+ LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i,
99
+ common_token_to_piece(vocab_tgt, i).c_str(),
100
+ common_token_to_piece(vocab_dft, i).c_str());
101
+ return false;
102
+ }
103
+ }
104
+ }
105
+
106
+ return true;
107
+ }
108
+
109
+ // state of an implementation of speculative decoding
110
+ //
111
+ // each implementation has a unique type and a state that is implementation-specific
112
+ // in a subclass of common_speculative_state
113
+ struct common_speculative_state {
114
+ const enum common_speculative_type type;
115
+
116
+ size_t n_call_begin = 0; // number of times this implementation was called for refresh.
117
+ size_t n_call_draft = 0; // number of times this implementation was called for generation.
118
+ size_t n_call_accept = 0; // number of times this implementation was called for accumulation.
119
+
120
+ size_t n_gen_drafts = 0; // number of times a draft or part was generated by this implementation.
121
+ size_t n_acc_drafts = 0; // number of times a draft or part was accepted by the target model.
122
+ size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
123
+ size_t n_acc_tokens = 0; // number of tokens accepted by the target model.
124
+
125
+ // TODO: track performance of most recent calls
126
+ const bool gen_perf = true; // whether to generate performance stats.
127
+
128
+ int64_t t_begin_us = 0; // total time spent in refresh of this implementation in microseconds.
129
+ int64_t t_draft_us = 0; // total time spent in generating drafts in this implementation in microseconds.
130
+ int64_t t_accept_us = 0; // total time spent in accumulation of this implementation in microseconds.
131
+
132
+ common_speculative_state(enum common_speculative_type type) : type(type) {}
133
+
134
+ virtual ~common_speculative_state() = default;
135
+
136
+ virtual void begin(const llama_tokens & prompt) = 0;
137
+
138
+ virtual void draft(
139
+ const common_params_speculative & params,
140
+ const llama_tokens & prompt_tgt,
141
+ llama_token id_last,
142
+ llama_tokens & result) = 0;
143
+
144
+ virtual void accept(uint16_t n_accepted) = 0;
145
+ };
146
+
147
+ struct common_speculative_state_draft : public common_speculative_state {
148
+ llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
149
+ llama_context * ctx_dft;
150
+
151
+ common_sampler * smpl;
152
+
153
+ llama_batch batch;
154
+ llama_tokens prompt_dft;
155
+
156
+ bool vocab_cmpt = true; // whether retokenization is needed
157
+ std::unordered_map<std::string, std::string> vocab_map;
158
+
159
+ common_speculative_state_draft(
160
+ enum common_speculative_type type,
161
+ llama_context * ctx_tgt,
162
+ llama_context * ctx_dft,
163
+ const std::vector<std::pair<std::string, std::string>> & replacements)
164
+ : common_speculative_state(type)
165
+ , ctx_tgt(ctx_tgt)
166
+ , ctx_dft(ctx_dft)
167
+ {
168
+ batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
169
+ smpl = nullptr;
170
+
171
+ // TODO: optimize or pass from outside?
172
+ // {
173
+ // common_params_sampling params;
174
+ // params.no_perf = false;
175
+ //
176
+ // params.top_k = 40;
177
+ // params.top_p = 0.9;
178
+ //
179
+ // params.samplers = {
180
+ // COMMON_SAMPLER_TYPE_TOP_K,
181
+ // COMMON_SAMPLER_TYPE_TOP_P,
182
+ // COMMON_SAMPLER_TYPE_INFILL,
183
+ // };
184
+ //
185
+ // result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
186
+ // }
187
+ {
188
+ common_params_sampling params;
189
+ params.no_perf = false;
190
+ params.top_k = 10;
191
+ params.samplers = {
192
+ COMMON_SAMPLER_TYPE_TOP_K,
193
+ };
194
+
195
+ smpl = common_sampler_init(llama_get_model(ctx_dft), params);
196
+ }
197
+
198
+ vocab_cmpt = common_speculative_are_compatible(llama_get_model(ctx_tgt), llama_get_model(ctx_dft));
199
+ LOG_DBG("vocab_cmpt = %d\n", vocab_cmpt);
200
+
201
+ if (!vocab_cmpt) {
202
+ LOG_WRN("the target and draft vocabs are not compatible - tokens will be translated between the two\n");
203
+
204
+ for (const auto & pair : replacements) {
205
+ vocab_map[pair.first] = pair.second;
206
+ }
207
+ }
208
+ }
209
+
210
+ ~common_speculative_state_draft() override {
211
+ llama_perf_context_print(ctx_dft);
212
+
213
+ llama_free(ctx_dft);
214
+
215
+ common_sampler_free(smpl);
216
+
217
+ llama_batch_free(batch);
218
+ }
219
+
220
+ void begin(const llama_tokens & prompt) override {
221
+ GGML_UNUSED(prompt);
222
+ }
223
+
224
+ void draft(
225
+ const common_params_speculative & params,
226
+ const llama_tokens & prompt_tgt,
227
+ llama_token id_last,
228
+ llama_tokens & result) override {
229
+ auto * spec = this;
230
+
231
+ auto & batch = spec->batch;
232
+ auto & ctx_tgt = spec->ctx_tgt;
233
+ auto & ctx_dft = spec->ctx_dft;
234
+ auto & smpl = spec->smpl;
235
+ auto & prompt_dft = spec->prompt_dft;
236
+
237
+ auto * mem_dft = llama_get_memory(ctx_dft);
238
+
239
+ int reuse_i = 0;
240
+ int reuse_n = 0;
241
+
242
+ const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max;
243
+
244
+ llama_tokens prompt_cnv;
245
+ if (!spec->vocab_cmpt) {
246
+ std::string text;
247
+
248
+ text = common_detokenize(ctx_tgt, prompt_tgt, true);
249
+ text = replace_to_dft(text);
250
+
251
+ LOG_DBG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
252
+
253
+ prompt_cnv = common_tokenize(ctx_dft, text, false, true);
254
+
255
+ // convert id_last to draft vocab. llama_detokenize is called directly to avoid an allocation
256
+ const auto * model_tgt = llama_get_model(ctx_tgt);
257
+ const auto * vocab_tgt = llama_model_get_vocab(model_tgt);
258
+
259
+ int32_t n_chars = llama_detokenize(vocab_tgt, &id_last, 1, nullptr, 0, false, false);
260
+ GGML_ASSERT(n_chars < 0 && "failed to detokenize id_last");
261
+
262
+ text.resize(-n_chars);
263
+ llama_detokenize(vocab_tgt, &id_last, 1, text.data(), text.size(), false, false);
264
+ text = replace_to_dft(text);
265
+
266
+ LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
267
+ id_last = common_tokenize(ctx_dft, text, false, true)[0];
268
+ }
269
+
270
+ const llama_tokens & prompt_cur = spec->vocab_cmpt ? prompt_tgt : prompt_cnv;
271
+
272
+ const int i_start = std::max<int>(0, (int) prompt_cur.size() - n_ctx);
273
+
274
+ // reuse as much as possible from the old draft context
275
+ // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
276
+ for (int i = 0; i < (int) prompt_dft.size(); ++i) {
277
+ int cur = 0;
278
+ while (i_start + cur < (int) prompt_cur.size() &&
279
+ i + cur < (int) prompt_dft.size() &&
280
+ prompt_cur[i_start + cur] == prompt_dft[i + cur]) {
281
+ cur++;
282
+ }
283
+
284
+ if ((cur >= 256 || n_ctx >= (int) prompt_cur.size()) && cur > reuse_n) {
285
+ reuse_i = i;
286
+ reuse_n = cur;
287
+ }
288
+ }
289
+
290
+ LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
291
+
292
+ result.clear();
293
+ result.reserve(params.n_max);
294
+
295
+ if (reuse_n == 0) {
296
+ llama_memory_clear(mem_dft, false);
297
+ prompt_dft.clear();
298
+ } else {
299
+ // this happens when a previous draft has been discarded (for example, due to being too small), but the
300
+ // target model agreed with it. in this case, we simply pass back the previous results to save compute
301
+ if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
302
+ for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
303
+ result.push_back(prompt_dft[i]);
304
+
305
+ if (params.n_max <= (int) result.size()) {
306
+ break;
307
+ }
308
+ }
309
+
310
+ return;
311
+ }
312
+
313
+ if (reuse_i > 0) {
314
+ llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
315
+ llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
316
+
317
+ prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
318
+ }
319
+
320
+ if (reuse_n < (int) prompt_dft.size()) {
321
+ llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
322
+ prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
323
+ }
324
+ }
325
+
326
+ // prepare a batch to evaluate any new tokens in the prompt
327
+ common_batch_clear(batch);
328
+
329
+ for (size_t i = i_start + reuse_n; i < prompt_cur.size(); ++i) {
330
+ //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_cur[i]);
331
+ common_batch_add(batch, prompt_cur[i], i - i_start, { 0 }, false);
332
+
333
+ prompt_dft.push_back(prompt_cur[i]);
334
+ }
335
+
336
+ // we should rarely end-up here during normal decoding
337
+ if (batch.n_tokens > 0) {
338
+ //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
339
+
340
+ llama_decode(ctx_dft, batch);
341
+ }
342
+
343
+ const llama_pos n_past = prompt_dft.size();
344
+
345
+ LOG_DBG("%s: n_past = %d\n", __func__, n_past);
346
+
347
+ common_batch_clear(batch);
348
+ common_batch_add (batch, id_last, n_past, { 0 }, true);
349
+
350
+ prompt_dft.push_back(id_last);
351
+
352
+ LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
353
+
354
+ llama_decode(ctx_dft, batch);
355
+
356
+ common_sampler_reset(smpl);
357
+
358
+ // sample n_draft tokens from the draft model
359
+ for (int i = 0; i < params.n_max; ++i) {
360
+ common_batch_clear(batch);
361
+
362
+ common_sampler_sample(smpl, ctx_dft, 0, true);
363
+
364
+ const auto * cur_p = common_sampler_get_candidates(smpl, true);
365
+
366
+ for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
367
+ LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
368
+ k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
369
+ }
370
+
371
+ // add drafted token for each sequence
372
+ const llama_token id = cur_p->data[0].id;
373
+
374
+ common_sampler_accept(smpl, id, true);
375
+
376
+ result.push_back(id);
377
+
378
+ if (params.n_max <= (int) result.size()) {
379
+ break;
380
+ }
381
+
382
+ // only collect very high-confidence draft tokens
383
+ if (cur_p->data[0].p < params.p_min) {
384
+ break;
385
+ }
386
+
387
+ common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
388
+
389
+ // evaluate the drafted tokens on the draft model
390
+ llama_decode(ctx_dft, batch);
391
+
392
+ prompt_dft.push_back(id);
393
+ }
394
+
395
+ if (!spec->vocab_cmpt) {
396
+ std::string detokenized = common_detokenize(ctx_dft, result, true);
397
+ detokenized = replace_to_tgt(detokenized);
398
+ LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
399
+ result = common_tokenize(ctx_tgt, detokenized, false, true);
400
+ if (result.size() > (size_t)params.n_max) {
401
+ result.resize(params.n_max);
402
+ }
403
+ }
404
+ }
405
+
406
+ void accept(uint16_t n_accepted) override {
407
+ // noop
408
+ GGML_UNUSED(n_accepted);
409
+ }
410
+
411
+ std::string replace_to_dft(const std::string & input) const {
412
+ std::string result = input;
413
+
414
+ for (const auto & pair : this->vocab_map) {
415
+ size_t pos = result.find(pair.first);
416
+ while (pos != std::string::npos) {
417
+ result.replace(pos, pair.first.length(), pair.second);
418
+ pos = result.find(pair.first, pos + pair.second.length());
419
+ }
420
+ }
421
+
422
+ return result;
423
+ }
424
+
425
+ std::string replace_to_tgt(const std::string & input) const {
426
+ std::string result = input;
427
+
428
+ for (const auto & pair : this->vocab_map) {
429
+ size_t pos = result.find(pair.second);
430
+ while (pos != std::string::npos) {
431
+ result.replace(pos, pair.second.length(), pair.first);
432
+ pos = result.find(pair.second, pos + pair.first.length());
433
+ }
434
+ }
435
+
436
+ return result;
437
+ }
438
+ };
439
+
440
+ struct common_speculative_state_eagle3 : public common_speculative_state {
441
+ common_speculative_state_eagle3(enum common_speculative_type type) : common_speculative_state(type) {}
442
+
443
+ void begin(const llama_tokens & prompt) override {
444
+ GGML_UNUSED(prompt);
445
+ }
446
+
447
+ void draft(
448
+ const common_params_speculative & params,
449
+ const llama_tokens & prompt_tgt,
450
+ llama_token id_last,
451
+ llama_tokens & draft_tokens) override {
452
+ // TODO: implement
453
+ GGML_UNUSED(params);
454
+ GGML_UNUSED(prompt_tgt);
455
+ GGML_UNUSED(id_last);
456
+ GGML_UNUSED(draft_tokens);
457
+ }
458
+
459
+ void accept(uint16_t n_accepted) override {
460
+ // noop
461
+ GGML_UNUSED(n_accepted);
462
+ }
463
+ };
464
+
465
+ // state of self-speculation (simple implementation, not ngram-map)
466
+ struct common_speculative_state_ngram_simple : public common_speculative_state {
467
+ common_ngram_simple_config config;
468
+
469
+ common_speculative_state_ngram_simple(
470
+ enum common_speculative_type type,
471
+ common_ngram_simple_config config)
472
+ : common_speculative_state(type), config(config) {}
473
+
474
+ void begin(const llama_tokens & prompt) override {
475
+ GGML_UNUSED(prompt);
476
+ }
477
+
478
+ void draft(
479
+ const common_params_speculative & params,
480
+ const llama_tokens & prompt_tgt,
481
+ llama_token id_last,
482
+ llama_tokens & result) override {
483
+
484
+ result = common_ngram_simple_draft(config, prompt_tgt, id_last);
485
+ GGML_UNUSED(params);
486
+ }
487
+
488
+ void accept(uint16_t n_accepted) override {
489
+ // noop
490
+ GGML_UNUSED(n_accepted);
491
+ }
492
+ };
493
+
494
+ struct common_speculative_state_ngram_map_k : public common_speculative_state {
495
+ // draft ngram map for speculative decoding without draft model
496
+ common_ngram_map map;
497
+
498
+ common_speculative_state_ngram_map_k(
499
+ enum common_speculative_type type,
500
+ common_ngram_map map)
501
+ : common_speculative_state(type), map(std::move(map)) {}
502
+
503
+ void begin(const llama_tokens & prompt) override {
504
+ common_ngram_map_begin(map, prompt);
505
+ }
506
+
507
+ void draft(
508
+ const common_params_speculative & params,
509
+ const llama_tokens & prompt_tgt,
510
+ llama_token id_last,
511
+ llama_tokens & result) override {
512
+ common_ngram_map_draft(map, prompt_tgt, id_last, result);
513
+ GGML_UNUSED(params);
514
+ }
515
+
516
+ void accept(uint16_t n_accepted) override {
517
+ common_ngram_map_accept(map, n_accepted);
518
+ }
519
+ };
520
+
521
+ struct common_speculative_state_ngram_mod : public common_speculative_state {
522
+ common_ngram_mod & mod;
523
+
524
+ // the last position in the prompt that was added to the ngram container
525
+ size_t i_last = 0;
526
+
527
+ // length of the last drafted n‑gram (number of tokens returned by draft)
528
+ size_t n_draft_last = 0;
529
+
530
+ // consecutive accept rounds with low acceptance fraction (< 0.5)
531
+ int n_low = 0;
532
+
533
+ // enable trace logging if LLAMA_TRACE is set
534
+ const bool verbose;
535
+
536
+ common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod)
537
+ : common_speculative_state(type), mod(mod), verbose(std::getenv("LLAMA_TRACE") != nullptr) {
538
+ static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
539
+ }
540
+
541
+ void begin(const llama_tokens & prompt) override {
542
+ i_last = 0;
543
+
544
+ n_draft_last = 0;
545
+
546
+ const size_t n = mod.get_n();
547
+
548
+ if (prompt.size() < n) {
549
+ return;
550
+ }
551
+
552
+ for (size_t i = 0; i < prompt.size() - n; ++i) {
553
+ mod.add(prompt.data() + i);
554
+ }
555
+
556
+ i_last = prompt.size() - n;
557
+
558
+ const double f = (double)mod.get_used() / (double)mod.size();
559
+ LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f);
560
+
561
+ constexpr double f_thold = 0.25;
562
+ if (f > f_thold) {
563
+ LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold);
564
+
565
+ mod.reset();
566
+ }
567
+ }
568
+
569
+ void draft(
570
+ const common_params_speculative & params,
571
+ const llama_tokens & prompt_tgt,
572
+ llama_token id_last,
573
+ llama_tokens & result) override {
574
+ GGML_UNUSED(params);
575
+
576
+ n_draft_last = 0;
577
+
578
+ const size_t cur_len = prompt_tgt.size();
579
+ if (cur_len < mod.get_n()) {
580
+ return;
581
+ }
582
+
583
+ const size_t n = mod.get_n();
584
+
585
+ // add new ngrams in chunks
586
+ if (i_last + 32 < cur_len) {
587
+ for (size_t i = i_last; i < cur_len - n; ++i) {
588
+ mod.add(prompt_tgt.data() + i);
589
+ }
590
+
591
+ i_last = cur_len - n;
592
+ }
593
+
594
+ result.resize(n + params.n_max);
595
+ for (size_t i = 0; i < n - 1; ++i) {
596
+ result[i] = prompt_tgt[cur_len - n + 1 + i];
597
+ }
598
+ result[n - 1] = id_last;
599
+
600
+ for (int i = 0; i < params.n_max; ++i) {
601
+ const llama_token token = mod.get(result.data() + i);
602
+ if (token == common_ngram_mod::EMPTY) {
603
+ if (i < params.n_min) {
604
+ result.clear();
605
+ return;
606
+ }
607
+
608
+ result.resize(n + i);
609
+ break;
610
+ }
611
+ result[n + i] = token;
612
+ }
613
+
614
+ // only return the m tokens that were drafted
615
+ for (size_t i = 0; n + i < result.size(); ++i) {
616
+ result[i] = result[n + i];
617
+ }
618
+ result.resize(result.size() - n);
619
+
620
+ // store length of drafted n‑gram for later acceptance analysis
621
+ n_draft_last = result.size();
622
+ }
623
+
624
+ void accept(uint16_t n_accepted) override {
625
+ if (verbose) {
626
+ LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
627
+ }
628
+
629
+ // compute acceptance fraction if we have a recorded draft length
630
+ if (n_draft_last > 0) {
631
+ const double f_acc = (double)n_accepted / (double)n_draft_last;
632
+ if (f_acc < 0.5) {
633
+ n_low++;
634
+ if (n_low >= 3) {
635
+ LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
636
+
637
+ mod.reset();
638
+ n_low = 0;
639
+ }
640
+ } else {
641
+ n_low = 0;
642
+ }
643
+ }
644
+ }
645
+ };
646
+
647
+ struct common_speculative_state_ngram_cache : public common_speculative_state {
648
+ uint16_t n_draft;
649
+ bool save_dynamic;
650
+ bool save_static;
651
+
652
+ common_ngram_cache ngram_cache_context;
653
+ common_ngram_cache ngram_cache_dynamic;
654
+ common_ngram_cache ngram_cache_static;
655
+
656
+ size_t cache_size = 0; // number of tokens in n-gram cache
657
+
658
+ common_speculative_state_ngram_cache(
659
+ const enum common_speculative_type type,
660
+ const std::string & path_static,
661
+ const std::string & path_dynamic,
662
+ uint16_t n_draft,
663
+ bool save_dynamic,
664
+ bool save_static)
665
+ : common_speculative_state(type)
666
+ , n_draft(n_draft)
667
+ , save_dynamic(save_dynamic)
668
+ , save_static(save_static)
669
+ {
670
+ if (!path_static.empty()) {
671
+ try {
672
+ ngram_cache_static = common_ngram_cache_load(path_static);
673
+ } catch (...) {
674
+ LOG_ERR("failed to open static lookup cache: %s", path_static.c_str());
675
+ GGML_ABORT("Couldn't read static lookup cache");
676
+ }
677
+ }
678
+
679
+ if (!path_dynamic.empty()) {
680
+ try {
681
+ ngram_cache_dynamic = common_ngram_cache_load(path_dynamic);
682
+ } catch (...) {
683
+ LOG_ERR("failed to open dynamic lookup cache: %s", path_dynamic.c_str());
684
+ GGML_ABORT("Couldn't read dynamic lookup cache");
685
+ }
686
+ }
687
+ }
688
+
689
+ void begin(const llama_tokens & prompt) override {
690
+ GGML_UNUSED(prompt);
691
+ }
692
+
693
+ void draft(
694
+ const common_params_speculative & params,
695
+ const llama_tokens & prompt_tgt,
696
+ llama_token id_last,
697
+ llama_tokens & result) override {
698
+ GGML_UNUSED(params);
699
+
700
+ if (cache_size < prompt_tgt.size() + 1) {
701
+ llama_tokens tokens_new;
702
+ tokens_new.reserve(prompt_tgt.size() + 1 - cache_size);
703
+ for (size_t j = cache_size; j < prompt_tgt.size(); ++j) {
704
+ tokens_new.push_back(prompt_tgt[j]);
705
+ }
706
+ tokens_new.push_back(id_last); // add the last token
707
+
708
+ // Update context ngram cache with new prompt_tgt:
709
+ common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX,
710
+ tokens_new, tokens_new.size(), false);
711
+ cache_size = prompt_tgt.size() + 1;
712
+ }
713
+
714
+ llama_tokens inp;
715
+ inp.reserve(prompt_tgt.size() + 1);
716
+ for (size_t j = 0; j < prompt_tgt.size(); ++j) {
717
+ inp.push_back(prompt_tgt[j]);
718
+ }
719
+ inp.push_back(id_last);
720
+
721
+ result.push_back(id_last);
722
+
723
+ common_ngram_cache_draft(inp, result, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX,
724
+ ngram_cache_context,
725
+ ngram_cache_dynamic,
726
+ ngram_cache_static);
727
+
728
+ if (result.size() > 0) {
729
+ // delete first token in result (which is the id_last token)
730
+ result.erase(result.begin());
731
+ }
732
+ }
733
+
734
+ void accept(uint16_t n_accepted) override {
735
+ // TODO: noop
736
+ GGML_UNUSED(n_accepted);
737
+ }
738
+ };
739
+
740
+ struct common_speculative {
741
+ std::vector<std::unique_ptr<common_speculative_state>> impls; // list of implementations to use and their states
742
+ common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
743
+ };
744
+
745
+ static common_ngram_map get_common_ngram_map(const common_speculative_config & config) {
746
+ uint16_t size_key = config.params.ngram_size_n;
747
+ uint16_t size_value = config.params.ngram_size_m;
748
+ bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
749
+ uint16_t min_hits = config.params.ngram_min_hits;
750
+
751
+ return common_ngram_map(size_key, size_value, key_only, min_hits);
752
+ }
753
+
754
+ static common_speculative_state_ngram_cache create_state_ngram_cache(
755
+ const std::string & path_static, const std::string & path_dynamic,
756
+ const common_speculative_config & config) {
757
+ uint16_t n_draft = 8; // TODO get from config?
758
+
759
+ // TODO bool param in common/common.h to set save_static/save_dynamic?
760
+ bool save_static = false;
761
+ bool save_dynamic = false;
762
+
763
+ common_speculative_state_ngram_cache state(config.type, path_static, path_dynamic, n_draft, save_static, save_dynamic);
764
+
765
+ return state;
766
+ }
767
+
768
+ std::string common_speculative_type_name_str() {
769
+ std::string result;
770
+ for (size_t i = 0; i < common_speculative_types.size(); i++) {
771
+ if (i > 0) {
772
+ result += ", ";
773
+ }
774
+ result += common_speculative_type_to_str(common_speculative_types[i]);
775
+ }
776
+ return result;
777
+ }
778
+
779
+ std::string common_speculative_type_to_str(enum common_speculative_type type) {
780
+ switch (type) {
781
+ case COMMON_SPECULATIVE_TYPE_NONE: return "none";
782
+ case COMMON_SPECULATIVE_TYPE_DRAFT: return "draft";
783
+ case COMMON_SPECULATIVE_TYPE_EAGLE3: return "eagle3";
784
+ case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram_simple";
785
+ case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram_map_k";
786
+ case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v";
787
+ case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: return "ngram_mod";
788
+ case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: return "ngram_cache";
789
+ default: return "unknown";
790
+ }
791
+ }
792
+
793
+ enum common_speculative_type common_speculative_type_from_name(const std::string & name) {
794
+ const auto it = common_speculative_type_from_name_map.find(name);
795
+ if (it == common_speculative_type_from_name_map.end()) {
796
+ return COMMON_SPECULATIVE_TYPE_COUNT;
797
+ }
798
+ return it->second;
799
+ }
800
+
801
+ bool common_speculative_is_compat(llama_context * ctx_tgt) {
802
+ auto * mem = llama_get_memory(ctx_tgt);
803
+ if (mem == nullptr) {
804
+ return false;
805
+ }
806
+
807
+ bool res = true;
808
+
809
+ llama_memory_clear(mem, true);
810
+
811
+ // eval 2 tokens to check if the context is compatible
812
+ std::vector<llama_token> tmp;
813
+ tmp.push_back(0);
814
+ tmp.push_back(0);
815
+
816
+ int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
817
+ if (ret != 0) {
818
+ LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
819
+ res = false;
820
+ goto done;
821
+ }
822
+
823
+ // try to remove the last tokens
824
+ if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
825
+ LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
826
+ res = false;
827
+ goto done;
828
+ }
829
+
830
+ done:
831
+ llama_memory_clear(mem, true);
832
+ llama_synchronize(ctx_tgt);
833
+
834
+ return res;
835
+ }
836
+
837
+ // initialization of the speculative decoding system
838
+ //
839
+ common_speculative * common_speculative_init(
840
+ common_params_speculative & params,
841
+ llama_context * ctx_tgt) {
842
+ llama_context * ctx_dft = nullptr;
843
+ if (params.model_dft) {
844
+ ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
845
+ if (ctx_dft == nullptr) {
846
+ LOG_ERR("%s", "failed to create draft context\n");
847
+ return nullptr;
848
+ }
849
+ }
850
+
851
+ // Compute the implementations to use based on the config and their order of preference
852
+ std::vector<common_speculative_config> configs = {}; // list of speculative configs to try
853
+ {
854
+ bool has_draft = !params.mparams_dft.path.empty();
855
+ bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
856
+
857
+ bool has_ngram_cache = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE);
858
+ bool has_ngram_simple = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
859
+ bool has_ngram_map_k = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
860
+ bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
861
+ bool has_ngram_mod = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD);
862
+
863
+ // In a more complex implementation we could use the same implementation but with different parameters.
864
+ // This was initially used in PR-18471 but removed to simplify the code.
865
+ if (has_ngram_simple) {
866
+ // This implementation can guess a lot of tokens without any draft model.
867
+ configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, params));
868
+ }
869
+ if (has_ngram_map_k) {
870
+ configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, params));
871
+ }
872
+ if (has_ngram_map_k4v) {
873
+ // This implementation can guess tokens with high acceptance rate but is more expensive.
874
+ configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
875
+ }
876
+ if (has_ngram_mod) {
877
+ // shared instance for all speculative decoding contexts
878
+ if (!params.ngram_mod) {
879
+ params.ngram_mod = std::make_shared<common_ngram_mod>(params.ngram_size_n, 4*1024*1024);
880
+
881
+ LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
882
+ params.ngram_size_n, params.ngram_mod->size(),
883
+ (float)(params.ngram_mod->size_bytes())/1024/1024);
884
+
885
+ if (params.ngram_size_n < 16) {
886
+ LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n);
887
+ }
888
+ }
889
+
890
+ configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params));
891
+ }
892
+ if (has_ngram_cache) {
893
+ configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
894
+ }
895
+ if (has_draft) {
896
+ configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT, params));
897
+ }
898
+ if (has_draft_eagle3) {
899
+ configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3, params));
900
+ }
901
+ }
902
+
903
+ std::vector<std::unique_ptr<common_speculative_state>> impls = {};
904
+
905
+ for (const common_speculative_config & config : configs) {
906
+ LOG_DBG("%s: adding implementation %s\n", __func__, common_speculative_type_to_str(config.type).c_str());
907
+ switch (config.type) {
908
+ case COMMON_SPECULATIVE_TYPE_NONE:
909
+ break;
910
+ case COMMON_SPECULATIVE_TYPE_DRAFT: {
911
+ impls.push_back(std::make_unique<common_speculative_state_draft>(config.type,
912
+ /* .ctx_tgt = */ ctx_tgt,
913
+ /* .ctx_dft = */ ctx_dft,
914
+ /* .replacements = */ params.replacements
915
+ ));
916
+ break;
917
+ }
918
+ case COMMON_SPECULATIVE_TYPE_EAGLE3: {
919
+ impls.push_back(std::make_unique<common_speculative_state_eagle3>(config.type));
920
+ break;
921
+ }
922
+ case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
923
+ common_ngram_map ngram_map = get_common_ngram_map(config);
924
+
925
+ uint16_t ngram_size_key = ngram_map.size_key;
926
+ uint16_t mgram_size_value = ngram_map.size_value;
927
+
928
+ auto config_simple = common_ngram_simple_config {
929
+ /* .size_ngram = */ ngram_size_key,
930
+ /* .size_mgram = */ mgram_size_value
931
+ };
932
+ auto state = std::make_unique<common_speculative_state_ngram_simple>(
933
+ /* .type = */ config.type,
934
+ /* .state = */ config_simple
935
+ );
936
+ impls.push_back(std::move(state));
937
+ break;
938
+ }
939
+ case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:
940
+ case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: {
941
+ impls.push_back(std::make_unique<common_speculative_state_ngram_map_k>(
942
+ (config.type),
943
+ get_common_ngram_map(config)
944
+ ));
945
+ break;
946
+ }
947
+ case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
948
+ GGML_ASSERT(config.params.ngram_mod);
949
+ impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
950
+ break;
951
+ }
952
+ case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
953
+ auto state = create_state_ngram_cache(
954
+ params.lookup_cache_static, params.lookup_cache_dynamic, config);
955
+ impls.push_back(std::make_unique<common_speculative_state_ngram_cache>(state));
956
+ break;
957
+ }
958
+ default:
959
+ break;
960
+ }
961
+ }
962
+
963
+ if (impls.empty()) {
964
+ LOG_WRN("%s", "no implementations specified for speculative decoding\n");
965
+ return nullptr;
966
+ }
967
+
968
+ auto * result = new common_speculative {
969
+ /* .impls = */ std::move(impls)
970
+ };
971
+
972
+ return result;
973
+ }
974
+
975
+ void common_speculative_free(common_speculative * spec) {
976
+ if (spec == nullptr) {
977
+ return;
978
+ }
979
+
980
+ delete spec;
981
+ }
982
+
983
+ void common_speculative_begin(common_speculative * spec, const llama_tokens & prompt) {
984
+ if (spec == nullptr) {
985
+ return;
986
+ }
987
+
988
+ for (auto & impl : spec->impls) {
989
+ common_time_meas tm(impl->t_begin_us, !impl->gen_perf);
990
+ impl->begin(prompt);
991
+ impl->n_call_begin++;
992
+ }
993
+ }
994
+
995
+ llama_tokens common_speculative_draft(
996
+ common_speculative * spec,
997
+ const common_params_speculative & params,
998
+ const llama_tokens & prompt_tgt, // specified in target model vocab
999
+ llama_token id_last) {
1000
+ llama_tokens result;
1001
+
1002
+ spec->curr_impl = nullptr; // reset current implementation
1003
+
1004
+ for (auto & impl : spec->impls) {
1005
+ {
1006
+ common_time_meas tm(impl->t_draft_us, !impl->gen_perf);
1007
+ impl->draft(params, prompt_tgt, id_last, result);
1008
+ impl->n_call_draft++;
1009
+ }
1010
+
1011
+ if (!result.empty()) {
1012
+ LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
1013
+ common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
1014
+ impl.get()->n_call_draft, result.size());
1015
+
1016
+ spec->curr_impl = impl.get(); // set current implementation for stats
1017
+ impl->n_gen_drafts++;
1018
+ impl->n_gen_tokens += result.size();
1019
+
1020
+ break; // We have a draft, so break out of the loop and return it.
1021
+ }
1022
+ }
1023
+
1024
+ return result;
1025
+ }
1026
+
1027
+ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
1028
+ if (n_accepted == 0) {
1029
+ return;
1030
+ }
1031
+
1032
+ common_speculative_state * impl = spec->curr_impl;
1033
+
1034
+ GGML_ASSERT(impl);
1035
+
1036
+ {
1037
+ common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
1038
+ if (n_accepted > 0) {
1039
+ impl->n_acc_drafts++;
1040
+ impl->n_acc_tokens += n_accepted;
1041
+ }
1042
+
1043
+ impl->accept(n_accepted);
1044
+ impl->n_call_accept++;
1045
+ }
1046
+ }
1047
+
1048
+ void common_speculative_print_stats(const common_speculative * spec) {
1049
+ if (spec == nullptr) {
1050
+ return;
1051
+ }
1052
+
1053
+ for (const auto & impl : spec->impls) {
1054
+ std::string str_perf;
1055
+ if (impl->gen_perf) {
1056
+ std::ostringstream oss;
1057
+ oss << std::fixed << std::setprecision(3) << impl->t_begin_us / 1000.0 << ", ";
1058
+ oss << std::fixed << std::setprecision(3) << impl->t_draft_us / 1000.0 << ", ";
1059
+ oss << std::fixed << std::setprecision(3) << impl->t_accept_us / 1000.0;
1060
+ str_perf = ", dur(b,g,a) = " + oss.str() + " ms";
1061
+ } else {
1062
+ str_perf = "";
1063
+ }
1064
+
1065
+ LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
1066
+ common_speculative_type_to_str(impl->type).c_str(),
1067
+ impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
1068
+ impl->n_gen_drafts,
1069
+ impl->n_acc_drafts,
1070
+ impl->n_gen_tokens,
1071
+ impl->n_acc_tokens,
1072
+ str_perf.c_str());
1073
+ }
1074
+ }