local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,930 @@
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "ggml-opt.h"
6
+ #include "llama-cpp.h"
7
+
8
+ #include <set>
9
+ #include <sstream>
10
+ #include <string>
11
+ #include <string_view>
12
+ #include <vector>
13
+ #include <map>
14
+
15
+ #if defined(_WIN32) && !defined(_WIN32_WINNT)
16
+ #define _WIN32_WINNT 0x0A00
17
+ #endif
18
+
19
+ #ifdef _WIN32
20
+ #define DIRECTORY_SEPARATOR '\\'
21
+ #else
22
+ #define DIRECTORY_SEPARATOR '/'
23
+ #endif // _WIN32
24
+
25
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
26
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
27
+
28
+ #define print_build_info() do { \
29
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
30
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
31
+ } while(0)
32
+
33
+ struct common_time_meas {
34
+ common_time_meas(int64_t & t_acc, bool disable = false);
35
+ ~common_time_meas();
36
+
37
+ const int64_t t_start_us;
38
+
39
+ int64_t & t_acc;
40
+ };
41
+
42
+ struct common_adapter_lora_info {
43
+ std::string path;
44
+ float scale;
45
+
46
+ std::string task_name;
47
+ std::string prompt_prefix;
48
+
49
+ struct llama_adapter_lora * ptr;
50
+ };
51
+
52
+ using llama_tokens = std::vector<llama_token>;
53
+
54
+ // build info
55
+ extern int LLAMA_BUILD_NUMBER;
56
+ extern const char * LLAMA_COMMIT;
57
+ extern const char * LLAMA_COMPILER;
58
+ extern const char * LLAMA_BUILD_TARGET;
59
+
60
+ const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
61
+
62
+ struct common_control_vector_load_info;
63
+
64
+ //
65
+ // CPU utils
66
+ //
67
+
68
+ struct cpu_params {
69
+ int n_threads = -1;
70
+ bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
71
+ bool mask_valid = false; // Default: any CPU
72
+ enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
73
+ bool strict_cpu = false; // Use strict CPU placement
74
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
75
+ };
76
+
77
+ int32_t cpu_get_num_physical_cores();
78
+ int32_t cpu_get_num_math();
79
+
80
+ //
81
+ // Common params
82
+ //
83
+
84
+ enum llama_example {
85
+ LLAMA_EXAMPLE_BATCHED,
86
+ LLAMA_EXAMPLE_DEBUG,
87
+ LLAMA_EXAMPLE_COMMON,
88
+ LLAMA_EXAMPLE_SPECULATIVE,
89
+ LLAMA_EXAMPLE_COMPLETION,
90
+ LLAMA_EXAMPLE_CLI,
91
+ LLAMA_EXAMPLE_EMBEDDING,
92
+ LLAMA_EXAMPLE_PERPLEXITY,
93
+ LLAMA_EXAMPLE_RETRIEVAL,
94
+ LLAMA_EXAMPLE_PASSKEY,
95
+ LLAMA_EXAMPLE_IMATRIX,
96
+ LLAMA_EXAMPLE_BENCH,
97
+ LLAMA_EXAMPLE_SERVER,
98
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
99
+ LLAMA_EXAMPLE_EXPORT_LORA,
100
+ LLAMA_EXAMPLE_MTMD,
101
+ LLAMA_EXAMPLE_LOOKUP,
102
+ LLAMA_EXAMPLE_PARALLEL,
103
+ LLAMA_EXAMPLE_TTS,
104
+ LLAMA_EXAMPLE_DIFFUSION,
105
+ LLAMA_EXAMPLE_FINETUNE,
106
+ LLAMA_EXAMPLE_FIT_PARAMS,
107
+
108
+ LLAMA_EXAMPLE_COUNT,
109
+ };
110
+
111
+ enum common_sampler_type {
112
+ COMMON_SAMPLER_TYPE_NONE = 0,
113
+ COMMON_SAMPLER_TYPE_DRY = 1,
114
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
115
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
116
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
117
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
118
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
119
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
120
+ COMMON_SAMPLER_TYPE_XTC = 8,
121
+ COMMON_SAMPLER_TYPE_INFILL = 9,
122
+ COMMON_SAMPLER_TYPE_PENALTIES = 10,
123
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
124
+ COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
125
+ };
126
+
127
+ // dimensionality reduction methods, used by cvector-generator
128
+ enum dimre_method {
129
+ DIMRE_METHOD_PCA,
130
+ DIMRE_METHOD_MEAN,
131
+ };
132
+
133
+ enum common_conversation_mode {
134
+ COMMON_CONVERSATION_MODE_DISABLED = 0,
135
+ COMMON_CONVERSATION_MODE_ENABLED = 1,
136
+ COMMON_CONVERSATION_MODE_AUTO = 2,
137
+ };
138
+
139
+ enum common_grammar_trigger_type {
140
+ COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
141
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
142
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
143
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
144
+ };
145
+
146
+ struct common_grammar_trigger {
147
+ common_grammar_trigger_type type;
148
+ std::string value;
149
+ llama_token token = LLAMA_TOKEN_NULL;
150
+ };
151
+
152
+ enum common_params_sampling_config : uint64_t {
153
+ COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
154
+ COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
155
+ COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
156
+ COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
157
+ COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
158
+ COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
159
+ COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
160
+ COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
161
+ COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
162
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
163
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
164
+ COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
165
+ };
166
+
167
+ enum common_speculative_type {
168
+ COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
169
+ COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
170
+ COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
171
+ COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
172
+ COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
173
+ COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
174
+ COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
175
+ COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
176
+ COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
177
+ };
178
+
179
+ // sampling parameters
180
+ struct common_params_sampling {
181
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
182
+
183
+ int32_t n_prev = 64; // number of previous tokens to remember
184
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
185
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
186
+ int32_t top_k = 40; // <= 0 to use vocab size
187
+ float top_p = 0.95f; // 1.0 = disabled
188
+ float min_p = 0.05f; // 0.0 = disabled
189
+ float xtc_probability = 0.00f; // 0.0 = disabled
190
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
191
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
192
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
193
+ float dynatemp_range = 0.00f; // 0.0 = disabled
194
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
195
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
196
+ float penalty_repeat = 1.00f; // 1.0 = disabled
197
+ float penalty_freq = 0.00f; // 0.0 = disabled
198
+ float penalty_present = 0.00f; // 0.0 = disabled
199
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
200
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
201
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
202
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
203
+ float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
204
+ float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
205
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
206
+ float top_n_sigma = -1.00f; // -1.0 = disabled
207
+ float mirostat_tau = 5.00f; // target entropy
208
+ float mirostat_eta = 0.10f; // learning rate
209
+ bool ignore_eos = false;
210
+ bool no_perf = false; // disable performance metrics
211
+ bool timing_per_token = false;
212
+
213
+ uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
214
+
215
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
216
+
217
+ std::vector<enum common_sampler_type> samplers = {
218
+ COMMON_SAMPLER_TYPE_PENALTIES,
219
+ COMMON_SAMPLER_TYPE_DRY,
220
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
221
+ COMMON_SAMPLER_TYPE_TOP_K,
222
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
223
+ COMMON_SAMPLER_TYPE_TOP_P,
224
+ COMMON_SAMPLER_TYPE_MIN_P,
225
+ COMMON_SAMPLER_TYPE_XTC,
226
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
227
+ };
228
+
229
+ std::string grammar; // optional BNF-like grammar to constrain sampling
230
+ bool grammar_lazy = false;
231
+ std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
232
+ std::set<llama_token> preserved_tokens;
233
+
234
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
235
+ std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
236
+
237
+ bool backend_sampling = false;
238
+
239
+ bool has_logit_bias() const {
240
+ return !logit_bias.empty();
241
+ }
242
+
243
+ // print the parameters into a string
244
+ std::string print() const;
245
+ };
246
+
247
+ struct common_params_model {
248
+ std::string path = ""; // model local path // NOLINT
249
+ std::string url = ""; // model url to download // NOLINT
250
+ std::string hf_repo = ""; // HF repo // NOLINT
251
+ std::string hf_file = ""; // HF file // NOLINT
252
+ std::string docker_repo = ""; // Docker repo // NOLINT
253
+ std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
254
+ };
255
+
256
+ struct common_ngram_mod;
257
+
258
+ struct common_params_speculative {
259
+ common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
260
+
261
+ // general-purpose speculative decoding parameters
262
+
263
+ int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
264
+ int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
265
+ float p_split = 0.1f; // speculative decoding split probability
266
+ float p_min = 0.75f; // minimum speculative decoding probability (greedy)
267
+
268
+ // ngram-based speculative decoding
269
+
270
+ uint16_t ngram_size_n = 12; // ngram size for lookup
271
+ uint16_t ngram_size_m = 48; // mgram size for speculative tokens
272
+ uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
273
+
274
+ std::shared_ptr<common_ngram_mod> ngram_mod;
275
+
276
+ std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
277
+ std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
278
+
279
+ // draft-model speculative decoding
280
+
281
+ struct common_params_model mparams_dft;
282
+
283
+ llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
284
+
285
+ llama_context_params cparams_dft; // these are the parameters for the draft llama_context
286
+
287
+ int32_t n_ctx = 0; // draft context size
288
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
289
+
290
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
291
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
292
+
293
+ struct cpu_params cpuparams;
294
+ struct cpu_params cpuparams_batch;
295
+
296
+ std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
297
+
298
+ std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
299
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
300
+
301
+ bool has_dft() const {
302
+ return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
303
+ }
304
+ };
305
+
306
+ struct common_params_vocoder {
307
+ struct common_params_model model;
308
+
309
+ std::string speaker_file = ""; // speaker file path // NOLINT
310
+
311
+ bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
312
+ };
313
+
314
+ struct common_params_diffusion {
315
+ int32_t steps = 128;
316
+ bool visual_mode = false;
317
+
318
+ float eps = 0; // epsilon for timesteps
319
+ int32_t block_length = 0; // block length for generation
320
+
321
+ int32_t algorithm = 4; // default algorithm: low-confidence
322
+ float alg_temp = 0.0f; // algorithm temperature
323
+
324
+ float cfg_scale = 0; // classifier-free guidance scale
325
+ bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
326
+ };
327
+
328
+ // reasoning API response format (not to be confused as chat template's reasoning format)
329
+ // only used by server
330
+ enum common_reasoning_format {
331
+ COMMON_REASONING_FORMAT_NONE,
332
+ COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
333
+ COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
334
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
335
+ // do not extend this enum unless you absolutely have to
336
+ // in most cases, use COMMON_REASONING_FORMAT_AUTO
337
+ // see: https://github.com/ggml-org/llama.cpp/pull/15408
338
+ };
339
+
340
+
341
+ struct lr_opt {
342
+ float lr0 = 1e-5; // learning rate at first epoch
343
+ float lr_min = -1;
344
+ float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
345
+ float scale_epoch = 0;
346
+ float wd = 0;
347
+ unsigned epochs = 2;
348
+
349
+ unsigned epoch; // set by optimizer outer (epochs) loop
350
+ // learning rate decay - constant LR per epoch only for now
351
+ float get_lr(float e) const;
352
+ float get_lr() const { return get_lr(epoch); }
353
+ // must call after arg parse, before get_lr
354
+ void init();
355
+ };
356
+
357
+ struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
358
+
359
+ struct common_params {
360
+ int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
361
+ int32_t n_ctx = 0; // context size, 0 == context the model was trained with
362
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
363
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
364
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
365
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
366
+ int32_t n_parallel = 1; // number of parallel sequences to decode
367
+ int32_t n_sequences = 1; // number of sequences to decode
368
+ int32_t grp_attn_n = 1; // group-attention factor
369
+ int32_t grp_attn_w = 512; // group-attention width
370
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
371
+ float rope_freq_base = 0.0f; // RoPE base frequency
372
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
373
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
374
+ float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
375
+ float yarn_beta_fast = -1.0f; // YaRN low correction dim
376
+ float yarn_beta_slow = -1.0f; // YaRN high correction dim
377
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
378
+
379
+ // offload params
380
+ std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
381
+
382
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
383
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
384
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
385
+ bool fit_params = true; // whether to fit unset model/context parameters to free device memory
386
+ int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
387
+
388
+ // margin per device in bytes for fitting parameters to free memory:
389
+ std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
390
+
391
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
392
+
393
+ struct cpu_params cpuparams;
394
+ struct cpu_params cpuparams_batch;
395
+
396
+ ggml_backend_sched_eval_callback cb_eval = nullptr;
397
+ void * cb_eval_user_data = nullptr;
398
+
399
+ ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
400
+
401
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
402
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
403
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
404
+ enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
405
+
406
+ struct common_params_sampling sampling;
407
+ struct common_params_speculative speculative;
408
+ struct common_params_vocoder vocoder;
409
+ struct common_params_diffusion diffusion;
410
+
411
+ struct common_params_model model;
412
+
413
+ std::string model_alias = ""; // model alias // NOLINT
414
+ std::string hf_token = ""; // HF token // NOLINT
415
+ std::string prompt = ""; // NOLINT
416
+ std::string system_prompt = ""; // NOLINT
417
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
418
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
419
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
420
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
421
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
422
+
423
+ // llama-debug specific options
424
+ std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
425
+ bool save_logits = false; // whether to save logits to files // NOLINT
426
+ std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
427
+
428
+ std::vector<std::string> in_files; // all input files
429
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
430
+ std::vector<llama_model_kv_override> kv_overrides;
431
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
432
+
433
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
434
+ std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
435
+
436
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
437
+
438
+ int32_t verbosity = 3; // LOG_LEVEL_INFO
439
+ int32_t control_vector_layer_start = -1; // layer range for control vector
440
+ int32_t control_vector_layer_end = -1; // layer range for control vector
441
+ bool offline = false;
442
+
443
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
444
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
445
+ // (which is more convenient to use for plotting)
446
+ //
447
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
448
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
449
+
450
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
451
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
452
+
453
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
454
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
455
+
456
+ bool kl_divergence = false; // compute KL divergence
457
+
458
+ bool usage = false; // print usage
459
+ bool completion = false; // print source-able completion script
460
+ bool use_color = false; // use color to distinguish generations and inputs
461
+ bool special = false; // enable special token output
462
+ bool interactive = false; // interactive mode
463
+ bool interactive_first = false; // wait for user input immediately
464
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
465
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
466
+
467
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
468
+ bool multiline_input = false; // reverse the usage of `\`
469
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
470
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
471
+ bool no_perf = false; // disable performance metrics
472
+ bool show_timings = true; // show timing information on CLI
473
+ bool ctx_shift = false; // context shift on infinite text generation
474
+ bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
475
+ bool kv_unified = false; // enable unified KV cache
476
+
477
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
478
+ bool use_mmap = true; // enable mmap to use filesystem cache
479
+ bool use_direct_io = false; // read from disk without buffering
480
+ bool use_mlock = false; // use mlock to keep model in memory
481
+ bool verbose_prompt = false; // print prompt tokens before generation
482
+ bool display_prompt = true; // print prompt before generation
483
+ bool no_kv_offload = false; // disable KV offloading
484
+ bool warmup = true; // warmup run
485
+ bool check_tensors = false; // validate tensor data
486
+ bool no_op_offload = false; // globally disable offload host tensor operations to device
487
+ bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
488
+ bool no_host = false; // bypass host buffer allowing extra buffers to be used
489
+
490
+ bool single_turn = false; // single turn chat conversation
491
+
492
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
493
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
494
+
495
+ common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
496
+
497
+ // multimodal models (see tools/mtmd)
498
+ struct common_params_model mmproj;
499
+ bool mmproj_use_gpu = true; // use GPU for multimodal model
500
+ bool no_mmproj = false; // explicitly disable multimodal model
501
+ std::vector<std::string> image; // path to image file(s)
502
+ int image_min_tokens = -1;
503
+ int image_max_tokens = -1;
504
+
505
+ // finetune
506
+ struct lr_opt lr;
507
+ enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
508
+ float val_split = 0.05f; // fraction of the data used for the validation set
509
+
510
+ // embedding
511
+ bool embedding = false; // get only sentence embedding
512
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
513
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
514
+ std::string embd_sep = "\n"; // separator of embeddings
515
+ std::string cls_sep = "\t"; // separator of classification sequences
516
+
517
+ // server params
518
+ int32_t port = 8080; // server listens on this network port
519
+ int32_t timeout_read = 600; // http read timeout in seconds
520
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
521
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
522
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
523
+ bool cache_prompt = true; // whether to enable prompt caching
524
+ int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
525
+ int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
526
+
527
+ std::string hostname = "127.0.0.1";
528
+ std::string public_path = ""; // NOLINT
529
+ std::string api_prefix = ""; // NOLINT
530
+ std::string chat_template = ""; // NOLINT
531
+ bool use_jinja = true; // NOLINT
532
+ bool enable_chat_template = true;
533
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
534
+ int reasoning_budget = -1;
535
+ bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
536
+ int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
537
+
538
+ std::vector<std::string> api_keys;
539
+
540
+ std::string ssl_file_key = ""; // NOLINT
541
+ std::string ssl_file_cert = ""; // NOLINT
542
+
543
+ std::map<std::string, std::string> default_template_kwargs;
544
+
545
+ // webui configs
546
+ bool webui = true;
547
+ std::string webui_config_json;
548
+
549
+ // "advanced" endpoints are disabled by default for better security
550
+ bool endpoint_slots = true;
551
+ bool endpoint_props = false; // only control POST requests, not GET
552
+ bool endpoint_metrics = false;
553
+
554
+ // router server configs
555
+ std::string models_dir = ""; // directory containing models for the router server
556
+ std::string models_preset = ""; // directory containing model presets for the router server
557
+ int models_max = 4; // maximum number of models to load simultaneously
558
+ bool models_autoload = true; // automatically load models when requested via the router server
559
+
560
+ bool log_json = false;
561
+
562
+ std::string slot_save_path;
563
+ std::string media_path; // path to directory for loading media files
564
+
565
+ float slot_prompt_similarity = 0.1f;
566
+
567
+ // batched-bench params
568
+ bool is_pp_shared = false;
569
+ bool is_tg_separate = false;
570
+
571
+ std::vector<int32_t> n_pp;
572
+ std::vector<int32_t> n_tg;
573
+ std::vector<int32_t> n_pl;
574
+
575
+ // retrieval params
576
+ std::vector<std::string> context_files; // context files to embed
577
+
578
+ int32_t chunk_size = 64; // chunk size for context embedding
579
+
580
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
581
+
582
+ // passkey params
583
+ int32_t n_junk = 250; // number of times to repeat the junk text
584
+ int32_t i_pos = -1; // position of the passkey in the junk text
585
+
586
+ // imatrix params
587
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
588
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
589
+ int32_t i_chunk = 0; // start processing from this chunk
590
+ int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
591
+
592
+ bool process_output = false; // collect data for the output tensor
593
+ bool compute_ppl = true; // whether to compute perplexity
594
+ bool show_statistics = false; // show imatrix statistics per tensor
595
+ bool parse_special = false; // whether to parse special tokens during imatrix tokenization
596
+
597
+ // cvector-generator params
598
+ int n_pca_batch = 100;
599
+ int n_pca_iterations = 1000;
600
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
601
+ std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
602
+ std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
603
+
604
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
605
+
606
+ // batched-bench params
607
+ bool batched_bench_output_jsonl = false;
608
+
609
+ // common params
610
+ std::string out_file; // output filename for all example programs
611
+ // optional callback for model loading progress and cancellation:
612
+ // called with a progress value between 0.0 and 1.0.
613
+ // return false from callback to abort model loading or true to continue
614
+ llama_progress_callback load_progress_callback = NULL;
615
+ void * load_progress_callback_user_data = NULL;
616
+ };
617
+
618
+ // call once at the start of a program if it uses libcommon
619
+ // initializes the logging system and prints info about the build
620
+ void common_init();
621
+
622
+ std::string common_params_get_system_info(const common_params & params);
623
+
624
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
625
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
626
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
627
+ bool set_process_priority(enum ggml_sched_priority prio);
628
+
629
+ //
630
+ // String utils
631
+ //
632
+
633
+ #ifdef __GNUC__
634
+ # if defined(__MINGW32__) && !defined(__clang__)
635
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
636
+ # else
637
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
638
+ # endif
639
+ #else
640
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
641
+ #endif
642
+
643
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
644
+ std::string string_format(const char * fmt, ...);
645
+
646
+ std::string string_strip(const std::string & str);
647
+ std::string string_get_sortable_timestamp();
648
+
649
+ std::string string_join(const std::vector<std::string> & values, const std::string & separator);
650
+ std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
651
+ std::string string_repeat(const std::string & str, size_t n);
652
+
653
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
654
+
655
+ std::string regex_escape(const std::string & s);
656
+
657
+ template<class T>
658
+ static std::vector<T> string_split(const std::string & str, char delim) {
659
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
660
+ std::vector<T> values;
661
+ std::istringstream str_stream(str);
662
+ std::string token;
663
+ while (std::getline(str_stream, token, delim)) {
664
+ T value;
665
+ std::istringstream token_stream(token);
666
+ token_stream >> value;
667
+ values.push_back(value);
668
+ }
669
+ return values;
670
+ }
671
+
672
+ template<>
673
+ inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
674
+ {
675
+ std::vector<std::string> parts;
676
+ size_t begin_pos = 0;
677
+ size_t delim_pos = str.find(delim);
678
+ while (delim_pos != std::string::npos) {
679
+ std::string part = str.substr(begin_pos, delim_pos - begin_pos);
680
+ parts.emplace_back(part);
681
+ begin_pos = delim_pos + 1;
682
+ delim_pos = str.find(delim, begin_pos);
683
+ }
684
+ parts.emplace_back(str.substr(begin_pos));
685
+ return parts;
686
+ }
687
+
688
+ // remove when moving to c++20
689
+ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
690
+ return str.size() >= prefix.size() &&
691
+ str.compare(0, prefix.size(), prefix) == 0;
692
+ }
693
+
694
+ // remove when moving to c++20
695
+ inline bool string_ends_with(std::string_view str, std::string_view suffix) {
696
+ return str.size() >= suffix.size() &&
697
+ str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
698
+ }
699
+
700
+ inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
701
+ if (string_ends_with(str, suffix)) {
702
+ str.resize(str.size() - suffix.size());
703
+ return true;
704
+ }
705
+ return false;
706
+ }
707
+
708
+ inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
709
+ if (!str.empty() && !stop.empty()) {
710
+ const size_t max_len = std::min(str.size(), stop.size());
711
+ const char last_char = str.back();
712
+ for (size_t len = max_len; len > 0; --len) {
713
+ if (stop[len - 1] == last_char) {
714
+ if (string_ends_with(str, stop.substr(0, len))) {
715
+ return str.size() - len;
716
+ }
717
+ }
718
+ }
719
+ }
720
+ return std::string::npos;
721
+ }
722
+
723
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
724
+ void string_process_escapes(std::string & input);
725
+
726
+ std::string string_from(bool value);
727
+ std::string string_from(const std::vector<int> & values);
728
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
729
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
730
+
731
+ //
732
+ // Filesystem utils
733
+ //
734
+
735
+ bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
736
+ bool fs_create_directory_with_parents(const std::string & path);
737
+ bool fs_is_directory(const std::string & path);
738
+
739
+ std::string fs_get_cache_directory();
740
+ std::string fs_get_cache_file(const std::string & filename);
741
+
742
+ struct common_file_info {
743
+ std::string path;
744
+ std::string name;
745
+ size_t size = 0; // in bytes
746
+ bool is_dir = false;
747
+ };
748
+ std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
749
+
750
+ //
751
+ // TTY utils
752
+ //
753
+
754
+ // Auto-detect if colors can be enabled based on terminal and environment
755
+ bool tty_can_use_colors();
756
+
757
+ //
758
+ // Model utils
759
+ //
760
+
761
+ struct common_sampler;
762
+
763
+ // note: defines the model, context, samplers, ets. lifetimes
764
+ struct common_init_result {
765
+ common_init_result(common_params & params);
766
+ ~common_init_result();
767
+
768
+ llama_model * model();
769
+ llama_context * context();
770
+
771
+ common_sampler * sampler(llama_seq_id seq_id);
772
+ void reset_samplers();
773
+
774
+ std::vector<llama_adapter_lora_ptr> & lora();
775
+
776
+ private:
777
+ struct impl;
778
+ std::unique_ptr<impl> pimpl;
779
+ };
780
+
781
+ using common_init_result_ptr = std::unique_ptr<common_init_result>;
782
+
783
+ common_init_result_ptr common_init_from_params(common_params & params);
784
+
785
+ struct llama_model_params common_model_params_to_llama ( common_params & params);
786
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
787
+ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
788
+
789
+ // clear LoRA adapters from context, then apply new list of adapters
790
+ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
791
+
792
+ std::string get_model_endpoint();
793
+
794
+ //
795
+ // Batch utils
796
+ //
797
+
798
+ void common_batch_clear(struct llama_batch & batch);
799
+
800
+ void common_batch_add(
801
+ struct llama_batch & batch,
802
+ llama_token id,
803
+ llama_pos pos,
804
+ const std::vector<llama_seq_id> & seq_ids,
805
+ bool logits);
806
+
807
+ // decodes a single batch of tokens for a prompt and manages session tokens
808
+ //
809
+ // Note: We save state before the last token so that we can replay it to ensure
810
+ // compatibility with all memory types. Recurrent/hybrid models cannot remove
811
+ // tokens from memory, so this approach works across all model architectures.
812
+ bool common_prompt_batch_decode(
813
+ struct llama_context * ctx,
814
+ const std::vector<llama_token> & embd,
815
+ int & n_past,
816
+ int n_batch,
817
+ std::string_view state_path,
818
+ bool save_state);
819
+
820
+ // replays the last token after loading state to regenerate logits
821
+ // used after loading session state to ensure the sampling context has valid logits
822
+ bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos);
823
+
824
+ //
825
+ // Vocab utils
826
+ //
827
+
828
+ // tokenizes a string into a vector of tokens
829
+ // should work similar to Python's `tokenizer.encode`
830
+ std::vector<llama_token> common_tokenize(
831
+ const struct llama_context * ctx,
832
+ const std::string & text,
833
+ bool add_special,
834
+ bool parse_special = false);
835
+
836
+ std::vector<llama_token> common_tokenize(
837
+ const struct llama_vocab * vocab,
838
+ const std::string & text,
839
+ bool add_special,
840
+ bool parse_special = false);
841
+
842
+ // tokenizes a token into a piece, optionally renders special/control tokens
843
+ // should work similar to Python's `tokenizer.id_to_piece`
844
+ std::string common_token_to_piece(
845
+ const struct llama_context * ctx,
846
+ llama_token token,
847
+ bool special = true);
848
+
849
+ std::string common_token_to_piece(
850
+ const struct llama_vocab * vocab,
851
+ llama_token token,
852
+ bool special = true);
853
+
854
+ // detokenizes a vector of tokens into a string
855
+ // should work similar to Python's `tokenizer.decode`
856
+ // optionally renders special/control tokens
857
+ std::string common_detokenize(
858
+ const struct llama_context * ctx,
859
+ const std::vector<llama_token> & tokens,
860
+ bool special = true);
861
+
862
+ std::string common_detokenize(
863
+ const struct llama_vocab * vocab,
864
+ const std::vector<llama_token> & tokens,
865
+ bool special = true);
866
+
867
+ //
868
+ // Embedding utils
869
+ //
870
+
871
+ // TODO: repace embd_norm with an enum
872
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
873
+
874
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
875
+
876
+ //
877
+ // Control vector utils
878
+ //
879
+
880
+ struct common_control_vector_data {
881
+ int n_embd;
882
+
883
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
884
+ std::vector<float> data;
885
+ };
886
+
887
+ struct common_control_vector_load_info {
888
+ float strength;
889
+
890
+ std::string fname;
891
+ };
892
+
893
+ // Load control vectors, scale each by strength, and add them together.
894
+ // On error, returns {-1, empty}
895
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
896
+
897
+ //
898
+ // Split utils
899
+ //
900
+
901
+ namespace {
902
+
903
+ const char * const LLM_KV_SPLIT_NO = "split.no";
904
+ const char * const LLM_KV_SPLIT_COUNT = "split.count";
905
+ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
906
+
907
+ }
908
+
909
+ //
910
+ // MoE utils
911
+ //
912
+
913
+ const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
914
+
915
+ inline std::string llm_ffn_exps_block_regex(int idx) {
916
+ return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
917
+ }
918
+
919
+ inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
920
+ return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
921
+ }
922
+
923
+ //
924
+ // training utils
925
+ //
926
+
927
+ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
928
+
929
+ // "adamw" or "sgd" (case insensitive)
930
+ enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);