local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,745 @@
1
+ #include "sampling.h"
2
+
3
+ #include "common.h"
4
+ #include "log.h"
5
+
6
+ #include <algorithm>
7
+ #include <cmath>
8
+ #include <cstring>
9
+ #include <unordered_map>
10
+
11
+ // the ring buffer works similarly to std::deque, but with a fixed capacity
12
+ // TODO: deduplicate with llama-impl.h
13
+ template<typename T>
14
+ struct ring_buffer {
15
+ ring_buffer(size_t cap) : capacity(cap), data(cap) {}
16
+
17
+ T & front() {
18
+ if (sz == 0) {
19
+ throw std::runtime_error("ring buffer is empty");
20
+ }
21
+ return data[first];
22
+ }
23
+
24
+ const T & front() const {
25
+ if (sz == 0) {
26
+ throw std::runtime_error("ring buffer is empty");
27
+ }
28
+ return data[first];
29
+ }
30
+
31
+ T & back() {
32
+ if (sz == 0) {
33
+ throw std::runtime_error("ring buffer is empty");
34
+ }
35
+ return data[pos];
36
+ }
37
+
38
+ const T & back() const {
39
+ if (sz == 0) {
40
+ throw std::runtime_error("ring buffer is empty");
41
+ }
42
+ return data[pos];
43
+ }
44
+
45
+ void push_back(const T & value) {
46
+ if (sz == capacity) {
47
+ // advance the start when buffer is full
48
+ first = (first + 1) % capacity;
49
+ } else {
50
+ sz++;
51
+ }
52
+ data[pos] = value;
53
+ pos = (pos + 1) % capacity;
54
+ }
55
+
56
+ T pop_front() {
57
+ if (sz == 0) {
58
+ throw std::runtime_error("ring buffer is empty");
59
+ }
60
+ T value = data[first];
61
+ first = (first + 1) % capacity;
62
+ sz--;
63
+ return value;
64
+ }
65
+
66
+ const T & rat(size_t i) const {
67
+ if (i >= sz) {
68
+ throw std::runtime_error("ring buffer: index out of bounds");
69
+ }
70
+ return data[(first + sz - i - 1) % capacity];
71
+ }
72
+
73
+ std::vector<T> to_vector() const {
74
+ std::vector<T> result;
75
+ result.reserve(sz);
76
+ for (size_t i = 0; i < sz; i++) {
77
+ result.push_back(data[(first + i) % capacity]);
78
+ }
79
+ return result;
80
+ }
81
+
82
+ void clear() {
83
+ // here only reset the status of the buffer
84
+ sz = 0;
85
+ first = 0;
86
+ pos = 0;
87
+ }
88
+
89
+ bool empty() const {
90
+ return sz == 0;
91
+ }
92
+
93
+ size_t size() const {
94
+ return sz;
95
+ }
96
+
97
+ size_t capacity = 0;
98
+ size_t sz = 0;
99
+ size_t first = 0;
100
+ size_t pos = 0;
101
+ std::vector<T> data;
102
+ };
103
+
104
+ struct common_sampler {
105
+ common_params_sampling params;
106
+
107
+ struct llama_sampler * grmr;
108
+ struct llama_sampler * chain;
109
+
110
+ ring_buffer<llama_token> prev;
111
+
112
+ std::vector<llama_token_data> cur;
113
+
114
+ llama_token_data_array cur_p;
115
+
116
+ void reset() {
117
+ prev.clear();
118
+
119
+ llama_sampler_reset(chain);
120
+ }
121
+
122
+ void set_logits(struct llama_context * ctx, int idx) {
123
+ const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx);
124
+ const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
125
+ const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
126
+
127
+ const llama_model * model = llama_get_model(ctx);
128
+ const llama_vocab * vocab = llama_model_get_vocab(model);
129
+
130
+ const int n_vocab = llama_vocab_n_tokens(vocab);
131
+
132
+ if (sampled_probs) {
133
+ const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
134
+ cur.resize(sampled_probs_count);
135
+ for (uint32_t i = 0; i < sampled_probs_count; ++i) {
136
+ cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
137
+ }
138
+ } else if (sampled_logits) {
139
+ const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
140
+ cur.resize(sampled_logits_count);
141
+ for (uint32_t i = 0; i < sampled_logits_count; i++) {
142
+ cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
143
+ }
144
+ } else {
145
+ const auto * logits = llama_get_logits_ith(ctx, idx);
146
+ GGML_ASSERT(logits != nullptr);
147
+ cur.resize(n_vocab);
148
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
149
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
150
+ }
151
+ }
152
+
153
+ cur_p = { cur.data(), cur.size(), -1, false };
154
+ }
155
+
156
+ common_time_meas tm() {
157
+ return common_time_meas(t_total_us, params.no_perf);
158
+ }
159
+
160
+ mutable int64_t t_total_us = 0;
161
+ };
162
+
163
+ std::string common_params_sampling::print() const {
164
+ char result[1024];
165
+
166
+ snprintf(result, sizeof(result),
167
+ "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
168
+ "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
169
+ "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
170
+ "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f",
171
+ penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
172
+ dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
173
+ top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
174
+ mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay);
175
+
176
+ return std::string(result);
177
+ }
178
+
179
+ struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
180
+ const llama_vocab * vocab = llama_model_get_vocab(model);
181
+
182
+ llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
183
+
184
+ lparams.no_perf = params.no_perf;
185
+
186
+ llama_sampler * grmr = nullptr;
187
+ llama_sampler * chain = llama_sampler_chain_init(lparams);
188
+
189
+ std::vector<llama_sampler *> samplers;
190
+
191
+ if (params.grammar.compare(0, 11, "%llguidance") == 0) {
192
+ #ifdef LLAMA_USE_LLGUIDANCE
193
+ grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
194
+ #else
195
+ GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
196
+ #endif // LLAMA_USE_LLGUIDANCE
197
+ } else {
198
+ std::vector<std::string> trigger_patterns;
199
+ std::vector<llama_token> trigger_tokens;
200
+ for (const auto & trigger : params.grammar_triggers) {
201
+ switch (trigger.type) {
202
+ case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
203
+ {
204
+ const auto & word = trigger.value;
205
+ trigger_patterns.push_back(regex_escape(word));
206
+ break;
207
+ }
208
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
209
+ {
210
+ trigger_patterns.push_back(trigger.value);
211
+ break;
212
+ }
213
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
214
+ {
215
+ const auto & pattern = trigger.value;
216
+ std::string anchored = "^$";
217
+ if (!pattern.empty()) {
218
+ anchored = (pattern.front() != '^' ? "^" : "")
219
+ + pattern
220
+ + (pattern.back() != '$' ? "$" : "");
221
+ }
222
+ trigger_patterns.push_back(anchored);
223
+ break;
224
+ }
225
+ case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
226
+ {
227
+ const auto token = trigger.token;
228
+ trigger_tokens.push_back(token);
229
+ break;
230
+ }
231
+ default:
232
+ GGML_ASSERT(false && "unknown trigger type");
233
+ }
234
+ }
235
+
236
+ std::vector<const char *> trigger_patterns_c;
237
+ trigger_patterns_c.reserve(trigger_patterns.size());
238
+ for (const auto & regex : trigger_patterns) {
239
+ trigger_patterns_c.push_back(regex.c_str());
240
+ }
241
+
242
+ if (!params.grammar.empty()) {
243
+ if (params.grammar_lazy) {
244
+ grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
245
+ trigger_patterns_c.data(), trigger_patterns_c.size(),
246
+ trigger_tokens.data(), trigger_tokens.size());
247
+ } else {
248
+ grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
249
+ }
250
+ }
251
+ }
252
+
253
+ if (params.has_logit_bias()) {
254
+ samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
255
+ }
256
+
257
+ if (params.mirostat == 0) {
258
+
259
+ bool use_adaptive_p = false; // see below
260
+
261
+ for (const auto & cnstr : params.samplers) {
262
+ switch (cnstr) {
263
+ case COMMON_SAMPLER_TYPE_DRY:
264
+ {
265
+ std::vector<const char *> c_breakers;
266
+ c_breakers.reserve(params.dry_sequence_breakers.size());
267
+ for (const auto & str : params.dry_sequence_breakers) {
268
+ c_breakers.push_back(str.c_str());
269
+ }
270
+ samplers.push_back(llama_sampler_init_dry(vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
271
+ }
272
+ break;
273
+ case COMMON_SAMPLER_TYPE_TOP_K:
274
+ samplers.push_back(llama_sampler_init_top_k(params.top_k));
275
+ break;
276
+ case COMMON_SAMPLER_TYPE_TOP_P:
277
+ samplers.push_back(llama_sampler_init_top_p(params.top_p, params.min_keep));
278
+ break;
279
+ case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
280
+ samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
281
+ break;
282
+ case COMMON_SAMPLER_TYPE_MIN_P:
283
+ samplers.push_back(llama_sampler_init_min_p(params.min_p, params.min_keep));
284
+ break;
285
+ case COMMON_SAMPLER_TYPE_XTC:
286
+ samplers.push_back(llama_sampler_init_xtc(params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
287
+ break;
288
+ case COMMON_SAMPLER_TYPE_TYPICAL_P:
289
+ samplers.push_back(llama_sampler_init_typical(params.typ_p, params.min_keep));
290
+ break;
291
+ case COMMON_SAMPLER_TYPE_TEMPERATURE:
292
+ samplers.push_back(llama_sampler_init_temp_ext(params.temp, params.dynatemp_range, params.dynatemp_exponent));
293
+ break;
294
+ case COMMON_SAMPLER_TYPE_INFILL:
295
+ samplers.push_back(llama_sampler_init_infill(vocab));
296
+ break;
297
+ case COMMON_SAMPLER_TYPE_PENALTIES:
298
+ samplers.push_back(llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
299
+ break;
300
+ case COMMON_SAMPLER_TYPE_ADAPTIVE_P:
301
+ // the `adaptive-p` sampler is like `dist` and `mirostat` in that it selects
302
+ // a single token, so we will add `dist` at the end of the chain by default,
303
+ // unless the user specifically included `adaptive-p`. we set this flag here
304
+ // so we know to add the sampler at the very end.
305
+ use_adaptive_p = true;
306
+ break;
307
+ default:
308
+ GGML_ASSERT(false && "unknown sampler type");
309
+ }
310
+ }
311
+ if (use_adaptive_p) {
312
+ // only if user explicitly included adaptive-p sampler
313
+ samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed));
314
+ } else {
315
+ // default: sample from distribution
316
+ samplers.push_back(llama_sampler_init_dist(params.seed));
317
+ }
318
+ } else if (params.mirostat == 1) {
319
+ samplers.push_back(llama_sampler_init_temp(params.temp));
320
+ samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
321
+ } else if (params.mirostat == 2) {
322
+ samplers.push_back(llama_sampler_init_temp(params.temp));
323
+ samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
324
+ } else {
325
+ GGML_ASSERT(false && "unknown mirostat version");
326
+ }
327
+
328
+ for (auto * smpl : samplers) {
329
+ llama_sampler_chain_add(chain, smpl);
330
+ }
331
+
332
+ if (grmr && params.backend_sampling) {
333
+ LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
334
+
335
+ params.backend_sampling = false;
336
+ }
337
+
338
+ auto * result = new common_sampler {
339
+ /* .params = */ params,
340
+ /* .grmr = */ grmr,
341
+ /* .chain = */ chain,
342
+ /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
343
+ /* .cur = */ {},
344
+ /* .cur_p = */ {},
345
+ };
346
+
347
+ return result;
348
+ }
349
+
350
+ void common_sampler_free(struct common_sampler * gsmpl) {
351
+ if (!gsmpl) {
352
+ return;
353
+ }
354
+
355
+ llama_sampler_free(gsmpl->grmr);
356
+ llama_sampler_free(gsmpl->chain);
357
+
358
+ delete gsmpl;
359
+ }
360
+
361
+ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
362
+ if (!gsmpl) {
363
+ return;
364
+ }
365
+
366
+ const auto tm = gsmpl->tm();
367
+
368
+ if (gsmpl->grmr && accept_grammar) {
369
+ llama_sampler_accept(gsmpl->grmr, token);
370
+ }
371
+
372
+ llama_sampler_accept(gsmpl->chain, token);
373
+
374
+ gsmpl->prev.push_back(token);
375
+ }
376
+
377
+ void common_sampler_reset(struct common_sampler * gsmpl) {
378
+ if (!gsmpl) {
379
+ return;
380
+ }
381
+
382
+ gsmpl->reset();
383
+ }
384
+
385
+ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
386
+ return new common_sampler {
387
+ /* .params = */ gsmpl->params,
388
+ /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
389
+ /* .chain = */ llama_sampler_clone(gsmpl->chain),
390
+ /* .prev = */ gsmpl->prev,
391
+ /* .cur = */ gsmpl->cur,
392
+ /* .cur_p = */ gsmpl->cur_p,
393
+ };
394
+ }
395
+
396
+ void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
397
+ // TODO: measure grammar performance
398
+
399
+ const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
400
+
401
+ llama_perf_sampler_data data_smpl;
402
+ llama_perf_context_data data_ctx;
403
+
404
+ memset(&data_smpl, 0, sizeof(data_smpl));
405
+ memset(&data_ctx, 0, sizeof(data_ctx));
406
+
407
+ if (gsmpl) {
408
+ auto & data = data_smpl;
409
+
410
+ data = llama_perf_sampler(gsmpl->chain);
411
+
412
+ // note: the sampling time includes the samplers time + extra time spent in common/sampling
413
+ LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms);
414
+ LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
415
+ }
416
+
417
+ if (ctx) {
418
+ auto & data = data_ctx;
419
+
420
+ data = llama_perf_context(ctx);
421
+
422
+ const double t_end_ms = 1e-3 * ggml_time_us();
423
+
424
+ const double t_total_ms = t_end_ms - data.t_start_ms;
425
+ const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
426
+ const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms;
427
+
428
+ LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
429
+ LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
430
+ __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
431
+ LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
432
+ __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
433
+ LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
434
+ LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
435
+ LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
436
+
437
+ llama_memory_breakdown_print(ctx);
438
+ }
439
+ }
440
+
441
+ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
442
+ if (!gsmpl) {
443
+ return nullptr;
444
+ }
445
+
446
+ return gsmpl->chain;
447
+ }
448
+
449
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
450
+ llama_synchronize(ctx);
451
+
452
+ // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
453
+ const auto tm = gsmpl->tm();
454
+
455
+ llama_token id = LLAMA_TOKEN_NULL;
456
+
457
+ auto & grmr = gsmpl->grmr;
458
+ auto & chain = gsmpl->chain;
459
+ auto & cur_p = gsmpl->cur_p; // initialized by set_logits
460
+
461
+ // Check if a backend sampler has already sampled a token in which case we
462
+ // return that token id directly.
463
+ {
464
+ id = llama_get_sampled_token_ith(ctx, idx);
465
+
466
+ if (id != LLAMA_TOKEN_NULL) {
467
+ LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
468
+
469
+ GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
470
+
471
+ // TODO: simplify
472
+ gsmpl->cur.resize(1);
473
+ gsmpl->cur[0] = { id, 0.0f, 1.0f };
474
+ cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
475
+
476
+ return id;
477
+ }
478
+ }
479
+
480
+ gsmpl->set_logits(ctx, idx);
481
+
482
+ if (grammar_first) {
483
+ llama_sampler_apply(grmr, &cur_p);
484
+ }
485
+
486
+ llama_sampler_apply(chain, &cur_p);
487
+
488
+ id = cur_p.data[cur_p.selected].id;
489
+
490
+ if (grammar_first) {
491
+ return id;
492
+ }
493
+
494
+ // check if it the sampled token fits the grammar (grammar-based rejection sampling)
495
+ {
496
+ llama_token_data single_token_data = { id, 1.0f, 0.0f };
497
+ llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
498
+
499
+ llama_sampler_apply(grmr, &single_token_data_array);
500
+
501
+ const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
502
+ if (is_valid) {
503
+ return id;
504
+ }
505
+ }
506
+
507
+ // resampling:
508
+ // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
509
+ gsmpl->set_logits(ctx, idx);
510
+
511
+ llama_sampler_apply(grmr, &cur_p);
512
+ llama_sampler_apply(chain, &cur_p);
513
+
514
+ GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
515
+
516
+ id = cur_p.data[cur_p.selected].id;
517
+
518
+ return id;
519
+ }
520
+
521
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
522
+ GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
523
+
524
+ std::vector<llama_token> result;
525
+ result.reserve(idxs.size());
526
+
527
+ size_t i = 0;
528
+ for (; i < draft.size(); i++) {
529
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
530
+
531
+ common_sampler_accept(gsmpl, id, true);
532
+
533
+ result.push_back(id);
534
+
535
+ if (draft[i] != id) {
536
+ break;
537
+ }
538
+ }
539
+
540
+ if (i == draft.size()) {
541
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
542
+
543
+ common_sampler_accept(gsmpl, id, true);
544
+
545
+ result.push_back(id);
546
+ }
547
+
548
+ return result;
549
+ }
550
+
551
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
552
+ std::vector<int> idxs(draft.size() + 1);
553
+ for (size_t i = 0; i < idxs.size(); ++i) {
554
+ idxs[i] = i;
555
+ }
556
+
557
+ return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
558
+ }
559
+
560
+ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
561
+ return llama_sampler_get_seed(gsmpl->chain);
562
+ }
563
+
564
+ // helpers
565
+
566
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
567
+ const auto tm = gsmpl->tm();
568
+
569
+ auto * res = &gsmpl->cur_p;
570
+
571
+ if (do_sort && !res->sorted) {
572
+ // remember the selected token before sorting
573
+ const llama_token id = res->data[res->selected].id;
574
+
575
+ std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
576
+ return a.p > b.p;
577
+ });
578
+
579
+ // restore the selected token after sorting
580
+ for (size_t i = 0; i < res->size; ++i) {
581
+ if (res->data[i].id == id) {
582
+ res->selected = i;
583
+ break;
584
+ }
585
+ }
586
+
587
+ res->sorted = true;
588
+ }
589
+
590
+ return res;
591
+ }
592
+
593
+ llama_token common_sampler_last(const struct common_sampler * gsmpl) {
594
+ return gsmpl->prev.rat(0);
595
+ }
596
+
597
+ std::string common_sampler_print(const struct common_sampler * gsmpl) {
598
+ std::string result = "logits ";
599
+
600
+ for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
601
+ const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
602
+ result += std::string("-> ");
603
+ result += std::string(llama_sampler_name(smpl)) + " ";
604
+ }
605
+
606
+ return result;
607
+ }
608
+
609
+ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
610
+ n = std::min(n, (int) gsmpl->prev.size());
611
+
612
+ if (n <= 0) {
613
+ return "";
614
+ }
615
+
616
+ std::string result;
617
+ result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
618
+
619
+ for (int i = n - 1; i >= 0; i--) {
620
+ const llama_token id = gsmpl->prev.rat(i);
621
+
622
+ GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
623
+
624
+ result += common_token_to_piece(ctx_main, id);
625
+ }
626
+
627
+ return result;
628
+ }
629
+
630
+ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
631
+ switch (cnstr) {
632
+ case COMMON_SAMPLER_TYPE_DRY: return 'd';
633
+ case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
634
+ case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
635
+ case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
636
+ case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
637
+ case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
638
+ case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
639
+ case COMMON_SAMPLER_TYPE_XTC: return 'x';
640
+ case COMMON_SAMPLER_TYPE_INFILL: return 'i';
641
+ case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
642
+ case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return 'a';
643
+ default : return '?';
644
+ }
645
+ }
646
+
647
+ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
648
+ switch (cnstr) {
649
+ case COMMON_SAMPLER_TYPE_DRY: return "dry";
650
+ case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
651
+ case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
652
+ case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
653
+ case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
654
+ case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
655
+ case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
656
+ case COMMON_SAMPLER_TYPE_XTC: return "xtc";
657
+ case COMMON_SAMPLER_TYPE_INFILL: return "infill";
658
+ case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
659
+ case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return "adaptive_p";
660
+ default : return "";
661
+ }
662
+ }
663
+
664
+ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
665
+ std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
666
+ { "dry", COMMON_SAMPLER_TYPE_DRY },
667
+ { "top_k", COMMON_SAMPLER_TYPE_TOP_K },
668
+ { "top_p", COMMON_SAMPLER_TYPE_TOP_P },
669
+ { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
670
+ { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
671
+ { "min_p", COMMON_SAMPLER_TYPE_MIN_P },
672
+ { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
673
+ { "xtc", COMMON_SAMPLER_TYPE_XTC },
674
+ { "infill", COMMON_SAMPLER_TYPE_INFILL },
675
+ { "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
676
+ { "adaptive_p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
677
+ };
678
+
679
+ // since samplers names are written multiple ways
680
+ // make it ready for both system names and input names
681
+ std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
682
+ { "top-k", COMMON_SAMPLER_TYPE_TOP_K },
683
+ { "top-p", COMMON_SAMPLER_TYPE_TOP_P },
684
+ { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
685
+ { "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
686
+ { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
687
+ { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
688
+ { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
689
+ { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
690
+ { "min-p", COMMON_SAMPLER_TYPE_MIN_P },
691
+ { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
692
+ { "adaptive-p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
693
+ };
694
+
695
+ std::vector<common_sampler_type> samplers;
696
+ samplers.reserve(names.size());
697
+
698
+ for (const auto & name : names) {
699
+ auto sampler = sampler_canonical_name_map.find(name);
700
+ if (sampler != sampler_canonical_name_map.end()) {
701
+ samplers.push_back(sampler->second);
702
+ continue;
703
+ }
704
+ if (allow_alt_names) {
705
+ sampler = sampler_alt_name_map.find(name);
706
+ if (sampler != sampler_alt_name_map.end()) {
707
+ samplers.push_back(sampler->second);
708
+ continue;
709
+ }
710
+ }
711
+ LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
712
+ }
713
+
714
+ return samplers;
715
+ }
716
+
717
+ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
718
+ std::unordered_map<char, common_sampler_type> sampler_name_map = {
719
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
720
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
721
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
722
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
723
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
724
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
725
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
726
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
727
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
728
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
729
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_ADAPTIVE_P), COMMON_SAMPLER_TYPE_ADAPTIVE_P },
730
+ };
731
+
732
+ std::vector<common_sampler_type> samplers;
733
+ samplers.reserve(chars.size());
734
+
735
+ for (const auto & c : chars) {
736
+ const auto sampler = sampler_name_map.find(c);
737
+ if (sampler != sampler_name_map.end()) {
738
+ samplers.push_back(sampler->second);
739
+ } else {
740
+ LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
741
+ }
742
+ }
743
+
744
+ return samplers;
745
+ }