local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,530 @@
1
+ #include "common.h"
2
+ #include "log.h"
3
+ #include "ngram-map.h"
4
+
5
+ #include <cinttypes>
6
+ #include <cstdint>
7
+ #include <cstdio>
8
+ #include <sstream>
9
+
10
+ // prime number used for LCG hash function (32 bit), it is near (sqrt(5) - 1)/2 * 2^32.
11
+ #define LCG_FACTOR 2654435761UL
12
+
13
+ // Compute the LCG hash of a n-gram of size len at offset start.
14
+ static uint32_t common_ngram_map_hash(const llama_tokens & tokens, size_t start, size_t len) {
15
+ uint32_t hash = 0;
16
+ for (size_t i = 0; i < len; ++i) {
17
+ hash = hash * LCG_FACTOR + tokens[start + i];
18
+ }
19
+ return hash;
20
+ }
21
+
22
+ // Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
23
+ static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
24
+ std::ostringstream oss;
25
+ oss << '[';
26
+ for (size_t i = 0; i < length; ++i) {
27
+ if (i > 0) {
28
+ oss << ", ";
29
+ }
30
+ oss << inp[start + i];
31
+ }
32
+ oss << ']';
33
+ return oss.str();
34
+ }
35
+
36
+
37
+ // n-gram simple
38
+ //
39
+
40
+ /**
41
+ * Perform speculative generation using the model's own token history.
42
+ * Searches for a matching pattern in the token history and returns draft tokens.
43
+ *
44
+ * @param state Current state of this implementation
45
+ * @param tokens Token history to search in
46
+ * @param sampled Last sampled token
47
+ * @return Vector of draft tokens, empty if no matching pattern is found
48
+ */
49
+ llama_tokens common_ngram_simple_draft(
50
+ const common_ngram_simple_config & config,
51
+ const llama_tokens & tokens, llama_token sampled) {
52
+
53
+ // Simple implementation of self-speculative decoding without a draft model.
54
+ //
55
+ const size_t cur_len = tokens.size();
56
+
57
+ const size_t n_draft_min = config.size_ngram; // size of n-gram to lookup in token history
58
+ const size_t n_draft_max = config.size_mgram; // the m-gram following the found n-gram is used for draft
59
+
60
+ // vector for tokens we want to verify.
61
+ // return empty vector if there is no match.
62
+ llama_tokens draft_tokens;
63
+
64
+ // We need at least n_draft_min + n_draft_max + 1 tokens.
65
+ if (cur_len <= static_cast<size_t>(n_draft_min + n_draft_max + 1)) {
66
+ return draft_tokens;
67
+ }
68
+
69
+ // pattern search
70
+ llama_tokens pattern;
71
+ pattern.reserve(n_draft_min);
72
+ for (size_t j = cur_len - n_draft_min + 1; j < cur_len; ++j) {
73
+ pattern.push_back(tokens[j]);
74
+ }
75
+ pattern.push_back(sampled); // add the last token to the pattern
76
+
77
+ size_t match_pos = 0; // we ignore position 0, position 0 == no match
78
+ // search backwards, but skip the current match (we are currently there)
79
+ for (size_t j = cur_len - n_draft_min - 1; j > 0; --j) {
80
+ bool match = true;
81
+ for (size_t k = 0; k < pattern.size(); ++k) {
82
+ if (tokens[j + k] != pattern[k]) {
83
+ match = false;
84
+ break;
85
+ }
86
+ }
87
+ if (match) {
88
+ match_pos = j;
89
+ break;
90
+ }
91
+ }
92
+ if (match_pos == 0) {
93
+ return draft_tokens;
94
+ }
95
+
96
+ const size_t copy_max = std::min(
97
+ n_draft_max,
98
+ cur_len - (match_pos + n_draft_min)
99
+ );
100
+ if (copy_max < n_draft_min) {
101
+ return draft_tokens;
102
+ }
103
+ LOG_DBG("%s: #tokens = %zu: found matching pattern at pos %zu, length %zu, draft length %zu\n",
104
+ __func__, cur_len,
105
+ match_pos, pattern.size(), copy_max);
106
+
107
+ draft_tokens.reserve(copy_max);
108
+ for (size_t j = 0; j < copy_max; ++j) {
109
+ draft_tokens.push_back(tokens[match_pos + n_draft_min + j]);
110
+ }
111
+ return draft_tokens;
112
+ }
113
+
114
+
115
+ // n-gram map
116
+ //
117
+
118
+ // maximum number of counted values of a ngram map value.
119
+ #define COMMON_NGRAM_MAX_VALUE_COUNT 16380
120
+
121
+ void common_ngram_map_begin(
122
+ common_ngram_map & map, const llama_tokens & tokens) {
123
+ size_t size_begin = tokens.size();
124
+
125
+ LOG_DBG("%s: begin, idx_last_draft=%zu, new begin=%zu, #keys=%zu\n", __func__,
126
+ map.idx_last_check, size_begin, map.keys.size());
127
+
128
+ size_t count_map_entries_upd = 0;
129
+ if (!map.key_map.empty() && size_begin < map.idx_last_check) {
130
+ if (map.show_key_map_stats) {
131
+ // Print statistics of hash map map_key.
132
+ size_t count_nonzero = 0;
133
+ uint32_t min_idx = UINT32_MAX;
134
+ uint32_t max_idx = 0;
135
+ for (size_t i = 0; i < map.key_map.size(); ++i) {
136
+ uint32_t key_idx = map.key_map[i];
137
+ if (key_idx != 0) {
138
+ ++count_nonzero;
139
+ if (key_idx < min_idx) min_idx = key_idx;
140
+ if (key_idx > max_idx) max_idx = key_idx;
141
+ }
142
+ }
143
+ if (count_nonzero == 0) {
144
+ min_idx = 0;
145
+ }
146
+ LOG_INF("%s: key_map stats: entries=%zu, min_idx=%u, max_idx=%u, key_map_last_idx=%u\n",
147
+ __func__, count_nonzero, min_idx, max_idx, map.key_map_last_idx);
148
+ }
149
+
150
+ // Update the map from hash to key index (clear outdated entries).
151
+ for (size_t i = 0; i < map.key_map.size(); ++i) {
152
+ uint32_t key_idx = map.key_map[i];
153
+ if (key_idx >= map.size_last_begin) {
154
+ map.key_map[i] = 0;
155
+ count_map_entries_upd++;
156
+ }
157
+ }
158
+ map.key_map_last_idx = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
159
+ }
160
+
161
+ if (size_begin < map.idx_last_check && !map.keys.empty()) {
162
+ // The next token generation will start at index size_begin.
163
+ // The tokens between map.size_last_begin and size_begin are no longer valid.
164
+ //
165
+ // Refresh map: Remove all entries with index >= map.size_last_begin.
166
+ size_t count_keys = map.keys.size();
167
+ size_t count_keys_del = 0;
168
+ size_t count_values_del = 0;
169
+ for (int32_t i = map.keys.size() - 1; i >= 0; --i) {
170
+ common_ngram_map_key & key = map.keys[i];
171
+ if (key.key_idx >= map.size_last_begin) {
172
+ // Delete the key.
173
+ LOG_DBG("%s: delete key %d at index %zu (>= size_last_begin=%zu)\n", __func__, i, key.key_idx, map.size_last_begin);
174
+ map.keys.erase(map.keys.begin() + i);
175
+ count_keys_del++;
176
+ continue;
177
+ }
178
+ if (map.key_only) {
179
+ continue;
180
+ }
181
+
182
+ // Check the indices of the values.
183
+ for (int16_t j = COMMON_NGRAM_MAX_VALUES - 1; j >= 0; --j) {
184
+ common_ngram_map_value & value = key.values[j];
185
+ if (value.value_idx >= map.size_last_begin) {
186
+ // Delete the value.
187
+ count_values_del++;
188
+
189
+ // Move all values after this value to the left.
190
+ for (uint16_t k = j; k < COMMON_NGRAM_MAX_VALUES - 1; ++k) {
191
+ key.values[k] = key.values[k + 1];
192
+ }
193
+ // Clear the last value.
194
+ key.values[COMMON_NGRAM_MAX_VALUES - 1].value_idx = 0;
195
+ key.values[COMMON_NGRAM_MAX_VALUES - 1].value_num = 0;
196
+ }
197
+ }
198
+ if (key.values[0].value_idx == 0) {
199
+ // No values left, delete the key.
200
+ LOG_DBG("%s: delete key %d at index %zu (no values left)\n", __func__, i, key.key_idx);
201
+ map.keys.erase(map.keys.begin() + i);
202
+ count_keys_del++;
203
+ }
204
+ }
205
+
206
+ LOG_INF("%s: refresh map: idx_last_draft=%zu, new begin=%zu, #keys_checked=%zu, #keys_del=%zu, #values_del=%zu, #hashes_upd=%zu\n", __func__,
207
+ map.idx_last_check, size_begin,
208
+ count_keys, count_keys_del, count_values_del, count_map_entries_upd);
209
+ }
210
+
211
+ map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
212
+ map.size_last_begin = size_begin;
213
+ }
214
+
215
+ void common_ngram_map_draft(common_ngram_map & map,
216
+ const llama_tokens & inp, llama_token sampled,
217
+ llama_tokens & draft) {
218
+ // reset last key and value.
219
+ map.last_draft_created = false;
220
+ map.last_draft_key_idx = 0;
221
+ map.last_draft_value_idx = 0;
222
+
223
+ const size_t cur_len = inp.size();
224
+ const uint16_t n = map.size_key;
225
+ const uint16_t m = map.size_value;
226
+ if (cur_len < static_cast<size_t>(2 * n + m)) {
227
+ return;
228
+ }
229
+ if (cur_len >= static_cast<size_t>(UINT32_MAX)) {
230
+ // key_map uses uint32_t instead of size_t.
231
+ GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
232
+ }
233
+
234
+ if (map.idx_last_check > cur_len) {
235
+ // Should not happen because of common_ngram_map_begin().
236
+ GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
237
+ }
238
+ map.idx_last_check = cur_len;
239
+
240
+ // search pattern, the key n-gram
241
+ std::vector<llama_token> key_tokens;
242
+ key_tokens.reserve(n);
243
+ for (size_t j = cur_len - n + 1; j < cur_len; ++j) {
244
+ key_tokens.push_back(inp[j]);
245
+ }
246
+ key_tokens.push_back(sampled);
247
+
248
+ // search for the key in the map
249
+ size_t match_pos = 0;
250
+ if (map.size_last_begin > cur_len) {
251
+ GGML_ABORT("%s: map.size_last_begin > cur_len: %zu > %zu", __func__, map.size_last_begin, cur_len);
252
+ }
253
+ if (!map.key_map.empty()) {
254
+ // Search for the key in the map key_map from hash of ngrams to index of ngram.
255
+ uint32_t idx_hash = (common_ngram_map_hash(key_tokens, 0, n) % map.key_map.size());
256
+ uint32_t idx_key = map.key_map[idx_hash];
257
+ if (idx_key != 0 && idx_key < cur_len - n - m - 1) {
258
+ // Check if the key matches the key at idx_key (because of possible collisions).
259
+ bool match = true;
260
+ for (size_t k = 0; k < n; ++k) {
261
+ if (inp[idx_key + k] != key_tokens[k]) {
262
+ match = false;
263
+ break;
264
+ }
265
+ }
266
+ LOG_DBG("%s: key hash %x -> idx_key %d: match %d\n", __func__, idx_hash, idx_key, match ? 1 : 0);
267
+ if (match) {
268
+ match_pos = idx_key;
269
+ }
270
+ }
271
+ }
272
+ if (match_pos == 0 && map.size_last_begin > (size_t) (n + m + 1)) {
273
+ // Search for the key in [1, map.size_last_begin - n - m -1], descending.
274
+ for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
275
+ // Check if the key matches the key.
276
+ bool match = true;
277
+ for (size_t k = 0; k < n; ++k) {
278
+ if (inp[j + k] != key_tokens[k]) {
279
+ match = false;
280
+ break;
281
+ }
282
+ }
283
+ if (match) {
284
+ match_pos = j;
285
+ break;
286
+ }
287
+ }
288
+ }
289
+ if (match_pos == 0) {
290
+ // In case of a reasoning chat, the part after size_last_begin may be deleted/reordered later.
291
+ //
292
+ // Search in [size_last_begin, cur_len - n - m - 1], descending.
293
+ for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
294
+ bool match = true;
295
+ for (size_t k = 0; k < n; ++k) {
296
+ if (inp[j + k] != key_tokens[k]) {
297
+ match = false;
298
+ break;
299
+ }
300
+ }
301
+ if (match) {
302
+ match_pos = j;
303
+ break;
304
+ }
305
+ }
306
+ }
307
+ if (match_pos > 0) {
308
+ LOG_DBG("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
309
+ cur_len, n, m, key_tokens.size(), sampled, match_pos);
310
+ }
311
+
312
+ if (!map.key_map.empty()) {
313
+ // Add hashes of new ngrams in key_map.
314
+ //
315
+ // Use the same order as above.
316
+ if (map.size_last_begin > (size_t) (n + m + 1)) {
317
+ for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
318
+ // compute hash and store index of ngram at idx j in the map.
319
+ uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
320
+ if (map.key_map[idx_hash] == 0) {
321
+ map.key_map[idx_hash] = j; // collisions may occur
322
+ }
323
+ }
324
+ }
325
+
326
+ for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
327
+ // compute hash and store index of ngram at idx j in the map.
328
+ uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
329
+ if (map.key_map[idx_hash] == 0) {
330
+ map.key_map[idx_hash] = j;
331
+ }
332
+ }
333
+ map.key_map_last_idx = std::max(static_cast<uint32_t>(cur_len - n - m - 1), map.key_map_last_idx);
334
+ }
335
+
336
+ if (match_pos == 0) {
337
+ return;
338
+ }
339
+
340
+ // We have a match, now we look for the statistics of the key.
341
+ size_t key_offset = map.keys.size(); // offset in the map
342
+ // We iterate through the std::vector<common_ngram_map_key> map->keys.
343
+ for (size_t i = 0; i < map.keys.size(); ++i) {
344
+ bool match = true;
345
+ for (size_t j = 0; j < n; ++j) {
346
+ if (inp[map.keys[i].key_idx + j] != key_tokens[j]) {
347
+ match = false;
348
+ break;
349
+ }
350
+ }
351
+ if (match) {
352
+ key_offset = i;
353
+ break;
354
+ }
355
+ }
356
+ if (key_offset == map.keys.size()) {
357
+ // We create a new key-entry, it will get offset key_offset.
358
+ common_ngram_map_key new_key;
359
+ new_key.key_idx = match_pos;
360
+ new_key.stat_idx = 0;
361
+ new_key.key_num = 0;
362
+ for (int i = 0; i < COMMON_NGRAM_MAX_VALUES; ++i) {
363
+ new_key.values[i].value_num = 0;
364
+ new_key.values[i].n_accepted = m;
365
+ }
366
+ map.keys.push_back(new_key);
367
+ }
368
+
369
+ // our key n-gram:
370
+ common_ngram_map_key & curr_key = map.keys[key_offset];
371
+
372
+ // update number of key hits
373
+ curr_key.key_num = (uint16_t) std::min((int) map.keys[key_offset].key_num + 1,
374
+ (int) COMMON_NGRAM_MAX_VALUE_COUNT);
375
+
376
+ if (map.key_only) {
377
+ // simple mode:
378
+ // Fill in the draft with the m tokens following the key.
379
+ // We work with value values[0] only.
380
+ int n_draft_tokens = std::min((int) m, (int) curr_key.values[0].n_accepted);
381
+
382
+ for (int i = 0; i < n_draft_tokens; ++i) {
383
+ draft.push_back(inp[match_pos + n + i]);
384
+ }
385
+
386
+ LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
387
+ curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
388
+
389
+ map.last_draft_created = false;
390
+ map.last_draft_key_idx = key_offset;
391
+ map.last_draft_value_idx = 0; // value 0 is used for simple mode
392
+ return;
393
+ }
394
+
395
+ if (curr_key.key_num < map.min_hits) {
396
+ // not enough hits to consider this a good draft
397
+ LOG_DBG("%s: key_offset = %zu, key_num = %d, min_hits = %d, no draft\n", __func__,
398
+ key_offset, curr_key.key_num, map.min_hits);
399
+ return;
400
+ }
401
+
402
+ // complex mode: examine the different m-grams after this key n-gram.
403
+ //
404
+
405
+ // determine all (max COMMON_NGRAM_MAX_VALUES) m-grams after the key n-gram.
406
+ for (size_t i = curr_key.stat_idx; i <= match_pos; ++i) {
407
+ // begins the key n-gram at index i?
408
+ bool match_key = true;
409
+ for (size_t k = 0; k < n; ++k) {
410
+ if (inp[i + k] != key_tokens[k]) {
411
+ match_key = false;
412
+ break;
413
+ }
414
+ }
415
+ if (!match_key) {
416
+ continue;
417
+ }
418
+
419
+ // Do we haven a existing value m-gram or a new one after the key at index i?
420
+ size_t idx_begin_value_key = i + n;
421
+ int idx_value = -1;
422
+ for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
423
+ size_t idx_begin_value_v = curr_key.values[v].value_idx;
424
+ if (idx_begin_value_v == 0) {
425
+ // We found an empty value slot => we found a new value m-gram after the key n-gram.
426
+ curr_key.values[v].value_idx = idx_begin_value_key;
427
+ curr_key.values[v].value_num = 0;
428
+ curr_key.values[v].n_accepted = m;
429
+ idx_value = v;
430
+ break;
431
+ }
432
+ bool match = true;
433
+ for (size_t j = 0; j < m; ++j) {
434
+ if (inp[idx_begin_value_key + j] != inp[idx_begin_value_v + j]) {
435
+ match = false;
436
+ break;
437
+ }
438
+ }
439
+ if (match) {
440
+ // We found an existing value m-gram after the key n-gram.
441
+ idx_value = v;
442
+ break;
443
+ }
444
+ }
445
+ if (idx_value >= 0) {
446
+ // We found a value m-gram of the key n-gram.
447
+ curr_key.values[idx_value].value_num = (uint16_t) std::min((int) curr_key.values[idx_value].value_num + 1,
448
+ (int) COMMON_NGRAM_MAX_VALUE_COUNT);
449
+ }
450
+ }
451
+ // the statistics are updated up to match_pos.
452
+ curr_key.stat_idx = match_pos;
453
+
454
+ // Do we have a value we could use for the draft?
455
+ uint16_t max_occur = 0;
456
+ int slot_max = 0;
457
+ for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
458
+ uint16_t curr_occur = curr_key.values[v].value_num;
459
+ if (curr_occur > max_occur) {
460
+ max_occur = curr_occur;
461
+ slot_max = v;
462
+ }
463
+ }
464
+ // What is sum of the other occurrences?
465
+ uint32_t sum_occur = 0;
466
+ for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
467
+ if (v == slot_max) {
468
+ continue;
469
+ }
470
+ uint16_t curr_occur = curr_key.values[v].value_num;
471
+ sum_occur += curr_occur;
472
+ }
473
+
474
+ LOG_INF("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__,
475
+ key_offset,
476
+ max_occur, sum_occur, slot_max,
477
+ curr_key.values[0].value_idx, curr_key.values[0].value_num,
478
+ curr_key.values[1].value_idx, curr_key.values[1].value_num,
479
+ curr_key.values[2].value_idx, curr_key.values[2].value_num,
480
+ curr_key.values[3].value_idx, curr_key.values[3].value_num
481
+ );
482
+ // Print the tokens of the four values (if idx != 0), use LOG_INF
483
+ for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
484
+ if (curr_key.values[v].value_idx != 0) {
485
+ LOG_INF("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str());
486
+ }
487
+ }
488
+
489
+ if (sum_occur > 0 && max_occur < 2 * sum_occur) {
490
+ // The most frequent value is not much more frequent than the other values.
491
+ // We do not use the draft.
492
+ return;
493
+ }
494
+
495
+ // We use the most frequent value values[slot_max] for the draft.
496
+ // Fill in the draft with the m tokens following the key.
497
+ int n_draft_tokens = std::min((int) m, (int) curr_key.values[slot_max].n_accepted);
498
+
499
+ for (int i = 0; i < n_draft_tokens; ++i) {
500
+ draft.push_back(inp[match_pos + n + i]);
501
+ }
502
+
503
+ LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
504
+ key_offset, slot_max,
505
+ curr_key.key_num, draft.size());
506
+
507
+ map.last_draft_created = true;
508
+ map.last_draft_key_idx = key_offset;
509
+ map.last_draft_value_idx = slot_max; // value used for draft generation.
510
+ }
511
+
512
+ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
513
+ if (!map.last_draft_created) {
514
+ return;
515
+ }
516
+
517
+ // find the key and its chosen value.
518
+ const size_t key_idx = map.last_draft_key_idx;
519
+ const size_t val_idx = map.last_draft_value_idx;
520
+
521
+ // find key corresponding to key_idx.
522
+ common_ngram_map_key & curr_key = map.keys[key_idx];
523
+ // find value corresponding to val_idx.
524
+ struct common_ngram_map_value & curr_value = curr_key.values[val_idx]; // value used for draft generation.
525
+
526
+ // update the value statistics
527
+ LOG_INF("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
528
+ n_accepted, curr_value.n_accepted);
529
+ curr_value.n_accepted = n_accepted;
530
+ }
@@ -0,0 +1,115 @@
1
+ #pragma once
2
+ //
3
+ // common/ngram-map.h: structures used to manage a map from n-grams to a list of m-grams
4
+ //
5
+ // These structures are used to do a lookup of n-grams followed by m-grams in token history.
6
+ //
7
+ // There are two algorithms implemented:
8
+ // 1. ngram_simple: lookup of n-grams followed by m-grams in token history.
9
+ // 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map.
10
+ // The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams.
11
+ //
12
+ // ref: https://github.com/ggml-org/llama.cpp/pull/18471
13
+ //
14
+
15
+ #include "llama.h"
16
+ #include "common.h"
17
+
18
+ #include <vector>
19
+
20
+ // n-gram simple
21
+ //
22
+
23
+ // config of n-gram simple.
24
+ struct common_ngram_simple_config {
25
+ uint16_t size_ngram; // size of n-grams to lookup in self-mode
26
+ uint16_t size_mgram; // size of m-grams to draft in self-mode
27
+ };
28
+
29
+ // Searches for a n-gram in the history and checks whether a draft sequence should be generated.
30
+ llama_tokens common_ngram_simple_draft(
31
+ const common_ngram_simple_config & config,
32
+ const llama_tokens & tokens, llama_token sampled);
33
+
34
+
35
+ // n-gram map
36
+ //
37
+
38
+ // maximum number of m-gram values stored for each key n-gram.
39
+ #define COMMON_NGRAM_MAX_VALUES 4
40
+
41
+ // number of entries in the (optional, size 0 to disable) map from ngram-hash to ngram-index.
42
+ #define COMMON_NGRAM_HASH_MAP_SIZE 262144
43
+
44
+ // statistics of a m-gram after a known n-gram
45
+ struct common_ngram_map_value {
46
+ size_t value_idx = 0; // index of value m-gram in token-history (0 if unused)
47
+ uint16_t value_num = 0; // number of occurrences of this value m-gram after the key n-gram (0 in an unused values-slot)
48
+ int16_t n_accepted = -1; // number of accepted tokens at last draft (-1 if unused)
49
+ };
50
+
51
+ // statistics of a n-gram
52
+ struct common_ngram_map_key {
53
+ size_t key_idx; // index of key n-gram in token-history
54
+ size_t stat_idx; // index of last token of stastistics computation (key_num, values)
55
+
56
+ uint16_t key_num; // number of occurrences of this key n-gram in token-history
57
+ common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key
58
+ };
59
+
60
+ // map from n-grams to following m-grams in token-history
61
+ struct common_ngram_map {
62
+ uint16_t size_key; // size of key n-grams
63
+ uint16_t size_value; // size of value m-grams
64
+
65
+ bool key_only; // true if only key n-grams are used, no values.
66
+
67
+ std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
68
+ uint16_t min_hits; // minimum number of key hits to consider a draft
69
+
70
+ bool show_key_map_stats = false; // true, if statistics of the key_map should be printed.
71
+
72
+ common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
73
+ uint16_t min_hits)
74
+ : size_key(sz_key), size_value(sz_value), key_only(only_keys),
75
+ min_hits(min_hits) {
76
+ key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used
77
+ }
78
+
79
+ // In reasoning chats the previous reasoning block will be removed from context history.
80
+ // A rebuild of the ngram map is needed after that.
81
+
82
+ size_t size_last_begin = 0; // number of tokens at previous start of generation
83
+
84
+ bool last_draft_created = false; // true if a draft was created at last call.
85
+ size_t last_draft_key_idx = 0; // index of last key used for draft generation (0 = no draft)
86
+ uint16_t last_draft_value_idx = 0; // index of last value used for draft generation.
87
+
88
+ size_t idx_last_check = 0; // index of last check in context history
89
+
90
+ // optional map "hash to ngram-index" for faster lookup of n-grams. map is empty if unused.
91
+ //
92
+ // uint32_t instead of size_t (size of current histories is << UINT32_MAX)
93
+ std::vector<uint32_t> key_map; // key_map[hash] = index of ngram in context window
94
+ uint32_t key_map_last_idx = 0; // index of the last ngram added to key_map
95
+ };
96
+
97
+ // Initialize the n-gram map with the given token history.
98
+ // map: the ngram map to initialize.
99
+ // tokens: the token history to base the map on.
100
+ void common_ngram_map_begin(
101
+ common_ngram_map & map,
102
+ const llama_tokens & tokens);
103
+
104
+ // Searches for the n-gram in the history and checks whether a draft sequence should be generated.
105
+ // map: the ngram map to search in.
106
+ // inp: the tokens generated so far.
107
+ // sampled: the token that was just sampled.
108
+ // draft: vector to store the draft tokens, initially empty.
109
+ void common_ngram_map_draft(
110
+ common_ngram_map & map,
111
+ const llama_tokens & inp, llama_token sampled,
112
+ llama_tokens & draft);
113
+
114
+ // Update the statistics of a value after a draft was processed.
115
+ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted);