local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,1464 @@
1
+ #include "llama-grammar.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-vocab.h"
5
+ #include "llama-sampler.h"
6
+
7
+ #include <cmath>
8
+ #include <algorithm>
9
+ #include <cstdint>
10
+ #include <stdexcept>
11
+
12
+ #define MAX_REPETITION_THRESHOLD 2000
13
+ //
14
+ // helpers
15
+ //
16
+
17
+ // NOTE: assumes valid utf8 (but checks for overrun)
18
+ static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
19
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
20
+ uint8_t first_byte = static_cast<uint8_t>(*src);
21
+ uint8_t highbits = first_byte >> 4;
22
+ int len = lookup[highbits];
23
+ uint8_t mask = (1 << (8 - len)) - 1;
24
+ uint32_t value = first_byte & mask;
25
+ const char * end = src + len; // may overrun!
26
+ const char * pos = src + 1;
27
+ for ( ; pos < end && *pos; pos++) {
28
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
29
+ }
30
+ return std::make_pair(value, pos);
31
+ }
32
+
33
+ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
34
+ const std::string & src,
35
+ llama_partial_utf8 partial_start) {
36
+ static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
37
+ const char * pos = src.c_str();
38
+ std::vector<uint32_t> code_points;
39
+
40
+ // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
41
+ code_points.reserve(src.size() + 1);
42
+ uint32_t value = partial_start.value;
43
+ int n_remain = partial_start.n_remain;
44
+
45
+ // continue previous decode, if applicable
46
+ while (*pos != 0 && n_remain > 0) {
47
+ uint8_t next_byte = static_cast<uint8_t>(*pos);
48
+ if ((next_byte >> 6) != 2) {
49
+ // invalid sequence, abort
50
+ code_points.push_back(0);
51
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
52
+ }
53
+ value = (value << 6) + (next_byte & 0x3F);
54
+ ++pos;
55
+ --n_remain;
56
+ }
57
+
58
+ if (partial_start.n_remain > 0 && n_remain == 0) {
59
+ code_points.push_back(value);
60
+ }
61
+
62
+ // decode any subsequent utf-8 sequences, which may end in an incomplete one
63
+ while (*pos != 0) {
64
+ uint8_t first_byte = static_cast<uint8_t>(*pos);
65
+ uint8_t highbits = first_byte >> 4;
66
+ n_remain = lookup[highbits] - 1;
67
+
68
+ if (n_remain < 0) {
69
+ // invalid sequence, abort
70
+ code_points.clear();
71
+ code_points.push_back(0);
72
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
73
+ }
74
+
75
+ uint8_t mask = (1 << (7 - n_remain)) - 1;
76
+ value = first_byte & mask;
77
+
78
+ ++pos;
79
+ while (*pos != 0 && n_remain > 0) {
80
+ value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
81
+ ++pos;
82
+ --n_remain;
83
+ }
84
+ if (n_remain == 0) {
85
+ code_points.push_back(value);
86
+ }
87
+ }
88
+ code_points.push_back(0);
89
+
90
+ return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
91
+ }
92
+
93
+ static bool is_digit_char(char c) {
94
+ return '0' <= c && c <= '9';
95
+ }
96
+
97
+ static bool is_word_char(char c) {
98
+ return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
99
+ }
100
+
101
+ static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
102
+ const char * pos = src;
103
+ const char * end = src + size;
104
+ uint32_t value = 0;
105
+ for ( ; pos < end && *pos; pos++) {
106
+ value <<= 4;
107
+ char c = *pos;
108
+ if ('a' <= c && c <= 'f') {
109
+ value += c - 'a' + 10;
110
+ } else if ('A' <= c && c <= 'F') {
111
+ value += c - 'A' + 10;
112
+ } else if ('0' <= c && c <= '9') {
113
+ value += c - '0';
114
+ } else {
115
+ break;
116
+ }
117
+ }
118
+ if (pos != end) {
119
+ throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
120
+ }
121
+ return std::make_pair(value, pos);
122
+ }
123
+
124
+ static const char * parse_space(const char * src, bool newline_ok) {
125
+ const char * pos = src;
126
+ while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
127
+ (newline_ok && (*pos == '\r' || *pos == '\n'))) {
128
+ if (*pos == '#') {
129
+ while (*pos && *pos != '\r' && *pos != '\n') {
130
+ pos++;
131
+ }
132
+ } else {
133
+ pos++;
134
+ }
135
+ }
136
+ return pos;
137
+ }
138
+
139
+ static const char * parse_name(const char * src) {
140
+ const char * pos = src;
141
+ while (is_word_char(*pos)) {
142
+ pos++;
143
+ }
144
+ if (pos == src) {
145
+ throw std::runtime_error(std::string("expecting name at ") + src);
146
+ }
147
+ return pos;
148
+ }
149
+
150
+ static const char * parse_int(const char * src) {
151
+ const char * pos = src;
152
+ while (is_digit_char(*pos)) {
153
+ pos++;
154
+ }
155
+ if (pos == src) {
156
+ throw std::runtime_error(std::string("expecting integer at ") + src);
157
+ }
158
+ return pos;
159
+ }
160
+
161
+ static std::pair<uint32_t, const char *> parse_char(const char * src) {
162
+ if (*src == '\\') {
163
+ switch (src[1]) {
164
+ case 'x': return parse_hex(src + 2, 2);
165
+ case 'u': return parse_hex(src + 2, 4);
166
+ case 'U': return parse_hex(src + 2, 8);
167
+ case 't': return std::make_pair('\t', src + 2);
168
+ case 'r': return std::make_pair('\r', src + 2);
169
+ case 'n': return std::make_pair('\n', src + 2);
170
+ case '\\':
171
+ case '"':
172
+ case '[':
173
+ case ']':
174
+ return std::make_pair(src[1], src + 2);
175
+ default:
176
+ throw std::runtime_error(std::string("unknown escape at ") + src);
177
+ }
178
+ } else if (*src) {
179
+ return decode_utf8(src);
180
+ }
181
+ throw std::runtime_error("unexpected end of input");
182
+ }
183
+
184
+ static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
185
+ const char * pos = src;
186
+ if (*pos != '<') {
187
+ throw std::runtime_error(std::string("expecting '<' at ") + pos);
188
+ }
189
+ pos++;
190
+
191
+ // Parse <[id]>
192
+ if (*pos == '[') {
193
+ pos++;
194
+ const char * int_end = parse_int(pos);
195
+ uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
196
+ pos = int_end;
197
+ if (*pos != ']') {
198
+ throw std::runtime_error(std::string("expecting ']' at ") + pos);
199
+ }
200
+ pos++;
201
+ if (*pos != '>') {
202
+ throw std::runtime_error(std::string("expecting '>' at ") + pos);
203
+ }
204
+ pos++;
205
+ return std::make_pair(token_id, pos);
206
+ }
207
+
208
+ if (vocab == nullptr) {
209
+ throw std::runtime_error(std::string("no vocab to parse token at ") + src);
210
+ }
211
+
212
+ // Parse <token> and tokenize to obtain the token id
213
+ while (*pos != 0 && *pos != '>') {
214
+ pos++;
215
+ }
216
+ if (*pos != '>') {
217
+ throw std::runtime_error(std::string("expecting '>' at ") + pos);
218
+ }
219
+ pos++;
220
+
221
+ llama_token tokens[2];
222
+ int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
223
+ if (n_tokens != 1) {
224
+ // must tokenize to exactly 1 token
225
+ throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
226
+ }
227
+ return std::make_pair(tokens[0], pos);
228
+ }
229
+
230
+ static void print_grammar_char(FILE * file, uint32_t c) {
231
+ if (0x20 <= c && c <= 0x7f) {
232
+ fprintf(file, "%c", static_cast<char>(c));
233
+ } else {
234
+ // cop out of encoding UTF-8
235
+ fprintf(file, "<U+%04X>", c);
236
+ }
237
+ }
238
+
239
+ static bool is_char_element(llama_grammar_element elem) {
240
+ switch (elem.type) {
241
+ case LLAMA_GRETYPE_CHAR: return true;
242
+ case LLAMA_GRETYPE_CHAR_NOT: return true;
243
+ case LLAMA_GRETYPE_CHAR_ALT: return true;
244
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
245
+ case LLAMA_GRETYPE_CHAR_ANY: return true;
246
+ default: return false;
247
+ }
248
+ }
249
+
250
+ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
251
+ for (auto elem : rule) {
252
+ switch (elem.type) {
253
+ case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
254
+ case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
255
+ case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
256
+ case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
257
+ case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
258
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
259
+ case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
260
+ case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
261
+ case LLAMA_GRETYPE_TOKEN: fprintf(file, "TOKEN"); break;
262
+ case LLAMA_GRETYPE_TOKEN_NOT: fprintf(file, "TOKEN_NOT"); break;
263
+ }
264
+ switch (elem.type) {
265
+ case LLAMA_GRETYPE_END:
266
+ case LLAMA_GRETYPE_ALT:
267
+ case LLAMA_GRETYPE_RULE_REF:
268
+ fprintf(file, "(%u) ", elem.value);
269
+ break;
270
+ case LLAMA_GRETYPE_CHAR:
271
+ case LLAMA_GRETYPE_CHAR_NOT:
272
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
273
+ case LLAMA_GRETYPE_CHAR_ALT:
274
+ case LLAMA_GRETYPE_CHAR_ANY:
275
+ fprintf(file, "(\"");
276
+ print_grammar_char(file, elem.value);
277
+ fprintf(file, "\") ");
278
+ break;
279
+ case LLAMA_GRETYPE_TOKEN:
280
+ fprintf(file, "<[");
281
+ fprintf(file, "%u", elem.value);
282
+ fprintf(file, "]> ");
283
+ break;
284
+ case LLAMA_GRETYPE_TOKEN_NOT:
285
+ fprintf(file, "!");
286
+ fprintf(file, "<[");
287
+ fprintf(file, "%u", elem.value);
288
+ fprintf(file, "]> ");
289
+ break;
290
+ }
291
+ }
292
+ fprintf(file, "\n");
293
+ }
294
+
295
+ static void print_rule(
296
+ FILE * file,
297
+ uint32_t rule_id,
298
+ const llama_grammar_rule & rule,
299
+ const std::map<uint32_t, std::string> & symbol_id_names) {
300
+ if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
301
+ throw std::runtime_error(
302
+ "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
303
+ }
304
+ fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
305
+ for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
306
+ llama_grammar_element elem = rule[i];
307
+ switch (elem.type) {
308
+ case LLAMA_GRETYPE_END:
309
+ throw std::runtime_error(
310
+ "unexpected end of rule: " + std::to_string(rule_id) + "," +
311
+ std::to_string(i));
312
+ case LLAMA_GRETYPE_ALT:
313
+ fprintf(file, "| ");
314
+ break;
315
+ case LLAMA_GRETYPE_RULE_REF:
316
+ fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
317
+ break;
318
+ case LLAMA_GRETYPE_CHAR:
319
+ fprintf(file, "[");
320
+ print_grammar_char(file, elem.value);
321
+ break;
322
+ case LLAMA_GRETYPE_CHAR_NOT:
323
+ fprintf(file, "[^");
324
+ print_grammar_char(file, elem.value);
325
+ break;
326
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
327
+ if (i == 0 || !is_char_element(rule[i - 1])) {
328
+ throw std::runtime_error(
329
+ "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
330
+ std::to_string(rule_id) + "," + std::to_string(i));
331
+ }
332
+ fprintf(file, "-");
333
+ print_grammar_char(file, elem.value);
334
+ break;
335
+ case LLAMA_GRETYPE_CHAR_ALT:
336
+ if (i == 0 || !is_char_element(rule[i - 1])) {
337
+ throw std::runtime_error(
338
+ "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
339
+ std::to_string(rule_id) + "," + std::to_string(i));
340
+ }
341
+ print_grammar_char(file, elem.value);
342
+ break;
343
+ case LLAMA_GRETYPE_CHAR_ANY:
344
+ fprintf(file, ".");
345
+ break;
346
+ case LLAMA_GRETYPE_TOKEN:
347
+ fprintf(file, "<[");
348
+ fprintf(file, "%u", elem.value);
349
+ fprintf(file, "]> ");
350
+ break;
351
+ case LLAMA_GRETYPE_TOKEN_NOT:
352
+ fprintf(file, "!");
353
+ fprintf(file, "<[");
354
+ fprintf(file, "%u", elem.value);
355
+ fprintf(file, "]> ");
356
+ break;
357
+ }
358
+ if (is_char_element(elem)) {
359
+ switch (rule[i + 1].type) {
360
+ case LLAMA_GRETYPE_CHAR_ALT:
361
+ case LLAMA_GRETYPE_CHAR_RNG_UPPER:
362
+ case LLAMA_GRETYPE_CHAR_ANY:
363
+ break;
364
+ default:
365
+ fprintf(file, "] ");
366
+ }
367
+ }
368
+ }
369
+ fprintf(file, "\n");
370
+ }
371
+
372
+ //
373
+ // Regex utilities
374
+ //
375
+
376
+ size_t llama_grammar_trigger_pattern::find(const std::string & input) const {
377
+ auto find_start_pos = [](const std::smatch & match) {
378
+ // get from the first matched capturing group to the end of the string
379
+ size_t start = std::string::npos;
380
+ for (auto i = 1u; i < match.size(); i++) {
381
+ if (match.length(i) > 0) {
382
+ start = match.position(i);
383
+ break;
384
+ }
385
+ }
386
+ if (start == std::string::npos) {
387
+ start = match.position(0);
388
+ }
389
+ return start;
390
+ };
391
+
392
+ if (!pattern.empty() && pattern.front() == '^' && pattern.back() == '$') {
393
+ // match against the entire input
394
+ std::smatch match;
395
+ if (std::regex_match(input, match, regex)) {
396
+ return find_start_pos(match);
397
+ }
398
+ }
399
+
400
+ // search anywhere
401
+ std::smatch match;
402
+ if (std::regex_search(input, match, regex)) {
403
+ return find_start_pos(match);
404
+ }
405
+
406
+ return std::string::npos;
407
+ }
408
+
409
+
410
+ //
411
+ // implementation
412
+ //
413
+
414
+ uint32_t llama_grammar_parser::get_symbol_id(const char * src, size_t len) {
415
+ uint32_t next_id = static_cast<uint32_t>(symbol_ids.size());
416
+ auto result = symbol_ids.emplace(std::string(src, len), next_id);
417
+ return result.first->second;
418
+ }
419
+
420
+ uint32_t llama_grammar_parser::generate_symbol_id(const std::string & base_name) {
421
+ uint32_t next_id = static_cast<uint32_t>(symbol_ids.size());
422
+ symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
423
+ return next_id;
424
+ }
425
+
426
+ void llama_grammar_parser::add_rule(uint32_t rule_id, const llama_grammar_rule & rule) {
427
+ if (rules.size() <= rule_id) {
428
+ rules.resize(rule_id + 1);
429
+ }
430
+ rules[rule_id] = rule;
431
+ }
432
+
433
+ const char * llama_grammar_parser::parse_alternates(
434
+ const char * src,
435
+ const std::string & rule_name,
436
+ uint32_t rule_id,
437
+ bool is_nested) {
438
+ llama_grammar_rule rule;
439
+ const char * pos = parse_sequence(src, rule_name, rule, is_nested);
440
+ while (*pos == '|') {
441
+ rule.push_back({LLAMA_GRETYPE_ALT, 0});
442
+ pos = parse_space(pos + 1, true);
443
+ pos = parse_sequence(pos, rule_name, rule, is_nested);
444
+ }
445
+ rule.push_back({LLAMA_GRETYPE_END, 0});
446
+ add_rule(rule_id, rule);
447
+ return pos;
448
+ }
449
+
450
+ const char * llama_grammar_parser::parse_sequence(
451
+ const char * src,
452
+ const std::string & rule_name,
453
+ llama_grammar_rule & rule,
454
+ bool is_nested) {
455
+ size_t last_sym_start = rule.size();
456
+ const char * pos = src;
457
+
458
+ // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
459
+ // (though it's technically the same as -1 now)
460
+ auto handle_repetitions = [&](uint64_t min_times, uint64_t max_times) {
461
+ bool no_max = max_times == UINT64_MAX;
462
+ if (last_sym_start == rule.size()) {
463
+ throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
464
+ }
465
+
466
+ // apply transformation to previous symbol (last_sym_start to end) according to
467
+ // the following rewrite rules:
468
+ // S{m,n} --> S S S (m times) S'(n-m)
469
+ // S'(x) ::= S S'(x-1) |
470
+ // (... n-m definitions of these S' rules ...)
471
+ // S'(1) ::= S |
472
+ // S{m,} --> S S S (m times) S'
473
+ // S' ::= S S' |
474
+ // S* --> S{0,}
475
+ // --> S' ::= S S' |
476
+ // S+ --> S{1,}
477
+ // --> S S'
478
+ // S' ::= S S' |
479
+ // S? --> S{0,1}
480
+ // --> S'
481
+ // S' ::= S |
482
+
483
+ llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
484
+ if (min_times == 0) {
485
+ rule.resize(last_sym_start);
486
+ } else {
487
+ // Repeat the previous elements (min_times - 1) times
488
+ for (uint64_t i = 1; i < min_times; i++) {
489
+ rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
490
+ }
491
+ }
492
+
493
+ uint32_t last_rec_rule_id = 0;
494
+ auto n_opt = no_max ? 1 : max_times - min_times;
495
+
496
+ llama_grammar_rule rec_rule(prev_rule);
497
+ for (uint64_t i = 0; i < n_opt; i++) {
498
+ rec_rule.resize(prev_rule.size());
499
+ uint32_t rec_rule_id = generate_symbol_id( rule_name);
500
+ if (i > 0 || no_max) {
501
+ rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, no_max ? rec_rule_id : last_rec_rule_id});
502
+ }
503
+ rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
504
+ rec_rule.push_back({LLAMA_GRETYPE_END, 0});
505
+ add_rule( rec_rule_id, rec_rule);
506
+ last_rec_rule_id = rec_rule_id;
507
+ }
508
+ if (n_opt > 0) {
509
+ rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
510
+ }
511
+ };
512
+
513
+ while (*pos) {
514
+ if (*pos == '"') { // literal string
515
+ pos++;
516
+ last_sym_start = rule.size();
517
+ while (*pos != '"') {
518
+ if (!*pos) {
519
+ throw std::runtime_error("unexpected end of input");
520
+ }
521
+ auto char_pair = parse_char(pos);
522
+ pos = char_pair.second;
523
+ rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
524
+ }
525
+ pos = parse_space(pos + 1, is_nested);
526
+ } else if (*pos == '[') { // char range(s)
527
+ pos++;
528
+ enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
529
+ if (*pos == '^') {
530
+ pos++;
531
+ start_type = LLAMA_GRETYPE_CHAR_NOT;
532
+ }
533
+ last_sym_start = rule.size();
534
+ while (*pos != ']') {
535
+ if (!*pos) {
536
+ throw std::runtime_error("unexpected end of input");
537
+ }
538
+ auto char_pair = parse_char(pos);
539
+ pos = char_pair.second;
540
+ enum llama_gretype type = last_sym_start < rule.size()
541
+ ? LLAMA_GRETYPE_CHAR_ALT
542
+ : start_type;
543
+
544
+ rule.push_back({type, char_pair.first});
545
+ if (pos[0] == '-' && pos[1] != ']') {
546
+ if (!pos[1]) {
547
+ throw std::runtime_error("unexpected end of input");
548
+ }
549
+ auto endchar_pair = parse_char(pos + 1);
550
+ pos = endchar_pair.second;
551
+ rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
552
+ }
553
+ }
554
+ pos = parse_space(pos + 1, is_nested);
555
+ } else if (*pos == '<' || *pos == '!') { // token
556
+ auto type = LLAMA_GRETYPE_TOKEN;
557
+ if (*pos == '!') { // token inverse
558
+ type = LLAMA_GRETYPE_TOKEN_NOT;
559
+ pos++;
560
+ }
561
+ auto token_pair = parse_token(vocab, pos);
562
+ const char * token_end = token_pair.second;
563
+ last_sym_start = rule.size();
564
+ rule.push_back({type, token_pair.first});
565
+ pos = parse_space(token_end, is_nested);
566
+ } else if (is_word_char(*pos)) { // rule reference
567
+ const char * name_end = parse_name(pos);
568
+ uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
569
+ pos = parse_space(name_end, is_nested);
570
+ last_sym_start = rule.size();
571
+ rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
572
+ } else if (*pos == '(') { // grouping
573
+ // parse nested alternates into synthesized rule
574
+ pos = parse_space(pos + 1, true);
575
+ uint32_t sub_rule_id = generate_symbol_id(rule_name);
576
+ pos = parse_alternates(pos, rule_name, sub_rule_id, true);
577
+ last_sym_start = rule.size();
578
+ // output reference to synthesized rule
579
+ rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
580
+ if (*pos != ')') {
581
+ throw std::runtime_error(std::string("expecting ')' at ") + pos);
582
+ }
583
+ pos = parse_space(pos + 1, is_nested);
584
+ } else if (*pos == '.') { // any char
585
+ last_sym_start = rule.size();
586
+ rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
587
+ pos = parse_space(pos + 1, is_nested);
588
+ } else if (*pos == '*') {
589
+ pos = parse_space(pos + 1, is_nested);
590
+ handle_repetitions(0, -1);
591
+ } else if (*pos == '+') {
592
+ pos = parse_space(pos + 1, is_nested);
593
+ handle_repetitions(1, -1);
594
+ } else if (*pos == '?') {
595
+ pos = parse_space(pos + 1, is_nested);
596
+ handle_repetitions(0, 1);
597
+ } else if (*pos == '{') {
598
+ pos = parse_space(pos + 1, is_nested);
599
+
600
+ if (!is_digit_char(*pos)) {
601
+ throw std::runtime_error(std::string("expecting an int at ") + pos);
602
+ }
603
+ const char * int_end = parse_int(pos);
604
+ uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
605
+ pos = parse_space(int_end, is_nested);
606
+
607
+ uint64_t max_times = UINT64_MAX; // default: no max limit
608
+
609
+ if (*pos == '}') {
610
+ max_times = min_times;
611
+ pos = parse_space(pos + 1, is_nested);
612
+ } else if (*pos == ',') {
613
+ pos = parse_space(pos + 1, is_nested);
614
+
615
+ if (is_digit_char(*pos)) {
616
+ const char * int_end = parse_int(pos);
617
+ max_times = std::stoul(std::string(pos, int_end - pos));
618
+ pos = parse_space(int_end, is_nested);
619
+ }
620
+
621
+ if (*pos != '}') {
622
+ throw std::runtime_error(std::string("expecting '}' at ") + pos);
623
+ }
624
+ pos = parse_space(pos + 1, is_nested);
625
+ } else {
626
+ throw std::runtime_error(std::string("expecting ',' at ") + pos);
627
+ }
628
+ bool has_max = max_times != UINT64_MAX;
629
+ if (min_times > MAX_REPETITION_THRESHOLD || (has_max && max_times > MAX_REPETITION_THRESHOLD)) {
630
+ throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions"));
631
+ }
632
+ handle_repetitions(min_times, max_times);
633
+ } else {
634
+ break;
635
+ }
636
+ }
637
+ return pos;
638
+ }
639
+
640
+ const char * llama_grammar_parser::parse_rule(const char * src) {
641
+ const char * name_end = parse_name(src);
642
+ const char * pos = parse_space(name_end, false);
643
+ size_t name_len = name_end - src;
644
+ uint32_t rule_id = get_symbol_id(src, name_len);
645
+ const std::string name(src, name_len);
646
+
647
+ if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
648
+ throw std::runtime_error(std::string("expecting ::= at ") + pos);
649
+ }
650
+ pos = parse_space(pos + 3, true);
651
+
652
+ pos = parse_alternates(pos, name, rule_id, false);
653
+
654
+ if (*pos == '\r') {
655
+ pos += pos[1] == '\n' ? 2 : 1;
656
+ } else if (*pos == '\n') {
657
+ pos++;
658
+ } else if (*pos) {
659
+ throw std::runtime_error(std::string("expecting newline or end at ") + pos);
660
+ }
661
+ return parse_space(pos, true);
662
+ }
663
+
664
+ bool llama_grammar_parser::parse(const char * src) {
665
+ try {
666
+ const char * pos = parse_space(src, true);
667
+ while (*pos) {
668
+ pos = parse_rule(pos);
669
+ }
670
+ // Validate the state to ensure that all rules are defined
671
+ for (const auto & rule : rules) {
672
+ if (rule.empty()) {
673
+ throw std::runtime_error("Undefined rule");
674
+ }
675
+ for (const auto & elem : rule) {
676
+ if (elem.type == LLAMA_GRETYPE_RULE_REF) {
677
+ // Ensure that the rule at that location exists
678
+ if (elem.value >= rules.size() || rules[elem.value].empty()) {
679
+ // Get the name of the rule that is missing
680
+ for (const auto & kv : symbol_ids) {
681
+ if (kv.second == elem.value) {
682
+ throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
683
+ }
684
+ }
685
+ }
686
+ }
687
+ }
688
+ }
689
+ } catch (const std::exception & err) {
690
+ fprintf(stderr, "%s: error parsing grammar: %s\n\n%s\n", __func__, err.what(), src);
691
+ rules.clear();
692
+ return false;
693
+ }
694
+
695
+ return true;
696
+ }
697
+
698
+ void llama_grammar_parser::print(FILE * file) {
699
+ try {
700
+ std::map<uint32_t, std::string> symbol_id_names;
701
+ for (const auto & kv : symbol_ids) {
702
+ symbol_id_names[kv.second] = kv.first;
703
+ }
704
+ for (size_t i = 0, end = rules.size(); i < end; i++) {
705
+ // fprintf(file, "%zu: ", i);
706
+ // print_rule_binary(file, rules[i]);
707
+ print_rule(file, uint32_t(i), rules[i], symbol_id_names);
708
+ // fprintf(file, "\n");
709
+ }
710
+ } catch (const std::exception & err) {
711
+ fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
712
+ }
713
+ }
714
+
715
+ llama_grammar_stack llama_grammar_parser::c_rules() const {
716
+ llama_grammar_stack ret;
717
+ ret.reserve(rules.size());
718
+ for (const auto & rule : rules) {
719
+ ret.push_back(rule.data());
720
+ }
721
+ return ret;
722
+ }
723
+
724
+ // returns true iff pos points to the end of one of the definitions of a rule
725
+ static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
726
+ switch (pos->type) {
727
+ case LLAMA_GRETYPE_END: return true; // NOLINT
728
+ case LLAMA_GRETYPE_ALT: return true; // NOLINT
729
+ default: return false;
730
+ }
731
+ }
732
+
733
+ // returns true iff chr satisfies the char range at pos (regular or inverse range)
734
+ // asserts that pos is pointing to a char range element
735
+ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
736
+ const llama_grammar_element * pos,
737
+ const uint32_t chr) {
738
+ bool found = false;
739
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
740
+
741
+ GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
742
+
743
+ do {
744
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
745
+ // inclusive range, e.g. [a-z]
746
+ found = found || (pos->value <= chr && chr <= pos[1].value);
747
+ pos += 2;
748
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
749
+ // Any character matches "."
750
+ found = true;
751
+ pos += 1;
752
+ } else {
753
+ // exact char match, e.g. [a] or "a"
754
+ found = found || pos->value == chr;
755
+ pos += 1;
756
+ }
757
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
758
+
759
+ return std::make_pair(found == is_positive_char, pos);
760
+ }
761
+
762
+ // returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
763
+ // range at pos (regular or inverse range)
764
+ // asserts that pos is pointing to a char range element
765
+ static bool llama_grammar_match_partial_char(
766
+ const llama_grammar_element * pos,
767
+ const llama_partial_utf8 partial_utf8) {
768
+ bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
769
+ GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
770
+
771
+ uint32_t partial_value = partial_utf8.value;
772
+ int n_remain = partial_utf8.n_remain;
773
+
774
+ // invalid sequence or 7-bit char split across 2 bytes (overlong)
775
+ if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
776
+ return false;
777
+ }
778
+
779
+ // range of possible code points this partial UTF-8 sequence could complete to
780
+ uint32_t low = partial_value << (n_remain * 6);
781
+ uint32_t high = low | ((1 << (n_remain * 6)) - 1);
782
+
783
+ if (low == 0) {
784
+ if (n_remain == 2) {
785
+ low = 1 << 11;
786
+ } else if (n_remain == 3) {
787
+ low = 1 << 16;
788
+ }
789
+ }
790
+
791
+ do {
792
+ if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
793
+ // inclusive range, e.g. [a-z]
794
+ if (pos->value <= high && low <= pos[1].value) {
795
+ return is_positive_char;
796
+ }
797
+ pos += 2;
798
+ } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
799
+ // Any character matches "."
800
+ return true;
801
+ } else {
802
+ // exact char match, e.g. [a] or "a"
803
+ if (low <= pos->value && pos->value <= high) {
804
+ return is_positive_char;
805
+ }
806
+ pos += 1;
807
+ }
808
+ } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
809
+
810
+ return !is_positive_char;
811
+ }
812
+
813
+ // returns true iff token matches the rule at pos (regular or inverse)
814
+ // asserts that pos is pointing to a token element
815
+ static bool llama_grammar_match_token(
816
+ const llama_grammar_element * pos,
817
+ const llama_token token) {
818
+ GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
819
+ if (pos->type == LLAMA_GRETYPE_TOKEN) {
820
+ return pos->value == static_cast<uint32_t>(token);
821
+ }
822
+ if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
823
+ return pos->value != static_cast<uint32_t>(token);
824
+ }
825
+ return false;
826
+ }
827
+
828
+ // transforms a grammar pushdown stack into N possible stacks, all ending
829
+ // at a character range (terminal element)
830
+ static void llama_grammar_advance_stack(
831
+ const llama_grammar_rules & rules,
832
+ const llama_grammar_stack & stack,
833
+ llama_grammar_stacks & new_stacks) {
834
+ if (stack.empty()) {
835
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
836
+ new_stacks.emplace_back(stack);
837
+ }
838
+ return;
839
+ }
840
+
841
+ const llama_grammar_element * pos = stack.back();
842
+
843
+ switch (pos->type) {
844
+ case LLAMA_GRETYPE_RULE_REF: {
845
+ const size_t rule_id = static_cast<size_t>(pos->value);
846
+ const llama_grammar_element * subpos = rules[rule_id].data();
847
+ do {
848
+ // init new stack without the top (pos)
849
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
850
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
851
+ // if this rule ref is followed by another element, add that to stack
852
+ new_stack.push_back(pos + 1);
853
+ }
854
+ if (!llama_grammar_is_end_of_sequence(subpos)) {
855
+ // if alternate is nonempty, add to stack
856
+ new_stack.push_back(subpos);
857
+ }
858
+ llama_grammar_advance_stack(rules, new_stack, new_stacks);
859
+ while (!llama_grammar_is_end_of_sequence(subpos)) {
860
+ // scan to end of alternate def
861
+ subpos++;
862
+ }
863
+ if (subpos->type == LLAMA_GRETYPE_ALT) {
864
+ // there's another alternate def of this rule to process
865
+ subpos++;
866
+ } else {
867
+ break;
868
+ }
869
+ } while (true);
870
+ break;
871
+ }
872
+ case LLAMA_GRETYPE_CHAR:
873
+ case LLAMA_GRETYPE_CHAR_NOT:
874
+ case LLAMA_GRETYPE_CHAR_ANY:
875
+ case LLAMA_GRETYPE_TOKEN:
876
+ case LLAMA_GRETYPE_TOKEN_NOT:
877
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
878
+ // only add the stack if it's not a duplicate of one we already have
879
+ new_stacks.emplace_back(stack);
880
+ }
881
+ break;
882
+ default:
883
+ // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
884
+ // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
885
+ // those
886
+ GGML_ABORT("fatal error");
887
+ }
888
+ }
889
+
890
+ static llama_grammar_candidates llama_grammar_reject_candidates(
891
+ const llama_grammar_rules & rules,
892
+ const llama_grammar_stacks & stacks,
893
+ const llama_grammar_candidates & candidates) {
894
+ GGML_ASSERT(!stacks.empty()); // REVIEW
895
+
896
+ if (candidates.empty()) {
897
+ return {};
898
+ }
899
+
900
+ auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
901
+
902
+ for (size_t i = 1, size = stacks.size(); i < size; ++i) {
903
+ rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
904
+ }
905
+
906
+ return rejects;
907
+ }
908
+
909
+ static bool llama_grammar_detect_left_recursion(
910
+ const llama_grammar_rules & rules,
911
+ size_t rule_index,
912
+ std::vector<bool> * rules_visited,
913
+ std::vector<bool> * rules_in_progress,
914
+ std::vector<bool> * rules_may_be_empty) {
915
+ if ((*rules_in_progress)[rule_index]) {
916
+ return true;
917
+ }
918
+
919
+ (*rules_in_progress)[rule_index] = true;
920
+
921
+ const llama_grammar_rule & rule = rules[rule_index];
922
+
923
+ // First check if the rule might produce the empty string. This could be done combined with the second
924
+ // step but it's more readable as two steps.
925
+ bool at_rule_start = true;
926
+ for (size_t i = 0; i < rule.size(); i++) {
927
+ if (llama_grammar_is_end_of_sequence(&rule[i])) {
928
+ if (at_rule_start) {
929
+ (*rules_may_be_empty)[rule_index] = true;
930
+ break;
931
+ }
932
+ at_rule_start = true;
933
+ } else {
934
+ at_rule_start = false;
935
+ }
936
+ }
937
+
938
+ // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
939
+ // be empty)
940
+ bool recurse_into_nonterminal = true;
941
+ for (size_t i = 0; i < rule.size(); i++) {
942
+ if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
943
+ if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
944
+ return true;
945
+ }
946
+ if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
947
+ recurse_into_nonterminal = false;
948
+ }
949
+ } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
950
+ recurse_into_nonterminal = true;
951
+ } else {
952
+ recurse_into_nonterminal = false;
953
+ }
954
+ }
955
+
956
+ (*rules_in_progress)[rule_index] = false;
957
+ (*rules_visited)[rule_index] = true;
958
+
959
+ return false;
960
+ }
961
+
962
+ const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) {
963
+ return grammar->rules;
964
+ }
965
+
966
+ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) {
967
+ return grammar->stacks;
968
+ }
969
+
970
+ static void llama_grammar_accept_chr(
971
+ struct llama_grammar & grammar,
972
+ const llama_grammar_stack & stack,
973
+ uint32_t chr,
974
+ llama_grammar_stacks & new_stacks) {
975
+ if (stack.empty()) {
976
+ return;
977
+ }
978
+
979
+ const llama_grammar_element * pos = stack.back();
980
+
981
+ // ignore if this turns into a token
982
+ if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
983
+ return;
984
+ }
985
+
986
+ auto match = llama_grammar_match_char(pos, chr);
987
+ if (match.first) {
988
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
989
+ if (!llama_grammar_is_end_of_sequence(match.second)) {
990
+ new_stack.push_back(match.second);
991
+ }
992
+ llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
993
+ }
994
+ }
995
+
996
+ void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
997
+ llama_grammar_stacks stacks_new;
998
+ stacks_new.reserve(grammar->stacks.size());
999
+
1000
+ for (const auto & stack : grammar->stacks) {
1001
+ llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
1002
+ }
1003
+
1004
+ grammar->stacks = std::move(stacks_new);
1005
+ }
1006
+
1007
+ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
1008
+ const llama_grammar_rules & rules,
1009
+ const llama_grammar_stack & stack,
1010
+ const llama_grammar_candidates & candidates) {
1011
+
1012
+ llama_grammar_candidates rejects;
1013
+ rejects.reserve(candidates.size());
1014
+
1015
+ if (stack.empty()) {
1016
+ for (const auto & tok : candidates) {
1017
+ if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
1018
+ rejects.push_back(tok);
1019
+ }
1020
+ }
1021
+ return rejects;
1022
+ }
1023
+
1024
+ const llama_grammar_element * stack_pos = stack.back();
1025
+
1026
+ // if the top of the stack is a token rule, then we only need to check the token id
1027
+ if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
1028
+ for (const auto & tok : candidates) {
1029
+ if (*tok.code_points == 0) {
1030
+ // reached the end of a token consumed by char rules, reject iff it ended
1031
+ // in a partial response
1032
+ if (tok.partial_utf8.n_remain != 0) {
1033
+ rejects.push_back(tok);
1034
+ }
1035
+ } else if (!llama_grammar_match_token(stack_pos, tok.id)) {
1036
+ rejects.push_back(tok);
1037
+ }
1038
+ }
1039
+ return rejects;
1040
+ }
1041
+
1042
+ llama_grammar_candidates next_candidates;
1043
+ next_candidates.reserve(candidates.size());
1044
+
1045
+ for (const auto & tok : candidates) {
1046
+ if (*tok.code_points == 0) {
1047
+ // reached end of full codepoints in token, reject iff it ended in a partial sequence
1048
+ // that cannot satisfy this position in grammar
1049
+ if (tok.partial_utf8.n_remain != 0 &&
1050
+ !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
1051
+ rejects.push_back(tok);
1052
+ }
1053
+ } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
1054
+ next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
1055
+ } else {
1056
+ rejects.push_back(tok);
1057
+ }
1058
+ }
1059
+
1060
+ const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
1061
+
1062
+ // update top of stack to next element, if any
1063
+ llama_grammar_stack stack_after(stack.begin(), stack.end() - 1);
1064
+ if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
1065
+ stack_after.push_back(stack_pos_after);
1066
+ }
1067
+ llama_grammar_stacks next_stacks;
1068
+ llama_grammar_advance_stack(rules, stack_after, next_stacks);
1069
+
1070
+ auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
1071
+ for (const auto & tok : next_rejects) {
1072
+ rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
1073
+ }
1074
+
1075
+ return rejects;
1076
+ }
1077
+
1078
+ ////////////////////
1079
+
1080
+ struct llama_grammar * llama_grammar_init_impl(
1081
+ const struct llama_vocab * vocab,
1082
+ const llama_grammar_element ** rules,
1083
+ size_t n_rules,
1084
+ size_t start_rule_index) {
1085
+ const llama_grammar_element * pos;
1086
+
1087
+ // copy rule definitions into vectors
1088
+ llama_grammar_rules vec_rules(n_rules);
1089
+ for (size_t i = 0; i < n_rules; i++) {
1090
+ for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
1091
+ vec_rules[i].push_back(*pos);
1092
+ }
1093
+ vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
1094
+ }
1095
+
1096
+ // Check for left recursion
1097
+ std::vector<bool> rules_visited(n_rules);
1098
+ std::vector<bool> rules_in_progress(n_rules);
1099
+ std::vector<bool> rules_may_be_empty(n_rules);
1100
+ for (size_t i = 0; i < n_rules; i++) {
1101
+ if (rules_visited[i]) {
1102
+ continue;
1103
+ }
1104
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
1105
+ LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
1106
+ return nullptr;
1107
+ }
1108
+ }
1109
+
1110
+ // loop over alternates of start rule to build initial stacks
1111
+ llama_grammar_stacks stacks;
1112
+ pos = vec_rules[start_rule_index].data();
1113
+ do {
1114
+ llama_grammar_stack stack;
1115
+ if (!llama_grammar_is_end_of_sequence(pos)) {
1116
+ // if alternate is nonempty, add to stack
1117
+ stack.push_back(pos);
1118
+ }
1119
+ llama_grammar_advance_stack(vec_rules, stack, stacks);
1120
+ while (!llama_grammar_is_end_of_sequence(pos)) {
1121
+ // scan to end of alternate def
1122
+ pos++;
1123
+ }
1124
+ if (pos->type == LLAMA_GRETYPE_ALT) {
1125
+ // there's another alternate def of this rule to process
1126
+ pos++;
1127
+ } else {
1128
+ break;
1129
+ }
1130
+ } while (true);
1131
+
1132
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
1133
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
1134
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
1135
+ return new llama_grammar {
1136
+ vocab,
1137
+ std::move(vec_rules),
1138
+ std::move(stacks),
1139
+ /* .partial_utf8 = */ {},
1140
+ /* .lazy = */ false,
1141
+ /* .awaiting_trigger = */ false,
1142
+ /* .trigger_buffer = */ "",
1143
+ /* .trigger_buffer_positions = */ {},
1144
+ /* .trigger_tokens = */ {},
1145
+ /* .trigger_patterns = */ {},
1146
+ };
1147
+ }
1148
+
1149
+ struct llama_grammar * llama_grammar_init_impl(
1150
+ const struct llama_vocab * vocab,
1151
+ const char * grammar_str,
1152
+ const char * grammar_root,
1153
+ bool lazy,
1154
+ const char ** trigger_patterns,
1155
+ size_t num_trigger_patterns,
1156
+ const llama_token * trigger_tokens,
1157
+ size_t num_trigger_tokens) {
1158
+ llama_grammar_parser parser(vocab);
1159
+
1160
+ // if there is a grammar, parse it
1161
+ // rules will be empty (default) if there are parse errors
1162
+ if (!parser.parse(grammar_str) || parser.rules.empty()) {
1163
+ fprintf(stderr, "%s: failed to parse grammar\n", __func__);
1164
+ return nullptr;
1165
+ }
1166
+
1167
+ // Ensure that there is a "root" node.
1168
+ if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
1169
+ fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
1170
+ return nullptr;
1171
+ }
1172
+
1173
+ std::vector<const llama_grammar_element *> grammar_rules(parser.c_rules());
1174
+
1175
+ const size_t n_rules = grammar_rules.size();
1176
+ const size_t start_rule_index = parser.symbol_ids.at(grammar_root);
1177
+
1178
+ const llama_grammar_element * pos;
1179
+
1180
+ // copy rule definitions into vectors
1181
+ llama_grammar_rules vec_rules(n_rules);
1182
+ for (size_t i = 0; i < n_rules; i++) {
1183
+ for (pos = grammar_rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
1184
+ vec_rules[i].push_back(*pos);
1185
+ }
1186
+ vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
1187
+ }
1188
+
1189
+ // Check for left recursion
1190
+ std::vector<bool> rules_visited(n_rules);
1191
+ std::vector<bool> rules_in_progress(n_rules);
1192
+ std::vector<bool> rules_may_be_empty(n_rules);
1193
+ for (size_t i = 0; i < n_rules; i++) {
1194
+ if (rules_visited[i]) {
1195
+ continue;
1196
+ }
1197
+ if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
1198
+ LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
1199
+ return nullptr;
1200
+ }
1201
+ }
1202
+
1203
+ // loop over alternates of start rule to build initial stacks
1204
+ llama_grammar_stacks stacks;
1205
+ pos = vec_rules[start_rule_index].data();
1206
+ do {
1207
+ llama_grammar_stack stack;
1208
+ if (!llama_grammar_is_end_of_sequence(pos)) {
1209
+ // if alternate is nonempty, add to stack
1210
+ stack.push_back(pos);
1211
+ }
1212
+ llama_grammar_advance_stack(vec_rules, stack, stacks);
1213
+ while (!llama_grammar_is_end_of_sequence(pos)) {
1214
+ // scan to end of alternate def
1215
+ pos++;
1216
+ }
1217
+ if (pos->type == LLAMA_GRETYPE_ALT) {
1218
+ // there's another alternate def of this rule to process
1219
+ pos++;
1220
+ } else {
1221
+ break;
1222
+ }
1223
+ } while (true);
1224
+
1225
+ std::vector<llama_token> vec_trigger_tokens;
1226
+ std::vector<llama_grammar_trigger_pattern> vec_trigger_patterns;
1227
+ for (size_t i = 0; i < num_trigger_tokens; i++) {
1228
+ GGML_ASSERT(trigger_tokens != nullptr);
1229
+ vec_trigger_tokens.push_back(trigger_tokens[i]);
1230
+ }
1231
+ for (size_t i = 0; i < num_trigger_patterns; i++) {
1232
+ GGML_ASSERT(trigger_patterns != nullptr);
1233
+ auto & trigger = vec_trigger_patterns.emplace_back();
1234
+ trigger.pattern = trigger_patterns[i];
1235
+ trigger.regex = std::regex(trigger.pattern);
1236
+ }
1237
+
1238
+ // Important: vec_rules has to be moved here, not copied, because stacks contains
1239
+ // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
1240
+ // then the pointers would be invalidated when the local vec_rules goes out of scope.
1241
+ return new llama_grammar {
1242
+ vocab,
1243
+ std::move(vec_rules),
1244
+ std::move(stacks),
1245
+ /* .partial_utf8 = */ {},
1246
+ /* .lazy = */ lazy,
1247
+ /* .awaiting_trigger = */ lazy,
1248
+ /* .trigger_buffer = */ "",
1249
+ /* .trigger_buffer_positions = */ {},
1250
+ std::move(vec_trigger_tokens),
1251
+ std::move(vec_trigger_patterns),
1252
+ };
1253
+ }
1254
+
1255
+ void llama_grammar_free_impl(struct llama_grammar * grammar) {
1256
+ if (grammar == nullptr) {
1257
+ return;
1258
+ }
1259
+
1260
+ delete grammar;
1261
+ }
1262
+
1263
+ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
1264
+ auto * result = new llama_grammar {
1265
+ grammar.vocab,
1266
+ grammar.rules,
1267
+ grammar.stacks,
1268
+ grammar.partial_utf8,
1269
+ grammar.lazy,
1270
+ grammar.awaiting_trigger,
1271
+ grammar.trigger_buffer,
1272
+ grammar.trigger_buffer_positions,
1273
+ grammar.trigger_tokens,
1274
+ grammar.trigger_patterns,
1275
+ };
1276
+
1277
+ // redirect elements in stacks to point to new rules
1278
+ for (size_t is = 0; is < result->stacks.size(); is++) {
1279
+ for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
1280
+ for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) {
1281
+ for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) {
1282
+ if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) {
1283
+ result->stacks[is][ie] = &result->rules[ir0][ir1];
1284
+ }
1285
+ }
1286
+ }
1287
+ }
1288
+ }
1289
+
1290
+ return result;
1291
+ }
1292
+
1293
+ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
1294
+ GGML_ASSERT(grammar.vocab != nullptr);
1295
+
1296
+ if (grammar.awaiting_trigger) {
1297
+ return;
1298
+ }
1299
+
1300
+ bool allow_eog = false;
1301
+ for (const auto & stack : grammar.stacks) {
1302
+ if (stack.empty()) {
1303
+ allow_eog = true;
1304
+ break;
1305
+ }
1306
+ }
1307
+
1308
+ std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
1309
+ candidates_decoded.reserve(cur_p->size);
1310
+
1311
+ llama_grammar_candidates candidates_grammar;
1312
+ candidates_grammar.reserve(cur_p->size);
1313
+
1314
+ for (size_t i = 0; i < cur_p->size; ++i) {
1315
+ const llama_token id = cur_p->data[i].id;
1316
+ const std::string & piece = grammar.vocab->token_to_piece(id);
1317
+
1318
+ if (grammar.vocab->is_eog(id)) {
1319
+ if (!allow_eog) {
1320
+ cur_p->data[i].logit = -INFINITY;
1321
+ }
1322
+ } else if (piece.empty() || piece[0] == 0) {
1323
+ cur_p->data[i].logit = -INFINITY;
1324
+ } else {
1325
+ candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
1326
+ candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
1327
+ }
1328
+ }
1329
+
1330
+ const auto rejects = llama_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar);
1331
+ for (const auto & reject : rejects) {
1332
+ cur_p->data[reject.index].logit = -INFINITY;
1333
+ }
1334
+ }
1335
+
1336
+ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
1337
+ GGML_ASSERT(grammar.vocab != nullptr);
1338
+
1339
+ const auto & piece = grammar.vocab->token_to_piece(token);
1340
+
1341
+ if (grammar.awaiting_trigger) {
1342
+ if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
1343
+ grammar.awaiting_trigger = false;
1344
+ grammar.trigger_buffer.clear();
1345
+ llama_grammar_accept_token(grammar, token, piece);
1346
+ LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
1347
+ return;
1348
+ } else {
1349
+ auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
1350
+ grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
1351
+ grammar.trigger_buffer += piece;
1352
+
1353
+ for (const auto & trigger_pattern : grammar.trigger_patterns) {
1354
+ auto start = trigger_pattern.find(grammar.trigger_buffer);
1355
+ if (start != std::string::npos) {
1356
+ grammar.awaiting_trigger = false;
1357
+
1358
+ // replay tokens that overlap with [start, end)
1359
+ for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
1360
+ auto [tok_start, tok_end] = tok_pos;
1361
+ if (tok_end <= start) {
1362
+ continue;
1363
+ }
1364
+
1365
+ size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
1366
+ size_t piece_len = tok_end - piece_start;
1367
+ auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
1368
+ llama_grammar_accept_token(grammar, tok, tok_piece);
1369
+ }
1370
+
1371
+ auto constrained_str = grammar.trigger_buffer.substr(start);
1372
+ grammar.trigger_buffer.clear();
1373
+ grammar.trigger_buffer_positions.clear();
1374
+ LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
1375
+ return;
1376
+ }
1377
+ }
1378
+ LLAMA_LOG_DEBUG("Grammar still awaiting trigger after token %d (`%s`)\n", token, piece.c_str());
1379
+ return;
1380
+ }
1381
+ }
1382
+
1383
+ if (grammar.vocab->is_eog(token)) {
1384
+ for (const auto & stack : grammar.stacks) {
1385
+ if (stack.empty()) {
1386
+ return;
1387
+ }
1388
+ }
1389
+ GGML_ABORT("fatal error");
1390
+ }
1391
+
1392
+ llama_grammar_accept_token(grammar, token, piece);
1393
+ }
1394
+
1395
+ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
1396
+ // Note terminating 0 in decoded string
1397
+ const auto decoded = decode_utf8(piece, grammar.partial_utf8);
1398
+ const auto & code_points = decoded.first;
1399
+
1400
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
1401
+ llama_grammar_accept(&grammar, *it);
1402
+ }
1403
+
1404
+ grammar.partial_utf8 = decoded.second;
1405
+ if (grammar.stacks.empty()) {
1406
+ throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
1407
+ }
1408
+ }
1409
+
1410
+ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
1411
+ // Note terminating 0 in decoded string
1412
+ const auto decoded = decode_utf8(piece, grammar.partial_utf8);
1413
+ const auto & code_points = decoded.first;
1414
+
1415
+ llama_grammar_stacks stacks_new;
1416
+ stacks_new.reserve(grammar.stacks.size());
1417
+
1418
+ for (const auto & stack : grammar.stacks) {
1419
+ if (stack.empty()) {
1420
+ continue;
1421
+ }
1422
+
1423
+ const llama_grammar_element * pos = stack.back();
1424
+
1425
+ if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
1426
+ if (llama_grammar_match_token(pos, token)) {
1427
+ llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
1428
+ if (!llama_grammar_is_end_of_sequence(pos + 1)) {
1429
+ new_stack.push_back(pos + 1);
1430
+ }
1431
+ llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
1432
+ }
1433
+ } else {
1434
+ llama_grammar_stacks current_stacks = {stack};
1435
+
1436
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
1437
+ llama_grammar_stacks next_stacks;
1438
+
1439
+ for (const auto & cur_stack : current_stacks) {
1440
+ llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
1441
+ }
1442
+
1443
+ current_stacks = std::move(next_stacks);
1444
+ if (current_stacks.empty()) {
1445
+ break;
1446
+ }
1447
+ }
1448
+
1449
+ for (auto & surviving_stack : current_stacks) {
1450
+ if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
1451
+ stacks_new.emplace_back(surviving_stack);
1452
+ }
1453
+ }
1454
+ }
1455
+ }
1456
+
1457
+ grammar.stacks = std::move(stacks_new);
1458
+ grammar.partial_utf8 = decoded.second;
1459
+
1460
+ if (grammar.stacks.empty()) {
1461
+ throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
1462
+ }
1463
+ }
1464
+