cui-llama.rn 1.7.3 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
  4. package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
package/cpp/chat.cpp CHANGED
@@ -1,8 +1,18 @@
1
1
  #include "chat.h"
2
+ #include "chat-parser.h"
3
+ #include "common.h"
4
+ #include "json-partial.h"
2
5
  #include "json-schema-to-grammar.h"
3
6
  #include "log.h"
7
+ #include "regex-partial.h"
4
8
 
9
+ #include <cstdio>
10
+ #include <exception>
11
+ #include <iostream>
5
12
  #include <optional>
13
+ #include <stdexcept>
14
+ #include <string>
15
+ #include <vector>
6
16
 
7
17
  static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
8
18
  auto time = std::chrono::system_clock::to_time_t(now);
@@ -13,6 +23,101 @@ static std::string format_time(const std::chrono::system_clock::time_point & now
13
23
  return res;
14
24
  }
15
25
 
26
+ static std::string string_diff(const std::string & last, const std::string & current) {
27
+ if (last.empty()) {
28
+ return current;
29
+ }
30
+ if (!string_starts_with(current, last)) {
31
+ if (string_starts_with(last, current)) {
32
+ // This happens if the last generation ended on a partial stop word (not erased),
33
+ // and the current ended on a stop word (erased).
34
+ return "";
35
+ }
36
+ throw std::runtime_error("Invalid diff: '" + last + "' not found at start of '" + current + "'");
37
+ }
38
+ return current.substr(last.size());
39
+ }
40
+
41
+ static bool has_content_or_tool_calls(const common_chat_msg & msg) {
42
+ return !msg.content.empty() || !msg.tool_calls.empty();
43
+ }
44
+
45
+ template <>
46
+ json common_chat_msg::to_json_oaicompat() const
47
+ {
48
+ json message {
49
+ {"role", "assistant"},
50
+ };
51
+ if (!reasoning_content.empty()) {
52
+ message["reasoning_content"] = reasoning_content;
53
+ }
54
+ if (content.empty() && !tool_calls.empty()) {
55
+ message["content"] = json();
56
+ } else {
57
+ message["content"] = content;
58
+ }
59
+ if (!tool_calls.empty()) {
60
+ auto arr = json::array();
61
+ for (const auto & tc : tool_calls) {
62
+ arr.push_back({
63
+ {"type", "function"},
64
+ {"function", {
65
+ {"name", tc.name},
66
+ {"arguments", tc.arguments},
67
+ }},
68
+ {"id", tc.id},
69
+ // // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
70
+ // // We only generate a random id for the ones that don't generate one by themselves
71
+ // // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
72
+ // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
73
+ });
74
+ }
75
+ message["tool_calls"] = arr;
76
+ }
77
+ return message;
78
+ }
79
+
80
+ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
81
+ std::vector<common_chat_msg_diff> diffs;
82
+ if (previous_msg.reasoning_content != new_msg.reasoning_content) {
83
+ auto & diff = diffs.emplace_back();
84
+ diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
85
+ }
86
+ if (previous_msg.content != new_msg.content) {
87
+ auto & diff = diffs.emplace_back();
88
+ diff.content_delta = string_diff(previous_msg.content, new_msg.content);
89
+ }
90
+
91
+ if (new_msg.tool_calls.size() < previous_msg.tool_calls.size()) {
92
+ throw std::runtime_error("Invalid diff: now finding less tool calls!");
93
+ }
94
+
95
+ if (!previous_msg.tool_calls.empty()) {
96
+ auto idx = previous_msg.tool_calls.size() - 1;
97
+ const auto & pref = previous_msg.tool_calls[idx];
98
+ const auto & newf = new_msg.tool_calls[idx];
99
+ if (pref.name != newf.name) {
100
+ throw std::runtime_error("Invalid diff: tool call mismatch!");
101
+ }
102
+ auto args_diff = string_diff(pref.arguments, newf.arguments);
103
+ if (!args_diff.empty() || pref.id != newf.id) {
104
+ auto & diff = diffs.emplace_back();
105
+ diff.tool_call_index = idx;
106
+ if (pref.id != newf.id) {
107
+ diff.tool_call_delta.id = newf.id;
108
+ diff.tool_call_delta.name = newf.name;
109
+ }
110
+ diff.tool_call_delta.arguments = args_diff;
111
+ }
112
+ }
113
+ for (size_t idx = previous_msg.tool_calls.size(); idx < new_msg.tool_calls.size(); ++idx) {
114
+ auto & diff = diffs.emplace_back();
115
+ diff.tool_call_index = idx;
116
+ diff.tool_call_delta = new_msg.tool_calls[idx];
117
+ }
118
+ return diffs;
119
+ }
120
+
16
121
  struct templates_params {
17
122
  json messages;
18
123
  json tools;
@@ -22,7 +127,7 @@ struct templates_params {
22
127
  bool stream;
23
128
  std::string grammar;
24
129
  bool add_generation_prompt = true;
25
- bool extract_reasoning = true;
130
+ bool enable_thinking = true;
26
131
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
27
132
  };
28
133
 
@@ -267,6 +372,32 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
267
372
  return result;
268
373
  }
269
374
 
375
+ template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
376
+ json delta = json::object();
377
+ if (!diff.reasoning_content_delta.empty()) {
378
+ delta["reasoning_content"] = diff.reasoning_content_delta;
379
+ }
380
+ if (!diff.content_delta.empty()) {
381
+ delta["content"] = diff.content_delta;
382
+ }
383
+ if (diff.tool_call_index != std::string::npos) {
384
+ json tool_call;
385
+ tool_call["index"] = diff.tool_call_index;
386
+ if (!diff.tool_call_delta.id.empty()) {
387
+ tool_call["id"] = diff.tool_call_delta.id;
388
+ tool_call["type"] = "function";
389
+ }
390
+ json function = json::object();
391
+ if (!diff.tool_call_delta.name.empty()) {
392
+ function["name"] = diff.tool_call_delta.name;
393
+ }
394
+ function["arguments"] = diff.tool_call_delta.arguments;
395
+ tool_call["function"] = function;
396
+ delta["tool_calls"] = json::array({tool_call});
397
+ }
398
+ return delta;
399
+ }
400
+
270
401
  bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
271
402
  if (use_jinja) {
272
403
  try {
@@ -434,7 +565,7 @@ common_chat_templates_ptr common_chat_templates_init(
434
565
  return tmpls;
435
566
  }
436
567
 
437
- std::string common_chat_format_name(common_chat_format format) {
568
+ const char * common_chat_format_name(common_chat_format format) {
438
569
  switch (format) {
439
570
  case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
440
571
  case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
@@ -442,182 +573,128 @@ std::string common_chat_format_name(common_chat_format format) {
442
573
  case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
443
574
  case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
444
575
  case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
445
- case COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING: return "DeepSeek R1 (extract reasoning)";
446
576
  case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
447
577
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
448
578
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
449
579
  case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
450
- case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)";
451
580
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
452
- case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)";
453
581
  default:
454
582
  throw std::runtime_error("Unknown chat format");
455
583
  }
456
584
  }
457
585
 
458
- static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
459
- // // https://json.nlohmann.me/features/parsing/sax_interface/
460
- struct json_error_locator : public nlohmann::json_sax<json> {
461
- std::size_t position;
462
- bool found_error;
463
-
464
- json_error_locator() : position(0), found_error(false) {}
465
-
466
- bool parse_error(std::size_t position, const std::string &, const json::exception &) override { // NOLINT
467
- this->position = position - 1;
468
- this->found_error = true;
469
- return false;
470
- }
471
- bool null() override { return true; } // NOLINT
472
- bool boolean(bool) override { return true; } // NOLINT
473
- bool number_integer(number_integer_t) override { return true; } // NOLINT
474
- bool number_unsigned(number_unsigned_t) override { return true; } // NOLINT
475
- bool number_float(number_float_t, const string_t &) override { return true; } // NOLINT
476
- bool string(string_t &) override { return true; } // NOLINT
477
- bool binary(binary_t &) override { return true; } // NOLINT
478
- bool start_object(std::size_t) override { return true; } // NOLINT
479
- bool key(string_t &) override { return true; } // NOLINT
480
- bool end_object() override { return true; }
481
- bool start_array(std::size_t) override { return true; } // NOLINT
482
- bool end_array() override { return true; }
483
- };
484
- json_error_locator err_loc;
485
- json::sax_parse(it, end, &err_loc);
486
-
487
- std::string::const_iterator temptative_end;
488
- if (err_loc.found_error) {
489
- temptative_end = it + err_loc.position;
490
- } else {
491
- temptative_end = end;
492
- }
493
- std::string json_sub {it, temptative_end};
494
- try {
495
- out = json::parse(json_sub);
496
- it = temptative_end;
497
- return true;
498
- } catch (const std::exception &) {
499
- return false;
500
- }
501
- }
502
-
503
- static bool parse_literal(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
504
- auto expected_it = expected.begin();
505
- auto tmp_it = it;
506
- while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
507
- ++tmp_it;
508
- ++expected_it;
509
- }
510
- if (expected_it == expected.end()) {
511
- it = tmp_it;
512
- return true;
513
- }
514
- return false;
515
- }
516
-
517
- static std::optional<std::smatch> parse_pattern(std::string::const_iterator & it, const std::string::const_iterator & end, const std::regex & expected) {
518
- std::smatch match;
519
- if (std::regex_match(it, end, match, expected)) {
520
- it = match.suffix().first;
521
- return match;
586
+ const char * common_reasoning_format_name(common_reasoning_format format) {
587
+ switch (format) {
588
+ case COMMON_REASONING_FORMAT_NONE: return "none";
589
+ case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
590
+ case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
591
+ default:
592
+ throw std::runtime_error("Unknown reasoning format");
522
593
  }
523
- return std::nullopt;
524
594
  }
525
595
 
526
- static void consume_spaces(std::string::const_iterator & it, const std::string::const_iterator & end) {
527
- while (it != end && std::isspace(*it)) {
528
- ++it;
596
+ static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
597
+ std::string arguments;
598
+ if (builder.is_partial()) {
599
+ arguments = (json {{"code", code + builder.healing_marker()}}).dump();
600
+ auto idx = arguments.find(builder.healing_marker());
601
+ if (idx != std::string::npos) {
602
+ arguments.resize(idx);
603
+ }
604
+ } else {
605
+ arguments = (json {{"code", code}}).dump();
529
606
  }
607
+ return arguments;
530
608
  }
531
609
 
532
610
  /**
533
611
  * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
534
612
  * Aggregates the prefix, suffix and in-between text into the content.
535
613
  */
536
- static common_chat_msg parse_json_tool_calls(
537
- const std::string& input,
538
- const std::optional<std::regex> & trigger_opt,
539
- const std::regex & function_regex,
540
- const std::regex & close_regex,
541
- bool allow_raw_python = false) {
542
- std::smatch match;
543
-
544
- common_chat_msg result;
545
- result.role = "assistant";
546
-
547
-
548
- auto end = input.end();
549
- auto it = input.begin();
550
-
551
- if (trigger_opt) {
552
- if (!std::regex_search(it, end, match, *trigger_opt)) {
553
- result.content = input;
554
- return result;
555
- }
556
- result.content = match.prefix().str();
557
- it = match.suffix().first;
558
- }
614
+ static void parse_json_tool_calls(
615
+ common_chat_msg_parser & builder,
616
+ const std::optional<common_regex> & block_open,
617
+ const std::optional<common_regex> & function_regex_start_only,
618
+ const std::optional<common_regex> & function_regex,
619
+ const common_regex & close_regex,
620
+ const std::optional<common_regex> & block_close,
621
+ bool allow_raw_python = false,
622
+ const std::function<std::string(const common_chat_msg_parser::find_regex_result & fres)> & get_function_name = nullptr) {
623
+
624
+ auto parse_tool_calls = [&]() {
625
+ size_t from = std::string::npos;
626
+ auto first = true;
627
+ while (true) {
628
+ auto res = function_regex_start_only && first
629
+ ? builder.try_consume_regex(*function_regex_start_only)
630
+ : function_regex
631
+ ? builder.try_find_regex(*function_regex, from)
632
+ : std::nullopt;
633
+ if (res) {
634
+ std::string name;
635
+ if (get_function_name) {
636
+ name = get_function_name(*res);
637
+ } else {
638
+ LM_GGML_ASSERT(res->groups.size() == 2);
639
+ name = builder.str(res->groups[1]);
640
+ }
641
+ first = false;
642
+ if (name.empty()) {
643
+ // get_function_name signalled us that we should skip this match and treat it as content.
644
+ from = res->groups[0].begin + 1;
645
+ continue;
646
+ }
647
+ from = std::string::npos;
559
648
 
560
- while (it != end) {
561
- std::sregex_iterator rend;
562
- std::sregex_iterator rit(it, end, function_regex);
563
- if (rit == rend) {
564
- result.content += std::string(it, end);
649
+ auto maybe_raw_python = name == "python" && allow_raw_python;
650
+ if (builder.input()[builder.pos()] == '{' || !maybe_raw_python) {
651
+ if (auto arguments = builder.try_consume_json_with_dumped_args({{}})) {
652
+ if (!builder.add_tool_call(name, "", arguments->value) || arguments->is_partial) {
653
+ throw common_chat_msg_partial_exception("incomplete tool call");
654
+ }
655
+ builder.consume_regex(close_regex);
656
+ }
657
+ continue;
658
+ }
659
+ if (maybe_raw_python) {
660
+ auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
661
+ if (!builder.add_tool_call(name, "", arguments)) {
662
+ throw common_chat_msg_partial_exception("incomplete tool call");
663
+ }
664
+ return;
665
+ }
666
+ throw common_chat_msg_partial_exception("incomplete tool call");
667
+ }
565
668
  break;
566
669
  }
567
- auto name = rit->str(1);
568
- result.content += std::string(it, rit->prefix().second);
569
- it = rit->suffix().first;
570
-
571
- json arguments;
572
- if (parse_json(it, end, arguments)) {
573
- if (!std::regex_search(it, end, match, close_regex)) {
574
- throw std::runtime_error("Malformed input, missing closing pattern: " + input);
575
- }
576
- it = match.suffix().first;
577
- result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
578
- } else {
579
- if (allow_raw_python && name == "python") {
580
- result.tool_calls.push_back({name, json({{"code", std::string(it, end)}}).dump(), /* id= */ ""});
581
- break;
582
- }
583
- throw std::runtime_error("Failed to parse json tool call arguments: " + input);
670
+ if (block_close) {
671
+ builder.consume_regex(*block_close);
584
672
  }
585
- }
586
-
587
- if (!result.tool_calls.empty()) {
588
- if (!string_strip(result.content).empty()) {
589
- LOG_WRN("Content found with tool calls: %s\n", result.content.c_str());
673
+ builder.consume_spaces();
674
+ builder.add_content(builder.consume_rest());
675
+ };
676
+ if (block_open) {
677
+ if (auto res = builder.try_find_regex(*block_open)) {
678
+ parse_tool_calls();
679
+ } else {
680
+ builder.add_content(builder.consume_rest());
590
681
  }
591
- result.content = "";
682
+ } else {
683
+ parse_tool_calls();
592
684
  }
593
- return result;
594
685
  }
595
686
 
596
- static common_chat_tool_call process_tool_call(const json & tool_call) {
597
- const auto & arguments = tool_call.at("arguments");
598
- return {
599
- /* .name = */ tool_call.at("name"),
600
- /* .arguments = */ arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
601
- /* .id = */ tool_call.contains("id") ? tool_call.at("id") : "",
602
- };
603
- }
604
- static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
605
- auto content_end = input.find(prefix);
606
- size_t tc_start = std::string::npos;
607
-
608
- common_chat_msg result;
609
- result.role = "assistant";
610
- if (content_end == std::string::npos) {
611
- result.content = input;
612
- } else {
613
- tc_start = content_end + prefix.size() - rstrip_prefix;
614
- result.content = input.substr(0, content_end);
615
- auto tool_calls = json::parse(input.substr(tc_start));
616
- for (const auto & tool_call : tool_calls) {
617
- result.tool_calls.emplace_back(process_tool_call(tool_call));
687
+ static void parse_prefixed_json_tool_call_array(common_chat_msg_parser & builder, const common_regex & prefix, size_t rstrip_prefix = 0) {
688
+ static const std::vector<std::vector<std::string>> args_paths = {{"arguments"}};
689
+ if (auto res = builder.try_find_regex(prefix)) {
690
+ builder.move_back(rstrip_prefix);
691
+ auto tool_calls = builder.consume_json_with_dumped_args(args_paths);
692
+ if (!builder.add_tool_calls(tool_calls.value) || tool_calls.is_partial) {
693
+ throw common_chat_msg_partial_exception("incomplete tool call array");
618
694
  }
695
+ } else {
696
+ builder.add_content(builder.consume_rest());
619
697
  }
620
- return result;
621
698
  }
622
699
 
623
700
  static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
@@ -744,29 +821,36 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
744
821
  data.format = COMMON_CHAT_FORMAT_GENERIC;
745
822
  return data;
746
823
  }
747
- static common_chat_msg common_chat_parse_generic(const std::string & input) {
748
- json data = json::parse(input);
749
- common_chat_msg result;
750
- result.role = "assistant";
751
- if (data.contains("tool_calls")) {
752
- for (const auto & tool_call : data.at("tool_calls")) {
753
- result.tool_calls.push_back({
754
- tool_call.at("name"),
755
- tool_call.at("arguments").dump(),
756
- tool_call.contains("id") ? tool_call.at("id") : "",
757
- });
824
+ static void common_chat_parse_generic(common_chat_msg_parser & builder) {
825
+ if (!builder.syntax().parse_tool_calls) {
826
+ builder.add_content(builder.consume_rest());
827
+ return;
828
+ }
829
+ static const std::vector<std::vector<std::string>> content_paths = {
830
+ {"response"},
831
+ };
832
+ static const std::vector<std::vector<std::string>> args_paths = {
833
+ {"tool_call", "arguments"},
834
+ {"tool_calls", "arguments"},
835
+ };
836
+ auto data = builder.consume_json_with_dumped_args(args_paths, content_paths);
837
+ if (data.value.contains("tool_calls")) {
838
+ if (!builder.add_tool_calls(data.value.at("tool_calls")) || data.is_partial) {
839
+ throw common_chat_msg_partial_exception("incomplete tool calls");
758
840
  }
759
- } else if (data.contains("tool_call")) {
760
- result.tool_calls.push_back({
761
- data.at("tool_call").at("name"),
762
- data.at("tool_call").at("arguments").dump(),
763
- /* id= */ "",
764
- });
765
- } else if (data.contains("response")) {
766
- const auto & response = data.at("response");
767
- result.content = response.is_string() ? response.get<std::string>() : response.dump(2);
841
+ } else if (data.value.contains("tool_call")) {
842
+ if (!builder.add_tool_call(data.value.at("tool_call")) || data.is_partial) {
843
+ throw common_chat_msg_partial_exception("incomplete tool call");
844
+ }
845
+ } else if (data.value.contains("response")) {
846
+ const auto & response = data.value.at("response");
847
+ builder.add_content(response.is_string() ? response.template get<std::string>() : response.dump(2));
848
+ if (data.is_partial) {
849
+ throw common_chat_msg_partial_exception("incomplete response");
850
+ }
851
+ } else {
852
+ throw common_chat_msg_partial_exception("Expected 'tool_call', 'tool_calls' or 'response' in JSON");
768
853
  }
769
- return result;
770
854
  }
771
855
 
772
856
  static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -813,12 +897,44 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
813
897
  data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
814
898
  return data;
815
899
  }
816
- static common_chat_msg common_chat_parse_mistral_nemo(const std::string & input) {
817
- return parse_prefixed_json_tool_call_array(input, "[TOOL_CALLS]");
900
+ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
901
+ if (!builder.syntax().parse_tool_calls) {
902
+ builder.add_content(builder.consume_rest());
903
+ return;
904
+ }
905
+
906
+ static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
907
+ parse_prefixed_json_tool_call_array(builder, prefix);
818
908
  }
819
909
 
820
910
  static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
821
911
  common_chat_params data;
912
+
913
+ auto adjusted_messages = json::array();
914
+ for (const auto & msg : inputs.messages) {
915
+ auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
916
+ auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
917
+ if (has_reasoning_content && has_tool_calls) {
918
+ auto adjusted_message = msg;
919
+ adjusted_message["tool_plan"] = msg.at("reasoning_content");
920
+ adjusted_message.erase("reasoning_content");
921
+ adjusted_messages.push_back(adjusted_message);
922
+ } else {
923
+ adjusted_messages.push_back(msg);
924
+ }
925
+ }
926
+ data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
927
+ data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
928
+ if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
929
+ if (!inputs.enable_thinking) {
930
+ data.prompt += "<|END_THINKING|>";
931
+ } else {
932
+ data.thinking_forced_open = true;
933
+ }
934
+ } else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) {
935
+ data.prompt += "<|START_THINKING|><|END_THINKING|>";
936
+ }
937
+
822
938
  data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
823
939
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
824
940
  auto schemas = json::array();
@@ -849,11 +965,16 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
849
965
  if (!inputs.parallel_tool_calls) {
850
966
  schema["maxItems"] = 1;
851
967
  }
852
- builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
968
+ builder.add_rule("root",
969
+ std::string(data.thinking_forced_open ? "( \"<|END_THINKING|>\" space )? " : "") +
970
+ "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
853
971
  });
854
972
  data.grammar_triggers.push_back({
855
- COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
856
- "<|START_ACTION|>",
973
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
974
+ // If thinking_forced_open, then we capture the </think> tag in the grammar,
975
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
976
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(<\\|END_THINKING\\|>\\s*)" : "(?:<\\|START_THINKING\\|>[\\s\\S]*?<\\|END_THINKING\\|>\\s*)?") +
977
+ "(<\\|START_ACTION\\|>)[\\s\\S]*"
857
978
  });
858
979
  data.preserved_tokens = {
859
980
  "<|START_ACTION|>",
@@ -863,61 +984,40 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
863
984
  "<|START_THINKING|>",
864
985
  "<|END_THINKING|>",
865
986
  };
866
- auto adjusted_messages = json::array();
867
- for (const auto & msg : inputs.messages) {
868
- auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
869
- auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
870
- if (has_reasoning_content && has_tool_calls) {
871
- auto adjusted_message = msg;
872
- adjusted_message["tool_plan"] = msg.at("reasoning_content");
873
- adjusted_message.erase("reasoning_content");
874
- adjusted_messages.push_back(adjusted_message);
875
- } else {
876
- adjusted_messages.push_back(msg);
877
- }
878
- }
879
- data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
880
- data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING : COMMON_CHAT_FORMAT_COMMAND_R7B;
881
987
  return data;
882
988
  }
883
- static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) {
884
- static const std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
885
- static const std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
886
- static const std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
887
-
888
- std::smatch match;
889
-
890
- common_chat_msg result;
891
- result.role = "assistant";
892
989
 
893
- std::string rest = input;
894
-
895
- if (std::regex_match(rest, match, thought_regex)) {
896
- if (extract_reasoning) {
897
- result.reasoning_content = match[2].str();
898
- } else if (!match[2].str().empty()) {
899
- // Let the unparsed thinking tags through in content only if their insides aren't empty.
900
- result.content = match[1].str();
990
+ static void common_chat_parse_command_r7b(common_chat_msg_parser & builder) {
991
+ builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>");
992
+
993
+ static const common_regex start_action_regex("<\\|START_ACTION\\|>");
994
+ static const common_regex end_action_regex("<\\|END_ACTION\\|>");
995
+ static const common_regex start_response_regex("<\\|START_RESPONSE\\|>");
996
+ static const common_regex end_response_regex("<\\|END_RESPONSE\\|>");
997
+
998
+ if (auto res = builder.try_find_regex(start_action_regex)) {
999
+ // If we didn't extract thoughts, prelude includes them.
1000
+ auto tool_calls = builder.consume_json_with_dumped_args({{"parameters"}});
1001
+ for (const auto & tool_call : tool_calls.value) {
1002
+ std::string name = tool_call.contains("tool_name") ? tool_call.at("tool_name") : "";
1003
+ std::string id = tool_call.contains("tool_call_id") ? tool_call.at("tool_call_id") : "";
1004
+ std::string arguments = tool_call.contains("parameters") ? tool_call.at("parameters") : "";
1005
+ if (!builder.add_tool_call(name, id, arguments) || tool_calls.is_partial) {
1006
+ throw common_chat_msg_partial_exception("incomplete tool call");
1007
+ }
901
1008
  }
902
- rest = match[3].str();
903
- }
904
- if (std::regex_match(rest, match, action_regex)) {
905
- auto actions_str = match[1].str();
906
- auto actions = json::parse(actions_str);
907
- for (const auto & action : actions) {
908
- result.tool_calls.push_back({
909
- /* .name = */ action.at("tool_name"),
910
- /* .arguments = */ action.at("parameters").dump(),
911
- /* .id = */ action.at("tool_call_id"),
912
- });
1009
+ if (tool_calls.is_partial) {
1010
+ throw common_chat_msg_partial_exception("incomplete tool call");
1011
+ }
1012
+ builder.consume_regex(end_action_regex);
1013
+ } else if (auto res = builder.try_find_regex(start_response_regex)) {
1014
+ if (!builder.try_find_regex(end_response_regex)) {
1015
+ builder.add_content(builder.consume_rest());
1016
+ throw common_chat_msg_partial_exception(end_response_regex.str());
913
1017
  }
914
- } else if (std::regex_match(rest, match, response_regex)) {
915
- auto response = match[1].str();
916
- result.content += response;
917
1018
  } else {
918
- result.content += rest;
1019
+ builder.add_content(builder.consume_rest());
919
1020
  }
920
- return result;
921
1021
  }
922
1022
 
923
1023
  static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector<std::string> & expected_properties) {
@@ -994,8 +1094,8 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
994
1094
  });
995
1095
  // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
996
1096
  data.grammar_triggers.push_back({
997
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
998
- "\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
1097
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1098
+ "(\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\")[\\s\\S]*", // + name + "\"[\\s\\S]*",
999
1099
  });
1000
1100
  if (!builtin_tools.empty()) {
1001
1101
  data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
@@ -1018,42 +1118,93 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
1018
1118
  });
1019
1119
  return data;
1020
1120
  }
1021
- static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
1022
- // TODO: tighten & simplify the parser, don't accept leading text context.
1023
- static const std::regex function_regex(
1121
+ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
1122
+ if (!builder.syntax().parse_tool_calls) {
1123
+ builder.add_content(builder.consume_rest());
1124
+ return;
1125
+ }
1126
+
1127
+ static const common_regex function_regex(
1024
1128
  "\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
1025
- static const std::regex close_regex("\\}\\s*");
1026
- static const std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
1129
+ static const common_regex close_regex("\\}\\s*");
1130
+
1131
+ static const common_regex function_name_regex("\\s*(\\w+)\\s*\\.\\s*call\\(");
1132
+ static const common_regex arg_name_regex("\\s*(\\w+)\\s*=\\s*");
1027
1133
 
1028
1134
  if (with_builtin_tools) {
1029
- std::smatch match;
1030
- if (std::regex_match(input, match, builtin_call_regex)) {
1031
- try {
1032
- auto name = match[1].str();
1033
- auto arg_name = match[2].str();
1034
- auto arg_value_str = match[3].str();
1035
- auto arg_value = json::parse(arg_value_str);
1036
-
1037
- common_chat_msg msg;
1038
- msg.role = "assistant";
1039
- msg.tool_calls.push_back({
1040
- /* .name = */ name,
1041
- /* .arguments = */ (json {
1042
- {arg_name, arg_value},
1043
- }).dump(),
1044
- /* .id = */ "",
1045
- });
1046
- return msg;
1047
- } catch (const std::exception & e) {
1048
- LOG_WRN("Failed to parse builtin tool call arguments (%s): %s", e.what(), input.c_str());
1135
+ static const common_regex builtin_call_regex("<\\|python_tag\\|>");
1136
+ if (auto res = builder.try_find_regex(builtin_call_regex)) {
1137
+ auto fun_res = builder.consume_regex(function_name_regex);
1138
+ auto function_name = builder.str(fun_res.groups[1]);
1139
+
1140
+ common_healing_marker healing_marker;
1141
+ json args = json::object();
1142
+ while (true) {
1143
+ if (auto arg_res = builder.try_consume_regex(arg_name_regex)) {
1144
+ auto arg_name = builder.str(arg_res->groups[1]);
1145
+ auto partial = builder.consume_json();
1146
+ args[arg_name] = partial.json;
1147
+ healing_marker.marker = partial.healing_marker.marker;
1148
+ healing_marker.json_dump_marker = partial.healing_marker.json_dump_marker;
1149
+ builder.consume_spaces();
1150
+ if (!builder.try_consume_literal(",")) {
1151
+ break;
1152
+ }
1153
+ } else {
1154
+ break;
1155
+ }
1156
+ }
1157
+ builder.consume_literal(")");
1158
+ builder.consume_spaces();
1159
+
1160
+ auto arguments = args.dump();
1161
+ if (!builder.add_tool_call(function_name, "", arguments)) {
1162
+ throw common_chat_msg_partial_exception("Incomplete tool call");
1049
1163
  }
1164
+ return;
1050
1165
  }
1051
1166
  }
1052
- return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
1167
+ parse_json_tool_calls(
1168
+ builder,
1169
+ /* block_open= */ std::nullopt,
1170
+ /* function_regex_start_only= */ function_regex,
1171
+ /* function_regex= */ std::nullopt,
1172
+ close_regex,
1173
+ std::nullopt);
1174
+
1053
1175
  }
1054
1176
 
1055
1177
  static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
1056
1178
  common_chat_params data;
1179
+ auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1180
+
1181
+ // Hacks to fix the official (broken) prompt.
1182
+ // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
1183
+ // until the official template is fixed.
1184
+ if (tmpl.source().find("{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}") != std::string::npos) {
1185
+ // Don't leave the chat dangling after tool results
1186
+ if (string_ends_with(prompt, "<|tool▁outputs▁end|>")) {
1187
+ prompt += "<|end▁of▁sentence|>";
1188
+ if (inputs.add_generation_prompt) {
1189
+ prompt += "<|Assistant|>";
1190
+ }
1191
+ }
1192
+ // Fix up tool call delta example added by Minja
1193
+ prompt = std::regex_replace(
1194
+ prompt,
1195
+ std::regex("(<|tool▁call▁end|>)[\\s\\r\\n]*(<|tool▁outputs▁begin|>|<|User|>)"),
1196
+ "$1<|tool▁calls▁end|><|end▁of▁sentence|>$2");
1197
+ }
1198
+ data.prompt = prompt;
1199
+ data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
1200
+ if (string_ends_with(data.prompt, "<think>\n")) {
1201
+ if (!inputs.enable_thinking) {
1202
+ data.prompt += "</think>";
1203
+ } else {
1204
+ data.thinking_forced_open = true;
1205
+ }
1206
+ }
1207
+
1057
1208
  if (inputs.tools.is_array() && !inputs.tools.empty()) {
1058
1209
  data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
1059
1210
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -1064,21 +1215,25 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
1064
1215
  auto parameters = function.at("parameters");
1065
1216
  builder.resolve_refs(parameters);
1066
1217
  tool_rules.push_back(builder.add_rule(name + "-call",
1067
- "\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n"
1218
+ "( \"<|tool▁call▁begin|>\" )? \"function<|tool▁sep|>" + name + "\\n"
1068
1219
  "```json\\n\" " + builder.add_schema(name + "-args", parameters) + " "
1069
1220
  "\"```<|tool▁call▁end|>\""));
1070
1221
  });
1071
1222
  // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
1072
1223
  // so we accept common variants (then it's all constrained)
1073
1224
  builder.add_rule("root",
1074
- "( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" ) "
1225
+ std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1226
+ "( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
1075
1227
  "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
1076
1228
  "\"<|tool▁calls▁end|>\""
1077
1229
  " space");
1078
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool▁calls▁begin|>"});
1079
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_calls_begin|>"});
1080
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool calls begin|>"});
1081
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool\\_calls\\_begin|>"});
1230
+ data.grammar_triggers.push_back({
1231
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1232
+ // If thinking_forced_open, then we capture the </think> tag in the grammar,
1233
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1234
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
1235
+ "(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
1236
+ });
1082
1237
  data.preserved_tokens = {
1083
1238
  "<think>",
1084
1239
  "</think>",
@@ -1090,65 +1245,27 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
1090
1245
  };
1091
1246
  });
1092
1247
  }
1093
- auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1094
-
1095
- // Hacks to fix the official (broken) prompt.
1096
- // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
1097
- // until the official template is fixed.
1098
- if (tmpl.source().find("{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}") != std::string::npos) {
1099
- // Don't leave the chat dangling after tool results
1100
- if (string_ends_with(prompt, "<|tool▁outputs▁end|>")) {
1101
- prompt += "<|end▁of▁sentence|>";
1102
- if (inputs.add_generation_prompt) {
1103
- prompt += "<|Assistant|>";
1104
- }
1105
- }
1106
- // Fix up tool call delta example added by Minja
1107
- prompt = std::regex_replace(
1108
- prompt,
1109
- std::regex("(<|tool▁call▁end|>)[\\s\\r\\n]*(<|tool▁outputs▁begin|>|<|User|>)"),
1110
- "$1<|tool▁calls▁end|><|end▁of▁sentence|>$2");
1111
- }
1112
- data.prompt = prompt;
1113
- data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1;
1114
1248
  return data;
1115
1249
  }
1116
- static common_chat_msg handle_think_tag_prelude(const std::string & input, bool extract_reasoning, const std::function<common_chat_msg(const std::string &)> & rest_parser) {
1117
- std::smatch match;
1118
- static const std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
1119
- if (std::regex_match(input, match, reasoning_content_regex)) {
1120
- auto rest = match[3].str();
1121
- auto msg = rest_parser(rest);
1122
- auto reasoning_content = string_strip(match[2].str());
1123
- if (extract_reasoning) {
1124
- msg.reasoning_content = reasoning_content;
1125
- } else if (!reasoning_content.empty()) {
1126
- std::ostringstream content;
1127
- content << "<think>" << reasoning_content << "</think>" << msg.content;
1128
- msg.content = content.str();
1129
- }
1130
- return msg;
1131
- }
1132
- return rest_parser(input);
1133
- }
1134
- static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
1135
- return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
1136
- static const std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n");
1137
- static const std::regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>");
1138
- static const std::regex tool_calls_regex("[\\s\\r\\n]*(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>)([\\s\\S\\r\\n]*?)<|tool▁calls▁end|>");
1139
-
1140
- common_chat_msg msg;
1141
- msg.role = "assistant";
1142
- std::smatch match;
1143
- if (std::regex_search(input, match, tool_calls_regex)) {
1144
- auto tool_calls = match[1].str();
1145
- auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex);
1146
- msg.tool_calls = std::move(msg2.tool_calls);
1147
- } else {
1148
- msg.content = input;
1149
- }
1150
- return msg;
1151
- });
1250
+ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1251
+ builder.try_parse_reasoning("<think>", "</think>");
1252
+ if (!builder.syntax().parse_tool_calls) {
1253
+ builder.add_content(builder.consume_rest());
1254
+ return;
1255
+ }
1256
+
1257
+ static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
1258
+ static const common_regex tool_calls_end("<|tool▁calls▁end|>");
1259
+ static const common_regex function_regex("(?:<|tool▁call▁begin|>)?function<|tool▁sep|>([^\n]+)\n```json\n");
1260
+ static const common_regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>");
1261
+
1262
+ parse_json_tool_calls(
1263
+ builder,
1264
+ /* block_open= */ tool_calls_begin,
1265
+ /* function_regex_start_only= */ std::nullopt,
1266
+ function_regex,
1267
+ close_regex,
1268
+ tool_calls_end);
1152
1269
  }
1153
1270
 
1154
1271
  static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1196,13 +1313,19 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
1196
1313
  }
1197
1314
  return data;
1198
1315
  }
1199
- static common_chat_msg common_chat_parse_firefunction_v2(const std::string & input) {
1200
- return parse_prefixed_json_tool_call_array(input, " functools[", /* rstrip_prefix= */ 1);
1316
+ static void common_chat_parse_firefunction_v2(common_chat_msg_parser & builder) {
1317
+ if (!builder.syntax().parse_tool_calls) {
1318
+ builder.add_content(builder.consume_rest());
1319
+ return;
1320
+ }
1321
+ static const common_regex prefix(regex_escape(" functools["));
1322
+ parse_prefixed_json_tool_call_array(builder, prefix, /* rstrip_prefix= */ 1);
1201
1323
  }
1202
1324
 
1203
1325
  static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1204
1326
  // >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}...
1205
1327
  // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
1328
+ // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
1206
1329
  common_chat_params data;
1207
1330
  data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1208
1331
  data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
@@ -1216,24 +1339,21 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
1216
1339
  std::string name = function.at("name");
1217
1340
  auto parameters = function.at("parameters");
1218
1341
  builder.resolve_refs(parameters);
1342
+ std::string args_pattern = "[\\s\\S]*";
1219
1343
  auto args_rule = builder.add_schema(name + "-args", parameters);
1220
- first_tool_rules.push_back(builder.add_rule(name + "-call", "( \"assistant<|end_header_id|>\\n\" )? \"" + name + "\\n\" " + args_rule));
1221
- subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
1222
- data.grammar_triggers.push_back({
1223
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
1224
- regex_escape(name + "\n"),
1225
- });
1226
- data.grammar_triggers.push_back({
1227
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
1228
- regex_escape("assistant<|end_header_id|>\n" + name + "\n"),
1229
- });
1230
- data.grammar_triggers.push_back({
1231
- COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1232
- regex_escape(">>>" + name + "\n"),
1233
- });
1344
+ if (name == "python") {
1345
+ args_rule = builder.add_rule(name + "-maybe-raw-args", args_rule + " | [^{] .*");
1346
+ } else {
1347
+ args_pattern = "\\{" + args_pattern;
1348
+ }
1349
+ auto call_rule = builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule);
1350
+ first_tool_rules.push_back(call_rule);
1351
+ if (inputs.parallel_tool_calls) {
1352
+ subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>\" " + call_rule));
1353
+ }
1234
1354
  data.grammar_triggers.push_back({
1235
- COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1236
- ">>>assistant<|end_header_id|>\n" + name,
1355
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1356
+ "((?:[\\s\\S]+?>>>)?" + regex_escape(name) + "\n)" + args_pattern,
1237
1357
  });
1238
1358
  });
1239
1359
  data.preserved_tokens = {
@@ -1251,40 +1371,33 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
1251
1371
  }
1252
1372
  return data;
1253
1373
  }
1254
-
1255
- static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
1256
- static const std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
1257
- static const std::regex close_regex(R"($|(?=>>>))");
1258
-
1259
- std::string content;
1260
- auto it = input.begin();
1261
- const auto end = input.end();
1262
-
1263
- if (parse_literal(it, end, "all\n")) {
1264
- std::smatch match;
1265
- if (std::regex_search(it, end, match, function_regex)) {
1266
- auto fun_it = match.prefix().second;
1267
- content = std::string(it, fun_it);
1268
- it = fun_it;
1269
- } else {
1270
- common_chat_msg res;
1271
- res.role = "assistant";
1272
- res.content = std::string(it, end);
1273
- return res;
1274
- }
1275
- }
1276
- // TODO: tighten & simplify.
1277
- try {
1278
- auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex, /* allow_raw_python= */ true);
1279
- res.content = content + res.content;
1280
- return res;
1281
- } catch (const std::exception & e) {
1282
- LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what());
1283
- common_chat_msg res;
1284
- res.role = "assistant";
1285
- res.content = input;
1286
- return res;
1287
- }
1374
+ static void common_chat_parse_functionary_v3_2(common_chat_msg_parser & builder) {
1375
+ static const common_regex function_regex_start_only(R"((\w+\n\{|python\n|all\n))");
1376
+ static const common_regex function_regex(R"(>>>(\w+\n\{|python\n|all\n))");
1377
+ static const common_regex close_regex(R"(\s*)");
1378
+
1379
+ parse_json_tool_calls(
1380
+ builder,
1381
+ std::nullopt,
1382
+ function_regex_start_only,
1383
+ function_regex,
1384
+ close_regex,
1385
+ std::nullopt,
1386
+ /* allow_raw_python= */ true,
1387
+ /* get_function_name= */ [&](const auto & res) -> std::string {
1388
+ auto at_start = res.groups[0].begin == 0;
1389
+ auto name = builder.str(res.groups[1]);
1390
+ if (!name.empty() && name.back() == '{') {
1391
+ // Unconsume the opening brace '{' to ensure the JSON parsing goes well.
1392
+ builder.move_back(1);
1393
+ }
1394
+ auto idx = name.find_last_not_of("\n{");
1395
+ name = name.substr(0, idx + 1);
1396
+ if (at_start && name == "all") {
1397
+ return "";
1398
+ }
1399
+ return name;
1400
+ });
1288
1401
  }
1289
1402
 
1290
1403
  static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1345,229 +1458,224 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
1345
1458
  // TODO: if (has_raw_python)
1346
1459
  return data;
1347
1460
  }
1348
- static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
1461
+ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser & builder) {
1462
+ if (!builder.syntax().parse_tool_calls) {
1463
+ builder.add_content(builder.consume_rest());
1464
+ return;
1465
+ }
1349
1466
  // This version of Functionary still supports the llama 3.1 tool call format for the python tool.
1350
- static const std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
1351
- std::smatch match;
1352
- if (std::regex_search(input, match, python_tag_regex)) {
1353
- auto code = match[1].str();
1354
- common_chat_msg msg;
1355
- msg.role = "assistant";
1356
- msg.content = match.prefix().str();
1357
- msg.tool_calls.push_back({
1358
- /* .name = */ "python",
1359
- /* .arguments = */ (json {{"code", code}}).dump(),
1360
- /* .id = */ "",
1361
- });
1362
- return msg;
1467
+ static const common_regex python_tag_regex(regex_escape("<|python_tag|>"));
1468
+
1469
+ static const common_regex function_regex(R"(<function=(\w+)>)");
1470
+ static const common_regex close_regex(R"(</function>)");
1471
+
1472
+ parse_json_tool_calls(
1473
+ builder,
1474
+ /* block_open= */ std::nullopt,
1475
+ /* function_regex_start_only= */ std::nullopt,
1476
+ function_regex,
1477
+ close_regex,
1478
+ std::nullopt);
1479
+
1480
+ if (auto res = builder.try_find_regex(python_tag_regex)) {
1481
+ auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
1482
+ builder.add_tool_call("python", "", arguments);
1483
+ return;
1363
1484
  }
1364
- static const std::regex function_regex(R"(<function=(\w+)>)");
1365
- static const std::regex close_regex(R"(</function>)");
1366
- // TODO: tighten & simplify.
1367
- return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
1368
1485
  }
1369
1486
 
1370
1487
  static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
1371
1488
  common_chat_params data;
1372
- // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
1373
- data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1374
- data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1375
- std::vector<std::string> tool_rules;
1376
- std::vector<std::string> tool_call_alts;
1377
- foreach_function(inputs.tools, [&](const json & tool) {
1378
- const auto & function = tool.at("function");
1379
- std::string name = function.at("name");
1380
- auto parameters = function.at("parameters");
1381
- builder.resolve_refs(parameters);
1382
- tool_rules.push_back(builder.add_schema(name + "-call", {
1383
- {"type", "object"},
1384
- {"properties", json {
1385
- {"name", json {{"const", name}}},
1386
- {"arguments", parameters},
1387
- }},
1388
- {"required", json::array({"name", "arguments"})},
1389
- }));
1390
- tool_call_alts.push_back(builder.add_rule(
1391
- name + "-function-tag",
1392
- "\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
1393
- builder.add_schema(name + "-args", parameters) + " "
1394
- "\"</function>\" space"));
1395
1489
 
1396
- data.grammar_triggers.push_back({
1397
- COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1398
- "<function=" + name + ">",
1490
+ json additional_context = {
1491
+ {"enable_thinking", inputs.enable_thinking},
1492
+ };
1493
+
1494
+ data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
1495
+ data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
1496
+ if (string_ends_with(data.prompt, "<think>\n")) {
1497
+ if (!inputs.enable_thinking) {
1498
+ data.prompt += "</think>";
1499
+ } else {
1500
+ data.thinking_forced_open = true;
1501
+ }
1502
+ }
1503
+
1504
+ if (!inputs.tools.is_null()) {
1505
+ // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
1506
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1507
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1508
+ std::vector<std::string> tool_rules;
1509
+ std::vector<std::string> tool_call_alts;
1510
+ std::vector<std::string> escaped_names;
1511
+ foreach_function(inputs.tools, [&](const json & tool) {
1512
+ const auto & function = tool.at("function");
1513
+ std::string name = function.at("name");
1514
+ auto parameters = function.at("parameters");
1515
+ builder.resolve_refs(parameters);
1516
+ tool_rules.push_back(builder.add_schema(name + "-call", {
1517
+ {"type", "object"},
1518
+ {"properties", json {
1519
+ {"name", json {{"const", name}}},
1520
+ {"arguments", parameters},
1521
+ }},
1522
+ {"required", json::array({"name", "arguments"})},
1523
+ }));
1524
+ tool_call_alts.push_back(builder.add_rule(
1525
+ name + "-function-tag",
1526
+ "\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
1527
+ builder.add_schema(name + "-args", parameters) + " "
1528
+ "\"</function>\" space"));
1529
+
1530
+ data.grammar_triggers.push_back({
1531
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1532
+ "<function=" + name + ">",
1533
+ });
1534
+ auto escaped_name = regex_escape(name);
1535
+ data.grammar_triggers.push_back({
1536
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1537
+ "<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
1538
+ });
1539
+ escaped_names.push_back(escaped_name);
1399
1540
  });
1400
- auto escaped_name = regex_escape(name);
1541
+ auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
1542
+ std::vector<std::string> alt_tags {
1543
+ any_tool_call,
1544
+ "\"<tool_call>\" space " + any_tool_call + " \"</tool_call>\"",
1545
+ // The rest is just to accommodate common "good bad" outputs.
1546
+ "\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
1547
+ "\"<response>\" space " + any_tool_call + " \"</response>\"",
1548
+ "\"<tools>\" space " + any_tool_call + " \"</tools>\"",
1549
+ "\"<json>\" space " + any_tool_call + " \"</json>\"",
1550
+ "\"<xml>\" space " + any_tool_call + " \"</xml>\"",
1551
+ "\"<JSON>\" space " + any_tool_call + " \"</JSON>\"",
1552
+ };
1553
+ auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
1554
+ tool_call_alts.push_back(wrappable_tool_call);
1555
+ tool_call_alts.push_back(
1556
+ "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
1557
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
1558
+ builder.add_rule("root",
1559
+ std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1560
+ (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
1561
+ // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
1401
1562
  data.grammar_triggers.push_back({
1402
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1403
- "<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
1563
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1564
+ // If thinking_forced_open, then we capture the </think> tag in the grammar,
1565
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1566
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1567
+ "(\\s*"
1568
+ "(?:<tool_call>"
1569
+ "|<function"
1570
+ "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
1571
+ "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
1572
+ ")"
1573
+ ")[\\s\\S]*"
1574
+ ),
1404
1575
  });
1576
+ data.preserved_tokens = {
1577
+ "<think>",
1578
+ "</think>",
1579
+ "<tool_call>",
1580
+ "</tool_call>",
1581
+ "<function",
1582
+ "<tools>",
1583
+ "</tools>",
1584
+ "<response>",
1585
+ "</response>",
1586
+ "<function_call>",
1587
+ "</function_call>",
1588
+ "<json>",
1589
+ "</json>",
1590
+ "<JSON>",
1591
+ "</JSON>",
1592
+ "```",
1593
+ "```json",
1594
+ "```xml",
1595
+ };
1405
1596
  });
1406
- auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
1407
- std::vector<std::string> alt_tags {
1408
- any_tool_call,
1409
- "\"<tool_call>\" space " + any_tool_call + " \"</tool_call>\"",
1410
- // The rest is just to accommodate common "good bad" outputs.
1411
- "\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
1412
- "\"<response>\" space " + any_tool_call + " \"</response>\"",
1413
- "\"<tools>\" space " + any_tool_call + " \"</tools>\"",
1414
- "\"<json>\" space " + any_tool_call + " \"</json>\"",
1415
- "\"<xml>\" space " + any_tool_call + " \"</xml>\"",
1416
- "\"<JSON>\" space " + any_tool_call + " \"</JSON>\"",
1417
- };
1418
- auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
1419
- tool_call_alts.push_back(wrappable_tool_call);
1420
- tool_call_alts.push_back(
1421
- "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
1422
- auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
1423
- builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
1424
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"});
1425
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function"});
1426
- // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
1427
- data.grammar_triggers.push_back({
1428
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
1429
- "(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
1430
- });
1431
- data.preserved_tokens = {
1432
- "<think>",
1433
- "</think>",
1434
- "<tool_call>",
1435
- "</tool_call>",
1436
- "<function",
1437
- "<tools>",
1438
- "</tools>",
1439
- "<response>",
1440
- "</response>",
1441
- "<function_call>",
1442
- "</function_call>",
1443
- "<json>",
1444
- "</json>",
1445
- "<JSON>",
1446
- "</JSON>",
1447
- "```",
1448
- "```json",
1449
- "```xml",
1450
- };
1451
- });
1597
+ }
1452
1598
 
1453
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1454
- data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING : COMMON_CHAT_FORMAT_HERMES_2_PRO;
1455
1599
  return data;
1456
1600
  }
1457
- static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input, bool extract_reasoning) {
1458
- return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
1459
- static const std::regex open_regex(
1460
- "(?:"
1461
- "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
1462
- "(<tool_call>" // match 2 (open_tag)
1463
- "|<function_call>"
1464
- "|<tool>"
1465
- "|<tools>"
1466
- "|<response>"
1467
- "|<json>"
1468
- "|<xml>"
1469
- "|<JSON>"
1601
+ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
1602
+ builder.try_parse_reasoning("<think>", "</think>");
1603
+ if (!builder.syntax().parse_tool_calls) {
1604
+ builder.add_content(builder.consume_rest());
1605
+ return;
1606
+ }
1607
+
1608
+ static const common_regex open_regex(
1609
+ "(?:"
1610
+ "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
1611
+ "(" // match 2 (open_tag)
1612
+ "<tool_call>"
1613
+ "|<function_call>"
1614
+ "|<tool>"
1615
+ "|<tools>"
1616
+ "|<response>"
1617
+ "|<json>"
1618
+ "|<xml>"
1619
+ "|<JSON>"
1470
1620
  ")?"
1471
- "(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)" // match 3 (named tool call + rest)
1472
- ")"
1473
- "|"
1474
- "(?:<function=([^>]+)>" // match 4 (function name)
1475
- "|<function name=\"([^\"]+)\">)" // match 5 (function name again)
1476
- "([\\s\\S]*)" // match 6 (function arguments + rest)})"
1477
- );
1478
-
1479
- try {
1480
- common_chat_msg msg;
1481
- msg.role = "assistant";
1482
-
1483
- std::string::const_iterator it = input.begin();
1484
- const std::string::const_iterator end = input.end();
1485
- std::smatch match;
1486
-
1487
- while (it != end) {
1488
- if (std::regex_search(it, end, match, open_regex)) {
1489
- // Add content before the match
1490
- msg.content += std::string(it, match[0].first);
1491
-
1492
- auto block_start = match[1].str();
1493
- std::string block_end = block_start.empty() ? "" : "```";
1494
-
1495
- auto open_tag = match[2].str();
1496
- std::string close_tag;
1497
-
1498
- if (match[3].matched) {
1499
- close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
1500
- auto json_it = match[3].first;
1501
- json tool_call;
1502
- if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
1621
+ "(\\s*\\{\\s*\"name\")" // match 3 (named tool call)
1622
+ ")"
1623
+ "|<function=([^>]+)>" // match 4 (function name)
1624
+ "|<function name=\"([^\"]+)\">" // match 5 (function name again)
1625
+ );
1626
+
1627
+ if (auto res = builder.try_find_regex(open_regex)) {
1628
+ const auto & block_start = res->groups[1];
1629
+ std::string block_end = block_start.empty() ? "" : "```";
1630
+
1631
+ const auto & open_tag = res->groups[2];
1632
+ std::string close_tag;
1633
+
1634
+ if (!res->groups[3].empty()) {
1635
+ builder.move_to(res->groups[3].begin);
1636
+ close_tag = open_tag.empty() ? "" : "</" + builder.str(open_tag).substr(1);
1637
+
1638
+ if (auto tool_call = builder.try_consume_json_with_dumped_args({{"arguments"}})) {
1639
+ if (!builder.add_tool_call(tool_call->value) || tool_call->is_partial) {
1640
+ throw common_chat_msg_partial_exception("incomplete tool call");
1641
+ }
1642
+ builder.consume_spaces();
1643
+ builder.consume_literal(close_tag);
1644
+ builder.consume_spaces();
1645
+ if (!block_end.empty()) {
1646
+ builder.consume_literal(block_end);
1647
+ builder.consume_spaces();
1648
+ }
1649
+ builder.add_content(builder.consume_rest());
1650
+ } else {
1651
+ throw common_chat_msg_partial_exception("failed to parse tool call");
1652
+ }
1653
+ } else {
1654
+ auto function_name = builder.str(res->groups[4]);
1655
+ if (function_name.empty()) {
1656
+ function_name = builder.str(res->groups[5]);
1657
+ }
1658
+ LM_GGML_ASSERT(!function_name.empty());
1503
1659
 
1504
- msg.tool_calls.emplace_back(process_tool_call(tool_call));
1505
- it = json_it; // Move iterator past parsed JSON
1660
+ close_tag = "</function>";
1506
1661
 
1507
- // Handle close tags
1508
- consume_spaces(it, end);
1509
- if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
1510
- throw std::runtime_error("Failed to parse closing tag");
1511
- }
1512
- consume_spaces(it, end);
1513
- if (!block_end.empty() && !parse_literal(it, end, block_end)) {
1514
- throw std::runtime_error("Failed to parse block end");
1515
- }
1516
- consume_spaces(it, end);
1517
- } else {
1518
- // Not a valid tool call, treat as content
1519
- msg.content += std::string(match[0].first, match[0].second);
1520
- it = match[0].second;
1521
- }
1522
- } else {
1523
- auto function_name = match[4].str();
1524
- if (function_name.empty()) {
1525
- function_name = match[5].str();
1526
- }
1527
- LM_GGML_ASSERT(!function_name.empty());
1528
-
1529
- close_tag = "</function>";
1530
- // Start parsing from after the opening tags
1531
- auto json_it = match[6].first;
1532
- json arguments;
1533
- if (parse_json(json_it, end, arguments)) {
1534
- msg.tool_calls.emplace_back(process_tool_call({
1535
- {"name", function_name},
1536
- {"arguments", arguments},
1537
- }));
1538
- it = json_it; // Move iterator past parsed JSON
1539
-
1540
- // Handle close tags
1541
- consume_spaces(it, end);
1542
- if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
1543
- throw std::runtime_error("Failed to parse closing tag");
1544
- }
1545
- consume_spaces(it, end);
1546
- if (!block_end.empty() && !parse_literal(it, end, block_end)) {
1547
- throw std::runtime_error("Failed to parse block end");
1548
- }
1549
- consume_spaces(it, end);
1550
- } else {
1551
- // Not a valid tool call, treat as content
1552
- msg.content += std::string(match[0].first, match[0].second);
1553
- it = match[0].second;
1554
- }
1555
- }
1556
- } else {
1557
- // Add remaining content
1558
- msg.content += std::string(it, end);
1559
- break;
1662
+ if (auto arguments = builder.try_consume_json_with_dumped_args({{}})) {
1663
+ if (!builder.add_tool_call(function_name, "", arguments->value) || arguments->is_partial) {
1664
+ throw common_chat_msg_partial_exception("incomplete tool call");
1665
+ }
1666
+ builder.consume_spaces();
1667
+ builder.consume_literal(close_tag);
1668
+ builder.consume_spaces();
1669
+ if (!block_end.empty()) {
1670
+ builder.consume_literal(block_end);
1671
+ builder.consume_spaces();
1560
1672
  }
1561
1673
  }
1562
- return msg;
1563
- } catch (const std::exception & e) {
1564
- LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
1565
- common_chat_msg msg;
1566
- msg.role = "assistant";
1567
- msg.content = input;
1568
- return msg;
1674
+ builder.add_content(builder.consume_rest());
1569
1675
  }
1570
- });
1676
+ } else {
1677
+ builder.add_content(builder.consume_rest());
1678
+ }
1571
1679
  }
1572
1680
 
1573
1681
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1599,8 +1707,8 @@ static common_chat_params common_chat_templates_apply_jinja(
1599
1707
  const auto & caps = tmpl.original_caps();
1600
1708
  params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
1601
1709
  params.add_generation_prompt = inputs.add_generation_prompt;
1602
- params.extract_reasoning = inputs.extract_reasoning;
1603
1710
  params.tool_choice = inputs.tool_choice;
1711
+ params.enable_thinking = inputs.enable_thinking;
1604
1712
  params.grammar = inputs.grammar;
1605
1713
  params.now = inputs.now;
1606
1714
  if (!inputs.json_schema.empty()) {
@@ -1634,7 +1742,7 @@ static common_chat_params common_chat_templates_apply_jinja(
1634
1742
  }
1635
1743
 
1636
1744
  // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
1637
- if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
1745
+ if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
1638
1746
  return common_chat_params_init_hermes_2_pro(tmpl, params);
1639
1747
  }
1640
1748
 
@@ -1719,7 +1827,7 @@ static common_chat_params common_chat_templates_apply_legacy(
1719
1827
  if (res < 0) {
1720
1828
  // if the custom "tmpl" is not supported, we throw an error
1721
1829
  // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1722
- throw std::runtime_error("this custom template is not supported");
1830
+ throw std::runtime_error("this custom template is not supported, try using --jinja");
1723
1831
  }
1724
1832
 
1725
1833
  // if it turns out that our buffer is too small, we resize it
@@ -1748,44 +1856,66 @@ common_chat_params common_chat_templates_apply(
1748
1856
  : common_chat_templates_apply_legacy(tmpls, inputs);
1749
1857
  }
1750
1858
 
1751
- static common_chat_msg common_chat_parse_content_only(const std::string & input) {
1752
- common_chat_msg msg;
1753
- msg.role = "assistant";
1754
- msg.content = input;
1755
- return msg;
1859
+ static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
1860
+ builder.add_content(builder.consume_rest());
1756
1861
  }
1757
1862
 
1758
- common_chat_msg common_chat_parse(const std::string & input, common_chat_format format) {
1759
- switch (format) {
1863
+ static void common_chat_parse(common_chat_msg_parser & builder) {
1864
+ LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(builder.syntax().format), builder.input().c_str());
1865
+
1866
+ switch (builder.syntax().format) {
1760
1867
  case COMMON_CHAT_FORMAT_CONTENT_ONLY:
1761
- return common_chat_parse_content_only(input);
1868
+ common_chat_parse_content_only(builder);
1869
+ break;
1762
1870
  case COMMON_CHAT_FORMAT_GENERIC:
1763
- return common_chat_parse_generic(input);
1871
+ common_chat_parse_generic(builder);
1872
+ break;
1764
1873
  case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
1765
- return common_chat_parse_mistral_nemo(input);
1874
+ common_chat_parse_mistral_nemo(builder);
1875
+ break;
1766
1876
  case COMMON_CHAT_FORMAT_LLAMA_3_X:
1767
- return common_chat_parse_llama_3_1(input);
1877
+ common_chat_parse_llama_3_1(builder);
1878
+ break;
1768
1879
  case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
1769
- return common_chat_parse_llama_3_1(input, /* with_builtin_tools= */ true);
1880
+ common_chat_parse_llama_3_1(builder, /* with_builtin_tools= */ true);
1881
+ break;
1770
1882
  case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
1771
- return common_chat_parse_deepseek_r1(input, /* extract_reasoning= */ false);
1772
- case COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING:
1773
- return common_chat_parse_deepseek_r1(input, /* extract_reasoning= */ true);
1883
+ common_chat_parse_deepseek_r1(builder);
1884
+ break;
1774
1885
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
1775
- return common_chat_parse_functionary_v3_2(input);
1886
+ common_chat_parse_functionary_v3_2(builder);
1887
+ break;
1776
1888
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
1777
- return common_chat_parse_functionary_v3_1_llama_3_1(input);
1889
+ common_chat_parse_functionary_v3_1_llama_3_1(builder);
1890
+ break;
1778
1891
  case COMMON_CHAT_FORMAT_HERMES_2_PRO:
1779
- return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ false);
1780
- case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING:
1781
- return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ true);
1892
+ common_chat_parse_hermes_2_pro(builder);
1893
+ break;
1782
1894
  case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
1783
- return common_chat_parse_firefunction_v2(input);
1895
+ common_chat_parse_firefunction_v2(builder);
1896
+ break;
1784
1897
  case COMMON_CHAT_FORMAT_COMMAND_R7B:
1785
- return common_chat_parse_command_r7b(input, /* extract_reasoning= */ false);
1786
- case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING:
1787
- return common_chat_parse_command_r7b(input, /* extract_reasoning= */ true);
1898
+ common_chat_parse_command_r7b(builder);
1899
+ break;
1788
1900
  default:
1789
- throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
1901
+ throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
1902
+ }
1903
+ builder.finish();
1904
+ }
1905
+
1906
+ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
1907
+ common_chat_msg_parser builder(input, is_partial, syntax);
1908
+ try {
1909
+ common_chat_parse(builder);
1910
+ } catch (const common_chat_msg_partial_exception & ex) {
1911
+ LOG_DBG("Partial parse: %s\n", ex.what());
1912
+ if (!is_partial) {
1913
+ builder.clear_tools();
1914
+ builder.move_to(0);
1915
+ common_chat_parse_content_only(builder);
1916
+ }
1790
1917
  }
1918
+ auto msg = builder.result();
1919
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
1920
+ return msg;
1791
1921
  }