@novastera-oss/llamarn 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/PureCppImpl.cpp +9 -27
  14. package/cpp/SystemUtils.h +2 -2
  15. package/cpp/build-info.cpp +2 -2
  16. package/cpp/llama.cpp/README.md +11 -3
  17. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  18. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  19. package/cpp/llama.cpp/common/arg.cpp +153 -113
  20. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  21. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  22. package/cpp/llama.cpp/common/chat.cpp +847 -699
  23. package/cpp/llama.cpp/common/chat.h +73 -6
  24. package/cpp/llama.cpp/common/common.cpp +50 -82
  25. package/cpp/llama.cpp/common/common.h +21 -17
  26. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  27. package/cpp/llama.cpp/common/json-partial.h +37 -0
  28. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  29. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  30. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  31. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  32. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  33. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  34. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  35. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  36. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  37. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  38. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  39. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  40. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  75. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  76. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  120. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  121. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  122. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  123. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  124. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  125. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  126. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  127. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  128. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  129. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  130. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  131. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  132. package/cpp/llama.cpp/include/llama.h +62 -125
  133. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  134. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  135. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  150. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  152. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  154. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  159. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  160. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  161. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  162. package/cpp/llama.cpp/models/templates/README.md +2 -0
  163. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  164. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  165. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  166. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  167. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  168. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  169. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  170. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  171. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  172. package/cpp/llama.cpp/src/llama-context.h +30 -0
  173. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  174. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  175. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  176. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  177. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  178. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  179. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  180. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  181. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  182. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  183. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  184. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  185. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  186. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  187. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  188. package/cpp/llama.cpp/src/llama-model.h +6 -1
  189. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  190. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  191. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  192. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  193. package/cpp/llama.cpp/src/llama.cpp +14 -0
  194. package/cpp/rn-completion.cpp +60 -5
  195. package/ios/include/chat.h +73 -6
  196. package/ios/include/common/minja/chat-template.hpp +9 -5
  197. package/ios/include/common/minja/minja.hpp +69 -36
  198. package/ios/include/common.h +21 -17
  199. package/ios/include/llama.h +62 -125
  200. package/ios/libs/llama.xcframework/Info.plist +19 -19
  201. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  202. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  203. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  204. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  205. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  206. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  207. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  208. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  212. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  213. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  227. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  228. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  231. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  232. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  233. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  234. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  235. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  236. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  240. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  241. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  242. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  243. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  244. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  245. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  246. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  247. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  248. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  249. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  253. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  254. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  255. package/package.json +1 -1
  256. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  257. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  258. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  259. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  260. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  261. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  262. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  263. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  267. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  268. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
@@ -96,6 +96,8 @@ enum llm_type {
96
96
  LLM_TYPE_235B_A22B,
97
97
  };
98
98
 
99
+ std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
100
+
99
101
  struct llama_layer_posnet {
100
102
  // resnet
101
103
  struct ggml_tensor * norm1 = nullptr;
@@ -396,7 +398,10 @@ struct llama_model {
396
398
 
397
399
  const struct ggml_tensor * get_tensor(const char * name) const;
398
400
 
399
- ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
401
+ float get_rope_freq_base (const llama_cparams & cparams, int il) const;
402
+ float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
403
+
404
+ ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
400
405
 
401
406
  // note: can mutate `cparams`
402
407
  // TODO: move this to new llm_arch_model_i interface
@@ -14,6 +14,12 @@
14
14
  #include <thread>
15
15
  #include <unordered_map>
16
16
 
17
+ // Quantization types. Changes to this struct must be replicated in quantize.cpp
18
+ struct tensor_quantization {
19
+ std::string name;
20
+ ggml_type quant = GGML_TYPE_COUNT;
21
+ };
22
+
17
23
  static void zeros(std::ofstream & file, size_t n) {
18
24
  char zero = 0;
19
25
  for (size_t i = 0; i < n; ++i) {
@@ -48,12 +54,6 @@ struct quantize_state_impl {
48
54
  {}
49
55
  };
50
56
 
51
- // changes to this struct must be replicated in quantize.cpp
52
- struct tensor_quantization {
53
- std::string name;
54
- ggml_type quant = GGML_TYPE_COUNT;
55
- };
56
-
57
57
  static void llama_tensor_dequantize_impl(
58
58
  ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
59
59
  const size_t nelements, const int nthread
@@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
519
519
  nthread = std::thread::hardware_concurrency();
520
520
  }
521
521
 
522
- // mmap consistently increases speed Linux, and also increases speed on Windows with
522
+ // mmap consistently increases speed on Linux, and also increases speed on Windows with
523
523
  // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
524
524
  #if defined(__linux__) || defined(_WIN32)
525
525
  constexpr bool use_mmap = true;
@@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
529
529
 
530
530
  llama_model_kv_override * kv_overrides = nullptr;
531
531
  if (params->kv_overrides) {
532
- auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
532
+ auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
533
533
  kv_overrides = v->data();
534
534
  }
535
535
 
@@ -796,17 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
796
796
  // unless the user specifies a type
797
797
  if (params->tensor_types) {
798
798
  const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
799
+ const std::string tensor_name(tensor->name);
799
800
  for (const auto & [tname, qtype] : tensor_types) {
800
- if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
801
- if (qtype != new_type) {
802
- LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
801
+ if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
802
+ if (qtype != new_type) {
803
+ LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
804
+ new_type = qtype;
805
+ break; // if two or more types are specified for the tensor, first match wins
803
806
  }
804
- new_type = qtype;
805
- break;
806
807
  }
807
808
  }
808
809
  }
809
810
  }
811
+
810
812
  if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
811
813
  new_type = params->token_embedding_type;
812
814
  }
@@ -798,7 +798,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
798
798
  }
799
799
 
800
800
  // if we have enough values the operation was a success
801
- if (filtered_tokens.size() >= ctx->min_keep) {
801
+ if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
802
802
  memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
803
803
  cur_p->size = filtered_tokens.size();
804
804
  min_p_applied = true;
@@ -909,7 +909,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
909
909
  cum_sum += cur_p->data[idx].p;
910
910
 
911
911
  // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
912
- if (cum_sum > ctx->p && i >= ctx->min_keep - 1) {
912
+ if (cum_sum > ctx->p && (ctx->min_keep == 0 || i >= ctx->min_keep - 1)) {
913
913
  last_idx = i + 1;
914
914
  break;
915
915
  }
@@ -1,5 +1,7 @@
1
1
  #include "llama-vocab.h"
2
2
 
3
+ #include "ggml.h"
4
+ #include "gguf.h"
3
5
  #include "llama-impl.h"
4
6
  #include "llama-model-loader.h"
5
7
 
@@ -833,7 +835,7 @@ struct llm_tokenizer_ugm_session {
833
835
  }
834
836
 
835
837
  // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
836
- std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
838
+ std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
837
839
  // at the beginning tokenization score is zero
838
840
  tokenization_results[0] = { vocab.token_unk(), 0, 0 };
839
841
 
@@ -865,7 +867,7 @@ struct llm_tokenizer_ugm_session {
865
867
  const double challenger_score = current_best.score_sum + token_score;
866
868
  struct best_tokenization & current_champ = tokenization_results[prefix_offset];
867
869
  if (challenger_score > current_champ.score_sum) {
868
- struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
870
+ struct best_tokenization challenger = { token_id, input_offset, challenger_score };
869
871
  current_champ = challenger;
870
872
  }
871
873
  }
@@ -879,7 +881,7 @@ struct llm_tokenizer_ugm_session {
879
881
  prefix_offset = input_offset + n_utf8_code_units;
880
882
  struct best_tokenization & current_champ = tokenization_results[prefix_offset];
881
883
  if (challenger_score > current_champ.score_sum) {
882
- struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
884
+ struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
883
885
  current_champ = challenger;
884
886
  }
885
887
  }
@@ -1005,7 +1007,7 @@ private:
1005
1007
  struct best_tokenization {
1006
1008
  llama_token token_id;
1007
1009
  size_t input_offset;
1008
- float score_sum;
1010
+ double score_sum;
1009
1011
  };
1010
1012
 
1011
1013
  struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
@@ -1234,6 +1236,9 @@ struct fragment_buffer_variant {
1234
1236
  struct llama_vocab::impl {
1235
1237
  uint32_t n_token_types = 0; // for BERT-style token types
1236
1238
 
1239
+ std::string tokenizer_model;
1240
+ std::string tokenizer_pre;
1241
+
1237
1242
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
1238
1243
  enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1239
1244
 
@@ -1369,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1369
1374
 
1370
1375
  // determine vocab type
1371
1376
  {
1372
- std::string tokenizer_model;
1373
- std::string tokenizer_pre;
1374
-
1375
1377
  ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
1376
1378
  ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
1377
1379
 
@@ -1466,7 +1468,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1466
1468
 
1467
1469
  const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
1468
1470
  if (precompiled_charsmap_keyidx != -1) {
1469
- size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1471
+ const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
1472
+ GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
1473
+
1474
+ const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1470
1475
  const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1471
1476
  precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1472
1477
  #ifdef IS_BIG_ENDIAN
@@ -2789,6 +2794,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
2789
2794
  pimpl->load(ml, kv);
2790
2795
  }
2791
2796
 
2797
+ std::string llama_vocab::get_tokenizer_model() const {
2798
+ return pimpl->tokenizer_model;
2799
+ }
2800
+
2801
+ std::string llama_vocab::get_tokenizer_pre() const {
2802
+ return pimpl->tokenizer_pre;
2803
+ }
2804
+
2792
2805
  enum llama_vocab_type llama_vocab::get_type() const {
2793
2806
  return pimpl->type;
2794
2807
  }
@@ -3011,6 +3024,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
3011
3024
  return it->second;
3012
3025
  }
3013
3026
 
3027
+ std::vector<std::string> llama_vocab::get_bpe_merges() const {
3028
+ std::vector<std::string> result(pimpl->bpe_ranks.size());
3029
+
3030
+ for (const auto & pair : pimpl->bpe_ranks) {
3031
+ result[pair.second] = pair.first.first + " " + pair.first.second;
3032
+ }
3033
+
3034
+ return result;
3035
+ }
3036
+
3037
+ std::vector<char> llama_vocab::get_precompiled_charsmap() const {
3038
+ return pimpl->precompiled_charsmap;
3039
+ }
3040
+
3014
3041
  int32_t llama_vocab::tokenize(
3015
3042
  const char * text,
3016
3043
  int32_t text_len,
@@ -21,6 +21,9 @@ struct llama_vocab {
21
21
 
22
22
  void load(llama_model_loader & ml, const LLM_KV & kv);
23
23
 
24
+ std::string get_tokenizer_model() const;
25
+ std::string get_tokenizer_pre() const;
26
+
24
27
  enum llama_vocab_type get_type() const;
25
28
  enum llama_vocab_pre_type get_pre_type() const;
26
29
 
@@ -80,6 +83,9 @@ struct llama_vocab {
80
83
  int max_token_len() const;
81
84
 
82
85
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
86
+ std::vector<std::string> get_bpe_merges() const;
87
+
88
+ std::vector<char> get_precompiled_charsmap() const;
83
89
 
84
90
  int32_t tokenize(
85
91
  const char * text,
@@ -4,6 +4,7 @@
4
4
  #include "llama-mmap.h"
5
5
  #include "llama-vocab.h"
6
6
  #include "llama-model-loader.h"
7
+ #include "llama-model-saver.h"
7
8
  #include "llama-model.h"
8
9
 
9
10
  #include "ggml.h"
@@ -139,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl(
139
140
  struct llama_model_params params) {
140
141
  ggml_time_init();
141
142
 
143
+ if (!params.vocab_only && ggml_backend_reg_count() == 0) {
144
+ LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
145
+ return nullptr;
146
+ }
147
+
142
148
  unsigned cur_percentage = 0;
143
149
  if (params.progress_callback == NULL) {
144
150
  params.progress_callback_user_data = &cur_percentage;
@@ -253,6 +259,13 @@ struct llama_model * llama_model_load_from_splits(
253
259
  return llama_model_load_from_file_impl(splits.front(), splits, params);
254
260
  }
255
261
 
262
+ void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
263
+ llama_model_saver ms(*model);
264
+ ms.add_kv_from_model();
265
+ ms.add_tensors_from_model();
266
+ ms.save(path_model);
267
+ }
268
+
256
269
  //
257
270
  // chat templates
258
271
  //
@@ -338,3 +351,4 @@ const char * llama_print_system_info(void) {
338
351
 
339
352
  return s.c_str();
340
353
  }
354
+
@@ -298,7 +298,8 @@ CompletionResult run_completion(
298
298
  }
299
299
 
300
300
  const int64_t t_end_generation = ggml_time_us();
301
- const double generation_time_ms = (t_end_generation - t_start_generation) / 1000.0;
301
+ // Note: keeping generation_time_ms for future timing measurements
302
+ // const double generation_time_ms = (t_end_generation - t_start_generation) / 1000.0;
302
303
 
303
304
  // Set the result
304
305
  result.content = state.generated_text;
@@ -349,8 +350,9 @@ CompletionResult run_chat_completion(
349
350
  common_chat_templates_inputs template_inputs;
350
351
  template_inputs.messages = chat_msgs;
351
352
  template_inputs.add_generation_prompt = true;
352
- template_inputs.use_jinja = options.use_jinja;
353
- template_inputs.extract_reasoning = true; // Default to true to extract reasoning content if available
353
+ template_inputs.use_jinja = rn_ctx->params.use_jinja;
354
+ // Note: extract_reasoning field doesn't exist in current llama.cpp version
355
+ // template_inputs.extract_reasoning = true; // Default to true to extract reasoning content if available
354
356
 
355
357
  // Add grammar if present in options
356
358
  if (!options.grammar.empty()) {
@@ -389,6 +391,31 @@ CompletionResult run_chat_completion(
389
391
  result = run_completion(rn_ctx, cmpl_options, callback);
390
392
 
391
393
  if (result.success) {
394
+ // Parse the generated content for tool calls and structured responses
395
+ common_chat_msg parsed_msg;
396
+ bool has_parsed_content = false;
397
+
398
+ // Only parse if we have tools available and the response isn't empty
399
+ if (!template_inputs.tools.empty() && !result.content.empty()) {
400
+ try {
401
+ // Construct the chat syntax for parsing using the format from template application
402
+ common_chat_syntax syntax;
403
+ syntax.format = chat_params.format; // Use format from template, not from params
404
+ syntax.reasoning_format = rn_ctx->params.reasoning_format;
405
+ syntax.reasoning_in_content = true;
406
+ syntax.thinking_forced_open = false;
407
+ syntax.parse_tool_calls = true;
408
+
409
+ // Parse the generated content for tool calls
410
+ parsed_msg = common_chat_parse(result.content, false, syntax);
411
+ has_parsed_content = true;
412
+
413
+ } catch (const std::exception& e) {
414
+ // If parsing fails, treat as regular content
415
+ has_parsed_content = false;
416
+ }
417
+ }
418
+
392
419
  // Create OpenAI-compatible response
393
420
  json response = {
394
421
  {"id", gen_chatcmplid()},
@@ -401,11 +428,39 @@ CompletionResult run_chat_completion(
401
428
  json choice = {
402
429
  {"index", 0},
403
430
  {"message", {
404
- {"role", "assistant"},
405
- {"content", result.content}
431
+ {"role", "assistant"}
406
432
  }},
407
433
  {"finish_reason", "stop"}
408
434
  };
435
+
436
+ // Add parsed content and tool calls if available
437
+ if (has_parsed_content && !parsed_msg.tool_calls.empty()) {
438
+ // Set content to the parsed content (may be null for tool-only responses)
439
+ if (!parsed_msg.content.empty()) {
440
+ choice["message"]["content"] = parsed_msg.content;
441
+ } else {
442
+ choice["message"]["content"] = nullptr;
443
+ }
444
+
445
+ // Add tool calls to the message
446
+ json tool_calls = json::array();
447
+ for (const auto& tool_call : parsed_msg.tool_calls) {
448
+ json tc = {
449
+ {"id", tool_call.id.empty() ? ("call_" + std::to_string(std::rand())) : tool_call.id},
450
+ {"type", "function"},
451
+ {"function", {
452
+ {"name", tool_call.name},
453
+ {"arguments", tool_call.arguments}
454
+ }}
455
+ };
456
+ tool_calls.push_back(tc);
457
+ }
458
+ choice["message"]["tool_calls"] = tool_calls;
459
+ choice["finish_reason"] = "tool_calls";
460
+ } else {
461
+ // Regular text response
462
+ choice["message"]["content"] = has_parsed_content ? parsed_msg.content : result.content;
463
+ }
409
464
 
410
465
  choices.push_back(choice);
411
466
  response["choices"] = choices;
@@ -3,6 +3,8 @@
3
3
  #pragma once
4
4
 
5
5
  #include "common.h"
6
+ #include <functional>
7
+ #include <chrono>
6
8
  #include <string>
7
9
  #include <vector>
8
10
 
@@ -12,11 +14,19 @@ struct common_chat_tool_call {
12
14
  std::string name;
13
15
  std::string arguments;
14
16
  std::string id;
17
+
18
+ bool operator==(const common_chat_tool_call & other) const {
19
+ return name == other.name && arguments == other.arguments && id == other.id;
20
+ }
15
21
  };
16
22
 
17
23
  struct common_chat_msg_content_part {
18
24
  std::string type;
19
25
  std::string text;
26
+
27
+ bool operator==(const common_chat_msg_content_part & other) const {
28
+ return type == other.type && text == other.text;
29
+ }
20
30
  };
21
31
 
22
32
  struct common_chat_msg {
@@ -27,6 +37,51 @@ struct common_chat_msg {
27
37
  std::string reasoning_content;
28
38
  std::string tool_name;
29
39
  std::string tool_call_id;
40
+
41
+ template <class T> T to_json_oaicompat() const;
42
+
43
+ bool empty() const {
44
+ return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
45
+ }
46
+ void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
47
+ for (auto i = 0u; i < tool_calls.size(); i++) {
48
+ if (ids_cache.size() <= i) {
49
+ auto id = tool_calls[i].id;
50
+ if (id.empty()) {
51
+ id = gen_tool_call_id();
52
+ }
53
+ ids_cache.push_back(id);
54
+ }
55
+ tool_calls[i].id = ids_cache[i];
56
+ }
57
+ }
58
+ bool operator==(const common_chat_msg & other) const {
59
+ return role == other.role
60
+ && content == other.content
61
+ && content_parts == other.content_parts
62
+ && tool_calls == other.tool_calls
63
+ && reasoning_content == other.reasoning_content
64
+ && tool_name == other.tool_name
65
+ && tool_call_id == other.tool_call_id;
66
+ }
67
+ bool operator!=(const common_chat_msg & other) const {
68
+ return !(*this == other);
69
+ }
70
+ };
71
+
72
+ struct common_chat_msg_diff {
73
+ // std::string reasoning_content_delta;
74
+ std::string content_delta;
75
+ size_t tool_call_index = std::string::npos;
76
+ common_chat_tool_call tool_call_delta;
77
+
78
+ static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
79
+
80
+ bool operator==(const common_chat_msg_diff & other) const {
81
+ return content_delta == other.content_delta
82
+ && tool_call_index == other.tool_call_index
83
+ && tool_call_delta == other.tool_call_delta;
84
+ }
30
85
  };
31
86
 
32
87
  struct common_chat_tool {
@@ -48,14 +103,11 @@ enum common_chat_format {
48
103
  COMMON_CHAT_FORMAT_LLAMA_3_X,
49
104
  COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
50
105
  COMMON_CHAT_FORMAT_DEEPSEEK_R1,
51
- COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
52
106
  COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
53
107
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
54
108
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
55
109
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
56
- COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
57
110
  COMMON_CHAT_FORMAT_COMMAND_R7B,
58
- COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
59
111
 
60
112
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
61
113
  };
@@ -70,7 +122,9 @@ struct common_chat_templates_inputs {
70
122
  std::vector<common_chat_tool> tools;
71
123
  common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
72
124
  bool parallel_tool_calls = false;
73
- bool extract_reasoning = true;
125
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
126
+ bool enable_thinking = true;
127
+ std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
74
128
  };
75
129
 
76
130
  struct common_chat_params {
@@ -78,11 +132,21 @@ struct common_chat_params {
78
132
  std::string prompt;
79
133
  std::string grammar;
80
134
  bool grammar_lazy = false;
135
+ bool thinking_forced_open = false;
81
136
  std::vector<common_grammar_trigger> grammar_triggers;
82
137
  std::vector<std::string> preserved_tokens;
83
138
  std::vector<std::string> additional_stops;
84
139
  };
85
140
 
141
+ struct common_chat_syntax {
142
+ common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
143
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
144
+ // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
145
+ bool reasoning_in_content = false;
146
+ bool thinking_forced_open = false;
147
+ bool parse_tool_calls = true;
148
+ };
149
+
86
150
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
87
151
  bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
88
152
 
@@ -119,8 +183,9 @@ std::string common_chat_format_example(
119
183
  const struct common_chat_templates * tmpls,
120
184
  bool use_jinja);
121
185
 
122
- std::string common_chat_format_name(common_chat_format format);
123
- common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
186
+ const char* common_chat_format_name(common_chat_format format);
187
+ const char* common_reasoning_format_name(common_reasoning_format format);
188
+ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
124
189
 
125
190
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
126
191
 
@@ -133,3 +198,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
133
198
  // T can be std::string containing JSON or nlohmann::ordered_json
134
199
  template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
135
200
  template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
201
+
202
+ template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
@@ -13,10 +13,12 @@
13
13
  #include <chrono>
14
14
  #include <cstddef>
15
15
  #include <cstdio>
16
+ #include <ctime>
16
17
  #include <exception>
17
18
  #include <iomanip>
18
19
  #include <memory>
19
20
  #include <sstream>
21
+ #include <stdexcept>
20
22
  #include <string>
21
23
  #include <vector>
22
24
 
@@ -393,8 +395,8 @@ class chat_template {
393
395
 
394
396
  for (const auto & message_ : adjusted_messages) {
395
397
  auto message = message_;
396
- if (!message.contains("role") || !message.contains("content")) {
397
- throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
398
+ if (!message.contains("role") || (!message.contains("content") && !message.contains("tool_calls"))) {
399
+ throw std::runtime_error("message must have 'role' and one of 'content' or 'tool_calls' fields: " + message.dump());
398
400
  }
399
401
  std::string role = message.at("role");
400
402
 
@@ -415,7 +417,6 @@ class chat_template {
415
417
  }
416
418
  }
417
419
  if (polyfill_tool_calls) {
418
- auto content = message.at("content");
419
420
  auto tool_calls = json::array();
420
421
  for (const auto & tool_call : message.at("tool_calls")) {
421
422
  if (tool_call.at("type") != "function") {
@@ -434,8 +435,11 @@ class chat_template {
434
435
  auto obj = json {
435
436
  {"tool_calls", tool_calls},
436
437
  };
437
- if (!content.is_null() && !content.empty()) {
438
- obj["content"] = content;
438
+ if (message.contains("content")) {
439
+ auto content = message.at("content");
440
+ if (!content.is_null() && !content.empty()) {
441
+ obj["content"] = content;
442
+ }
439
443
  }
440
444
  message["content"] = obj.dump(2);
441
445
  message.erase("tool_calls");