@novastera-oss/llamarn 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/RNLlamaCpp.podspec +3 -2
  2. package/android/CMakeLists.txt +6 -3
  3. package/android/src/main/cpp/include/llama.h +140 -38
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  12. package/cpp/LlamaCppModel.cpp +48 -67
  13. package/cpp/LlamaCppModel.h +8 -3
  14. package/cpp/PureCppImpl.cpp +1 -1
  15. package/cpp/PureCppImpl.h +2 -2
  16. package/cpp/build-info.cpp +2 -2
  17. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  18. package/cpp/llama.cpp/Makefile +2 -2
  19. package/cpp/llama.cpp/README.md +33 -13
  20. package/cpp/llama.cpp/common/CMakeLists.txt +15 -28
  21. package/cpp/llama.cpp/common/arg.cpp +38 -12
  22. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  23. package/cpp/llama.cpp/common/chat-parser.cpp +9 -3
  24. package/cpp/llama.cpp/common/chat-parser.h +4 -1
  25. package/cpp/llama.cpp/common/chat.cpp +16 -13
  26. package/cpp/llama.cpp/common/chat.h +1 -1
  27. package/cpp/llama.cpp/common/common.cpp +52 -40
  28. package/cpp/llama.cpp/common/common.h +5 -2
  29. package/cpp/llama.cpp/common/json-partial.cpp +5 -4
  30. package/cpp/llama.cpp/common/json-partial.h +2 -1
  31. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  32. package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
  33. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  34. package/cpp/llama.cpp/convert_hf_to_gguf.py +128 -84
  35. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
  36. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  37. package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
  38. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +49 -13
  39. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
  41. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  44. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +33 -2
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +6 -8
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +25 -16
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  79. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  82. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
  83. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  88. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  93. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  94. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -46
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -248
  109. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  112. package/cpp/llama.cpp/ggml/src/ggml.c +9 -8
  113. package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
  114. package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
  115. package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
  116. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
  117. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
  118. package/cpp/llama.cpp/include/llama.h +140 -38
  119. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  120. package/cpp/llama.cpp/src/CMakeLists.txt +4 -1
  121. package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
  122. package/cpp/llama.cpp/src/llama-arch.h +7 -1
  123. package/cpp/llama.cpp/src/llama-batch.cpp +289 -31
  124. package/cpp/llama.cpp/src/llama-batch.h +47 -17
  125. package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
  126. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  127. package/cpp/llama.cpp/src/llama-context.cpp +488 -313
  128. package/cpp/llama.cpp/src/llama-context.h +38 -17
  129. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  130. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  131. package/cpp/llama.cpp/src/llama-graph.cpp +275 -152
  132. package/cpp/llama.cpp/src/llama-graph.h +109 -52
  133. package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
  134. package/cpp/llama.cpp/src/llama-hparams.h +8 -2
  135. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +281 -0
  136. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +133 -0
  137. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1835 -0
  138. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +308 -0
  139. package/cpp/llama.cpp/src/llama-kv-cells.h +53 -17
  140. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
  141. package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
  142. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +1116 -0
  143. package/cpp/llama.cpp/src/llama-memory-recurrent.h +188 -0
  144. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  145. package/cpp/llama.cpp/src/llama-memory.h +89 -4
  146. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  147. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  148. package/cpp/llama.cpp/src/llama-model.cpp +735 -143
  149. package/cpp/llama.cpp/src/llama-model.h +4 -0
  150. package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
  151. package/cpp/llama.cpp/src/llama-vocab.cpp +39 -25
  152. package/cpp/llama.cpp/src/llama.cpp +11 -7
  153. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  154. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
  155. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
  156. package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
  157. package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
  158. package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
  159. package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  160. package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
  161. package/cpp/rn-completion.cpp +65 -10
  162. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  163. package/cpp/{rn-utils.hpp → rn-utils.h} +8 -1
  164. package/ios/include/chat.h +1 -1
  165. package/ios/include/common/minja/chat-template.hpp +1 -1
  166. package/ios/include/common/minja/minja.hpp +1 -1
  167. package/ios/include/common.h +5 -2
  168. package/ios/include/json-schema-to-grammar.h +4 -4
  169. package/ios/include/llama.h +140 -38
  170. package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
  171. package/ios/libs/llama.xcframework/Info.plist +20 -20
  172. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  173. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4617
  174. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
  175. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +140 -38
  176. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  177. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  178. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
  179. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3557
  180. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  181. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  182. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  183. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  184. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
  185. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3559
  186. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
  187. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +140 -38
  188. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
  189. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +140 -38
  190. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  191. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
  192. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +140 -38
  193. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  194. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  195. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  196. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4616
  197. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
  198. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +140 -38
  199. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  200. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  201. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4637
  202. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3556
  203. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  204. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  205. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  206. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  207. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4653
  208. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
  209. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +140 -38
  210. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  211. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  212. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4674
  213. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3587
  214. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  215. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  216. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  217. package/package.json +1 -2
  218. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  219. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  220. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  221. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2747
  222. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -502
  223. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  224. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  225. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -89,6 +89,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
89
89
  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
90
90
  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
91
91
 
92
+ if (NOT DEFINED LLAMA_BUILD_NUMBER)
93
+ set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
94
+ endif()
95
+ if (NOT DEFINED LLAMA_BUILD_COMMIT)
96
+ set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
97
+ endif()
98
+ set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
99
+
92
100
  # override ggml options
93
101
  set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
94
102
  set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
@@ -155,10 +163,17 @@ if (LLAMA_USE_SYSTEM_GGML)
155
163
  endif()
156
164
 
157
165
  if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
166
+ set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
167
+ set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
158
168
  add_subdirectory(ggml)
159
169
  # ... otherwise assume ggml is added by a parent CMakeLists.txt
160
170
  endif()
161
171
 
172
+ if (MINGW)
173
+ # Target Windows 8 for PrefetchVirtualMemory
174
+ add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
175
+ endif()
176
+
162
177
  #
163
178
  # build the library
164
179
  #
@@ -199,10 +214,6 @@ endif()
199
214
  include(GNUInstallDirs)
200
215
  include(CMakePackageConfigHelpers)
201
216
 
202
- set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
203
- set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
204
- set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
205
-
206
217
  set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
207
218
  set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
208
219
  set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
@@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
367
367
  endif
368
368
 
369
369
  ifndef GGML_NO_CPU_AARCH64
370
- MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
370
+ MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
371
371
  endif
372
372
 
373
373
  # warnings
@@ -970,7 +970,7 @@ OBJ_GGML = \
970
970
  $(DIR_GGML)/src/ggml-threading.o \
971
971
  $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
972
972
  $(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
973
- $(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
973
+ $(DIR_GGML)/src/ggml-cpu/repack.o \
974
974
  $(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
975
975
  $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
976
976
  $(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
@@ -3,9 +3,10 @@
3
3
  ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
4
4
 
5
5
  [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
6
+ [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
6
7
  [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
7
8
 
8
- [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
9
+ [Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
9
10
 
10
11
  Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
11
12
 
@@ -17,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
17
18
  ## Hot topics
18
19
 
19
20
  - 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
20
- - **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
21
21
  - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
22
22
  - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
23
23
  - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
@@ -28,6 +28,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
28
28
 
29
29
  ----
30
30
 
31
+ ## Quick start
32
+
33
+ Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
34
+
35
+ - Install `llama.cpp` using [brew, nix or winget](docs/install.md)
36
+ - Run with Docker - see our [Docker documentation](docs/docker.md)
37
+ - Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
38
+ - Build from source by cloning this repository - check out [our build guide](docs/build.md)
39
+
40
+ Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
41
+
42
+ Example command:
43
+
44
+ ```sh
45
+ # Use a local model file
46
+ llama-cli -m my_model.gguf
47
+
48
+ # Or download and run a model directly from Hugging Face
49
+ llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
50
+
51
+ # Launch OpenAI-compatible API server
52
+ llama-server -hf ggml-org/gemma-3-1b-it-GGUF
53
+ ```
54
+
31
55
  ## Description
32
56
 
33
57
  The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
@@ -130,6 +154,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
130
154
  <details>
131
155
  <summary>Bindings</summary>
132
156
 
157
+ - Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
133
158
  - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
134
159
  - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
135
160
  - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
@@ -229,6 +254,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
229
254
 
230
255
  </details>
231
256
 
257
+
232
258
  ## Supported backends
233
259
 
234
260
  | Backend | Target devices |
@@ -245,16 +271,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
245
271
  | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
246
272
  | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
247
273
 
248
- ## Building the project
249
-
250
- The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
251
- The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
252
-
253
- - Clone this repository and build locally, see [how to build](docs/build.md)
254
- - On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
255
- - Use a Docker image, see [documentation for Docker](docs/docker.md)
256
- - Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
257
-
258
274
  ## Obtaining and quantizing models
259
275
 
260
276
  The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
@@ -262,7 +278,11 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
262
278
  - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
263
279
  - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
264
280
 
265
- You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
281
+ You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
282
+
283
+ ```sh
284
+ llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
285
+ ```
266
286
 
267
287
  By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
268
288
 
@@ -7,8 +7,8 @@ llama_add_compile_flags()
7
7
  # Build info header
8
8
  #
9
9
 
10
- if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
11
- set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
10
+ if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
11
+ set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
12
12
 
13
13
  # Is git submodule
14
14
  if(NOT IS_DIRECTORY "${GIT_DIR}")
@@ -18,36 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
18
18
  if (SLASH_POS EQUAL 0)
19
19
  set(GIT_DIR "${REAL_GIT_DIR}")
20
20
  else()
21
- set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
21
+ set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
22
22
  endif()
23
23
  endif()
24
24
 
25
25
  if(EXISTS "${GIT_DIR}/index")
26
- set(GIT_INDEX "${GIT_DIR}/index")
26
+ # For build-info.cpp below
27
+ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
27
28
  else()
28
29
  message(WARNING "Git index not found in git repository.")
29
- set(GIT_INDEX "")
30
30
  endif()
31
31
  else()
32
32
  message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
33
- set(GIT_INDEX "")
34
33
  endif()
35
34
 
36
- # Add a custom command to rebuild build-info.cpp when .git/index changes
37
- add_custom_command(
38
- OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
39
- COMMENT "Generating build details from Git"
40
- COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
41
- -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
42
- -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
43
- -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
44
- -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
45
- WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
46
- DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
47
- VERBATIM
48
- )
35
+ set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
36
+ set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
37
+ configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
38
+
49
39
  set(TARGET build_info)
50
- add_library(${TARGET} OBJECT build-info.cpp)
40
+ add_library(${TARGET} OBJECT ${OUTPUT_FILE})
51
41
  if (BUILD_SHARED_LIBS)
52
42
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
53
43
  endif()
@@ -58,23 +48,20 @@ add_library(${TARGET} STATIC
58
48
  arg.cpp
59
49
  arg.h
60
50
  base64.hpp
61
- chat.cpp
62
- chat.h
63
51
  chat-parser.cpp
64
52
  chat-parser.h
53
+ chat.cpp
54
+ chat.h
65
55
  common.cpp
66
56
  common.h
67
57
  console.cpp
68
58
  console.h
69
- json-schema-to-grammar.cpp
70
- json.hpp
71
- json-partial.h
72
59
  json-partial.cpp
60
+ json-partial.h
61
+ json-schema-to-grammar.cpp
73
62
  llguidance.cpp
74
63
  log.cpp
75
64
  log.h
76
- minja/chat-template.hpp
77
- minja/minja.hpp
78
65
  ngram-cache.cpp
79
66
  ngram-cache.h
80
67
  regex-partial.cpp
@@ -147,7 +134,7 @@ if (LLAMA_LLGUIDANCE)
147
134
  set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
148
135
  endif ()
149
136
 
150
- target_include_directories(${TARGET} PUBLIC .)
137
+ target_include_directories(${TARGET} PUBLIC . ../vendor)
151
138
  target_compile_features (${TARGET} PUBLIC cxx_std_17)
152
139
  target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
153
140
 
@@ -1,10 +1,11 @@
1
- #include "gguf.h" // for reading GGUF splits
2
1
  #include "arg.h"
3
2
 
3
+ #include "chat.h"
4
4
  #include "common.h"
5
+ #include "gguf.h" // for reading GGUF splits
6
+ #include "json-schema-to-grammar.h"
5
7
  #include "log.h"
6
8
  #include "sampling.h"
7
- #include "chat.h"
8
9
 
9
10
  // fix problem with std::min and std::max
10
11
  #if defined(_WIN32)
@@ -15,6 +16,9 @@
15
16
  #include <windows.h>
16
17
  #endif
17
18
 
19
+ #define JSON_ASSERT GGML_ASSERT
20
+ #include <nlohmann/json.hpp>
21
+
18
22
  #include <algorithm>
19
23
  #include <climits>
20
24
  #include <cstdarg>
@@ -34,8 +38,6 @@
34
38
  #include <future>
35
39
  #endif
36
40
 
37
- #include "json-schema-to-grammar.h"
38
-
39
41
  using json = nlohmann::ordered_json;
40
42
 
41
43
  std::initializer_list<enum llama_example> mmproj_examples = {
@@ -986,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
986
988
  params.tensor_buft_overrides.push_back({nullptr, nullptr});
987
989
  }
988
990
 
989
- if (params.reranking && params.embedding) {
990
- throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
991
- }
992
-
993
991
  if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
994
992
  throw std::runtime_error(string_format(
995
993
  "error: the supplied chat template is not supported: %s%s\n",
@@ -1346,9 +1344,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1346
1344
  ));
1347
1345
  add_opt(common_arg(
1348
1346
  {"--prio"}, "N",
1349
- string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
1347
+ string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
1350
1348
  [](common_params & params, int prio) {
1351
- if (prio < 0 || prio > 3) {
1349
+ if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
1352
1350
  throw std::invalid_argument("invalid value");
1353
1351
  }
1354
1352
  params.cpuparams.priority = (enum ggml_sched_priority) prio;
@@ -2745,9 +2743,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2745
2743
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
2746
2744
  add_opt(common_arg(
2747
2745
  {"--reranking", "--rerank"},
2748
- string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
2746
+ string_format("enable reranking endpoint on server (default: %s)", "disabled"),
2749
2747
  [](common_params & params) {
2750
- params.reranking = true;
2748
+ params.embedding = true;
2749
+ params.pooling_type = LLAMA_POOLING_TYPE_RANK;
2751
2750
  }
2752
2751
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
2753
2752
  add_opt(common_arg(
@@ -2867,6 +2866,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2867
2866
  "(default: deepseek)",
2868
2867
  [](common_params & params, const std::string & value) {
2869
2868
  /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2869
+ else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
2870
2870
  else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2871
2871
  else { throw std::invalid_argument("invalid value"); }
2872
2872
  }
@@ -3210,6 +3210,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3210
3210
  params.speculative.model.path = value;
3211
3211
  }
3212
3212
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3213
+ add_opt(common_arg(
3214
+ {"-ctkd", "--cache-type-k-draft"}, "TYPE",
3215
+ string_format(
3216
+ "KV cache data type for K for the draft model\n"
3217
+ "allowed values: %s\n"
3218
+ "(default: %s)",
3219
+ get_all_kv_cache_types().c_str(),
3220
+ ggml_type_name(params.speculative.cache_type_k)
3221
+ ),
3222
+ [](common_params & params, const std::string & value) {
3223
+ params.speculative.cache_type_k = kv_cache_type_from_str(value);
3224
+ }
3225
+ ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3226
+ add_opt(common_arg(
3227
+ {"-ctvd", "--cache-type-v-draft"}, "TYPE",
3228
+ string_format(
3229
+ "KV cache data type for V for the draft model\n"
3230
+ "allowed values: %s\n"
3231
+ "(default: %s)",
3232
+ get_all_kv_cache_types().c_str(),
3233
+ ggml_type_name(params.speculative.cache_type_v)
3234
+ ),
3235
+ [](common_params & params, const std::string & value) {
3236
+ params.speculative.cache_type_v = kv_cache_type_from_str(value);
3237
+ }
3238
+ ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
3213
3239
 
3214
3240
  add_opt(common_arg(
3215
3241
  {"-mv", "--model-vocoder"}, "FNAME",
@@ -1,4 +1,4 @@
1
- int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
2
- char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
1
+ int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
2
+ char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
3
3
  char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
4
4
  char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
49
49
 
50
50
  // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
51
51
  result_.tool_calls.emplace_back(tool_call);
52
+
52
53
  return true;
53
54
  }
54
55
  bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
@@ -154,9 +155,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
154
155
  if (!rest.empty()) {
155
156
  handle_reasoning(rest, /* closed */ !is_partial());
156
157
  }
157
- if (!syntax_.thinking_forced_open) {
158
- throw common_chat_msg_partial_exception(end_think);
159
- }
158
+ // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
159
+ // if (!syntax_.thinking_forced_open) {
160
+ // throw common_chat_msg_partial_exception(end_think);
161
+ // }
160
162
  return true;
161
163
  }
162
164
  }
@@ -377,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
377
379
  /* .is_partial = */ found_healing_marker,
378
380
  };
379
381
  }
382
+
383
+ void common_chat_msg_parser::clear_tools() {
384
+ result_.tool_calls.clear();
385
+ }
@@ -2,9 +2,10 @@
2
2
 
3
3
  #include "chat.h"
4
4
  #include "json-partial.h"
5
- #include "json.hpp"
6
5
  #include "regex-partial.h"
7
6
 
7
+ #include <nlohmann/json.hpp>
8
+
8
9
  #include <optional>
9
10
  #include <string>
10
11
  #include <vector>
@@ -114,4 +115,6 @@ class common_chat_msg_parser {
114
115
  const std::vector<std::vector<std::string>> & args_paths = {},
115
116
  const std::vector<std::vector<std::string>> & content_paths = {}
116
117
  );
118
+
119
+ void clear_tools();
117
120
  };
@@ -1,13 +1,14 @@
1
1
  #include "chat.h"
2
2
  #include "chat-parser.h"
3
3
  #include "common.h"
4
+ #include "json-partial.h"
4
5
  #include "json-schema-to-grammar.h"
5
6
  #include "log.h"
6
- #include "json-partial.h"
7
- #include "minja/chat-template.hpp"
8
- #include "minja/minja.hpp"
9
7
  #include "regex-partial.h"
10
8
 
9
+ #include <minja/chat-template.hpp>
10
+ #include <minja/minja.hpp>
11
+
11
12
  #include <cstdio>
12
13
  #include <exception>
13
14
  #include <iostream>
@@ -16,7 +17,6 @@
16
17
  #include <string>
17
18
  #include <vector>
18
19
 
19
-
20
20
  static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
21
21
  auto time = std::chrono::system_clock::to_time_t(now);
22
22
  auto local_time = *std::localtime(&time);
@@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
82
82
 
83
83
  std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
84
84
  std::vector<common_chat_msg_diff> diffs;
85
- // if (previous_msg.reasoning_content != current.reasoning_content) {
86
- // auto & diff = diffs.emplace_back();
87
- // diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content);
88
- // }
85
+ if (previous_msg.reasoning_content != new_msg.reasoning_content) {
86
+ auto & diff = diffs.emplace_back();
87
+ diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
88
+ }
89
89
  if (previous_msg.content != new_msg.content) {
90
90
  auto & diff = diffs.emplace_back();
91
91
  diff.content_delta = string_diff(previous_msg.content, new_msg.content);
@@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
385
385
 
386
386
  template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
387
387
  json delta = json::object();
388
- // if (!diff.reasoning_content_delta.empty()) {
389
- // delta["reasoning_content"] = msg.reasoning_content;
390
- // }
388
+ if (!diff.reasoning_content_delta.empty()) {
389
+ delta["reasoning_content"] = diff.reasoning_content_delta;
390
+ }
391
391
  if (!diff.content_delta.empty()) {
392
392
  delta["content"] = diff.content_delta;
393
393
  }
@@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
598
598
  switch (format) {
599
599
  case COMMON_REASONING_FORMAT_NONE: return "none";
600
600
  case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
601
+ case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
601
602
  default:
602
603
  throw std::runtime_error("Unknown reasoning format");
603
604
  }
@@ -1837,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy(
1837
1838
  if (res < 0) {
1838
1839
  // if the custom "tmpl" is not supported, we throw an error
1839
1840
  // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1840
- throw std::runtime_error("this custom template is not supported");
1841
+ throw std::runtime_error("this custom template is not supported, try using --jinja");
1841
1842
  }
1842
1843
 
1843
1844
  // if it turns out that our buffer is too small, we resize it
@@ -1920,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
1920
1921
  } catch (const common_chat_msg_partial_exception & ex) {
1921
1922
  LOG_DBG("Partial parse: %s\n", ex.what());
1922
1923
  if (!is_partial) {
1923
- throw std::runtime_error(ex.what());
1924
+ builder.clear_tools();
1925
+ builder.move_to(0);
1926
+ common_chat_parse_content_only(builder);
1924
1927
  }
1925
1928
  }
1926
1929
  auto msg = builder.result();
@@ -70,7 +70,7 @@ struct common_chat_msg {
70
70
  };
71
71
 
72
72
  struct common_chat_msg_diff {
73
- // std::string reasoning_content_delta;
73
+ std::string reasoning_content_delta;
74
74
  std::string content_delta;
75
75
  size_t tool_call_index = std::string::npos;
76
76
  common_chat_tool_call tool_call_delta;