@novastera-oss/llamarn 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. package/android/src/main/cpp/include/llama.h +134 -36
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +2 -2
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +30 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +50 -40
  26. package/cpp/llama.cpp/common/common.h +5 -2
  27. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  28. package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
  29. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
  30. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  31. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
  32. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  34. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  35. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  70. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
  74. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
  76. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  84. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  101. package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
  102. package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
  103. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
  104. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
  105. package/cpp/llama.cpp/include/llama.h +134 -36
  106. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  107. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  108. package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
  109. package/cpp/llama.cpp/src/llama-arch.h +7 -1
  110. package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
  111. package/cpp/llama.cpp/src/llama-batch.h +36 -11
  112. package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
  113. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  114. package/cpp/llama.cpp/src/llama-context.cpp +313 -213
  115. package/cpp/llama.cpp/src/llama-context.h +16 -12
  116. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  117. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  118. package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
  119. package/cpp/llama.cpp/src/llama-graph.h +90 -34
  120. package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
  121. package/cpp/llama.cpp/src/llama-hparams.h +8 -2
  122. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
  123. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
  124. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
  125. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
  126. package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
  127. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
  128. package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
  129. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
  130. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
  131. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  132. package/cpp/llama.cpp/src/llama-memory.h +64 -23
  133. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  134. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  135. package/cpp/llama.cpp/src/llama-model.cpp +726 -141
  136. package/cpp/llama.cpp/src/llama-model.h +4 -0
  137. package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
  138. package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
  139. package/cpp/llama.cpp/src/llama.cpp +11 -7
  140. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  141. package/cpp/rn-completion.cpp +2 -2
  142. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  143. package/ios/include/chat.h +1 -1
  144. package/ios/include/common.h +5 -2
  145. package/ios/include/llama.h +134 -36
  146. package/ios/libs/llama.xcframework/Info.plist +18 -18
  147. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  148. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  149. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
  150. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  151. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  152. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  153. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  154. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  155. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
  160. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
  161. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  162. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
  163. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  164. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  165. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  167. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
  168. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  173. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  175. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  178. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  183. package/package.json +1 -2
  184. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  185. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  186. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  187. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  188. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  189. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  190. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  191. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  192. /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
@@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
988
988
  params.tensor_buft_overrides.push_back({nullptr, nullptr});
989
989
  }
990
990
 
991
- if (params.reranking && params.embedding) {
992
- throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
993
- }
994
-
995
991
  if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
996
992
  throw std::runtime_error(string_format(
997
993
  "error: the supplied chat template is not supported: %s%s\n",
@@ -2747,9 +2743,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2747
2743
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
2748
2744
  add_opt(common_arg(
2749
2745
  {"--reranking", "--rerank"},
2750
- string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
2746
+ string_format("enable reranking endpoint on server (default: %s)", "disabled"),
2751
2747
  [](common_params & params) {
2752
- params.reranking = true;
2748
+ params.embedding = true;
2749
+ params.pooling_type = LLAMA_POOLING_TYPE_RANK;
2753
2750
  }
2754
2751
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
2755
2752
  add_opt(common_arg(
@@ -2869,6 +2866,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2869
2866
  "(default: deepseek)",
2870
2867
  [](common_params & params, const std::string & value) {
2871
2868
  /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2869
+ else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
2872
2870
  else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2873
2871
  else { throw std::invalid_argument("invalid value"); }
2874
2872
  }
@@ -3212,6 +3210,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3212
3210
  params.speculative.model.path = value;
3213
3211
  }
3214
3212
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3213
+ add_opt(common_arg(
3214
+ {"-ctkd", "--cache-type-k-draft"}, "TYPE",
3215
+ string_format(
3216
+ "KV cache data type for K for the draft model\n"
3217
+ "allowed values: %s\n"
3218
+ "(default: %s)",
3219
+ get_all_kv_cache_types().c_str(),
3220
+ ggml_type_name(params.speculative.cache_type_k)
3221
+ ),
3222
+ [](common_params & params, const std::string & value) {
3223
+ params.speculative.cache_type_k = kv_cache_type_from_str(value);
3224
+ }
3225
+ ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3226
+ add_opt(common_arg(
3227
+ {"-ctvd", "--cache-type-v-draft"}, "TYPE",
3228
+ string_format(
3229
+ "KV cache data type for V for the draft model\n"
3230
+ "allowed values: %s\n"
3231
+ "(default: %s)",
3232
+ get_all_kv_cache_types().c_str(),
3233
+ ggml_type_name(params.speculative.cache_type_v)
3234
+ ),
3235
+ [](common_params & params, const std::string & value) {
3236
+ params.speculative.cache_type_v = kv_cache_type_from_str(value);
3237
+ }
3238
+ ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
3215
3239
 
3216
3240
  add_opt(common_arg(
3217
3241
  {"-mv", "--model-vocoder"}, "FNAME",
@@ -1,4 +1,4 @@
1
- int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
2
- char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
1
+ int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
2
+ char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
3
3
  char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
4
4
  char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
49
49
 
50
50
  // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
51
51
  result_.tool_calls.emplace_back(tool_call);
52
+
52
53
  return true;
53
54
  }
54
55
  bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
@@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
378
379
  /* .is_partial = */ found_healing_marker,
379
380
  };
380
381
  }
382
+
383
+ void common_chat_msg_parser::clear_tools() {
384
+ result_.tool_calls.clear();
385
+ }
@@ -115,4 +115,6 @@ class common_chat_msg_parser {
115
115
  const std::vector<std::vector<std::string>> & args_paths = {},
116
116
  const std::vector<std::vector<std::string>> & content_paths = {}
117
117
  );
118
+
119
+ void clear_tools();
118
120
  };
@@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
82
82
 
83
83
  std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
84
84
  std::vector<common_chat_msg_diff> diffs;
85
- // if (previous_msg.reasoning_content != current.reasoning_content) {
86
- // auto & diff = diffs.emplace_back();
87
- // diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content);
88
- // }
85
+ if (previous_msg.reasoning_content != new_msg.reasoning_content) {
86
+ auto & diff = diffs.emplace_back();
87
+ diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
88
+ }
89
89
  if (previous_msg.content != new_msg.content) {
90
90
  auto & diff = diffs.emplace_back();
91
91
  diff.content_delta = string_diff(previous_msg.content, new_msg.content);
@@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
385
385
 
386
386
  template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
387
387
  json delta = json::object();
388
- // if (!diff.reasoning_content_delta.empty()) {
389
- // delta["reasoning_content"] = msg.reasoning_content;
390
- // }
388
+ if (!diff.reasoning_content_delta.empty()) {
389
+ delta["reasoning_content"] = diff.reasoning_content_delta;
390
+ }
391
391
  if (!diff.content_delta.empty()) {
392
392
  delta["content"] = diff.content_delta;
393
393
  }
@@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
598
598
  switch (format) {
599
599
  case COMMON_REASONING_FORMAT_NONE: return "none";
600
600
  case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
601
+ case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
601
602
  default:
602
603
  throw std::runtime_error("Unknown reasoning format");
603
604
  }
@@ -1837,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy(
1837
1838
  if (res < 0) {
1838
1839
  // if the custom "tmpl" is not supported, we throw an error
1839
1840
  // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1840
- throw std::runtime_error("this custom template is not supported");
1841
+ throw std::runtime_error("this custom template is not supported, try using --jinja");
1841
1842
  }
1842
1843
 
1843
1844
  // if it turns out that our buffer is too small, we resize it
@@ -1920,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
1920
1921
  } catch (const common_chat_msg_partial_exception & ex) {
1921
1922
  LOG_DBG("Partial parse: %s\n", ex.what());
1922
1923
  if (!is_partial) {
1923
- throw std::runtime_error(ex.what());
1924
+ builder.clear_tools();
1925
+ builder.move_to(0);
1926
+ common_chat_parse_content_only(builder);
1924
1927
  }
1925
1928
  }
1926
1929
  auto msg = builder.result();
@@ -70,7 +70,7 @@ struct common_chat_msg {
70
70
  };
71
71
 
72
72
  struct common_chat_msg_diff {
73
- // std::string reasoning_content_delta;
73
+ std::string reasoning_content_delta;
74
74
  std::string content_delta;
75
75
  size_t tool_call_index = std::string::npos;
76
76
  common_chat_tool_call tool_call_delta;
@@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
466
466
 
467
467
  std::string regex_escape(const std::string & s) {
468
468
  static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
469
- return std::regex_replace(s, special_chars, "\\$0");
469
+ return std::regex_replace(s, special_chars, "\\$&");
470
470
  }
471
471
 
472
472
  std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
@@ -706,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) {
706
706
  // disable C++17 deprecation warning for std::codecvt_utf8
707
707
  # pragma clang diagnostic push
708
708
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
709
+ #elif defined(__GNUC__)
710
+ # pragma GCC diagnostic push
711
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
709
712
  #endif
713
+
710
714
  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
711
715
 
712
716
  #if defined(__clang__)
713
717
  # pragma clang diagnostic pop
718
+ #elif defined(__GNUC__)
719
+ # pragma GCC diagnostic pop
714
720
  #endif
715
721
 
716
722
  filename_utf32 = converter.from_bytes(filename);
@@ -767,6 +773,9 @@ bool fs_validate_filename(const std::string & filename) {
767
773
  return true;
768
774
  }
769
775
 
776
+ #include <iostream>
777
+
778
+
770
779
  // returns true if successful, false otherwise
771
780
  bool fs_create_directory_with_parents(const std::string & path) {
772
781
  #ifdef _WIN32
@@ -784,9 +793,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
784
793
  // process path from front to back, procedurally creating directories
785
794
  while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
786
795
  const std::wstring subpath = wpath.substr(0, pos_slash);
787
- const wchar_t * test = subpath.c_str();
788
796
 
789
- const bool success = CreateDirectoryW(test, NULL);
797
+ pos_slash += 1;
798
+
799
+ // skip the drive letter, in some systems it can return an access denied error
800
+ if (subpath.length() == 2 && subpath[1] == ':') {
801
+ continue;
802
+ }
803
+
804
+ const bool success = CreateDirectoryW(subpath.c_str(), NULL);
805
+
790
806
  if (!success) {
791
807
  const DWORD error = GetLastError();
792
808
 
@@ -800,8 +816,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
800
816
  return false;
801
817
  }
802
818
  }
803
-
804
- pos_slash += 1;
805
819
  }
806
820
 
807
821
  return true;
@@ -897,34 +911,6 @@ struct common_init_result common_init_from_params(common_params & params) {
897
911
 
898
912
  const llama_vocab * vocab = llama_model_get_vocab(model);
899
913
 
900
- if (params.reranking) {
901
- bool ok = true;
902
-
903
- if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
904
- LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
905
- ok = false;
906
- }
907
-
908
- bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
909
- bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
910
-
911
- if (!has_eos && !has_sep) {
912
- LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
913
- ok = false;
914
- } else if (!has_eos) {
915
- LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
916
- } else if (!has_sep) {
917
- LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
918
- ok = false;
919
- }
920
-
921
- if (!ok) {
922
- llama_model_free(model);
923
-
924
- return iparams;
925
- }
926
- }
927
-
928
914
  auto cparams = common_context_params_to_llama(params);
929
915
 
930
916
  llama_context * lctx = llama_init_from_model(model, cparams);
@@ -934,7 +920,7 @@ struct common_init_result common_init_from_params(common_params & params) {
934
920
  return iparams;
935
921
  }
936
922
 
937
- if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
923
+ if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
938
924
  LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
939
925
  params.ctx_shift = false;
940
926
  }
@@ -966,6 +952,35 @@ struct common_init_result common_init_from_params(common_params & params) {
966
952
  }
967
953
  }
968
954
 
955
+ if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
956
+ bool ok = true;
957
+
958
+ if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
959
+ LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
960
+ ok = false;
961
+ }
962
+
963
+ bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
964
+ bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
965
+
966
+ if (!has_eos && !has_sep) {
967
+ LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
968
+ ok = false;
969
+ } else if (!has_eos) {
970
+ LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
971
+ } else if (!has_sep) {
972
+ LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
973
+ ok = false;
974
+ }
975
+
976
+ if (!ok) {
977
+ llama_free(lctx);
978
+ llama_model_free(model);
979
+
980
+ return iparams;
981
+ }
982
+ }
983
+
969
984
  // load and optionally apply lora adapters
970
985
  for (auto & la : params.lora_adapters) {
971
986
  llama_adapter_lora_ptr lora;
@@ -1041,7 +1056,7 @@ struct common_init_result common_init_from_params(common_params & params) {
1041
1056
  if (llama_model_has_decoder(model)) {
1042
1057
  llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
1043
1058
  }
1044
- llama_kv_self_clear(lctx);
1059
+ llama_memory_clear(llama_get_memory(lctx), true);
1045
1060
  llama_synchronize(lctx);
1046
1061
  llama_perf_context_reset(lctx);
1047
1062
  llama_set_warmup(lctx, false);
@@ -1143,11 +1158,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1143
1158
  cparams.op_offload = !params.no_op_offload;
1144
1159
  cparams.swa_full = params.swa_full;
1145
1160
 
1146
- if (params.reranking) {
1147
- cparams.embeddings = true;
1148
- cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1149
- }
1150
-
1151
1161
  cparams.type_k = params.cache_type_k;
1152
1162
  cparams.type_v = params.cache_type_v;
1153
1163
 
@@ -199,6 +199,9 @@ struct common_params_speculative {
199
199
  float p_split = 0.1f; // speculative decoding split probability
200
200
  float p_min = 0.75f; // minimum speculative decoding probability (greedy)
201
201
 
202
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
203
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
204
+
202
205
  struct cpu_params cpuparams;
203
206
  struct cpu_params cpuparams_batch;
204
207
 
@@ -215,7 +218,8 @@ struct common_params_vocoder {
215
218
 
216
219
  enum common_reasoning_format {
217
220
  COMMON_REASONING_FORMAT_NONE,
218
- COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
221
+ COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
222
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
219
223
  };
220
224
 
221
225
  struct common_params {
@@ -354,7 +358,6 @@ struct common_params {
354
358
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
355
359
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
356
360
  std::string embd_sep = "\n"; // separator of embeddings
357
- bool reranking = false; // enable reranking support on server
358
361
 
359
362
  // server params
360
363
  int32_t port = 8080; // server listens on this network port
@@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
144
144
  auto & smpl = spec->smpl;
145
145
  auto & prompt = spec->prompt;
146
146
 
147
+ auto * mem = llama_get_memory(ctx);
148
+
147
149
  int reuse_i = 0;
148
150
  int reuse_n = 0;
149
151
 
@@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
173
175
  result.reserve(params.n_draft);
174
176
 
175
177
  if (reuse_n == 0) {
176
- llama_kv_self_clear(ctx);
178
+ llama_memory_clear(mem, false);
177
179
 
178
180
  prompt.clear();
179
181
  } else {
@@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
192
194
  }
193
195
 
194
196
  if (reuse_i > 0) {
195
- llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
196
- llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
197
+ llama_memory_seq_rm (mem, 0, 0, reuse_i);
198
+ llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
197
199
 
198
200
  prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
199
201
  }
200
202
 
201
203
  if (reuse_n < (int) prompt.size()) {
202
- llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
204
+ llama_memory_seq_rm (mem, 0, reuse_n, -1);
203
205
 
204
206
  prompt.erase(prompt.begin() + reuse_n, prompt.end());
205
207
  }