cui-llama.rn 1.4.2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +93 -114
  2. package/android/src/main/CMakeLists.txt +5 -0
  3. package/android/src/main/build-arm64/CMakeCache.txt +429 -0
  4. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +81 -0
  5. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +101 -0
  6. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  7. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  8. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +15 -0
  9. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +904 -0
  10. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  11. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +919 -0
  12. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  13. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +431 -0
  14. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +16 -0
  15. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +165 -0
  16. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +297 -0
  17. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +1 -0
  18. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +1 -0
  19. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +1 -0
  20. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +1 -0
  21. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +1 -0
  22. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +1 -0
  23. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +1 -0
  24. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +8 -0
  25. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +1 -0
  26. package/android/src/main/build-arm64/CMakeFiles/progress.marks +1 -0
  27. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  28. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +58 -0
  29. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  30. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +756 -0
  31. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  32. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +709 -0
  33. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  34. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +714 -0
  35. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  36. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +62 -0
  37. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  38. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +708 -0
  39. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  40. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +113 -0
  41. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  42. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +713 -0
  43. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  44. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +763 -0
  45. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  46. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +61 -0
  47. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  48. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +707 -0
  49. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  50. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +104 -0
  51. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  52. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +714 -0
  53. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  54. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +723 -0
  55. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +62 -0
  56. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +722 -0
  57. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +89 -0
  58. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +2 -0
  59. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +2 -0
  60. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +2 -0
  61. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +17 -0
  62. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +41 -0
  63. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +62 -0
  64. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +722 -0
  65. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +89 -0
  66. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +2 -0
  67. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +2 -0
  68. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +2 -0
  69. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +17 -0
  70. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +41 -0
  71. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +62 -0
  72. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +722 -0
  73. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +89 -0
  74. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +2 -0
  75. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +2 -0
  76. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +2 -0
  77. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +17 -0
  78. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +41 -0
  79. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +62 -0
  80. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +722 -0
  81. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +89 -0
  82. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +2 -0
  83. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +2 -0
  84. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +2 -0
  85. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +17 -0
  86. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +41 -0
  87. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +62 -0
  88. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +722 -0
  89. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +89 -0
  90. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +2 -0
  91. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +2 -0
  92. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +2 -0
  93. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +17 -0
  94. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +41 -0
  95. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +62 -0
  96. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +722 -0
  97. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +89 -0
  98. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +2 -0
  99. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +2 -0
  100. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +2 -0
  101. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +17 -0
  102. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +41 -0
  103. package/android/src/main/build-arm64/Makefile +1862 -0
  104. package/android/src/main/build-arm64/cmake_install.cmake +66 -0
  105. package/android/src/main/java/com/rnllama/LlamaContext.java +92 -18
  106. package/android/src/main/java/com/rnllama/RNLlama.java +37 -4
  107. package/android/src/main/jni-utils.h +6 -0
  108. package/android/src/main/jni.cpp +287 -31
  109. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  110. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  111. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  112. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  113. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  114. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  115. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  116. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  117. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +7 -2
  118. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +7 -2
  119. package/cpp/chat-template.hpp +529 -0
  120. package/cpp/chat.cpp +1085 -0
  121. package/cpp/chat.hpp +55 -0
  122. package/cpp/common.cpp +159 -36
  123. package/cpp/common.h +64 -19
  124. package/cpp/ggml-alloc.c +1 -13
  125. package/cpp/ggml-common.h +0 -2
  126. package/cpp/ggml-cpu-impl.h +6 -12
  127. package/cpp/ggml-cpu-quants.c +937 -340
  128. package/cpp/ggml-cpu.c +207 -113
  129. package/cpp/ggml-cpu.cpp +4 -6
  130. package/cpp/ggml-cpu.h +1 -1
  131. package/cpp/ggml-metal.h +66 -66
  132. package/cpp/ggml-metal.m +141 -23
  133. package/cpp/ggml.c +24 -14
  134. package/cpp/ggml.h +2 -2
  135. package/cpp/json-schema-to-grammar.cpp +46 -66
  136. package/cpp/json-schema-to-grammar.h +15 -1
  137. package/cpp/llama-arch.cpp +7 -2
  138. package/cpp/llama-arch.h +3 -1
  139. package/cpp/llama-chat.cpp +10 -1
  140. package/cpp/llama-chat.h +1 -0
  141. package/cpp/llama-grammar.cpp +86 -6
  142. package/cpp/llama-grammar.h +22 -1
  143. package/cpp/llama-impl.h +6 -6
  144. package/cpp/llama-kv-cache.h +1 -1
  145. package/cpp/llama-mmap.h +1 -0
  146. package/cpp/llama-model-loader.cpp +1 -1
  147. package/cpp/llama-model.cpp +32 -6
  148. package/cpp/llama-sampling.cpp +178 -61
  149. package/cpp/llama-vocab.cpp +8 -3
  150. package/cpp/llama.cpp +188 -128
  151. package/cpp/llama.h +27 -10
  152. package/cpp/log.cpp +32 -10
  153. package/cpp/log.h +12 -1
  154. package/cpp/minja.hpp +2883 -0
  155. package/cpp/rn-llama.cpp +82 -5
  156. package/cpp/rn-llama.h +16 -1
  157. package/cpp/sampling.cpp +68 -41
  158. package/cpp/sampling.h +3 -0
  159. package/cpp/sgemm.cpp +9 -8
  160. package/cpp/unicode.cpp +9 -2
  161. package/ios/CMakeLists.txt +6 -0
  162. package/ios/RNLlama.h +0 -8
  163. package/ios/RNLlama.mm +27 -3
  164. package/ios/RNLlamaContext.h +10 -1
  165. package/ios/RNLlamaContext.mm +269 -57
  166. package/jest/mock.js +21 -2
  167. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  168. package/lib/commonjs/grammar.js +3 -0
  169. package/lib/commonjs/grammar.js.map +1 -1
  170. package/lib/commonjs/index.js +87 -13
  171. package/lib/commonjs/index.js.map +1 -1
  172. package/lib/module/NativeRNLlama.js.map +1 -1
  173. package/lib/module/grammar.js +3 -0
  174. package/lib/module/grammar.js.map +1 -1
  175. package/lib/module/index.js +86 -13
  176. package/lib/module/index.js.map +1 -1
  177. package/lib/typescript/NativeRNLlama.d.ts +107 -2
  178. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  179. package/lib/typescript/grammar.d.ts.map +1 -1
  180. package/lib/typescript/index.d.ts +32 -7
  181. package/lib/typescript/index.d.ts.map +1 -1
  182. package/llama-rn.podspec +1 -1
  183. package/package.json +2 -2
  184. package/src/NativeRNLlama.ts +115 -3
  185. package/src/grammar.ts +3 -0
  186. package/src/index.ts +138 -21
package/cpp/chat.hpp ADDED
@@ -0,0 +1,55 @@
1
+ // Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
2
+
3
+ #pragma once
4
+
5
+ #include "common.h"
6
+ #include "json.hpp"
7
+ #include <optional>
8
+ #include <string>
9
+ #include <vector>
10
+
11
+ using json = nlohmann::ordered_json;
12
+
13
+ struct common_chat_inputs {
14
+ json messages;
15
+ json tools;
16
+ json tool_choice;
17
+ json json_schema;
18
+ bool parallel_tool_calls;
19
+ bool stream;
20
+ std::string grammar;
21
+ bool add_generation_prompt = true;
22
+ bool extract_reasoning = true;
23
+ };
24
+
25
+ enum common_chat_format {
26
+ COMMON_CHAT_FORMAT_CONTENT_ONLY,
27
+ COMMON_CHAT_FORMAT_GENERIC,
28
+ COMMON_CHAT_FORMAT_MISTRAL_NEMO,
29
+ COMMON_CHAT_FORMAT_LLAMA_3_X,
30
+ COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
31
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
32
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
33
+ COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
34
+ COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
35
+ COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
36
+ COMMON_CHAT_FORMAT_HERMES_2_PRO,
37
+ COMMON_CHAT_FORMAT_COMMAND_R7B,
38
+ COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
39
+
40
+ COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
41
+ };
42
+
43
+ struct common_chat_params {
44
+ common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
45
+ json prompt;
46
+ std::string grammar;
47
+ bool grammar_lazy = false;
48
+ std::vector<common_grammar_trigger> grammar_triggers;
49
+ std::vector<std::string> preserved_tokens;
50
+ std::vector<std::string> additional_stops;
51
+ };
52
+
53
+ struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params);
54
+ std::string common_chat_format_name(common_chat_format format);
55
+ common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
package/cpp/common.cpp CHANGED
@@ -12,6 +12,8 @@
12
12
  #include "json.hpp"
13
13
  #include "json-schema-to-grammar.h"
14
14
  #include "llama.h"
15
+ #include "chat.hpp"
16
+ #include "chat-template.hpp"
15
17
 
16
18
  #include <algorithm>
17
19
  #include <cinttypes>
@@ -489,6 +491,48 @@ void string_replace_all(std::string & s, const std::string & search, const std::
489
491
  s = std::move(builder);
490
492
  }
491
493
 
494
+ std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
495
+ std::ostringstream result;
496
+ for (size_t i = 0; i < values.size(); ++i) {
497
+ if (i > 0) {
498
+ result << separator;
499
+ }
500
+ result << values[i];
501
+ }
502
+ return result.str();
503
+ }
504
+
505
+ std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
506
+ std::vector<std::string> parts;
507
+ size_t start = 0;
508
+ size_t end = str.find(delimiter);
509
+
510
+ while (end != std::string::npos) {
511
+ parts.push_back(str.substr(start, end - start));
512
+ start = end + delimiter.length();
513
+ end = str.find(delimiter, start);
514
+ }
515
+
516
+ parts.push_back(str.substr(start));
517
+
518
+ return parts;
519
+ }
520
+
521
+ std::string string_repeat(const std::string & str, size_t n) {
522
+ if (n == 0) {
523
+ return "";
524
+ }
525
+
526
+ std::string result;
527
+ result.reserve(str.length() * n);
528
+
529
+ for (size_t i = 0; i < n; ++i) {
530
+ result += str;
531
+ }
532
+
533
+ return result;
534
+ }
535
+
492
536
  std::string string_from(bool value) {
493
537
  return value ? "true" : "false";
494
538
  }
@@ -1526,67 +1570,80 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
1526
1570
  // Chat template utils
1527
1571
  //
1528
1572
 
1529
- std::string common_get_builtin_chat_template(const struct llama_model * model) {
1530
- const char * ptr_tmpl = llama_model_chat_template(model);
1531
- return ptr_tmpl == nullptr ? "" : ptr_tmpl;
1532
- }
1533
-
1534
- bool common_chat_verify_template(const std::string & tmpl) {
1573
+ bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
1574
+ if (use_jinja) {
1575
+ try {
1576
+ auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
1577
+ common_chat_inputs inputs;
1578
+ inputs.messages = json::array({{
1579
+ {"role", "user"},
1580
+ {"content", "test"},
1581
+ }});
1582
+ common_chat_params_init(chat_template, inputs);
1583
+ return true;
1584
+ } catch (const std::exception & e) {
1585
+ LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
1586
+ return false;
1587
+ }
1588
+ }
1535
1589
  llama_chat_message chat[] = {{"user", "test"}};
1536
1590
  const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
1537
1591
  return res >= 0;
1538
1592
  }
1539
1593
 
1540
- std::string common_chat_apply_template(const struct llama_model * model,
1541
- const std::string & tmpl,
1594
+ std::string common_chat_apply_template(
1595
+ const common_chat_template & tmpl,
1542
1596
  const std::vector<common_chat_msg> & msgs,
1543
- bool add_ass) {
1597
+ bool add_ass,
1598
+ bool use_jinja) {
1599
+ if (use_jinja) {
1600
+ auto messages = json::array();
1601
+ for (const auto & msg : msgs) {
1602
+ messages.push_back({{"role", msg.role}, {"content", msg.content}});
1603
+ }
1604
+ common_chat_inputs inputs;
1605
+ inputs.messages = messages;
1606
+ inputs.add_generation_prompt = add_ass;
1607
+ return common_chat_params_init(tmpl, inputs).prompt;
1608
+ }
1609
+
1544
1610
  int alloc_size = 0;
1545
- bool fallback = false; // indicate if we must fallback to default chatml
1546
1611
  std::vector<llama_chat_message> chat;
1547
1612
  for (const auto & msg : msgs) {
1548
1613
  chat.push_back({msg.role.c_str(), msg.content.c_str()});
1549
1614
  alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
1550
1615
  }
1551
1616
 
1552
- const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
1553
1617
  std::vector<char> buf(alloc_size);
1554
1618
 
1555
1619
  // run the first time to get the total output length
1556
- int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1620
+ int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1557
1621
 
1558
1622
  // error: chat template is not supported
1559
1623
  if (res < 0) {
1560
- if (ptr_tmpl != nullptr) {
1561
- // if the custom "tmpl" is not supported, we throw an error
1562
- // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1563
- throw std::runtime_error("this custom template is not supported");
1564
- }
1565
-
1566
- // If the built-in template is not supported, we default to chatml
1567
- res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1568
- fallback = true;
1624
+ // if the custom "tmpl" is not supported, we throw an error
1625
+ // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1626
+ throw std::runtime_error("this custom template is not supported");
1569
1627
  }
1570
1628
 
1571
1629
  // if it turns out that our buffer is too small, we resize it
1572
1630
  if ((size_t) res > buf.size()) {
1573
1631
  buf.resize(res);
1574
- res = llama_chat_apply_template(
1575
- fallback ? "chatml" : ptr_tmpl,
1576
- chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1632
+ res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1577
1633
  }
1578
1634
 
1579
1635
  std::string formatted_chat(buf.data(), res);
1580
1636
  return formatted_chat;
1581
1637
  }
1582
1638
 
1583
- std::string common_chat_format_single(const struct llama_model * model,
1584
- const std::string & tmpl,
1639
+ std::string common_chat_format_single(
1640
+ const common_chat_template & tmpl,
1585
1641
  const std::vector<common_chat_msg> & past_msg,
1586
1642
  const common_chat_msg & new_msg,
1587
- bool add_ass) {
1643
+ bool add_ass,
1644
+ bool use_jinja) {
1588
1645
  std::ostringstream ss;
1589
- auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
1646
+ auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
1590
1647
  std::vector<common_chat_msg> chat_new(past_msg);
1591
1648
  // if the past_msg ends with a newline, we must preserve it in the formatted version
1592
1649
  if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@@ -1594,21 +1651,87 @@ std::string common_chat_format_single(const struct llama_model * model,
1594
1651
  };
1595
1652
  // format chat with new_msg
1596
1653
  chat_new.push_back(new_msg);
1597
- auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
1654
+ auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
1598
1655
  // get the diff part
1599
1656
  ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
1600
1657
  return ss.str();
1601
1658
  }
1602
1659
 
1603
- std::string common_chat_format_example(const struct llama_model * model,
1604
- const std::string & tmpl) {
1660
+ std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
1605
1661
  std::vector<common_chat_msg> msgs = {
1606
- {"system", "You are a helpful assistant"},
1607
- {"user", "Hello"},
1608
- {"assistant", "Hi there"},
1609
- {"user", "How are you?"},
1662
+ {"system", "You are a helpful assistant", {}},
1663
+ {"user", "Hello", {}},
1664
+ {"assistant", "Hi there", {}},
1665
+ {"user", "How are you?", {}},
1610
1666
  };
1611
- return common_chat_apply_template(model, tmpl, msgs, true);
1667
+ return common_chat_apply_template(tmpl, msgs, true, use_jinja);
1668
+ }
1669
+
1670
+ #define CHATML_TEMPLATE_SRC \
1671
+ "{%- for message in messages -%}\n" \
1672
+ " {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
1673
+ "{%- endfor -%}\n" \
1674
+ "{%- if add_generation_prompt -%}\n" \
1675
+ " {{- '<|im_start|>assistant\n' -}}\n" \
1676
+ "{%- endif -%}"
1677
+
1678
+ common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
1679
+ {
1680
+ std::string default_template_src;
1681
+ std::string template_tool_use_src;
1682
+
1683
+ bool has_explicit_template = !chat_template_override.empty();
1684
+ if (chat_template_override.empty()) {
1685
+ auto str = llama_model_chat_template(model, /* name */ nullptr);
1686
+ if (str) {
1687
+ default_template_src = str;
1688
+ has_explicit_template = true;
1689
+ }
1690
+ str = llama_model_chat_template(model, /* name */ "tool_use");
1691
+ if (str) {
1692
+ template_tool_use_src = str;
1693
+ has_explicit_template = true;
1694
+ }
1695
+ } else {
1696
+ default_template_src = chat_template_override;
1697
+ }
1698
+ if (default_template_src.empty() || default_template_src == "chatml") {
1699
+ if (!template_tool_use_src.empty()) {
1700
+ default_template_src = template_tool_use_src;
1701
+ } else {
1702
+ default_template_src = CHATML_TEMPLATE_SRC;
1703
+ }
1704
+ }
1705
+ auto vocab = llama_model_get_vocab(model);
1706
+ const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
1707
+ if (token == LLAMA_TOKEN_NULL) {
1708
+ if (default_template_src.find(jinja_variable_name) != std::string::npos
1709
+ || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
1710
+ LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
1711
+ }
1712
+ return std::string();
1713
+ } else {
1714
+ return common_token_to_piece(vocab, token, true);
1715
+ }
1716
+ };
1717
+ auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
1718
+ auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
1719
+ try {
1720
+ return {
1721
+ has_explicit_template,
1722
+ std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
1723
+ template_tool_use_src.empty()
1724
+ ? nullptr
1725
+ : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
1726
+ };
1727
+ } catch (const std::exception & e) {
1728
+ LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
1729
+ return {
1730
+ has_explicit_template,
1731
+ std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
1732
+ nullptr,
1733
+ };
1734
+ }
1612
1735
  }
1613
1736
 
1614
1737
  //
package/cpp/common.h CHANGED
@@ -4,6 +4,7 @@
4
4
 
5
5
  #include "llama-cpp.h"
6
6
 
7
+ #include <set>
7
8
  #include <string>
8
9
  #include <vector>
9
10
  #include <sstream>
@@ -120,6 +121,11 @@ enum common_conversation_mode {
120
121
  COMMON_CONVERSATION_MODE_AUTO = 2,
121
122
  };
122
123
 
124
+ struct common_grammar_trigger {
125
+ std::string word;
126
+ bool at_start;
127
+ };
128
+
123
129
  // sampling parameters
124
130
  struct common_params_sampling {
125
131
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -145,6 +151,7 @@ struct common_params_sampling {
145
151
  int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
146
152
  int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
147
153
  int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
154
+ float top_n_sigma = -1.00f;// -1.0 = disabled
148
155
  float mirostat_tau = 5.00f; // target entropy
149
156
  float mirostat_eta = 0.10f; // learning rate
150
157
  bool ignore_eos = false;
@@ -165,7 +172,11 @@ struct common_params_sampling {
165
172
  COMMON_SAMPLER_TYPE_TEMPERATURE,
166
173
  };
167
174
 
168
- std::string grammar; // optional BNF-like grammar to constrain sampling
175
+ std::string grammar; // optional BNF-like grammar to constrain sampling
176
+ bool grammar_lazy = false;
177
+ std::vector<common_grammar_trigger> grammar_trigger_words; // optional trigger words to trigger lazy grammar
178
+ std::vector<llama_token> grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
179
+ std::set<llama_token> preserved_tokens;
169
180
 
170
181
  std::vector<llama_logit_bias> logit_bias; // logit biases to apply
171
182
 
@@ -203,6 +214,11 @@ struct common_params_vocoder {
203
214
  bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
204
215
  };
205
216
 
217
+ enum common_reasoning_format {
218
+ COMMON_REASONING_FORMAT_NONE,
219
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
220
+ };
221
+
206
222
  struct common_params {
207
223
 
208
224
  void * progress_callback_user_data = nullptr;
@@ -297,6 +313,7 @@ struct common_params {
297
313
  bool kl_divergence = false; // compute KL divergence
298
314
 
299
315
  bool usage = false; // print usage
316
+ bool completion = false; // print source-able completion script
300
317
  bool use_color = false; // use color to distinguish generations and inputs
301
318
  bool special = false; // enable special token output
302
319
  bool interactive = false; // interactive mode
@@ -349,7 +366,9 @@ struct common_params {
349
366
  std::string hostname = "127.0.0.1";
350
367
  std::string public_path = ""; // NOLINT
351
368
  std::string chat_template = ""; // NOLINT
369
+ bool use_jinja = false; // NOLINT
352
370
  bool enable_chat_template = true;
371
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
353
372
 
354
373
  std::vector<std::string> api_keys;
355
374
 
@@ -428,13 +447,13 @@ bool set_process_priority(enum lm_ggml_sched_priority prio);
428
447
  //
429
448
 
430
449
  #ifdef __GNUC__
431
- #ifdef __MINGW32__
432
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
450
+ # if defined(__MINGW32__) && !defined(__clang__)
451
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
452
+ # else
453
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
454
+ # endif
433
455
  #else
434
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
435
- #endif
436
- #else
437
- #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
456
+ # define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
438
457
  #endif
439
458
 
440
459
  LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
@@ -443,6 +462,10 @@ std::string string_format(const char * fmt, ...);
443
462
  std::string string_strip(const std::string & str);
444
463
  std::string string_get_sortable_timestamp();
445
464
 
465
+ std::string string_join(const std::vector<std::string> & values, const std::string & separator);
466
+ std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
467
+ std::string string_repeat(const std::string & str, size_t n);
468
+
446
469
  void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
447
470
 
448
471
  template<class T>
@@ -534,6 +557,7 @@ struct llama_model * common_load_model_from_hf(
534
557
  const std::string & local_path,
535
558
  const std::string & hf_token,
536
559
  const struct llama_model_params & params);
560
+
537
561
  std::pair<std::string, std::string> common_get_hf_file(
538
562
  const std::string & hf_repo_with_tag,
539
563
  const std::string & hf_token);
@@ -615,36 +639,57 @@ std::string common_detokenize(
615
639
  // Chat template utils
616
640
  //
617
641
 
642
+ struct common_tool_call {
643
+ std::string name;
644
+ std::string arguments;
645
+ std::string id;
646
+ };
647
+
618
648
  // same with llama_chat_message, but uses std::string
619
649
  struct common_chat_msg {
620
650
  std::string role;
621
651
  std::string content;
652
+ std::vector<common_tool_call> tool_calls;
653
+ std::string reasoning_content = "";
622
654
  };
623
655
 
624
- // Get the built-in chat template for the model. Return empty string if not present.
625
- std::string common_get_builtin_chat_template(const struct llama_model * model);
626
-
627
656
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
628
- bool common_chat_verify_template(const std::string & tmpl);
657
+ bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
658
+
659
+ namespace minja {
660
+ class chat_template;
661
+ }
662
+
663
+ typedef minja::chat_template common_chat_template;
664
+
665
+ struct common_chat_templates {
666
+ bool has_explicit_template; // Model had builtin template or template overridde was specified.
667
+ std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
668
+ std::unique_ptr<common_chat_template> template_tool_use;
669
+ };
629
670
 
630
671
  // CPP wrapper for llama_chat_apply_template
631
672
  // If the built-in template is not supported, we default to chatml
632
673
  // If the custom "tmpl" is not supported, we throw an error
633
- std::string common_chat_apply_template(const struct llama_model * model,
634
- const std::string & tmpl,
674
+ std::string common_chat_apply_template(
675
+ const common_chat_template & tmpl,
635
676
  const std::vector<common_chat_msg> & chat,
636
- bool add_ass);
677
+ bool add_ass,
678
+ bool use_jinja);
637
679
 
638
680
  // Format single message, while taking into account the position of that message in chat history
639
- std::string common_chat_format_single(const struct llama_model * model,
640
- const std::string & tmpl,
681
+ std::string common_chat_format_single(
682
+ const common_chat_template & tmpl,
641
683
  const std::vector<common_chat_msg> & past_msg,
642
684
  const common_chat_msg & new_msg,
643
- bool add_ass);
685
+ bool add_ass,
686
+ bool use_jinja);
644
687
 
645
688
  // Returns an example of formatted chat
646
- std::string common_chat_format_example(const struct llama_model * model,
647
- const std::string & tmpl);
689
+ std::string common_chat_format_example(
690
+ const common_chat_template & tmpl, bool use_jinja);
691
+
692
+ common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
648
693
 
649
694
  //
650
695
  // KV cache utils
package/cpp/ggml-alloc.c CHANGED
@@ -989,19 +989,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
989
989
  this_size = LM_GGML_PAD(lm_ggml_backend_buft_get_alloc_size(buft, t), alignment);
990
990
  }
991
991
 
992
- if (this_size > max_size) {
993
- LM_GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
994
- __func__, t->name,
995
- lm_ggml_backend_buft_name(buft),
996
- this_size, max_size);
997
- for (size_t i = 0; i < n_buffers; i++) {
998
- lm_ggml_backend_buffer_free(buffers[i]);
999
- }
1000
- free(buffers);
1001
- return NULL;
1002
- }
1003
-
1004
- if ((cur_buf_size + this_size) > max_size) {
992
+ if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
1005
993
  // allocate tensors in the current buffer
1006
994
  if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1007
995
  return NULL;
package/cpp/ggml-common.h CHANGED
@@ -473,7 +473,6 @@ LM_GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
473
473
  240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
474
474
  LM_GGML_TABLE_END()
475
475
 
476
- //#if __CUDA_ARCH__ >= LM_GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
477
476
  LM_GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
478
477
  0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
479
478
  0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
@@ -508,7 +507,6 @@ LM_GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
508
507
  0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
509
508
  0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
510
509
  LM_GGML_TABLE_END()
511
- //#endif
512
510
 
513
511
 
514
512
  LM_GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)
@@ -360,21 +360,15 @@ inline static int32x4_t lm_ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t
360
360
  #endif
361
361
 
362
362
  #if defined(__loongarch_asx)
363
-
364
- typedef union {
365
- int32_t i;
366
- float f;
367
- } ft_union;
368
-
369
363
  /* float type data load instructions */
370
- static __m128 __lsx_vreplfr2vr_s(float val) {
371
- ft_union fi_tmpval = {.f = val};
372
- return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
364
+ static __m128 __lsx_vreplfr2vr_s(const float val) {
365
+ v4f32 res = {val, val, val, val};
366
+ return (__m128)res;
373
367
  }
374
368
 
375
- static __m256 __lasx_xvreplfr2vr_s(float val) {
376
- ft_union fi_tmpval = {.f = val};
377
- return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
369
+ static __m256 __lasx_xvreplfr2vr_s(const float val) {
370
+ v8f32 res = {val, val, val, val, val, val, val, val};
371
+ return (__m256)res;
378
372
  }
379
373
  #endif
380
374