cui-llama.rn 1.6.1 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. package/android/src/main/CMakeLists.txt +6 -0
  2. package/android/src/main/java/com/rnllama/LlamaContext.java +38 -5
  3. package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
  4. package/android/src/main/jni.cpp +153 -14
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  13. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
  14. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
  15. package/cpp/chat.cpp +128 -106
  16. package/cpp/chat.h +2 -0
  17. package/cpp/common.cpp +41 -76
  18. package/cpp/common.h +23 -19
  19. package/cpp/ggml-backend.cpp +9 -5
  20. package/cpp/ggml-backend.h +4 -4
  21. package/cpp/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  22. package/cpp/ggml-cpu/ggml-cpu-quants.c +306 -6
  23. package/cpp/ggml-cpu/ggml-cpu.c +5 -13
  24. package/cpp/ggml-cpu/ggml-cpu.cpp +29 -16
  25. package/cpp/ggml-cpu/ops.cpp +107 -13
  26. package/cpp/ggml-cpu/vec.cpp +0 -6
  27. package/cpp/ggml-cpu/vec.h +16 -0
  28. package/cpp/ggml-llama-sim.metallib +0 -0
  29. package/cpp/ggml-llama.metallib +0 -0
  30. package/cpp/ggml-metal-impl.h +36 -11
  31. package/cpp/ggml-metal.m +321 -132
  32. package/cpp/ggml-opt.cpp +373 -190
  33. package/cpp/ggml-opt.h +49 -28
  34. package/cpp/ggml-quants.c +0 -6
  35. package/cpp/ggml.c +93 -38
  36. package/cpp/ggml.h +21 -7
  37. package/cpp/gguf.cpp +33 -33
  38. package/cpp/llama-adapter.cpp +6 -0
  39. package/cpp/llama-arch.cpp +3 -0
  40. package/cpp/llama-batch.cpp +3 -1
  41. package/cpp/llama-chat.cpp +8 -6
  42. package/cpp/llama-chat.h +1 -0
  43. package/cpp/llama-context.cpp +349 -135
  44. package/cpp/llama-context.h +30 -3
  45. package/cpp/llama-cparams.h +1 -0
  46. package/cpp/llama-graph.cpp +150 -234
  47. package/cpp/llama-graph.h +52 -7
  48. package/cpp/llama-hparams.cpp +17 -1
  49. package/cpp/llama-hparams.h +34 -5
  50. package/cpp/llama-kv-cache.cpp +662 -321
  51. package/cpp/llama-kv-cache.h +203 -93
  52. package/cpp/llama-memory.h +3 -2
  53. package/cpp/llama-model-loader.cpp +24 -15
  54. package/cpp/llama-model-saver.cpp +281 -0
  55. package/cpp/llama-model-saver.h +37 -0
  56. package/cpp/llama-model.cpp +536 -132
  57. package/cpp/llama-model.h +7 -1
  58. package/cpp/llama-sampling.cpp +18 -6
  59. package/cpp/llama-vocab.cpp +46 -8
  60. package/cpp/llama-vocab.h +6 -0
  61. package/cpp/llama.cpp +14 -0
  62. package/cpp/llama.h +72 -131
  63. package/cpp/minja/chat-template.hpp +9 -5
  64. package/cpp/minja/minja.hpp +69 -36
  65. package/cpp/rn-llama.cpp +611 -47
  66. package/cpp/rn-llama.h +33 -3
  67. package/cpp/sampling.cpp +57 -50
  68. package/cpp/tools/mtmd/clip-impl.h +462 -0
  69. package/cpp/tools/mtmd/clip.cpp +4024 -0
  70. package/cpp/tools/mtmd/clip.h +101 -0
  71. package/cpp/tools/mtmd/miniaudio.h +93468 -0
  72. package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  73. package/cpp/tools/mtmd/mtmd-audio.h +62 -0
  74. package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
  75. package/cpp/tools/mtmd/mtmd.cpp +942 -0
  76. package/cpp/tools/mtmd/mtmd.h +362 -0
  77. package/cpp/tools/mtmd/stb_image.h +7988 -0
  78. package/ios/CMakeLists.txt +7 -0
  79. package/ios/RNLlama.mm +77 -3
  80. package/ios/RNLlamaContext.h +5 -1
  81. package/ios/RNLlamaContext.mm +105 -10
  82. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
  83. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +23 -19
  84. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  85. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  86. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  87. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +21 -7
  88. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  89. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +30 -3
  90. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  91. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +52 -7
  92. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +34 -5
  93. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +203 -93
  94. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +3 -2
  95. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  96. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +7 -1
  97. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  98. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +72 -131
  99. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  100. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  101. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +33 -3
  102. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  105. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  106. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +23 -19
  107. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  108. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  109. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  110. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +21 -7
  111. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  112. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +30 -3
  113. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  114. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +52 -7
  115. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +34 -5
  116. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +203 -93
  117. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +3 -2
  118. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  119. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +7 -1
  120. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  121. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +72 -131
  122. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  123. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  124. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +33 -3
  125. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  126. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  127. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  128. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  129. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
  130. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +23 -19
  131. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  132. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  133. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  134. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +21 -7
  135. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  136. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +30 -3
  137. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  138. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +52 -7
  139. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +34 -5
  140. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +203 -93
  141. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +3 -2
  142. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  143. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +7 -1
  144. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  145. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +72 -131
  146. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  147. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  148. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +33 -3
  149. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  150. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  151. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  152. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  153. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +23 -19
  154. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  155. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  156. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  157. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +21 -7
  158. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  159. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +30 -3
  160. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  161. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +52 -7
  162. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +34 -5
  163. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +203 -93
  164. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +3 -2
  165. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  166. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +7 -1
  167. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  168. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +72 -131
  169. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  170. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  171. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +33 -3
  172. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  173. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  174. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  175. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  176. package/jest/mock.js +33 -7
  177. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  178. package/lib/commonjs/index.js +153 -21
  179. package/lib/commonjs/index.js.map +1 -1
  180. package/lib/module/NativeRNLlama.js.map +1 -1
  181. package/lib/module/index.js +152 -20
  182. package/lib/module/index.js.map +1 -1
  183. package/lib/typescript/NativeRNLlama.d.ts +50 -4
  184. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  185. package/lib/typescript/index.d.ts +72 -6
  186. package/lib/typescript/index.d.ts.map +1 -1
  187. package/package.json +1 -1
  188. package/src/NativeRNLlama.ts +67 -4
  189. package/src/index.ts +212 -38
  190. package/lib/commonjs/chat.js +0 -37
  191. package/lib/commonjs/chat.js.map +0 -1
  192. package/lib/module/chat.js +0 -33
  193. package/lib/module/chat.js.map +0 -1
  194. package/lib/typescript/chat.d.ts +0 -10
  195. package/lib/typescript/chat.d.ts.map +0 -1
  196. package/src/chat.ts +0 -44
package/cpp/chat.cpp CHANGED
@@ -4,6 +4,15 @@
4
4
 
5
5
  #include <optional>
6
6
 
7
+ static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
8
+ auto time = std::chrono::system_clock::to_time_t(now);
9
+ auto local_time = *std::localtime(&time);
10
+ std::ostringstream ss;
11
+ ss << std::put_time(&local_time, format.c_str());
12
+ auto res = ss.str();
13
+ return res;
14
+ }
15
+
7
16
  struct templates_params {
8
17
  json messages;
9
18
  json tools;
@@ -14,6 +23,7 @@ struct templates_params {
14
23
  std::string grammar;
15
24
  bool add_generation_prompt = true;
16
25
  bool extract_reasoning = true;
26
+ std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
17
27
  };
18
28
 
19
29
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -115,7 +125,9 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
115
125
  msgs.push_back(msg);
116
126
  }
117
127
  } catch (const std::exception & e) {
118
- throw std::runtime_error("Failed to parse messages: " + std::string(e.what()) + "; messages = " + messages.dump(2));
128
+ // @ngxson : disable otherwise it's bloating the API response
129
+ // printf("%s\n", std::string("; messages = ") + messages.dump(2));
130
+ throw std::runtime_error("Failed to parse messages: " + std::string(e.what()));
119
131
  }
120
132
 
121
133
  return msgs;
@@ -927,78 +939,83 @@ static void expect_tool_parameters(const std::string & name, const json & parame
927
939
  }
928
940
  }
929
941
 
930
- static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct templates_params & inputs, bool allow_python_tag_builtin_tools) {
942
+ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_template & tmpl, const struct templates_params & inputs, bool allow_python_tag_builtin_tools) {
931
943
  auto builtin_tools = json::array();
932
944
  common_chat_params data;
933
- data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
934
- data.grammar = build_grammar([&](const common_grammar_builder & builder) {
935
- std::vector<std::string> tool_rules;
945
+ if (!inputs.tools.is_null()) {
946
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
947
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
948
+ std::vector<std::string> tool_rules;
936
949
 
937
- auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
938
- if (name == "wolfram_alpha" || name == "web_search" || name == "brave_search") {
939
- // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
940
- // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
941
- expect_tool_parameters(name, parameters, {"query"});
942
- } else if (name == "python" || name == "code_interpreter") {
943
- // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
944
- expect_tool_parameters(name, parameters, {"code"});
945
- } else {
946
- return false;
947
- }
950
+ auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
951
+ if (name == "wolfram_alpha" || name == "web_search" || name == "brave_search") {
952
+ // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
953
+ // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
954
+ expect_tool_parameters(name, parameters, {"query"});
955
+ } else if (name == "python" || name == "code_interpreter") {
956
+ // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
957
+ expect_tool_parameters(name, parameters, {"code"});
958
+ } else {
959
+ return false;
960
+ }
948
961
 
949
- std::vector<std::string> kvs;
950
- for (const auto & [key, value] : parameters.at("properties").items()) {
951
- kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value)); // NOLINT
952
- }
962
+ std::vector<std::string> kvs;
963
+ for (const auto & [key, value] : parameters.at("properties").items()) {
964
+ kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value)); // NOLINT
965
+ }
953
966
 
954
- tool_rules.push_back(
955
- builder.add_rule(
956
- name + "-call",
957
- "\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
958
- builtin_tools.push_back(name);
967
+ tool_rules.push_back(
968
+ builder.add_rule(
969
+ name + "-call",
970
+ "\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
971
+ builtin_tools.push_back(name);
959
972
 
960
- return true;
961
- };
973
+ return true;
974
+ };
962
975
 
963
- foreach_function(inputs.tools, [&](const json & tool) {
964
- const auto & function = tool.at("function");
965
- std::string name = function.at("name");
966
- auto parameters = function.at("parameters");
967
- builder.resolve_refs(parameters);
976
+ foreach_function(inputs.tools, [&](const json & tool) {
977
+ const auto & function = tool.at("function");
978
+ std::string name = function.at("name");
979
+ auto parameters = function.at("parameters");
980
+ builder.resolve_refs(parameters);
968
981
 
969
- // https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
970
- if (allow_python_tag_builtin_tools) {
971
- handle_builtin_tool(name, parameters);
982
+ // https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
983
+ if (allow_python_tag_builtin_tools) {
984
+ handle_builtin_tool(name, parameters);
985
+ }
986
+ tool_rules.push_back(
987
+ builder.add_rule(
988
+ name + "-call",
989
+ "\"{\" space "
990
+ "( \"\\\"type\\\"\" space \":\" space \"\\\"function\\\"\" space \",\" space )? "
991
+ " \"\\\"name\\\"\" space \":\" space \"\\\"" + name + "\\\"\" space \",\" space "
992
+ " \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " "
993
+ "\"}\" space"));
994
+ });
995
+ // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
996
+ data.grammar_triggers.push_back({
997
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
998
+ "\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
999
+ });
1000
+ if (!builtin_tools.empty()) {
1001
+ data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
1002
+ data.preserved_tokens.push_back("<|python_tag|>");
972
1003
  }
973
- tool_rules.push_back(
974
- builder.add_rule(
975
- name + "-call",
976
- "\"{\" space "
977
- "( \"\\\"type\\\"\" space \":\" space \"\\\"function\\\"\" space \",\" space )? "
978
- " \"\\\"name\\\"\" space \":\" space \"\\\"" + name + "\\\"\" space \",\" space "
979
- " \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " "
980
- "\"}\" space"));
1004
+ // Allow a few empty lines on top of the usual constrained json schema space rule.
1005
+ builder.add_rule("root", string_join(tool_rules, " | "));
1006
+ data.additional_stops.push_back("<|eom_id|>");
981
1007
  });
982
- // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
983
- data.grammar_triggers.push_back({
984
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
985
- "\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
986
- });
987
- if (!builtin_tools.empty()) {
988
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
989
- data.preserved_tokens.push_back("<|python_tag|>");
990
- }
991
- // Allow a few empty lines on top of the usual constrained json schema space rule.
992
- builder.add_rule("root", string_join(tool_rules, " | "));
993
- });
994
- data.additional_stops.push_back("<|eom_id|>");
1008
+ data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
1009
+ ? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
1010
+ : COMMON_CHAT_FORMAT_LLAMA_3_X;
1011
+ } else {
1012
+ data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1013
+ }
995
1014
  data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
1015
+ {"date_string", format_time(inputs.now, "%d %b %Y")},
996
1016
  {"tools_in_user_message", false},
997
1017
  {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
998
1018
  });
999
- data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
1000
- ? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
1001
- : COMMON_CHAT_FORMAT_LLAMA_3_X;
1002
1019
  return data;
1003
1020
  }
1004
1021
  static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
@@ -1138,7 +1155,7 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
1138
1155
  LOG_DBG("%s\n", __func__);
1139
1156
  common_chat_params data;
1140
1157
  data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
1141
- {"datetime", "Jan 29 2025 13:00:00 GMT"},
1158
+ {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
1142
1159
  {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
1143
1160
  });
1144
1161
  if (inputs.tools.is_array() && !inputs.tools.empty()) {
@@ -1273,55 +1290,59 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in
1273
1290
  static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
1274
1291
  // https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt
1275
1292
  common_chat_params data;
1276
- json tools = inputs.tools.is_null() ? inputs.tools : json::array();
1277
- std::string python_code_argument_name;
1278
- auto has_raw_python = false;
1279
1293
 
1280
- data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1281
- data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1282
- std::vector<std::string> tool_rules;
1283
- foreach_function(inputs.tools, [&](const json & tool) {
1284
- const auto & function = tool.at("function");
1285
- const auto & parameters = function.at("parameters");
1286
- std::string name = function.at("name");
1287
- if (name == "python" || name == "ipython") {
1288
- if (!parameters.contains("type")) {
1289
- throw std::runtime_error("Missing type in python tool");
1290
- }
1291
- has_raw_python = true;
1292
- const auto & type = parameters.at("type");
1293
- if (type == "object") {
1294
- auto properties = parameters.at("properties");
1295
- for (auto it = properties.begin(); it != properties.end(); ++it) {
1296
- if (it.value().at("type") == "string") {
1297
- if (!python_code_argument_name.empty()) {
1298
- throw std::runtime_error("Multiple string arguments found in python tool");
1294
+ if (!inputs.tools.is_null()) {
1295
+ std::string python_code_argument_name;
1296
+ auto has_raw_python = false;
1297
+
1298
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1299
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1300
+ std::vector<std::string> tool_rules;
1301
+ foreach_function(inputs.tools, [&](const json & tool) {
1302
+ const auto & function = tool.at("function");
1303
+ const auto & parameters = function.at("parameters");
1304
+ std::string name = function.at("name");
1305
+ if (name == "python" || name == "ipython") {
1306
+ if (!parameters.contains("type")) {
1307
+ throw std::runtime_error("Missing type in python tool");
1308
+ }
1309
+ has_raw_python = true;
1310
+ const auto & type = parameters.at("type");
1311
+ if (type == "object") {
1312
+ auto properties = parameters.at("properties");
1313
+ for (auto it = properties.begin(); it != properties.end(); ++it) {
1314
+ if (it.value().at("type") == "string") {
1315
+ if (!python_code_argument_name.empty()) {
1316
+ throw std::runtime_error("Multiple string arguments found in python tool");
1317
+ }
1318
+ python_code_argument_name = it.key();
1299
1319
  }
1300
- python_code_argument_name = it.key();
1301
1320
  }
1321
+ if (python_code_argument_name.empty()) {
1322
+ throw std::runtime_error("No string argument found in python tool");
1323
+ }
1324
+ } else if (type != "string") {
1325
+ throw std::runtime_error("Invalid type in python tool: " + type.dump());
1302
1326
  }
1303
- if (python_code_argument_name.empty()) {
1304
- throw std::runtime_error("No string argument found in python tool");
1305
- }
1306
- } else if (type != "string") {
1307
- throw std::runtime_error("Invalid type in python tool: " + type.dump());
1308
1327
  }
1328
+ tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
1329
+ });
1330
+ if (has_raw_python) {
1331
+ tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
1332
+ data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
1333
+ data.preserved_tokens.push_back("<|python_tag|>");
1309
1334
  }
1310
- tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
1335
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
1336
+ builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
1337
+ data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function="});
1311
1338
  });
1312
- if (has_raw_python) {
1313
- tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
1314
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
1315
- data.preserved_tokens.push_back("<|python_tag|>");
1316
- }
1317
- auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
1318
- builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
1319
- data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function="});
1320
- });
1339
+ data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
1340
+ } else {
1341
+ data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1342
+ }
1321
1343
 
1322
1344
  data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1323
1345
  // TODO: if (has_raw_python)
1324
- data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
1325
1346
  return data;
1326
1347
  }
1327
1348
  static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
@@ -1581,6 +1602,7 @@ static common_chat_params common_chat_templates_apply_jinja(
1581
1602
  params.extract_reasoning = inputs.extract_reasoning;
1582
1603
  params.tool_choice = inputs.tool_choice;
1583
1604
  params.grammar = inputs.grammar;
1605
+ params.now = inputs.now;
1584
1606
  if (!inputs.json_schema.empty()) {
1585
1607
  params.json_schema = json::parse(inputs.json_schema);
1586
1608
  }
@@ -1632,21 +1654,21 @@ static common_chat_params common_chat_templates_apply_jinja(
1632
1654
  return common_chat_params_init_firefunction_v2(tmpl, params);
1633
1655
  }
1634
1656
 
1635
- // Plain handler (no tools)
1636
- if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
1637
- return common_chat_params_init_without_tools(tmpl, params);
1638
- }
1639
-
1640
1657
  // Functionary v3.1 (w/ tools)
1641
1658
  if (src.find("<|start_header_id|>") != std::string::npos
1642
1659
  && src.find("<function=") != std::string::npos) {
1643
1660
  return common_chat_params_init_functionary_v3_1_llama_3_1(tmpl, params);
1644
1661
  }
1645
1662
 
1646
- // Llama 3.1, 3.2, 3.3 (w/ tools)
1663
+ // Llama 3.1, 3.2, 3.3 (also requires date_string so using it even w/o tools)
1647
1664
  if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
1648
1665
  auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
1649
- return common_chat_params_init_llama_3_1_tool_calls(tmpl, params, allow_python_tag_builtin_tools);
1666
+ return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
1667
+ }
1668
+
1669
+ // Plain handler (no tools)
1670
+ if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
1671
+ return common_chat_params_init_without_tools(tmpl, params);
1650
1672
  }
1651
1673
 
1652
1674
  // Mistral Nemo (w/ tools)
package/cpp/chat.h CHANGED
@@ -3,6 +3,7 @@
3
3
  #pragma once
4
4
 
5
5
  #include "common.h"
6
+ #include <chrono>
6
7
  #include <string>
7
8
  #include <vector>
8
9
  #include "minja/chat-template.hpp"
@@ -79,6 +80,7 @@ struct common_chat_templates_inputs {
79
80
  common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
80
81
  bool parallel_tool_calls = false;
81
82
  bool extract_reasoning = true;
83
+ std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
82
84
  };
83
85
 
84
86
  struct common_chat_params {
package/cpp/common.cpp CHANGED
@@ -450,6 +450,25 @@ void string_replace_all(std::string & s, const std::string & search, const std::
450
450
  s = std::move(builder);
451
451
  }
452
452
 
453
+ bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
454
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
455
+ }
456
+ size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
457
+ if (!str.empty() && !stop.empty()) {
458
+ const char text_last_char = str.back();
459
+ for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
460
+ if (stop[char_index] == text_last_char) {
461
+ const auto current_partial = stop.substr(0, char_index + 1);
462
+ if (string_ends_with(str, current_partial)) {
463
+ return str.size() - char_index - 1;
464
+ }
465
+ }
466
+ }
467
+ }
468
+
469
+ return std::string::npos;
470
+ }
471
+
453
472
  std::string regex_escape(const std::string & s) {
454
473
  static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
455
474
  return std::regex_replace(s, special_chars, "\\$0");
@@ -1093,6 +1112,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1093
1112
  mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
1094
1113
  }
1095
1114
 
1115
+ mparams.progress_callback = params.load_progress_callback;
1116
+ mparams.progress_callback_user_data = params.load_progress_callback_user_data;
1117
+
1096
1118
  return mparams;
1097
1119
  }
1098
1120
 
@@ -1106,7 +1128,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1106
1128
  cparams.n_threads = params.cpuparams.n_threads;
1107
1129
  cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
1108
1130
  params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
1109
- cparams.logits_all = params.logits_all;
1110
1131
  cparams.embeddings = params.embedding;
1111
1132
  cparams.rope_scaling_type = params.rope_scaling_type;
1112
1133
  cparams.rope_freq_base = params.rope_freq_base;
@@ -1124,6 +1145,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1124
1145
  cparams.offload_kqv = !params.no_kv_offload;
1125
1146
  cparams.flash_attn = params.flash_attn;
1126
1147
  cparams.no_perf = params.no_perf;
1148
+ cparams.op_offload = !params.no_op_offload;
1149
+ cparams.swa_full = params.swa_full;
1127
1150
 
1128
1151
  if (params.reranking) {
1129
1152
  cparams.embeddings = true;
@@ -1316,81 +1339,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
1316
1339
  return text;
1317
1340
  }
1318
1341
 
1319
- //
1320
- // KV cache utils
1321
- //
1322
-
1323
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1324
- static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1325
-
1326
- printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1327
- view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1328
-
1329
- llama_kv_cache_view_cell * c_curr = view.cells;
1330
- llama_seq_id * cs_curr = view.cells_sequences;
1331
-
1332
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1333
- if (i % row_size == 0) {
1334
- printf("\n%5d: ", i);
1335
- }
1336
- int seq_count = 0;
1337
- for (int j = 0; j < view.n_seq_max; j++) {
1338
- if (cs_curr[j] >= 0) { seq_count++; }
1339
- }
1340
- putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1341
- }
1342
-
1343
- printf("\n=== Done dumping\n");
1344
- }
1345
-
1346
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1347
- static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1348
-
1349
- printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1350
- view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1351
-
1352
- std::unordered_map<llama_seq_id, size_t> seqs;
1353
- llama_kv_cache_view_cell * c_curr = view.cells;
1354
- llama_seq_id * cs_curr = view.cells_sequences;
1355
-
1356
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1357
- for (int j = 0; j < view.n_seq_max; j++) {
1358
- if (cs_curr[j] < 0) { continue; }
1359
- if (seqs.find(cs_curr[j]) == seqs.end()) {
1360
- if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1361
- const size_t sz = seqs.size();
1362
- seqs[cs_curr[j]] = sz;
1363
- }
1364
- }
1365
- if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1366
- }
1367
-
1368
- printf("=== Sequence legend: ");
1369
- for (const auto & it : seqs) {
1370
- printf("%zu=%d, ", it.second, it.first);
1371
- }
1372
- printf("'+'=other sequence ids");
1373
-
1374
- c_curr = view.cells;
1375
- cs_curr = view.cells_sequences;
1376
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1377
- if (i % row_size == 0) {
1378
- printf("\n%5d: ", i);
1379
- }
1380
- for (int j = 0; j < view.n_seq_max; j++) {
1381
- if (cs_curr[j] >= 0) {
1382
- const auto & it = seqs.find(cs_curr[j]);
1383
- putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1384
- } else {
1385
- putchar('.');
1386
- }
1387
- }
1388
- putchar(' ');
1389
- }
1390
-
1391
- printf("\n=== Done dumping\n");
1392
- }
1393
-
1394
1342
  //
1395
1343
  // Embedding utils
1396
1344
  //
@@ -1575,3 +1523,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
1575
1523
 
1576
1524
  return result;
1577
1525
  }
1526
+
1527
+ lm_ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
1528
+ const int64_t ne_datapoint = llama_n_ctx(ctx);
1529
+ const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
1530
+ lm_ggml_opt_dataset_t result = lm_ggml_opt_dataset_init(
1531
+ LM_GGML_TYPE_I32, LM_GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
1532
+
1533
+ llama_token * data = (llama_token *) lm_ggml_opt_dataset_data(result)->data;
1534
+ llama_token * labels = (llama_token *) lm_ggml_opt_dataset_labels(result)->data;
1535
+
1536
+ for (int64_t idata = 0; idata < ndata; ++idata) {
1537
+ memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
1538
+ memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
1539
+ }
1540
+
1541
+ return result;
1542
+ }
package/cpp/common.h CHANGED
@@ -6,6 +6,7 @@
6
6
 
7
7
  #include <set>
8
8
  #include <string>
9
+ #include <string_view>
9
10
  #include <vector>
10
11
  #include <sstream>
11
12
 
@@ -77,7 +78,6 @@ enum llama_example {
77
78
  LLAMA_EXAMPLE_COMMON,
78
79
  LLAMA_EXAMPLE_SPECULATIVE,
79
80
  LLAMA_EXAMPLE_MAIN,
80
- LLAMA_EXAMPLE_INFILL,
81
81
  LLAMA_EXAMPLE_EMBEDDING,
82
82
  LLAMA_EXAMPLE_PERPLEXITY,
83
83
  LLAMA_EXAMPLE_RETRIEVAL,
@@ -87,7 +87,7 @@ enum llama_example {
87
87
  LLAMA_EXAMPLE_SERVER,
88
88
  LLAMA_EXAMPLE_CVECTOR_GENERATOR,
89
89
  LLAMA_EXAMPLE_EXPORT_LORA,
90
- LLAMA_EXAMPLE_LLAVA,
90
+ LLAMA_EXAMPLE_MTMD,
91
91
  LLAMA_EXAMPLE_LOOKUP,
92
92
  LLAMA_EXAMPLE_PARALLEL,
93
93
  LLAMA_EXAMPLE_TTS,
@@ -107,6 +107,7 @@ enum common_sampler_type {
107
107
  COMMON_SAMPLER_TYPE_XTC = 8,
108
108
  COMMON_SAMPLER_TYPE_INFILL = 9,
109
109
  COMMON_SAMPLER_TYPE_PENALTIES = 10,
110
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
110
111
  };
111
112
 
112
113
  // dimensionality reduction methods, used by cvector-generator
@@ -172,6 +173,7 @@ struct common_params_sampling {
172
173
  std::vector<enum common_sampler_type> samplers = {
173
174
  COMMON_SAMPLER_TYPE_PENALTIES,
174
175
  COMMON_SAMPLER_TYPE_DRY,
176
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
175
177
  COMMON_SAMPLER_TYPE_TOP_K,
176
178
  COMMON_SAMPLER_TYPE_TYPICAL_P,
177
179
  COMMON_SAMPLER_TYPE_TOP_P,
@@ -336,17 +338,17 @@ struct common_params {
336
338
  bool flash_attn = false; // flash attention
337
339
  bool no_perf = false; // disable performance metrics
338
340
  bool ctx_shift = true; // context shift on inifinite text generation
341
+ bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
339
342
 
340
343
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
341
- bool logits_all = false; // return logits for all tokens in the batch
342
344
  bool use_mmap = true; // use mmap for faster loads
343
345
  bool use_mlock = false; // use mlock to keep model in memory
344
346
  bool verbose_prompt = false; // print prompt tokens before generation
345
347
  bool display_prompt = true; // print prompt before generation
346
- bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
347
348
  bool no_kv_offload = false; // disable KV offloading
348
349
  bool warmup = true; // warmup run
349
350
  bool check_tensors = false; // validate tensor data
351
+ bool no_op_offload = false; // globally disable offload host tensor operations to device
350
352
 
351
353
  bool single_turn = false; // single turn chat conversation
352
354
 
@@ -355,7 +357,7 @@ struct common_params {
355
357
 
356
358
  common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
357
359
 
358
- // multimodal models (see tools/llava)
360
+ // multimodal models (see tools/mtmd)
359
361
  struct common_params_model mmproj;
360
362
  bool mmproj_use_gpu = true; // use GPU for multimodal model
361
363
  bool no_mmproj = false; // explicitly disable multimodal model
@@ -381,6 +383,7 @@ struct common_params {
381
383
  bool use_jinja = false; // NOLINT
382
384
  bool enable_chat_template = true;
383
385
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
386
+ bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
384
387
 
385
388
  std::vector<std::string> api_keys;
386
389
 
@@ -424,6 +427,7 @@ struct common_params {
424
427
 
425
428
  bool process_output = false; // collect data for the output tensor
426
429
  bool compute_ppl = true; // whether to compute perplexity
430
+ bool parse_special = false; // whether to parse special tokens during imatrix tokenization
427
431
 
428
432
  // cvector-generator params
429
433
  int n_pca_batch = 100;
@@ -439,6 +443,11 @@ struct common_params {
439
443
 
440
444
  // common params
441
445
  std::string out_file; // output filename for all example programs
446
+ // optional callback for model loading progress and cancellation:
447
+ // called with a progress value between 0.0 and 1.0.
448
+ // return false from callback to abort model loading or true to continue
449
+ llama_progress_callback load_progress_callback = NULL;
450
+ void * load_progress_callback_user_data = NULL;
442
451
  };
443
452
 
444
453
  // call once at the start of a program if it uses libcommon
@@ -516,10 +525,9 @@ static bool string_starts_with(const std::string & str,
516
525
  return str.rfind(prefix, 0) == 0;
517
526
  }
518
527
 
519
- static bool string_ends_with(const std::string & str,
520
- const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
521
- return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
522
- }
528
+ // While we wait for C++20's std::string::ends_with...
529
+ bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
530
+ size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
523
531
 
524
532
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
525
533
  void string_process_escapes(std::string & input);
@@ -628,16 +636,6 @@ std::string common_detokenize(
628
636
  const std::vector<llama_token> & tokens,
629
637
  bool special = true);
630
638
 
631
- //
632
- // KV cache utils
633
- //
634
-
635
- // Dump the KV cache view with the number of sequences per cell.
636
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
637
-
638
- // Dump the KV cache view showing individual sequences in each cell (long output).
639
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
640
-
641
639
  //
642
640
  // Embedding utils
643
641
  //
@@ -679,3 +677,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
679
677
  const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
680
678
 
681
679
  }
680
+
681
+ //
682
+ // training utils
683
+ //
684
+
685
+ lm_ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);