cui-llama.rn 1.4.3 → 1.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/README.md +93 -114
  2. package/android/src/main/CMakeLists.txt +5 -0
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +91 -17
  4. package/android/src/main/java/com/rnllama/RNLlama.java +37 -4
  5. package/android/src/main/jni-utils.h +6 -0
  6. package/android/src/main/jni.cpp +289 -31
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  15. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +7 -2
  16. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +7 -2
  17. package/cpp/chat-template.hpp +529 -0
  18. package/cpp/chat.cpp +1779 -0
  19. package/cpp/chat.h +135 -0
  20. package/cpp/common.cpp +2064 -1873
  21. package/cpp/common.h +700 -699
  22. package/cpp/ggml-alloc.c +1039 -1042
  23. package/cpp/ggml-alloc.h +1 -1
  24. package/cpp/ggml-backend-impl.h +255 -255
  25. package/cpp/ggml-backend-reg.cpp +586 -582
  26. package/cpp/ggml-backend.cpp +2004 -2002
  27. package/cpp/ggml-backend.h +354 -354
  28. package/cpp/ggml-common.h +1851 -1853
  29. package/cpp/ggml-cpp.h +39 -39
  30. package/cpp/ggml-cpu-aarch64.cpp +4248 -4247
  31. package/cpp/ggml-cpu-aarch64.h +8 -8
  32. package/cpp/ggml-cpu-impl.h +531 -386
  33. package/cpp/ggml-cpu-quants.c +12527 -10920
  34. package/cpp/ggml-cpu-traits.cpp +36 -36
  35. package/cpp/ggml-cpu-traits.h +38 -38
  36. package/cpp/ggml-cpu.c +15766 -14391
  37. package/cpp/ggml-cpu.cpp +655 -635
  38. package/cpp/ggml-cpu.h +138 -135
  39. package/cpp/ggml-impl.h +567 -567
  40. package/cpp/ggml-metal-impl.h +235 -0
  41. package/cpp/ggml-metal.h +1 -1
  42. package/cpp/ggml-metal.m +5146 -4884
  43. package/cpp/ggml-opt.cpp +854 -854
  44. package/cpp/ggml-opt.h +216 -216
  45. package/cpp/ggml-quants.c +5238 -5238
  46. package/cpp/ggml-threading.h +14 -14
  47. package/cpp/ggml.c +6529 -6514
  48. package/cpp/ggml.h +2198 -2194
  49. package/cpp/gguf.cpp +1329 -1329
  50. package/cpp/gguf.h +202 -202
  51. package/cpp/json-schema-to-grammar.cpp +1024 -1045
  52. package/cpp/json-schema-to-grammar.h +21 -8
  53. package/cpp/json.hpp +24766 -24766
  54. package/cpp/llama-adapter.cpp +347 -347
  55. package/cpp/llama-adapter.h +74 -74
  56. package/cpp/llama-arch.cpp +1513 -1487
  57. package/cpp/llama-arch.h +403 -400
  58. package/cpp/llama-batch.cpp +368 -368
  59. package/cpp/llama-batch.h +88 -88
  60. package/cpp/llama-chat.cpp +588 -578
  61. package/cpp/llama-chat.h +53 -52
  62. package/cpp/llama-context.cpp +1775 -1775
  63. package/cpp/llama-context.h +128 -128
  64. package/cpp/llama-cparams.cpp +1 -1
  65. package/cpp/llama-cparams.h +37 -37
  66. package/cpp/llama-cpp.h +30 -30
  67. package/cpp/llama-grammar.cpp +1219 -1139
  68. package/cpp/llama-grammar.h +173 -143
  69. package/cpp/llama-hparams.cpp +71 -71
  70. package/cpp/llama-hparams.h +139 -139
  71. package/cpp/llama-impl.cpp +167 -167
  72. package/cpp/llama-impl.h +61 -61
  73. package/cpp/llama-kv-cache.cpp +718 -718
  74. package/cpp/llama-kv-cache.h +219 -218
  75. package/cpp/llama-mmap.cpp +600 -590
  76. package/cpp/llama-mmap.h +68 -67
  77. package/cpp/llama-model-loader.cpp +1124 -1124
  78. package/cpp/llama-model-loader.h +167 -167
  79. package/cpp/llama-model.cpp +4087 -3997
  80. package/cpp/llama-model.h +370 -370
  81. package/cpp/llama-sampling.cpp +2558 -2408
  82. package/cpp/llama-sampling.h +32 -32
  83. package/cpp/llama-vocab.cpp +3264 -3247
  84. package/cpp/llama-vocab.h +125 -125
  85. package/cpp/llama.cpp +10284 -10077
  86. package/cpp/llama.h +1354 -1323
  87. package/cpp/log.cpp +393 -401
  88. package/cpp/log.h +132 -121
  89. package/cpp/minja/chat-template.hpp +529 -0
  90. package/cpp/minja/minja.hpp +2915 -0
  91. package/cpp/minja.hpp +2915 -0
  92. package/cpp/rn-llama.cpp +66 -6
  93. package/cpp/rn-llama.h +26 -1
  94. package/cpp/sampling.cpp +570 -505
  95. package/cpp/sampling.h +3 -0
  96. package/cpp/sgemm.cpp +2598 -2597
  97. package/cpp/sgemm.h +14 -14
  98. package/cpp/speculative.cpp +278 -277
  99. package/cpp/speculative.h +28 -28
  100. package/cpp/unicode.cpp +9 -2
  101. package/ios/CMakeLists.txt +6 -0
  102. package/ios/RNLlama.h +0 -8
  103. package/ios/RNLlama.mm +27 -3
  104. package/ios/RNLlamaContext.h +10 -1
  105. package/ios/RNLlamaContext.mm +269 -57
  106. package/jest/mock.js +21 -2
  107. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  108. package/lib/commonjs/grammar.js +3 -0
  109. package/lib/commonjs/grammar.js.map +1 -1
  110. package/lib/commonjs/index.js +87 -13
  111. package/lib/commonjs/index.js.map +1 -1
  112. package/lib/module/NativeRNLlama.js.map +1 -1
  113. package/lib/module/grammar.js +3 -0
  114. package/lib/module/grammar.js.map +1 -1
  115. package/lib/module/index.js +86 -13
  116. package/lib/module/index.js.map +1 -1
  117. package/lib/typescript/NativeRNLlama.d.ts +107 -2
  118. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  119. package/lib/typescript/grammar.d.ts.map +1 -1
  120. package/lib/typescript/index.d.ts +32 -7
  121. package/lib/typescript/index.d.ts.map +1 -1
  122. package/llama-rn.podspec +1 -1
  123. package/package.json +3 -2
  124. package/src/NativeRNLlama.ts +115 -3
  125. package/src/grammar.ts +3 -0
  126. package/src/index.ts +138 -21
  127. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  128. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  129. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  130. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  131. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  132. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  133. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -55
  134. package/cpp/rn-llama.hpp +0 -913
@@ -1,578 +1,588 @@
1
- #include "llama-chat.h"
2
-
3
- #include "llama.h"
4
-
5
- #include <map>
6
- #include <sstream>
7
-
8
- #if __cplusplus >= 202000L
9
- #define LU8(x) (const char*)(u8##x)
10
- #else
11
- #define LU8(x) u8##x
12
- #endif
13
-
14
- // trim whitespace from the beginning and end of a string
15
- static std::string trim(const std::string & str) {
16
- size_t start = 0;
17
- size_t end = str.size();
18
- while (start < end && isspace(str[start])) {
19
- start += 1;
20
- }
21
- while (end > start && isspace(str[end - 1])) {
22
- end -= 1;
23
- }
24
- return str.substr(start, end - start);
25
- }
26
-
27
- static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
28
- { "chatml", LLM_CHAT_TEMPLATE_CHATML },
29
- { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
30
- { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
31
- { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
32
- { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
33
- { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
34
- { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
35
- { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
36
- { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
37
- { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
38
- { "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
39
- { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
40
- { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
41
- { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
42
- { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
43
- { "orion", LLM_CHAT_TEMPLATE_ORION },
44
- { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
45
- { "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
46
- { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
47
- { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
48
- { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
49
- { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
50
- { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
51
- { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
52
- { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
53
- { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
54
- { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
55
- { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
56
- { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
57
- { "granite", LLM_CHAT_TEMPLATE_GRANITE },
58
- { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
59
- { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
60
- };
61
-
62
- llm_chat_template llm_chat_template_from_str(const std::string & name) {
63
- return LLM_CHAT_TEMPLATES.at(name);
64
- }
65
-
66
- llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
67
- try {
68
- return llm_chat_template_from_str(tmpl);
69
- } catch (const std::out_of_range &) {
70
- // ignore
71
- }
72
-
73
- auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
74
- return tmpl.find(haystack) != std::string::npos;
75
- };
76
- if (tmpl_contains("<|im_start|>")) {
77
- return tmpl_contains("<|im_sep|>")
78
- ? LLM_CHAT_TEMPLATE_PHI_4
79
- : LLM_CHAT_TEMPLATE_CHATML;
80
- } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
81
- if (tmpl_contains("[SYSTEM_PROMPT]")) {
82
- return LLM_CHAT_TEMPLATE_MISTRAL_V7;
83
- } else if (
84
- // catches official 'v1' template
85
- tmpl_contains("' [INST] ' + system_message")
86
- // catches official 'v3' and 'v3-tekken' templates
87
- || tmpl_contains("[AVAILABLE_TOOLS]")
88
- ) {
89
- // Official mistral 'v1', 'v3' and 'v3-tekken' templates
90
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
91
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
92
- if (tmpl_contains(" [INST]")) {
93
- return LLM_CHAT_TEMPLATE_MISTRAL_V1;
94
- } else if (tmpl_contains("\"[INST]\"")) {
95
- return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
96
- }
97
- return LLM_CHAT_TEMPLATE_MISTRAL_V3;
98
- } else {
99
- // llama2 template and its variants
100
- // [variant] support system message
101
- // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
102
- bool support_system_message = tmpl_contains("<<SYS>>");
103
- bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
104
- bool strip_message = tmpl_contains("content.strip()");
105
- if (strip_message) {
106
- return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
107
- } else if (add_bos_inside_history) {
108
- return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
109
- } else if (support_system_message) {
110
- return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
111
- } else {
112
- return LLM_CHAT_TEMPLATE_LLAMA_2;
113
- }
114
- }
115
- } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
116
- return LLM_CHAT_TEMPLATE_PHI_3;
117
- } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
118
- return LLM_CHAT_TEMPLATE_FALCON_3;
119
- } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
120
- return LLM_CHAT_TEMPLATE_ZEPHYR;
121
- } else if (tmpl_contains("bos_token + message['role']")) {
122
- return LLM_CHAT_TEMPLATE_MONARCH;
123
- } else if (tmpl_contains("<start_of_turn>")) {
124
- return LLM_CHAT_TEMPLATE_GEMMA;
125
- } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
126
- // OrionStarAI/Orion-14B-Chat
127
- return LLM_CHAT_TEMPLATE_ORION;
128
- } else if (tmpl_contains("GPT4 Correct ")) {
129
- // openchat/openchat-3.5-0106
130
- return LLM_CHAT_TEMPLATE_OPENCHAT;
131
- } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
132
- // eachadea/vicuna-13b-1.1 (and Orca variant)
133
- if (tmpl_contains("SYSTEM: ")) {
134
- return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
135
- }
136
- return LLM_CHAT_TEMPLATE_VICUNA;
137
- } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
138
- // deepseek-ai/deepseek-coder-33b-instruct
139
- return LLM_CHAT_TEMPLATE_DEEPSEEK;
140
- } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
141
- // CohereForAI/c4ai-command-r-plus
142
- return LLM_CHAT_TEMPLATE_COMMAND_R;
143
- } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
144
- return LLM_CHAT_TEMPLATE_LLAMA_3;
145
- } else if (tmpl_contains("[gMASK]sop")) {
146
- // chatglm3-6b
147
- return LLM_CHAT_TEMPLATE_CHATGML_3;
148
- } else if (tmpl_contains("[gMASK]<sop>")) {
149
- return LLM_CHAT_TEMPLATE_CHATGML_4;
150
- } else if (tmpl_contains(LU8("<用户>"))) {
151
- // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
152
- return LLM_CHAT_TEMPLATE_MINICPM;
153
- } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
154
- return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
155
- } else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
156
- return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
157
- } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
158
- // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
159
- // EXAONE-3.0-7.8B-Instruct
160
- return LLM_CHAT_TEMPLATE_EXAONE_3;
161
- } else if (tmpl_contains("rwkv-world")) {
162
- return LLM_CHAT_TEMPLATE_RWKV_WORLD;
163
- } else if (tmpl_contains("<|start_of_role|>")) {
164
- return LLM_CHAT_TEMPLATE_GRANITE;
165
- } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
166
- return LLM_CHAT_TEMPLATE_GIGACHAT;
167
- } else if (tmpl_contains("<|role_start|>")) {
168
- return LLM_CHAT_TEMPLATE_MEGREZ;
169
- }
170
- return LLM_CHAT_TEMPLATE_UNKNOWN;
171
- }
172
-
173
- // Simple version of "llama_apply_chat_template" that only works with strings
174
- // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
175
- int32_t llm_chat_apply_template(
176
- llm_chat_template tmpl,
177
- const std::vector<const llama_chat_message *> & chat,
178
- std::string & dest, bool add_ass) {
179
- // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
180
- std::stringstream ss;
181
- if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
182
- // chatml template
183
- for (auto message : chat) {
184
- ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
185
- }
186
- if (add_ass) {
187
- ss << "<|im_start|>assistant\n";
188
- }
189
- } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
190
- // Official mistral 'v7' template
191
- // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
192
- for (auto message : chat) {
193
- std::string role(message->role);
194
- std::string content(message->content);
195
- if (role == "system") {
196
- ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
197
- } else if (role == "user") {
198
- ss << "[INST] " << content << "[/INST]";
199
- }
200
- else {
201
- ss << " " << content << "</s>";
202
- }
203
- }
204
- } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
205
- || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
206
- || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
207
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
208
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
209
- std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
210
- std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
211
- bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
212
- bool is_inside_turn = false;
213
- for (auto message : chat) {
214
- if (!is_inside_turn) {
215
- ss << leading_space << "[INST]" << trailing_space;
216
- is_inside_turn = true;
217
- }
218
- std::string role(message->role);
219
- std::string content(message->content);
220
- if (role == "system") {
221
- ss << content << "\n\n";
222
- } else if (role == "user") {
223
- ss << content << leading_space << "[/INST]";
224
- } else {
225
- ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
226
- is_inside_turn = false;
227
- }
228
- }
229
- } else if (
230
- tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
231
- || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
232
- || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
233
- || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
234
- // llama2 template and its variants
235
- // [variant] support system message
236
- // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
237
- bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
238
- // [variant] add BOS inside history
239
- bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
240
- // [variant] trim spaces from the input message
241
- bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
242
- // construct the prompt
243
- bool is_inside_turn = true; // skip BOS at the beginning
244
- ss << "[INST] ";
245
- for (auto message : chat) {
246
- std::string content = strip_message ? trim(message->content) : message->content;
247
- std::string role(message->role);
248
- if (!is_inside_turn) {
249
- is_inside_turn = true;
250
- ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
251
- }
252
- if (role == "system") {
253
- if (support_system_message) {
254
- ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
255
- } else {
256
- // if the model does not support system message, we still include it in the first message, but without <<SYS>>
257
- ss << content << "\n";
258
- }
259
- } else if (role == "user") {
260
- ss << content << " [/INST]";
261
- } else {
262
- ss << content << "</s>";
263
- is_inside_turn = false;
264
- }
265
- }
266
- } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
267
- // Phi 3
268
- for (auto message : chat) {
269
- std::string role(message->role);
270
- ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
271
- }
272
- if (add_ass) {
273
- ss << "<|assistant|>\n";
274
- }
275
- } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
276
- // chatml template
277
- for (auto message : chat) {
278
- ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
279
- }
280
- if (add_ass) {
281
- ss << "<|im_start|>assistant<|im_sep|>";
282
- }
283
- } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
284
- // Falcon 3
285
- for (auto message : chat) {
286
- std::string role(message->role);
287
- ss << "<|" << role << "|>\n" << message->content << "\n";
288
- }
289
- if (add_ass) {
290
- ss << "<|assistant|>\n";
291
- }
292
- } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
293
- // zephyr template
294
- for (auto message : chat) {
295
- ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
296
- }
297
- if (add_ass) {
298
- ss << "<|assistant|>\n";
299
- }
300
- } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
301
- // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
302
- for (auto message : chat) {
303
- std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
304
- ss << bos << message->role << "\n" << message->content << "</s>\n";
305
- }
306
- if (add_ass) {
307
- ss << "<s>assistant\n";
308
- }
309
- } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
310
- // google/gemma-7b-it
311
- std::string system_prompt = "";
312
- for (auto message : chat) {
313
- std::string role(message->role);
314
- if (role == "system") {
315
- // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
316
- system_prompt = trim(message->content);
317
- continue;
318
- }
319
- // in gemma, "assistant" is "model"
320
- role = role == "assistant" ? "model" : message->role;
321
- ss << "<start_of_turn>" << role << "\n";
322
- if (!system_prompt.empty() && role != "model") {
323
- ss << system_prompt << "\n\n";
324
- system_prompt = "";
325
- }
326
- ss << trim(message->content) << "<end_of_turn>\n";
327
- }
328
- if (add_ass) {
329
- ss << "<start_of_turn>model\n";
330
- }
331
- } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
332
- // OrionStarAI/Orion-14B-Chat
333
- std::string system_prompt = "";
334
- for (auto message : chat) {
335
- std::string role(message->role);
336
- if (role == "system") {
337
- // there is no system message support, we will merge it with user prompt
338
- system_prompt = message->content;
339
- continue;
340
- } else if (role == "user") {
341
- ss << "Human: ";
342
- if (!system_prompt.empty()) {
343
- ss << system_prompt << "\n\n";
344
- system_prompt = "";
345
- }
346
- ss << message->content << "\n\nAssistant: </s>";
347
- } else {
348
- ss << message->content << "</s>";
349
- }
350
- }
351
- } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
352
- // openchat/openchat-3.5-0106,
353
- for (auto message : chat) {
354
- std::string role(message->role);
355
- if (role == "system") {
356
- ss << message->content << "<|end_of_turn|>";
357
- } else {
358
- role[0] = toupper(role[0]);
359
- ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
360
- }
361
- }
362
- if (add_ass) {
363
- ss << "GPT4 Correct Assistant:";
364
- }
365
- } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
366
- // eachadea/vicuna-13b-1.1 (and Orca variant)
367
- for (auto message : chat) {
368
- std::string role(message->role);
369
- if (role == "system") {
370
- // Orca-Vicuna variant uses a system prefix
371
- if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
372
- ss << "SYSTEM: " << message->content << "\n";
373
- } else {
374
- ss << message->content << "\n\n";
375
- }
376
- } else if (role == "user") {
377
- ss << "USER: " << message->content << "\n";
378
- } else if (role == "assistant") {
379
- ss << "ASSISTANT: " << message->content << "</s>\n";
380
- }
381
- }
382
- if (add_ass) {
383
- ss << "ASSISTANT:";
384
- }
385
- } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
386
- // deepseek-ai/deepseek-coder-33b-instruct
387
- for (auto message : chat) {
388
- std::string role(message->role);
389
- if (role == "system") {
390
- ss << message->content;
391
- } else if (role == "user") {
392
- ss << "### Instruction:\n" << message->content << "\n";
393
- } else if (role == "assistant") {
394
- ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
395
- }
396
- }
397
- if (add_ass) {
398
- ss << "### Response:\n";
399
- }
400
- } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
401
- // CohereForAI/c4ai-command-r-plus
402
- for (auto message : chat) {
403
- std::string role(message->role);
404
- if (role == "system") {
405
- ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
406
- } else if (role == "user") {
407
- ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
408
- } else if (role == "assistant") {
409
- ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
410
- }
411
- }
412
- if (add_ass) {
413
- ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
414
- }
415
- } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
416
- // Llama 3
417
- for (auto message : chat) {
418
- std::string role(message->role);
419
- ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
420
- }
421
- if (add_ass) {
422
- ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
423
- }
424
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
425
- // chatglm3-6b
426
- ss << "[gMASK]" << "sop";
427
- for (auto message : chat) {
428
- std::string role(message->role);
429
- ss << "<|" << role << "|>" << "\n " << message->content;
430
- }
431
- if (add_ass) {
432
- ss << "<|assistant|>";
433
- }
434
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
435
- ss << "[gMASK]" << "<sop>";
436
- for (auto message : chat) {
437
- std::string role(message->role);
438
- ss << "<|" << role << "|>" << "\n" << message->content;
439
- }
440
- if (add_ass) {
441
- ss << "<|assistant|>";
442
- }
443
- } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
444
- // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
445
- for (auto message : chat) {
446
- std::string role(message->role);
447
- if (role == "user") {
448
- ss << LU8("<用户>");
449
- ss << trim(message->content);
450
- ss << "<AI>";
451
- } else {
452
- ss << trim(message->content);
453
- }
454
- }
455
- } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
456
- // DeepSeek-V2
457
- for (auto message : chat) {
458
- std::string role(message->role);
459
- if (role == "system") {
460
- ss << message->content << "\n\n";
461
- } else if (role == "user") {
462
- ss << "User: " << message->content << "\n\n";
463
- } else if (role == "assistant") {
464
- ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
465
- }
466
- }
467
- if (add_ass) {
468
- ss << "Assistant:";
469
- }
470
- } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
471
- // DeepSeek-V3
472
- for (auto message : chat) {
473
- std::string role(message->role);
474
- if (role == "system") {
475
- ss << message->content << "\n\n";
476
- } else if (role == "user") {
477
- ss << LU8("<|User|>") << message->content;
478
- } else if (role == "assistant") {
479
- ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
480
- }
481
- }
482
- if (add_ass) {
483
- ss << LU8("<|Assistant|>");
484
- }
485
- } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
486
- // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
487
- // EXAONE-3.0-7.8B-Instruct
488
- for (auto message : chat) {
489
- std::string role(message->role);
490
- if (role == "system") {
491
- ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
492
- } else if (role == "user") {
493
- ss << "[|user|]" << trim(message->content) << "\n";
494
- } else if (role == "assistant") {
495
- ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
496
- }
497
- }
498
- if (add_ass) {
499
- ss << "[|assistant|]";
500
- }
501
- } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
502
- // this template requires the model to have "\n\n" as EOT token
503
- for (auto message : chat) {
504
- std::string role(message->role);
505
- if (role == "user") {
506
- ss << "User: " << message->content << "\n\nAssistant:";
507
- } else {
508
- ss << message->content << "\n\n";
509
- }
510
- }
511
- } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
512
- // IBM Granite template
513
- for (const auto & message : chat) {
514
- std::string role(message->role);
515
- ss << "<|start_of_role|>" << role << "<|end_of_role|>";
516
- if (role == "assistant_tool_call") {
517
- ss << "<|tool_call|>";
518
- }
519
- ss << message->content << "<|end_of_text|>\n";
520
- }
521
- if (add_ass) {
522
- ss << "<|start_of_role|>assistant<|end_of_role|>\n";
523
- }
524
- } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
525
- // GigaChat template
526
- bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
527
-
528
- // Handle system message if present
529
- if (has_system) {
530
- ss << "<s>" << chat[0]->content << "<|message_sep|>";
531
- } else {
532
- ss << "<s>";
533
- }
534
-
535
- // Process remaining messages
536
- for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
537
- std::string role(chat[i]->role);
538
- if (role == "user") {
539
- ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
540
- << "available functions<|role_sep|>[]<|message_sep|>";
541
- } else if (role == "assistant") {
542
- ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
543
- }
544
- }
545
-
546
- // Add generation prompt if needed
547
- if (add_ass) {
548
- ss << "assistant<|role_sep|>";
549
- }
550
- } else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
551
- // Megrez template
552
- for (auto message : chat) {
553
- std::string role(message->role);
554
- ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
555
- }
556
-
557
- if (add_ass) {
558
- ss << "<|role_start|>assistant<|role_end|>";
559
- }
560
- } else {
561
- // template not supported
562
- return -1;
563
- }
564
- dest = ss.str();
565
- return dest.size();
566
- }
567
-
568
- // public interface
569
-
570
- int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
571
- auto it = LLM_CHAT_TEMPLATES.begin();
572
- for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
573
- output[i] = it->first.c_str();
574
- std::advance(it, 1);
575
- }
576
- return (int32_t) LLM_CHAT_TEMPLATES.size();
577
- }
578
-
1
+ #include "llama-chat.h"
2
+
3
+ #include "llama.h"
4
+
5
+ #include <map>
6
+ #include <sstream>
7
+ #include <algorithm>
8
+
9
+ #if __cplusplus >= 202000L
10
+ #define LU8(x) (const char*)(u8##x)
11
+ #else
12
+ #define LU8(x) u8##x
13
+ #endif
14
+
15
+ // trim whitespace from the beginning and end of a string
16
+ static std::string trim(const std::string & str) {
17
+ size_t start = 0;
18
+ size_t end = str.size();
19
+ while (start < end && isspace(str[start])) {
20
+ start += 1;
21
+ }
22
+ while (end > start && isspace(str[end - 1])) {
23
+ end -= 1;
24
+ }
25
+ return str.substr(start, end - start);
26
+ }
27
+
28
+ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
29
+ { "chatml", LLM_CHAT_TEMPLATE_CHATML },
30
+ { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
31
+ { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
32
+ { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
33
+ { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
34
+ { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
35
+ { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
36
+ { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
37
+ { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
38
+ { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
39
+ { "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
40
+ { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
41
+ { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
42
+ { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
43
+ { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
44
+ { "orion", LLM_CHAT_TEMPLATE_ORION },
45
+ { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
46
+ { "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
47
+ { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
48
+ { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
49
+ { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
50
+ { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
51
+ { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
52
+ { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
53
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
54
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
55
+ { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
56
+ { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
57
+ { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
58
+ { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
59
+ { "granite", LLM_CHAT_TEMPLATE_GRANITE },
60
+ { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
61
+ { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
62
+ };
63
+
64
+ llm_chat_template llm_chat_template_from_str(const std::string & name) {
65
+ return LLM_CHAT_TEMPLATES.at(name);
66
+ }
67
+
68
+ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
69
+ try {
70
+ return llm_chat_template_from_str(tmpl);
71
+ } catch (const std::out_of_range &) {
72
+ // ignore
73
+ }
74
+
75
+ auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
76
+ return tmpl.find(haystack) != std::string::npos;
77
+ };
78
+ if (tmpl_contains("<|im_start|>")) {
79
+ return tmpl_contains("<|im_sep|>")
80
+ ? LLM_CHAT_TEMPLATE_PHI_4
81
+ : LLM_CHAT_TEMPLATE_CHATML;
82
+ } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
83
+ if (tmpl_contains("[SYSTEM_PROMPT]")) {
84
+ return LLM_CHAT_TEMPLATE_MISTRAL_V7;
85
+ } else if (
86
+ // catches official 'v1' template
87
+ tmpl_contains("' [INST] ' + system_message")
88
+ // catches official 'v3' and 'v3-tekken' templates
89
+ || tmpl_contains("[AVAILABLE_TOOLS]")
90
+ ) {
91
+ // Official mistral 'v1', 'v3' and 'v3-tekken' templates
92
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
93
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
94
+ if (tmpl_contains(" [INST]")) {
95
+ return LLM_CHAT_TEMPLATE_MISTRAL_V1;
96
+ } else if (tmpl_contains("\"[INST]\"")) {
97
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
98
+ }
99
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3;
100
+ } else {
101
+ // llama2 template and its variants
102
+ // [variant] support system message
103
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
104
+ bool support_system_message = tmpl_contains("<<SYS>>");
105
+ bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
106
+ bool strip_message = tmpl_contains("content.strip()");
107
+ if (strip_message) {
108
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
109
+ } else if (add_bos_inside_history) {
110
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
111
+ } else if (support_system_message) {
112
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
113
+ } else {
114
+ return LLM_CHAT_TEMPLATE_LLAMA_2;
115
+ }
116
+ }
117
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
118
+ return LLM_CHAT_TEMPLATE_PHI_3;
119
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
120
+ return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
121
+ } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
122
+ return LLM_CHAT_TEMPLATE_ZEPHYR;
123
+ } else if (tmpl_contains("bos_token + message['role']")) {
124
+ return LLM_CHAT_TEMPLATE_MONARCH;
125
+ } else if (tmpl_contains("<start_of_turn>")) {
126
+ return LLM_CHAT_TEMPLATE_GEMMA;
127
+ } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
128
+ // OrionStarAI/Orion-14B-Chat
129
+ return LLM_CHAT_TEMPLATE_ORION;
130
+ } else if (tmpl_contains("GPT4 Correct ")) {
131
+ // openchat/openchat-3.5-0106
132
+ return LLM_CHAT_TEMPLATE_OPENCHAT;
133
+ } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
134
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
135
+ if (tmpl_contains("SYSTEM: ")) {
136
+ return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
137
+ }
138
+ return LLM_CHAT_TEMPLATE_VICUNA;
139
+ } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
140
+ // deepseek-ai/deepseek-coder-33b-instruct
141
+ return LLM_CHAT_TEMPLATE_DEEPSEEK;
142
+ } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
143
+ // CohereForAI/c4ai-command-r-plus
144
+ return LLM_CHAT_TEMPLATE_COMMAND_R;
145
+ } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
146
+ return LLM_CHAT_TEMPLATE_LLAMA_3;
147
+ } else if (tmpl_contains("[gMASK]sop")) {
148
+ // chatglm3-6b
149
+ return LLM_CHAT_TEMPLATE_CHATGML_3;
150
+ } else if (tmpl_contains("[gMASK]<sop>")) {
151
+ return LLM_CHAT_TEMPLATE_CHATGML_4;
152
+ } else if (tmpl_contains(LU8("<用户>"))) {
153
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
154
+ return LLM_CHAT_TEMPLATE_MINICPM;
155
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
156
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
157
+ } else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
158
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
159
+ } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
160
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
161
+ // EXAONE-3.0-7.8B-Instruct
162
+ return LLM_CHAT_TEMPLATE_EXAONE_3;
163
+ } else if (tmpl_contains("rwkv-world")) {
164
+ return LLM_CHAT_TEMPLATE_RWKV_WORLD;
165
+ } else if (tmpl_contains("<|start_of_role|>")) {
166
+ return LLM_CHAT_TEMPLATE_GRANITE;
167
+ } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
168
+ return LLM_CHAT_TEMPLATE_GIGACHAT;
169
+ } else if (tmpl_contains("<|role_start|>")) {
170
+ return LLM_CHAT_TEMPLATE_MEGREZ;
171
+ }
172
+ return LLM_CHAT_TEMPLATE_UNKNOWN;
173
+ }
174
+
175
+ // Simple version of "llama_apply_chat_template" that only works with strings
176
+ // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
177
+ int32_t llm_chat_apply_template(
178
+ llm_chat_template tmpl,
179
+ const std::vector<const llama_chat_message *> & chat,
180
+ std::string & dest, bool add_ass) {
181
+ // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
182
+ std::stringstream ss;
183
+ if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
184
+ // chatml template
185
+ for (auto message : chat) {
186
+ ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
187
+ }
188
+ if (add_ass) {
189
+ ss << "<|im_start|>assistant\n";
190
+ }
191
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
192
+ // Official mistral 'v7' template
193
+ // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
194
+ for (auto message : chat) {
195
+ std::string role(message->role);
196
+ std::string content(message->content);
197
+ if (role == "system") {
198
+ ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
199
+ } else if (role == "user") {
200
+ ss << "[INST] " << content << "[/INST]";
201
+ }
202
+ else {
203
+ ss << " " << content << "</s>";
204
+ }
205
+ }
206
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
207
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
208
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
209
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
210
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
211
+ std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
212
+ std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
213
+ bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
214
+ bool is_inside_turn = false;
215
+ for (auto message : chat) {
216
+ if (!is_inside_turn) {
217
+ ss << leading_space << "[INST]" << trailing_space;
218
+ is_inside_turn = true;
219
+ }
220
+ std::string role(message->role);
221
+ std::string content(message->content);
222
+ if (role == "system") {
223
+ ss << content << "\n\n";
224
+ } else if (role == "user") {
225
+ ss << content << leading_space << "[/INST]";
226
+ } else {
227
+ ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
228
+ is_inside_turn = false;
229
+ }
230
+ }
231
+ } else if (
232
+ tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
233
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
234
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
235
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
236
+ // llama2 template and its variants
237
+ // [variant] support system message
238
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
239
+ bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
240
+ // [variant] add BOS inside history
241
+ bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
242
+ // [variant] trim spaces from the input message
243
+ bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
244
+ // construct the prompt
245
+ bool is_inside_turn = true; // skip BOS at the beginning
246
+ ss << "[INST] ";
247
+ for (auto message : chat) {
248
+ std::string content = strip_message ? trim(message->content) : message->content;
249
+ std::string role(message->role);
250
+ if (!is_inside_turn) {
251
+ is_inside_turn = true;
252
+ ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
253
+ }
254
+ if (role == "system") {
255
+ if (support_system_message) {
256
+ ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
257
+ } else {
258
+ // if the model does not support system message, we still include it in the first message, but without <<SYS>>
259
+ ss << content << "\n";
260
+ }
261
+ } else if (role == "user") {
262
+ ss << content << " [/INST]";
263
+ } else {
264
+ ss << content << "</s>";
265
+ is_inside_turn = false;
266
+ }
267
+ }
268
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
269
+ // Phi 3
270
+ for (auto message : chat) {
271
+ std::string role(message->role);
272
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
273
+ }
274
+ if (add_ass) {
275
+ ss << "<|assistant|>\n";
276
+ }
277
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
278
+ // chatml template
279
+ for (auto message : chat) {
280
+ ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
281
+ }
282
+ if (add_ass) {
283
+ ss << "<|im_start|>assistant<|im_sep|>";
284
+ }
285
+ } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
286
+ // Falcon 3
287
+ for (auto message : chat) {
288
+ std::string role(message->role);
289
+ ss << "<|" << role << "|>\n" << message->content << "\n";
290
+ }
291
+ if (add_ass) {
292
+ ss << "<|assistant|>\n";
293
+ }
294
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
295
+ // zephyr template
296
+ for (auto message : chat) {
297
+ ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
298
+ }
299
+ if (add_ass) {
300
+ ss << "<|assistant|>\n";
301
+ }
302
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
303
+ // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
304
+ for (auto message : chat) {
305
+ std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
306
+ ss << bos << message->role << "\n" << message->content << "</s>\n";
307
+ }
308
+ if (add_ass) {
309
+ ss << "<s>assistant\n";
310
+ }
311
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
312
+ // google/gemma-7b-it
313
+ std::string system_prompt = "";
314
+ for (auto message : chat) {
315
+ std::string role(message->role);
316
+ if (role == "system") {
317
+ // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
318
+ system_prompt = trim(message->content);
319
+ continue;
320
+ }
321
+ // in gemma, "assistant" is "model"
322
+ role = role == "assistant" ? "model" : message->role;
323
+ ss << "<start_of_turn>" << role << "\n";
324
+ if (!system_prompt.empty() && role != "model") {
325
+ ss << system_prompt << "\n\n";
326
+ system_prompt = "";
327
+ }
328
+ ss << trim(message->content) << "<end_of_turn>\n";
329
+ }
330
+ if (add_ass) {
331
+ ss << "<start_of_turn>model\n";
332
+ }
333
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
334
+ // OrionStarAI/Orion-14B-Chat
335
+ std::string system_prompt = "";
336
+ for (auto message : chat) {
337
+ std::string role(message->role);
338
+ if (role == "system") {
339
+ // there is no system message support, we will merge it with user prompt
340
+ system_prompt = message->content;
341
+ continue;
342
+ } else if (role == "user") {
343
+ ss << "Human: ";
344
+ if (!system_prompt.empty()) {
345
+ ss << system_prompt << "\n\n";
346
+ system_prompt = "";
347
+ }
348
+ ss << message->content << "\n\nAssistant: </s>";
349
+ } else {
350
+ ss << message->content << "</s>";
351
+ }
352
+ }
353
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
354
+ // openchat/openchat-3.5-0106,
355
+ for (auto message : chat) {
356
+ std::string role(message->role);
357
+ if (role == "system") {
358
+ ss << message->content << "<|end_of_turn|>";
359
+ } else {
360
+ role[0] = toupper(role[0]);
361
+ ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
362
+ }
363
+ }
364
+ if (add_ass) {
365
+ ss << "GPT4 Correct Assistant:";
366
+ }
367
+ } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
368
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
369
+ for (auto message : chat) {
370
+ std::string role(message->role);
371
+ if (role == "system") {
372
+ // Orca-Vicuna variant uses a system prefix
373
+ if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
374
+ ss << "SYSTEM: " << message->content << "\n";
375
+ } else {
376
+ ss << message->content << "\n\n";
377
+ }
378
+ } else if (role == "user") {
379
+ ss << "USER: " << message->content << "\n";
380
+ } else if (role == "assistant") {
381
+ ss << "ASSISTANT: " << message->content << "</s>\n";
382
+ }
383
+ }
384
+ if (add_ass) {
385
+ ss << "ASSISTANT:";
386
+ }
387
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
388
+ // deepseek-ai/deepseek-coder-33b-instruct
389
+ for (auto message : chat) {
390
+ std::string role(message->role);
391
+ if (role == "system") {
392
+ ss << message->content;
393
+ } else if (role == "user") {
394
+ ss << "### Instruction:\n" << message->content << "\n";
395
+ } else if (role == "assistant") {
396
+ ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
397
+ }
398
+ }
399
+ if (add_ass) {
400
+ ss << "### Response:\n";
401
+ }
402
+ } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
403
+ // CohereForAI/c4ai-command-r-plus
404
+ for (auto message : chat) {
405
+ std::string role(message->role);
406
+ if (role == "system") {
407
+ ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
408
+ } else if (role == "user") {
409
+ ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
410
+ } else if (role == "assistant") {
411
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
412
+ }
413
+ }
414
+ if (add_ass) {
415
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
416
+ }
417
+ } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
418
+ // Llama 3
419
+ for (auto message : chat) {
420
+ std::string role(message->role);
421
+ ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
422
+ }
423
+ if (add_ass) {
424
+ ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
425
+ }
426
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
427
+ // chatglm3-6b
428
+ ss << "[gMASK]" << "sop";
429
+ for (auto message : chat) {
430
+ std::string role(message->role);
431
+ ss << "<|" << role << "|>" << "\n " << message->content;
432
+ }
433
+ if (add_ass) {
434
+ ss << "<|assistant|>";
435
+ }
436
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
437
+ ss << "[gMASK]" << "<sop>";
438
+ for (auto message : chat) {
439
+ std::string role(message->role);
440
+ ss << "<|" << role << "|>" << "\n" << message->content;
441
+ }
442
+ if (add_ass) {
443
+ ss << "<|assistant|>";
444
+ }
445
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
446
+ for (auto message : chat) {
447
+ std::string role(message->role);
448
+ ss << "<|" << role << "|>" << "\n" << message->content;
449
+ }
450
+ if (add_ass) {
451
+ ss << "<|assistant|>";
452
+ }
453
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
454
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
455
+ for (auto message : chat) {
456
+ std::string role(message->role);
457
+ if (role == "user") {
458
+ ss << LU8("<用户>");
459
+ ss << trim(message->content);
460
+ ss << "<AI>";
461
+ } else {
462
+ ss << trim(message->content);
463
+ }
464
+ }
465
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
466
+ // DeepSeek-V2
467
+ for (auto message : chat) {
468
+ std::string role(message->role);
469
+ if (role == "system") {
470
+ ss << message->content << "\n\n";
471
+ } else if (role == "user") {
472
+ ss << "User: " << message->content << "\n\n";
473
+ } else if (role == "assistant") {
474
+ ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
475
+ }
476
+ }
477
+ if (add_ass) {
478
+ ss << "Assistant:";
479
+ }
480
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
481
+ // DeepSeek-V3
482
+ for (auto message : chat) {
483
+ std::string role(message->role);
484
+ if (role == "system") {
485
+ ss << message->content << "\n\n";
486
+ } else if (role == "user") {
487
+ ss << LU8("<|User|>") << message->content;
488
+ } else if (role == "assistant") {
489
+ ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
490
+ }
491
+ }
492
+ if (add_ass) {
493
+ ss << LU8("<|Assistant|>");
494
+ }
495
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
496
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
497
+ // EXAONE-3.0-7.8B-Instruct
498
+ for (auto message : chat) {
499
+ std::string role(message->role);
500
+ if (role == "system") {
501
+ ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
502
+ } else if (role == "user") {
503
+ ss << "[|user|]" << trim(message->content) << "\n";
504
+ } else if (role == "assistant") {
505
+ ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
506
+ }
507
+ }
508
+ if (add_ass) {
509
+ ss << "[|assistant|]";
510
+ }
511
+ } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
512
+ // this template requires the model to have "\n\n" as EOT token
513
+ for (auto message : chat) {
514
+ std::string role(message->role);
515
+ if (role == "user") {
516
+ ss << "User: " << message->content << "\n\nAssistant:";
517
+ } else {
518
+ ss << message->content << "\n\n";
519
+ }
520
+ }
521
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
522
+ // IBM Granite template
523
+ for (const auto & message : chat) {
524
+ std::string role(message->role);
525
+ ss << "<|start_of_role|>" << role << "<|end_of_role|>";
526
+ if (role == "assistant_tool_call") {
527
+ ss << "<|tool_call|>";
528
+ }
529
+ ss << message->content << "<|end_of_text|>\n";
530
+ }
531
+ if (add_ass) {
532
+ ss << "<|start_of_role|>assistant<|end_of_role|>\n";
533
+ }
534
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
535
+ // GigaChat template
536
+ bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
537
+
538
+ // Handle system message if present
539
+ if (has_system) {
540
+ ss << "<s>" << chat[0]->content << "<|message_sep|>";
541
+ } else {
542
+ ss << "<s>";
543
+ }
544
+
545
+ // Process remaining messages
546
+ for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
547
+ std::string role(chat[i]->role);
548
+ if (role == "user") {
549
+ ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
550
+ << "available functions<|role_sep|>[]<|message_sep|>";
551
+ } else if (role == "assistant") {
552
+ ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
553
+ }
554
+ }
555
+
556
+ // Add generation prompt if needed
557
+ if (add_ass) {
558
+ ss << "assistant<|role_sep|>";
559
+ }
560
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
561
+ // Megrez template
562
+ for (auto message : chat) {
563
+ std::string role(message->role);
564
+ ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
565
+ }
566
+
567
+ if (add_ass) {
568
+ ss << "<|role_start|>assistant<|role_end|>";
569
+ }
570
+ } else {
571
+ // template not supported
572
+ return -1;
573
+ }
574
+ dest = ss.str();
575
+ return dest.size();
576
+ }
577
+
578
+ // public interface
579
+
580
+ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
581
+ auto it = LLM_CHAT_TEMPLATES.begin();
582
+ for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
583
+ output[i] = it->first.c_str();
584
+ std::advance(it, 1);
585
+ }
586
+ return (int32_t) LLM_CHAT_TEMPLATES.size();
587
+ }
588
+