cui-llama.rn 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/android/src/main/jni.cpp +9 -9
  2. package/cpp/common.cpp +163 -60
  3. package/cpp/common.h +43 -12
  4. package/cpp/ggml-alloc.c +1042 -1037
  5. package/cpp/ggml-backend-impl.h +255 -256
  6. package/cpp/ggml-backend-reg.cpp +582 -582
  7. package/cpp/ggml-backend.cpp +2002 -2002
  8. package/cpp/ggml-backend.h +354 -352
  9. package/cpp/ggml-common.h +1853 -1853
  10. package/cpp/ggml-cpp.h +39 -39
  11. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  12. package/cpp/ggml-cpu-aarch64.h +8 -8
  13. package/cpp/ggml-cpu-impl.h +386 -386
  14. package/cpp/ggml-cpu-quants.c +10920 -10839
  15. package/cpp/ggml-cpu-traits.cpp +36 -36
  16. package/cpp/ggml-cpu-traits.h +38 -38
  17. package/cpp/ggml-cpu.c +329 -60
  18. package/cpp/ggml-cpu.cpp +10 -2
  19. package/cpp/ggml-cpu.h +135 -135
  20. package/cpp/ggml-impl.h +567 -567
  21. package/cpp/ggml-metal-impl.h +17 -17
  22. package/cpp/ggml-metal.m +4884 -4884
  23. package/cpp/ggml-quants.c +5238 -5238
  24. package/cpp/ggml-threading.h +14 -14
  25. package/cpp/ggml.c +6514 -6448
  26. package/cpp/ggml.h +2194 -2163
  27. package/cpp/gguf.cpp +1329 -1325
  28. package/cpp/gguf.h +202 -202
  29. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  30. package/cpp/json-schema-to-grammar.h +8 -8
  31. package/cpp/json.hpp +24766 -24766
  32. package/cpp/llama-adapter.cpp +347 -346
  33. package/cpp/llama-adapter.h +74 -73
  34. package/cpp/llama-arch.cpp +1487 -1434
  35. package/cpp/llama-arch.h +400 -395
  36. package/cpp/llama-batch.cpp +368 -368
  37. package/cpp/llama-batch.h +88 -88
  38. package/cpp/llama-chat.cpp +578 -567
  39. package/cpp/llama-chat.h +52 -51
  40. package/cpp/llama-context.cpp +1775 -1771
  41. package/cpp/llama-context.h +128 -128
  42. package/cpp/llama-cparams.cpp +1 -1
  43. package/cpp/llama-cparams.h +37 -37
  44. package/cpp/llama-cpp.h +30 -30
  45. package/cpp/llama-grammar.cpp +1139 -1139
  46. package/cpp/llama-grammar.h +143 -143
  47. package/cpp/llama-hparams.cpp +71 -71
  48. package/cpp/llama-hparams.h +139 -140
  49. package/cpp/llama-impl.cpp +167 -167
  50. package/cpp/llama-impl.h +61 -61
  51. package/cpp/llama-kv-cache.cpp +718 -718
  52. package/cpp/llama-kv-cache.h +218 -218
  53. package/cpp/llama-mmap.cpp +2 -1
  54. package/cpp/llama-mmap.h +67 -67
  55. package/cpp/llama-model-loader.cpp +1124 -1011
  56. package/cpp/llama-model-loader.h +167 -158
  57. package/cpp/llama-model.cpp +3997 -2202
  58. package/cpp/llama-model.h +370 -391
  59. package/cpp/llama-sampling.cpp +2408 -2406
  60. package/cpp/llama-sampling.h +32 -48
  61. package/cpp/llama-vocab.cpp +3247 -1982
  62. package/cpp/llama-vocab.h +125 -182
  63. package/cpp/llama.cpp +416 -2886
  64. package/cpp/llama.h +1323 -1285
  65. package/cpp/log.cpp +401 -401
  66. package/cpp/log.h +121 -121
  67. package/cpp/rn-llama.hpp +18 -12
  68. package/cpp/sampling.cpp +505 -500
  69. package/cpp/sgemm.cpp +2597 -2597
  70. package/cpp/speculative.cpp +277 -274
  71. package/cpp/speculative.h +28 -28
  72. package/cpp/unicode.cpp +2 -3
  73. package/package.json +1 -1
@@ -1,567 +1,578 @@
1
- #include "llama-chat.h"
2
-
3
- #include "llama.h"
4
-
5
- #include <map>
6
- #include <sstream>
7
-
8
- #if __cplusplus >= 202000L
9
- #define LU8(x) (const char*)(u8##x)
10
- #else
11
- #define LU8(x) u8##x
12
- #endif
13
-
14
- // trim whitespace from the beginning and end of a string
15
- static std::string trim(const std::string & str) {
16
- size_t start = 0;
17
- size_t end = str.size();
18
- while (start < end && isspace(str[start])) {
19
- start += 1;
20
- }
21
- while (end > start && isspace(str[end - 1])) {
22
- end -= 1;
23
- }
24
- return str.substr(start, end - start);
25
- }
26
-
27
- static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
28
- { "chatml", LLM_CHAT_TEMPLATE_CHATML },
29
- { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
30
- { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
31
- { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
32
- { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
33
- { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
34
- { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
35
- { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
36
- { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
37
- { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
38
- { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
39
- { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
40
- { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
41
- { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
42
- { "orion", LLM_CHAT_TEMPLATE_ORION },
43
- { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
44
- { "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
45
- { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
46
- { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
47
- { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
48
- { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
49
- { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
50
- { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
51
- { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
52
- { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
53
- { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
54
- { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
55
- { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
56
- { "granite", LLM_CHAT_TEMPLATE_GRANITE },
57
- { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
58
- { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
59
- };
60
-
61
- llm_chat_template llm_chat_template_from_str(const std::string & name) {
62
- return LLM_CHAT_TEMPLATES.at(name);
63
- }
64
-
65
- llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
66
- try {
67
- return llm_chat_template_from_str(tmpl);
68
- } catch (const std::out_of_range &) {
69
- // ignore
70
- }
71
-
72
- auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
73
- return tmpl.find(haystack) != std::string::npos;
74
- };
75
- if (tmpl_contains("<|im_start|>")) {
76
- return LLM_CHAT_TEMPLATE_CHATML;
77
- } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
78
- if (tmpl_contains("[SYSTEM_PROMPT]")) {
79
- return LLM_CHAT_TEMPLATE_MISTRAL_V7;
80
- } else if (
81
- // catches official 'v1' template
82
- tmpl_contains("' [INST] ' + system_message")
83
- // catches official 'v3' and 'v3-tekken' templates
84
- || tmpl_contains("[AVAILABLE_TOOLS]")
85
- ) {
86
- // Official mistral 'v1', 'v3' and 'v3-tekken' templates
87
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
88
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
89
- if (tmpl_contains(" [INST]")) {
90
- return LLM_CHAT_TEMPLATE_MISTRAL_V1;
91
- } else if (tmpl_contains("\"[INST]\"")) {
92
- return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
93
- }
94
- return LLM_CHAT_TEMPLATE_MISTRAL_V3;
95
- } else {
96
- // llama2 template and its variants
97
- // [variant] support system message
98
- // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
99
- bool support_system_message = tmpl_contains("<<SYS>>");
100
- bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
101
- bool strip_message = tmpl_contains("content.strip()");
102
- if (strip_message) {
103
- return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
104
- } else if (add_bos_inside_history) {
105
- return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
106
- } else if (support_system_message) {
107
- return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
108
- } else {
109
- return LLM_CHAT_TEMPLATE_LLAMA_2;
110
- }
111
- }
112
- } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
113
- return LLM_CHAT_TEMPLATE_PHI_3;
114
- } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
115
- return LLM_CHAT_TEMPLATE_FALCON_3;
116
- } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
117
- return LLM_CHAT_TEMPLATE_ZEPHYR;
118
- } else if (tmpl_contains("bos_token + message['role']")) {
119
- return LLM_CHAT_TEMPLATE_MONARCH;
120
- } else if (tmpl_contains("<start_of_turn>")) {
121
- return LLM_CHAT_TEMPLATE_GEMMA;
122
- } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
123
- // OrionStarAI/Orion-14B-Chat
124
- return LLM_CHAT_TEMPLATE_ORION;
125
- } else if (tmpl_contains("GPT4 Correct ")) {
126
- // openchat/openchat-3.5-0106
127
- return LLM_CHAT_TEMPLATE_OPENCHAT;
128
- } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
129
- // eachadea/vicuna-13b-1.1 (and Orca variant)
130
- if (tmpl_contains("SYSTEM: ")) {
131
- return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
132
- }
133
- return LLM_CHAT_TEMPLATE_VICUNA;
134
- } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
135
- // deepseek-ai/deepseek-coder-33b-instruct
136
- return LLM_CHAT_TEMPLATE_DEEPSEEK;
137
- } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
138
- // CohereForAI/c4ai-command-r-plus
139
- return LLM_CHAT_TEMPLATE_COMMAND_R;
140
- } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
141
- return LLM_CHAT_TEMPLATE_LLAMA_3;
142
- } else if (tmpl_contains("[gMASK]sop")) {
143
- // chatglm3-6b
144
- return LLM_CHAT_TEMPLATE_CHATGML_3;
145
- } else if (tmpl_contains("[gMASK]<sop>")) {
146
- return LLM_CHAT_TEMPLATE_CHATGML_4;
147
- } else if (tmpl_contains(LU8("<用户>"))) {
148
- // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
149
- return LLM_CHAT_TEMPLATE_MINICPM;
150
- } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
151
- return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
152
- } else if (tmpl_contains(LU8("'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'"))) {
153
- return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
154
- } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
155
- // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
156
- // EXAONE-3.0-7.8B-Instruct
157
- return LLM_CHAT_TEMPLATE_EXAONE_3;
158
- } else if (tmpl_contains("rwkv-world")) {
159
- return LLM_CHAT_TEMPLATE_RWKV_WORLD;
160
- } else if (tmpl_contains("<|start_of_role|>")) {
161
- return LLM_CHAT_TEMPLATE_GRANITE;
162
- } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
163
- return LLM_CHAT_TEMPLATE_GIGACHAT;
164
- } else if (tmpl_contains("<|role_start|>")) {
165
- return LLM_CHAT_TEMPLATE_MEGREZ;
166
- }
167
- return LLM_CHAT_TEMPLATE_UNKNOWN;
168
- }
169
-
170
- // Simple version of "llama_apply_chat_template" that only works with strings
171
- // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
172
- int32_t llm_chat_apply_template(
173
- llm_chat_template tmpl,
174
- const std::vector<const llama_chat_message *> & chat,
175
- std::string & dest, bool add_ass) {
176
- // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
177
- std::stringstream ss;
178
- if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
179
- // chatml template
180
- for (auto message : chat) {
181
- ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
182
- }
183
- if (add_ass) {
184
- ss << "<|im_start|>assistant\n";
185
- }
186
- } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
187
- // Official mistral 'v7' template
188
- // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
189
- for (auto message : chat) {
190
- std::string role(message->role);
191
- std::string content(message->content);
192
- if (role == "system") {
193
- ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
194
- } else if (role == "user") {
195
- ss << "[INST] " << content << "[/INST]";
196
- }
197
- else {
198
- ss << " " << content << "</s>";
199
- }
200
- }
201
- } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
202
- || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
203
- || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
204
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
205
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
206
- std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
207
- std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
208
- bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
209
- bool is_inside_turn = false;
210
- for (auto message : chat) {
211
- if (!is_inside_turn) {
212
- ss << leading_space << "[INST]" << trailing_space;
213
- is_inside_turn = true;
214
- }
215
- std::string role(message->role);
216
- std::string content(message->content);
217
- if (role == "system") {
218
- ss << content << "\n\n";
219
- } else if (role == "user") {
220
- ss << content << leading_space << "[/INST]";
221
- } else {
222
- ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
223
- is_inside_turn = false;
224
- }
225
- }
226
- } else if (
227
- tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
228
- || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
229
- || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
230
- || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
231
- // llama2 template and its variants
232
- // [variant] support system message
233
- // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
234
- bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
235
- // [variant] add BOS inside history
236
- bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
237
- // [variant] trim spaces from the input message
238
- bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
239
- // construct the prompt
240
- bool is_inside_turn = true; // skip BOS at the beginning
241
- ss << "[INST] ";
242
- for (auto message : chat) {
243
- std::string content = strip_message ? trim(message->content) : message->content;
244
- std::string role(message->role);
245
- if (!is_inside_turn) {
246
- is_inside_turn = true;
247
- ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
248
- }
249
- if (role == "system") {
250
- if (support_system_message) {
251
- ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
252
- } else {
253
- // if the model does not support system message, we still include it in the first message, but without <<SYS>>
254
- ss << content << "\n";
255
- }
256
- } else if (role == "user") {
257
- ss << content << " [/INST]";
258
- } else {
259
- ss << content << "</s>";
260
- is_inside_turn = false;
261
- }
262
- }
263
- } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
264
- // Phi 3
265
- for (auto message : chat) {
266
- std::string role(message->role);
267
- ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
268
- }
269
- if (add_ass) {
270
- ss << "<|assistant|>\n";
271
- }
272
- } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
273
- // Falcon 3
274
- for (auto message : chat) {
275
- std::string role(message->role);
276
- ss << "<|" << role << "|>\n" << message->content << "\n";
277
- }
278
- if (add_ass) {
279
- ss << "<|assistant|>\n";
280
- }
281
- } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
282
- // zephyr template
283
- for (auto message : chat) {
284
- ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
285
- }
286
- if (add_ass) {
287
- ss << "<|assistant|>\n";
288
- }
289
- } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
290
- // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
291
- for (auto message : chat) {
292
- std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
293
- ss << bos << message->role << "\n" << message->content << "</s>\n";
294
- }
295
- if (add_ass) {
296
- ss << "<s>assistant\n";
297
- }
298
- } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
299
- // google/gemma-7b-it
300
- std::string system_prompt = "";
301
- for (auto message : chat) {
302
- std::string role(message->role);
303
- if (role == "system") {
304
- // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
305
- system_prompt = trim(message->content);
306
- continue;
307
- }
308
- // in gemma, "assistant" is "model"
309
- role = role == "assistant" ? "model" : message->role;
310
- ss << "<start_of_turn>" << role << "\n";
311
- if (!system_prompt.empty() && role != "model") {
312
- ss << system_prompt << "\n\n";
313
- system_prompt = "";
314
- }
315
- ss << trim(message->content) << "<end_of_turn>\n";
316
- }
317
- if (add_ass) {
318
- ss << "<start_of_turn>model\n";
319
- }
320
- } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
321
- // OrionStarAI/Orion-14B-Chat
322
- std::string system_prompt = "";
323
- for (auto message : chat) {
324
- std::string role(message->role);
325
- if (role == "system") {
326
- // there is no system message support, we will merge it with user prompt
327
- system_prompt = message->content;
328
- continue;
329
- } else if (role == "user") {
330
- ss << "Human: ";
331
- if (!system_prompt.empty()) {
332
- ss << system_prompt << "\n\n";
333
- system_prompt = "";
334
- }
335
- ss << message->content << "\n\nAssistant: </s>";
336
- } else {
337
- ss << message->content << "</s>";
338
- }
339
- }
340
- } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
341
- // openchat/openchat-3.5-0106,
342
- for (auto message : chat) {
343
- std::string role(message->role);
344
- if (role == "system") {
345
- ss << message->content << "<|end_of_turn|>";
346
- } else {
347
- role[0] = toupper(role[0]);
348
- ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
349
- }
350
- }
351
- if (add_ass) {
352
- ss << "GPT4 Correct Assistant:";
353
- }
354
- } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
355
- // eachadea/vicuna-13b-1.1 (and Orca variant)
356
- for (auto message : chat) {
357
- std::string role(message->role);
358
- if (role == "system") {
359
- // Orca-Vicuna variant uses a system prefix
360
- if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
361
- ss << "SYSTEM: " << message->content << "\n";
362
- } else {
363
- ss << message->content << "\n\n";
364
- }
365
- } else if (role == "user") {
366
- ss << "USER: " << message->content << "\n";
367
- } else if (role == "assistant") {
368
- ss << "ASSISTANT: " << message->content << "</s>\n";
369
- }
370
- }
371
- if (add_ass) {
372
- ss << "ASSISTANT:";
373
- }
374
- } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
375
- // deepseek-ai/deepseek-coder-33b-instruct
376
- for (auto message : chat) {
377
- std::string role(message->role);
378
- if (role == "system") {
379
- ss << message->content;
380
- } else if (role == "user") {
381
- ss << "### Instruction:\n" << message->content << "\n";
382
- } else if (role == "assistant") {
383
- ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
384
- }
385
- }
386
- if (add_ass) {
387
- ss << "### Response:\n";
388
- }
389
- } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
390
- // CohereForAI/c4ai-command-r-plus
391
- for (auto message : chat) {
392
- std::string role(message->role);
393
- if (role == "system") {
394
- ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
395
- } else if (role == "user") {
396
- ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
397
- } else if (role == "assistant") {
398
- ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
399
- }
400
- }
401
- if (add_ass) {
402
- ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
403
- }
404
- } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
405
- // Llama 3
406
- for (auto message : chat) {
407
- std::string role(message->role);
408
- ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
409
- }
410
- if (add_ass) {
411
- ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
412
- }
413
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
414
- // chatglm3-6b
415
- ss << "[gMASK]" << "sop";
416
- for (auto message : chat) {
417
- std::string role(message->role);
418
- ss << "<|" << role << "|>" << "\n " << message->content;
419
- }
420
- if (add_ass) {
421
- ss << "<|assistant|>";
422
- }
423
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
424
- ss << "[gMASK]" << "<sop>";
425
- for (auto message : chat) {
426
- std::string role(message->role);
427
- ss << "<|" << role << "|>" << "\n" << message->content;
428
- }
429
- if (add_ass) {
430
- ss << "<|assistant|>";
431
- }
432
- } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
433
- // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
434
- for (auto message : chat) {
435
- std::string role(message->role);
436
- if (role == "user") {
437
- ss << LU8("<用户>");
438
- ss << trim(message->content);
439
- ss << "<AI>";
440
- } else {
441
- ss << trim(message->content);
442
- }
443
- }
444
- } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
445
- // DeepSeek-V2
446
- for (auto message : chat) {
447
- std::string role(message->role);
448
- if (role == "system") {
449
- ss << message->content << "\n\n";
450
- } else if (role == "user") {
451
- ss << "User: " << message->content << "\n\n";
452
- } else if (role == "assistant") {
453
- ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
454
- }
455
- }
456
- if (add_ass) {
457
- ss << "Assistant:";
458
- }
459
- } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
460
- // DeepSeek-V3
461
- for (auto message : chat) {
462
- std::string role(message->role);
463
- if (role == "system") {
464
- ss << message->content << "\n\n";
465
- } else if (role == "user") {
466
- ss << LU8("<|User|>") << message->content;
467
- } else if (role == "assistant") {
468
- ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
469
- }
470
- }
471
- if (add_ass) {
472
- ss << LU8("<|Assistant|>");
473
- }
474
- } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
475
- // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
476
- // EXAONE-3.0-7.8B-Instruct
477
- for (auto message : chat) {
478
- std::string role(message->role);
479
- if (role == "system") {
480
- ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
481
- } else if (role == "user") {
482
- ss << "[|user|]" << trim(message->content) << "\n";
483
- } else if (role == "assistant") {
484
- ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
485
- }
486
- }
487
- if (add_ass) {
488
- ss << "[|assistant|]";
489
- }
490
- } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
491
- // this template requires the model to have "\n\n" as EOT token
492
- for (auto message : chat) {
493
- std::string role(message->role);
494
- if (role == "user") {
495
- ss << "User: " << message->content << "\n\nAssistant:";
496
- } else {
497
- ss << message->content << "\n\n";
498
- }
499
- }
500
- } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
501
- // IBM Granite template
502
- for (const auto & message : chat) {
503
- std::string role(message->role);
504
- ss << "<|start_of_role|>" << role << "<|end_of_role|>";
505
- if (role == "assistant_tool_call") {
506
- ss << "<|tool_call|>";
507
- }
508
- ss << message->content << "<|end_of_text|>\n";
509
- }
510
- if (add_ass) {
511
- ss << "<|start_of_role|>assistant<|end_of_role|>\n";
512
- }
513
- } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
514
- // GigaChat template
515
- bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
516
-
517
- // Handle system message if present
518
- if (has_system) {
519
- ss << "<s>" << chat[0]->content << "<|message_sep|>";
520
- } else {
521
- ss << "<s>";
522
- }
523
-
524
- // Process remaining messages
525
- for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
526
- std::string role(chat[i]->role);
527
- if (role == "user") {
528
- ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
529
- << "available functions<|role_sep|>[]<|message_sep|>";
530
- } else if (role == "assistant") {
531
- ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
532
- }
533
- }
534
-
535
- // Add generation prompt if needed
536
- if (add_ass) {
537
- ss << "assistant<|role_sep|>";
538
- }
539
- } else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
540
- // Megrez template
541
- for (auto message : chat) {
542
- std::string role(message->role);
543
- ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
544
- }
545
-
546
- if (add_ass) {
547
- ss << "<|role_start|>assistant<|role_end|>";
548
- }
549
- } else {
550
- // template not supported
551
- return -1;
552
- }
553
- dest = ss.str();
554
- return dest.size();
555
- }
556
-
557
- // public interface
558
-
559
- int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
560
- auto it = LLM_CHAT_TEMPLATES.begin();
561
- for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
562
- output[i] = it->first.c_str();
563
- std::advance(it, 1);
564
- }
565
- return (int32_t) LLM_CHAT_TEMPLATES.size();
566
- }
567
-
1
+ #include "llama-chat.h"
2
+
3
+ #include "llama.h"
4
+
5
+ #include <map>
6
+ #include <sstream>
7
+
8
+ #if __cplusplus >= 202000L
9
+ #define LU8(x) (const char*)(u8##x)
10
+ #else
11
+ #define LU8(x) u8##x
12
+ #endif
13
+
14
+ // trim whitespace from the beginning and end of a string
15
+ static std::string trim(const std::string & str) {
16
+ size_t start = 0;
17
+ size_t end = str.size();
18
+ while (start < end && isspace(str[start])) {
19
+ start += 1;
20
+ }
21
+ while (end > start && isspace(str[end - 1])) {
22
+ end -= 1;
23
+ }
24
+ return str.substr(start, end - start);
25
+ }
26
+
27
+ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
28
+ { "chatml", LLM_CHAT_TEMPLATE_CHATML },
29
+ { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
30
+ { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
31
+ { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
32
+ { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
33
+ { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
34
+ { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
35
+ { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
36
+ { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
37
+ { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
38
+ { "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
39
+ { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
40
+ { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
41
+ { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
42
+ { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
43
+ { "orion", LLM_CHAT_TEMPLATE_ORION },
44
+ { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
45
+ { "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
46
+ { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
47
+ { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
48
+ { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
49
+ { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
50
+ { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
51
+ { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
52
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
53
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
54
+ { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
55
+ { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
56
+ { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
57
+ { "granite", LLM_CHAT_TEMPLATE_GRANITE },
58
+ { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
59
+ { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
60
+ };
61
+
62
+ llm_chat_template llm_chat_template_from_str(const std::string & name) {
63
+ return LLM_CHAT_TEMPLATES.at(name);
64
+ }
65
+
66
+ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
67
+ try {
68
+ return llm_chat_template_from_str(tmpl);
69
+ } catch (const std::out_of_range &) {
70
+ // ignore
71
+ }
72
+
73
+ auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
74
+ return tmpl.find(haystack) != std::string::npos;
75
+ };
76
+ if (tmpl_contains("<|im_start|>")) {
77
+ return tmpl_contains("<|im_sep|>")
78
+ ? LLM_CHAT_TEMPLATE_PHI_4
79
+ : LLM_CHAT_TEMPLATE_CHATML;
80
+ } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
81
+ if (tmpl_contains("[SYSTEM_PROMPT]")) {
82
+ return LLM_CHAT_TEMPLATE_MISTRAL_V7;
83
+ } else if (
84
+ // catches official 'v1' template
85
+ tmpl_contains("' [INST] ' + system_message")
86
+ // catches official 'v3' and 'v3-tekken' templates
87
+ || tmpl_contains("[AVAILABLE_TOOLS]")
88
+ ) {
89
+ // Official mistral 'v1', 'v3' and 'v3-tekken' templates
90
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
91
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
92
+ if (tmpl_contains(" [INST]")) {
93
+ return LLM_CHAT_TEMPLATE_MISTRAL_V1;
94
+ } else if (tmpl_contains("\"[INST]\"")) {
95
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
96
+ }
97
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3;
98
+ } else {
99
+ // llama2 template and its variants
100
+ // [variant] support system message
101
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
102
+ bool support_system_message = tmpl_contains("<<SYS>>");
103
+ bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
104
+ bool strip_message = tmpl_contains("content.strip()");
105
+ if (strip_message) {
106
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
107
+ } else if (add_bos_inside_history) {
108
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
109
+ } else if (support_system_message) {
110
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
111
+ } else {
112
+ return LLM_CHAT_TEMPLATE_LLAMA_2;
113
+ }
114
+ }
115
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
116
+ return LLM_CHAT_TEMPLATE_PHI_3;
117
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
118
+ return LLM_CHAT_TEMPLATE_FALCON_3;
119
+ } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
120
+ return LLM_CHAT_TEMPLATE_ZEPHYR;
121
+ } else if (tmpl_contains("bos_token + message['role']")) {
122
+ return LLM_CHAT_TEMPLATE_MONARCH;
123
+ } else if (tmpl_contains("<start_of_turn>")) {
124
+ return LLM_CHAT_TEMPLATE_GEMMA;
125
+ } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
126
+ // OrionStarAI/Orion-14B-Chat
127
+ return LLM_CHAT_TEMPLATE_ORION;
128
+ } else if (tmpl_contains("GPT4 Correct ")) {
129
+ // openchat/openchat-3.5-0106
130
+ return LLM_CHAT_TEMPLATE_OPENCHAT;
131
+ } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
132
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
133
+ if (tmpl_contains("SYSTEM: ")) {
134
+ return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
135
+ }
136
+ return LLM_CHAT_TEMPLATE_VICUNA;
137
+ } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
138
+ // deepseek-ai/deepseek-coder-33b-instruct
139
+ return LLM_CHAT_TEMPLATE_DEEPSEEK;
140
+ } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
141
+ // CohereForAI/c4ai-command-r-plus
142
+ return LLM_CHAT_TEMPLATE_COMMAND_R;
143
+ } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
144
+ return LLM_CHAT_TEMPLATE_LLAMA_3;
145
+ } else if (tmpl_contains("[gMASK]sop")) {
146
+ // chatglm3-6b
147
+ return LLM_CHAT_TEMPLATE_CHATGML_3;
148
+ } else if (tmpl_contains("[gMASK]<sop>")) {
149
+ return LLM_CHAT_TEMPLATE_CHATGML_4;
150
+ } else if (tmpl_contains(LU8("<用户>"))) {
151
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
152
+ return LLM_CHAT_TEMPLATE_MINICPM;
153
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
154
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
155
+ } else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
156
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
157
+ } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
158
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
159
+ // EXAONE-3.0-7.8B-Instruct
160
+ return LLM_CHAT_TEMPLATE_EXAONE_3;
161
+ } else if (tmpl_contains("rwkv-world")) {
162
+ return LLM_CHAT_TEMPLATE_RWKV_WORLD;
163
+ } else if (tmpl_contains("<|start_of_role|>")) {
164
+ return LLM_CHAT_TEMPLATE_GRANITE;
165
+ } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
166
+ return LLM_CHAT_TEMPLATE_GIGACHAT;
167
+ } else if (tmpl_contains("<|role_start|>")) {
168
+ return LLM_CHAT_TEMPLATE_MEGREZ;
169
+ }
170
+ return LLM_CHAT_TEMPLATE_UNKNOWN;
171
+ }
172
+
173
+ // Simple version of "llama_apply_chat_template" that only works with strings
174
+ // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
175
+ int32_t llm_chat_apply_template(
176
+ llm_chat_template tmpl,
177
+ const std::vector<const llama_chat_message *> & chat,
178
+ std::string & dest, bool add_ass) {
179
+ // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
180
+ std::stringstream ss;
181
+ if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
182
+ // chatml template
183
+ for (auto message : chat) {
184
+ ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
185
+ }
186
+ if (add_ass) {
187
+ ss << "<|im_start|>assistant\n";
188
+ }
189
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
190
+ // Official mistral 'v7' template
191
+ // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
192
+ for (auto message : chat) {
193
+ std::string role(message->role);
194
+ std::string content(message->content);
195
+ if (role == "system") {
196
+ ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
197
+ } else if (role == "user") {
198
+ ss << "[INST] " << content << "[/INST]";
199
+ }
200
+ else {
201
+ ss << " " << content << "</s>";
202
+ }
203
+ }
204
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
205
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
206
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
207
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
208
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
209
+ std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
210
+ std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
211
+ bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
212
+ bool is_inside_turn = false;
213
+ for (auto message : chat) {
214
+ if (!is_inside_turn) {
215
+ ss << leading_space << "[INST]" << trailing_space;
216
+ is_inside_turn = true;
217
+ }
218
+ std::string role(message->role);
219
+ std::string content(message->content);
220
+ if (role == "system") {
221
+ ss << content << "\n\n";
222
+ } else if (role == "user") {
223
+ ss << content << leading_space << "[/INST]";
224
+ } else {
225
+ ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
226
+ is_inside_turn = false;
227
+ }
228
+ }
229
+ } else if (
230
+ tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
231
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
232
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
233
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
234
+ // llama2 template and its variants
235
+ // [variant] support system message
236
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
237
+ bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
238
+ // [variant] add BOS inside history
239
+ bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
240
+ // [variant] trim spaces from the input message
241
+ bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
242
+ // construct the prompt
243
+ bool is_inside_turn = true; // skip BOS at the beginning
244
+ ss << "[INST] ";
245
+ for (auto message : chat) {
246
+ std::string content = strip_message ? trim(message->content) : message->content;
247
+ std::string role(message->role);
248
+ if (!is_inside_turn) {
249
+ is_inside_turn = true;
250
+ ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
251
+ }
252
+ if (role == "system") {
253
+ if (support_system_message) {
254
+ ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
255
+ } else {
256
+ // if the model does not support system message, we still include it in the first message, but without <<SYS>>
257
+ ss << content << "\n";
258
+ }
259
+ } else if (role == "user") {
260
+ ss << content << " [/INST]";
261
+ } else {
262
+ ss << content << "</s>";
263
+ is_inside_turn = false;
264
+ }
265
+ }
266
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
267
+ // Phi 3
268
+ for (auto message : chat) {
269
+ std::string role(message->role);
270
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
271
+ }
272
+ if (add_ass) {
273
+ ss << "<|assistant|>\n";
274
+ }
275
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
276
+ // chatml template
277
+ for (auto message : chat) {
278
+ ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
279
+ }
280
+ if (add_ass) {
281
+ ss << "<|im_start|>assistant<|im_sep|>";
282
+ }
283
+ } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
284
+ // Falcon 3
285
+ for (auto message : chat) {
286
+ std::string role(message->role);
287
+ ss << "<|" << role << "|>\n" << message->content << "\n";
288
+ }
289
+ if (add_ass) {
290
+ ss << "<|assistant|>\n";
291
+ }
292
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
293
+ // zephyr template
294
+ for (auto message : chat) {
295
+ ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
296
+ }
297
+ if (add_ass) {
298
+ ss << "<|assistant|>\n";
299
+ }
300
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
301
+ // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
302
+ for (auto message : chat) {
303
+ std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
304
+ ss << bos << message->role << "\n" << message->content << "</s>\n";
305
+ }
306
+ if (add_ass) {
307
+ ss << "<s>assistant\n";
308
+ }
309
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
310
+ // google/gemma-7b-it
311
+ std::string system_prompt = "";
312
+ for (auto message : chat) {
313
+ std::string role(message->role);
314
+ if (role == "system") {
315
+ // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
316
+ system_prompt = trim(message->content);
317
+ continue;
318
+ }
319
+ // in gemma, "assistant" is "model"
320
+ role = role == "assistant" ? "model" : message->role;
321
+ ss << "<start_of_turn>" << role << "\n";
322
+ if (!system_prompt.empty() && role != "model") {
323
+ ss << system_prompt << "\n\n";
324
+ system_prompt = "";
325
+ }
326
+ ss << trim(message->content) << "<end_of_turn>\n";
327
+ }
328
+ if (add_ass) {
329
+ ss << "<start_of_turn>model\n";
330
+ }
331
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
332
+ // OrionStarAI/Orion-14B-Chat
333
+ std::string system_prompt = "";
334
+ for (auto message : chat) {
335
+ std::string role(message->role);
336
+ if (role == "system") {
337
+ // there is no system message support, we will merge it with user prompt
338
+ system_prompt = message->content;
339
+ continue;
340
+ } else if (role == "user") {
341
+ ss << "Human: ";
342
+ if (!system_prompt.empty()) {
343
+ ss << system_prompt << "\n\n";
344
+ system_prompt = "";
345
+ }
346
+ ss << message->content << "\n\nAssistant: </s>";
347
+ } else {
348
+ ss << message->content << "</s>";
349
+ }
350
+ }
351
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
352
+ // openchat/openchat-3.5-0106,
353
+ for (auto message : chat) {
354
+ std::string role(message->role);
355
+ if (role == "system") {
356
+ ss << message->content << "<|end_of_turn|>";
357
+ } else {
358
+ role[0] = toupper(role[0]);
359
+ ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
360
+ }
361
+ }
362
+ if (add_ass) {
363
+ ss << "GPT4 Correct Assistant:";
364
+ }
365
+ } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
366
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
367
+ for (auto message : chat) {
368
+ std::string role(message->role);
369
+ if (role == "system") {
370
+ // Orca-Vicuna variant uses a system prefix
371
+ if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
372
+ ss << "SYSTEM: " << message->content << "\n";
373
+ } else {
374
+ ss << message->content << "\n\n";
375
+ }
376
+ } else if (role == "user") {
377
+ ss << "USER: " << message->content << "\n";
378
+ } else if (role == "assistant") {
379
+ ss << "ASSISTANT: " << message->content << "</s>\n";
380
+ }
381
+ }
382
+ if (add_ass) {
383
+ ss << "ASSISTANT:";
384
+ }
385
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
386
+ // deepseek-ai/deepseek-coder-33b-instruct
387
+ for (auto message : chat) {
388
+ std::string role(message->role);
389
+ if (role == "system") {
390
+ ss << message->content;
391
+ } else if (role == "user") {
392
+ ss << "### Instruction:\n" << message->content << "\n";
393
+ } else if (role == "assistant") {
394
+ ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
395
+ }
396
+ }
397
+ if (add_ass) {
398
+ ss << "### Response:\n";
399
+ }
400
+ } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
401
+ // CohereForAI/c4ai-command-r-plus
402
+ for (auto message : chat) {
403
+ std::string role(message->role);
404
+ if (role == "system") {
405
+ ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
406
+ } else if (role == "user") {
407
+ ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
408
+ } else if (role == "assistant") {
409
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
410
+ }
411
+ }
412
+ if (add_ass) {
413
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
414
+ }
415
+ } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
416
+ // Llama 3
417
+ for (auto message : chat) {
418
+ std::string role(message->role);
419
+ ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
420
+ }
421
+ if (add_ass) {
422
+ ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
423
+ }
424
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
425
+ // chatglm3-6b
426
+ ss << "[gMASK]" << "sop";
427
+ for (auto message : chat) {
428
+ std::string role(message->role);
429
+ ss << "<|" << role << "|>" << "\n " << message->content;
430
+ }
431
+ if (add_ass) {
432
+ ss << "<|assistant|>";
433
+ }
434
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
435
+ ss << "[gMASK]" << "<sop>";
436
+ for (auto message : chat) {
437
+ std::string role(message->role);
438
+ ss << "<|" << role << "|>" << "\n" << message->content;
439
+ }
440
+ if (add_ass) {
441
+ ss << "<|assistant|>";
442
+ }
443
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
444
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
445
+ for (auto message : chat) {
446
+ std::string role(message->role);
447
+ if (role == "user") {
448
+ ss << LU8("<用户>");
449
+ ss << trim(message->content);
450
+ ss << "<AI>";
451
+ } else {
452
+ ss << trim(message->content);
453
+ }
454
+ }
455
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
456
+ // DeepSeek-V2
457
+ for (auto message : chat) {
458
+ std::string role(message->role);
459
+ if (role == "system") {
460
+ ss << message->content << "\n\n";
461
+ } else if (role == "user") {
462
+ ss << "User: " << message->content << "\n\n";
463
+ } else if (role == "assistant") {
464
+ ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
465
+ }
466
+ }
467
+ if (add_ass) {
468
+ ss << "Assistant:";
469
+ }
470
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
471
+ // DeepSeek-V3
472
+ for (auto message : chat) {
473
+ std::string role(message->role);
474
+ if (role == "system") {
475
+ ss << message->content << "\n\n";
476
+ } else if (role == "user") {
477
+ ss << LU8("<|User|>") << message->content;
478
+ } else if (role == "assistant") {
479
+ ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
480
+ }
481
+ }
482
+ if (add_ass) {
483
+ ss << LU8("<|Assistant|>");
484
+ }
485
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
486
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
487
+ // EXAONE-3.0-7.8B-Instruct
488
+ for (auto message : chat) {
489
+ std::string role(message->role);
490
+ if (role == "system") {
491
+ ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
492
+ } else if (role == "user") {
493
+ ss << "[|user|]" << trim(message->content) << "\n";
494
+ } else if (role == "assistant") {
495
+ ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
496
+ }
497
+ }
498
+ if (add_ass) {
499
+ ss << "[|assistant|]";
500
+ }
501
+ } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
502
+ // this template requires the model to have "\n\n" as EOT token
503
+ for (auto message : chat) {
504
+ std::string role(message->role);
505
+ if (role == "user") {
506
+ ss << "User: " << message->content << "\n\nAssistant:";
507
+ } else {
508
+ ss << message->content << "\n\n";
509
+ }
510
+ }
511
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
512
+ // IBM Granite template
513
+ for (const auto & message : chat) {
514
+ std::string role(message->role);
515
+ ss << "<|start_of_role|>" << role << "<|end_of_role|>";
516
+ if (role == "assistant_tool_call") {
517
+ ss << "<|tool_call|>";
518
+ }
519
+ ss << message->content << "<|end_of_text|>\n";
520
+ }
521
+ if (add_ass) {
522
+ ss << "<|start_of_role|>assistant<|end_of_role|>\n";
523
+ }
524
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
525
+ // GigaChat template
526
+ bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
527
+
528
+ // Handle system message if present
529
+ if (has_system) {
530
+ ss << "<s>" << chat[0]->content << "<|message_sep|>";
531
+ } else {
532
+ ss << "<s>";
533
+ }
534
+
535
+ // Process remaining messages
536
+ for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
537
+ std::string role(chat[i]->role);
538
+ if (role == "user") {
539
+ ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
540
+ << "available functions<|role_sep|>[]<|message_sep|>";
541
+ } else if (role == "assistant") {
542
+ ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
543
+ }
544
+ }
545
+
546
+ // Add generation prompt if needed
547
+ if (add_ass) {
548
+ ss << "assistant<|role_sep|>";
549
+ }
550
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
551
+ // Megrez template
552
+ for (auto message : chat) {
553
+ std::string role(message->role);
554
+ ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
555
+ }
556
+
557
+ if (add_ass) {
558
+ ss << "<|role_start|>assistant<|role_end|>";
559
+ }
560
+ } else {
561
+ // template not supported
562
+ return -1;
563
+ }
564
+ dest = ss.str();
565
+ return dest.size();
566
+ }
567
+
568
+ // public interface
569
+
570
+ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
571
+ auto it = LLM_CHAT_TEMPLATES.begin();
572
+ for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
573
+ output[i] = it->first.c_str();
574
+ std::advance(it, 1);
575
+ }
576
+ return (int32_t) LLM_CHAT_TEMPLATES.size();
577
+ }
578
+