cui-llama.rn 1.3.6 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/README.md +22 -1
  2. package/android/src/main/CMakeLists.txt +25 -26
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
  4. package/android/src/main/java/com/rnllama/RNLlama.java +98 -0
  5. package/android/src/main/jni-utils.h +94 -0
  6. package/android/src/main/jni.cpp +133 -63
  7. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +15 -0
  8. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +15 -0
  9. package/cpp/common.cpp +2085 -1982
  10. package/cpp/common.h +696 -664
  11. package/cpp/ggml-alloc.c +1042 -1037
  12. package/cpp/ggml-backend-impl.h +255 -256
  13. package/cpp/ggml-backend-reg.cpp +582 -582
  14. package/cpp/ggml-backend.cpp +2002 -2002
  15. package/cpp/ggml-backend.h +354 -352
  16. package/cpp/ggml-common.h +1853 -1853
  17. package/cpp/ggml-cpp.h +39 -39
  18. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  19. package/cpp/ggml-cpu-aarch64.h +8 -8
  20. package/cpp/ggml-cpu-impl.h +386 -386
  21. package/cpp/ggml-cpu-quants.c +10920 -10839
  22. package/cpp/ggml-cpu-traits.cpp +36 -36
  23. package/cpp/ggml-cpu-traits.h +38 -38
  24. package/cpp/ggml-cpu.c +14391 -14122
  25. package/cpp/ggml-cpu.cpp +635 -627
  26. package/cpp/ggml-cpu.h +135 -135
  27. package/cpp/ggml-impl.h +567 -567
  28. package/cpp/ggml-metal-impl.h +288 -0
  29. package/cpp/ggml-metal.m +4884 -4884
  30. package/cpp/ggml-opt.cpp +854 -0
  31. package/cpp/ggml-opt.h +216 -0
  32. package/cpp/ggml-quants.c +5238 -5238
  33. package/cpp/ggml-threading.h +14 -14
  34. package/cpp/ggml.c +6514 -6448
  35. package/cpp/ggml.h +2194 -2163
  36. package/cpp/gguf.cpp +1329 -1325
  37. package/cpp/gguf.h +202 -202
  38. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  39. package/cpp/json-schema-to-grammar.h +8 -8
  40. package/cpp/json.hpp +24766 -24766
  41. package/cpp/llama-adapter.cpp +347 -346
  42. package/cpp/llama-adapter.h +74 -73
  43. package/cpp/llama-arch.cpp +1487 -1434
  44. package/cpp/llama-arch.h +400 -395
  45. package/cpp/llama-batch.cpp +368 -368
  46. package/cpp/llama-batch.h +88 -88
  47. package/cpp/llama-chat.cpp +578 -567
  48. package/cpp/llama-chat.h +52 -51
  49. package/cpp/llama-context.cpp +1775 -1771
  50. package/cpp/llama-context.h +128 -128
  51. package/cpp/llama-cparams.cpp +1 -1
  52. package/cpp/llama-cparams.h +37 -37
  53. package/cpp/llama-cpp.h +30 -30
  54. package/cpp/llama-grammar.cpp +1139 -1139
  55. package/cpp/llama-grammar.h +143 -143
  56. package/cpp/llama-hparams.cpp +71 -71
  57. package/cpp/llama-hparams.h +139 -140
  58. package/cpp/llama-impl.cpp +167 -167
  59. package/cpp/llama-impl.h +61 -61
  60. package/cpp/llama-kv-cache.cpp +718 -718
  61. package/cpp/llama-kv-cache.h +218 -218
  62. package/cpp/llama-mmap.cpp +590 -589
  63. package/cpp/llama-mmap.h +67 -67
  64. package/cpp/llama-model-loader.cpp +1124 -1011
  65. package/cpp/llama-model-loader.h +167 -158
  66. package/cpp/llama-model.cpp +3997 -2202
  67. package/cpp/llama-model.h +370 -391
  68. package/cpp/llama-sampling.cpp +2408 -2406
  69. package/cpp/llama-sampling.h +32 -48
  70. package/cpp/llama-vocab.cpp +3247 -1982
  71. package/cpp/llama-vocab.h +125 -182
  72. package/cpp/llama.cpp +10077 -12544
  73. package/cpp/llama.h +1323 -1285
  74. package/cpp/log.cpp +401 -401
  75. package/cpp/log.h +121 -121
  76. package/cpp/rn-llama.hpp +123 -116
  77. package/cpp/sampling.cpp +505 -500
  78. package/cpp/sgemm.cpp +2597 -2597
  79. package/cpp/sgemm.h +14 -14
  80. package/cpp/speculative.cpp +277 -274
  81. package/cpp/speculative.h +28 -28
  82. package/cpp/unicode.cpp +2 -3
  83. package/ios/RNLlama.mm +47 -0
  84. package/ios/RNLlamaContext.h +3 -1
  85. package/ios/RNLlamaContext.mm +71 -14
  86. package/jest/mock.js +15 -3
  87. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  88. package/lib/commonjs/index.js +33 -37
  89. package/lib/commonjs/index.js.map +1 -1
  90. package/lib/module/NativeRNLlama.js.map +1 -1
  91. package/lib/module/index.js +31 -35
  92. package/lib/module/index.js.map +1 -1
  93. package/lib/typescript/NativeRNLlama.d.ts +26 -6
  94. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  95. package/lib/typescript/index.d.ts +21 -36
  96. package/lib/typescript/index.d.ts.map +1 -1
  97. package/llama-rn.podspec +4 -18
  98. package/package.json +2 -3
  99. package/src/NativeRNLlama.ts +32 -13
  100. package/src/index.ts +52 -47
  101. package/cpp/llama.cpp.rej +0 -23
@@ -1,567 +1,578 @@
1
- #include "llama-chat.h"
2
-
3
- #include "llama.h"
4
-
5
- #include <map>
6
- #include <sstream>
7
-
8
- #if __cplusplus >= 202000L
9
- #define LU8(x) (const char*)(u8##x)
10
- #else
11
- #define LU8(x) u8##x
12
- #endif
13
-
14
- // trim whitespace from the beginning and end of a string
15
- static std::string trim(const std::string & str) {
16
- size_t start = 0;
17
- size_t end = str.size();
18
- while (start < end && isspace(str[start])) {
19
- start += 1;
20
- }
21
- while (end > start && isspace(str[end - 1])) {
22
- end -= 1;
23
- }
24
- return str.substr(start, end - start);
25
- }
26
-
27
- static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
28
- { "chatml", LLM_CHAT_TEMPLATE_CHATML },
29
- { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
30
- { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
31
- { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
32
- { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
33
- { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
34
- { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
35
- { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
36
- { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
37
- { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
38
- { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
39
- { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
40
- { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
41
- { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
42
- { "orion", LLM_CHAT_TEMPLATE_ORION },
43
- { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
44
- { "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
45
- { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
46
- { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
47
- { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
48
- { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
49
- { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
50
- { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
51
- { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
52
- { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
53
- { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
54
- { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
55
- { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
56
- { "granite", LLM_CHAT_TEMPLATE_GRANITE },
57
- { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
58
- { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
59
- };
60
-
61
- llm_chat_template llm_chat_template_from_str(const std::string & name) {
62
- return LLM_CHAT_TEMPLATES.at(name);
63
- }
64
-
65
- llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
66
- try {
67
- return llm_chat_template_from_str(tmpl);
68
- } catch (const std::out_of_range &) {
69
- // ignore
70
- }
71
-
72
- auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
73
- return tmpl.find(haystack) != std::string::npos;
74
- };
75
- if (tmpl_contains("<|im_start|>")) {
76
- return LLM_CHAT_TEMPLATE_CHATML;
77
- } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
78
- if (tmpl_contains("[SYSTEM_PROMPT]")) {
79
- return LLM_CHAT_TEMPLATE_MISTRAL_V7;
80
- } else if (
81
- // catches official 'v1' template
82
- tmpl_contains("' [INST] ' + system_message")
83
- // catches official 'v3' and 'v3-tekken' templates
84
- || tmpl_contains("[AVAILABLE_TOOLS]")
85
- ) {
86
- // Official mistral 'v1', 'v3' and 'v3-tekken' templates
87
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
88
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
89
- if (tmpl_contains(" [INST]")) {
90
- return LLM_CHAT_TEMPLATE_MISTRAL_V1;
91
- } else if (tmpl_contains("\"[INST]\"")) {
92
- return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
93
- }
94
- return LLM_CHAT_TEMPLATE_MISTRAL_V3;
95
- } else {
96
- // llama2 template and its variants
97
- // [variant] support system message
98
- // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
99
- bool support_system_message = tmpl_contains("<<SYS>>");
100
- bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
101
- bool strip_message = tmpl_contains("content.strip()");
102
- if (strip_message) {
103
- return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
104
- } else if (add_bos_inside_history) {
105
- return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
106
- } else if (support_system_message) {
107
- return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
108
- } else {
109
- return LLM_CHAT_TEMPLATE_LLAMA_2;
110
- }
111
- }
112
- } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
113
- return LLM_CHAT_TEMPLATE_PHI_3;
114
- } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
115
- return LLM_CHAT_TEMPLATE_FALCON_3;
116
- } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
117
- return LLM_CHAT_TEMPLATE_ZEPHYR;
118
- } else if (tmpl_contains("bos_token + message['role']")) {
119
- return LLM_CHAT_TEMPLATE_MONARCH;
120
- } else if (tmpl_contains("<start_of_turn>")) {
121
- return LLM_CHAT_TEMPLATE_GEMMA;
122
- } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
123
- // OrionStarAI/Orion-14B-Chat
124
- return LLM_CHAT_TEMPLATE_ORION;
125
- } else if (tmpl_contains("GPT4 Correct ")) {
126
- // openchat/openchat-3.5-0106
127
- return LLM_CHAT_TEMPLATE_OPENCHAT;
128
- } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
129
- // eachadea/vicuna-13b-1.1 (and Orca variant)
130
- if (tmpl_contains("SYSTEM: ")) {
131
- return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
132
- }
133
- return LLM_CHAT_TEMPLATE_VICUNA;
134
- } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
135
- // deepseek-ai/deepseek-coder-33b-instruct
136
- return LLM_CHAT_TEMPLATE_DEEPSEEK;
137
- } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
138
- // CohereForAI/c4ai-command-r-plus
139
- return LLM_CHAT_TEMPLATE_COMMAND_R;
140
- } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
141
- return LLM_CHAT_TEMPLATE_LLAMA_3;
142
- } else if (tmpl_contains("[gMASK]sop")) {
143
- // chatglm3-6b
144
- return LLM_CHAT_TEMPLATE_CHATGML_3;
145
- } else if (tmpl_contains("[gMASK]<sop>")) {
146
- return LLM_CHAT_TEMPLATE_CHATGML_4;
147
- } else if (tmpl_contains(LU8("<用户>"))) {
148
- // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
149
- return LLM_CHAT_TEMPLATE_MINICPM;
150
- } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
151
- return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
152
- } else if (tmpl_contains(LU8("'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'"))) {
153
- return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
154
- } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
155
- // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
156
- // EXAONE-3.0-7.8B-Instruct
157
- return LLM_CHAT_TEMPLATE_EXAONE_3;
158
- } else if (tmpl_contains("rwkv-world")) {
159
- return LLM_CHAT_TEMPLATE_RWKV_WORLD;
160
- } else if (tmpl_contains("<|start_of_role|>")) {
161
- return LLM_CHAT_TEMPLATE_GRANITE;
162
- } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
163
- return LLM_CHAT_TEMPLATE_GIGACHAT;
164
- } else if (tmpl_contains("<|role_start|>")) {
165
- return LLM_CHAT_TEMPLATE_MEGREZ;
166
- }
167
- return LLM_CHAT_TEMPLATE_UNKNOWN;
168
- }
169
-
170
- // Simple version of "llama_apply_chat_template" that only works with strings
171
- // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
172
- int32_t llm_chat_apply_template(
173
- llm_chat_template tmpl,
174
- const std::vector<const llama_chat_message *> & chat,
175
- std::string & dest, bool add_ass) {
176
- // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
177
- std::stringstream ss;
178
- if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
179
- // chatml template
180
- for (auto message : chat) {
181
- ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
182
- }
183
- if (add_ass) {
184
- ss << "<|im_start|>assistant\n";
185
- }
186
- } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
187
- // Official mistral 'v7' template
188
- // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
189
- for (auto message : chat) {
190
- std::string role(message->role);
191
- std::string content(message->content);
192
- if (role == "system") {
193
- ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
194
- } else if (role == "user") {
195
- ss << "[INST] " << content << "[/INST]";
196
- }
197
- else {
198
- ss << " " << content << "</s>";
199
- }
200
- }
201
- } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
202
- || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
203
- || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
204
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
205
- // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
206
- std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
207
- std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
208
- bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
209
- bool is_inside_turn = false;
210
- for (auto message : chat) {
211
- if (!is_inside_turn) {
212
- ss << leading_space << "[INST]" << trailing_space;
213
- is_inside_turn = true;
214
- }
215
- std::string role(message->role);
216
- std::string content(message->content);
217
- if (role == "system") {
218
- ss << content << "\n\n";
219
- } else if (role == "user") {
220
- ss << content << leading_space << "[/INST]";
221
- } else {
222
- ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
223
- is_inside_turn = false;
224
- }
225
- }
226
- } else if (
227
- tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
228
- || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
229
- || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
230
- || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
231
- // llama2 template and its variants
232
- // [variant] support system message
233
- // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
234
- bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
235
- // [variant] add BOS inside history
236
- bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
237
- // [variant] trim spaces from the input message
238
- bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
239
- // construct the prompt
240
- bool is_inside_turn = true; // skip BOS at the beginning
241
- ss << "[INST] ";
242
- for (auto message : chat) {
243
- std::string content = strip_message ? trim(message->content) : message->content;
244
- std::string role(message->role);
245
- if (!is_inside_turn) {
246
- is_inside_turn = true;
247
- ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
248
- }
249
- if (role == "system") {
250
- if (support_system_message) {
251
- ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
252
- } else {
253
- // if the model does not support system message, we still include it in the first message, but without <<SYS>>
254
- ss << content << "\n";
255
- }
256
- } else if (role == "user") {
257
- ss << content << " [/INST]";
258
- } else {
259
- ss << content << "</s>";
260
- is_inside_turn = false;
261
- }
262
- }
263
- } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
264
- // Phi 3
265
- for (auto message : chat) {
266
- std::string role(message->role);
267
- ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
268
- }
269
- if (add_ass) {
270
- ss << "<|assistant|>\n";
271
- }
272
- } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
273
- // Falcon 3
274
- for (auto message : chat) {
275
- std::string role(message->role);
276
- ss << "<|" << role << "|>\n" << message->content << "\n";
277
- }
278
- if (add_ass) {
279
- ss << "<|assistant|>\n";
280
- }
281
- } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
282
- // zephyr template
283
- for (auto message : chat) {
284
- ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
285
- }
286
- if (add_ass) {
287
- ss << "<|assistant|>\n";
288
- }
289
- } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
290
- // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
291
- for (auto message : chat) {
292
- std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
293
- ss << bos << message->role << "\n" << message->content << "</s>\n";
294
- }
295
- if (add_ass) {
296
- ss << "<s>assistant\n";
297
- }
298
- } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
299
- // google/gemma-7b-it
300
- std::string system_prompt = "";
301
- for (auto message : chat) {
302
- std::string role(message->role);
303
- if (role == "system") {
304
- // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
305
- system_prompt = trim(message->content);
306
- continue;
307
- }
308
- // in gemma, "assistant" is "model"
309
- role = role == "assistant" ? "model" : message->role;
310
- ss << "<start_of_turn>" << role << "\n";
311
- if (!system_prompt.empty() && role != "model") {
312
- ss << system_prompt << "\n\n";
313
- system_prompt = "";
314
- }
315
- ss << trim(message->content) << "<end_of_turn>\n";
316
- }
317
- if (add_ass) {
318
- ss << "<start_of_turn>model\n";
319
- }
320
- } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
321
- // OrionStarAI/Orion-14B-Chat
322
- std::string system_prompt = "";
323
- for (auto message : chat) {
324
- std::string role(message->role);
325
- if (role == "system") {
326
- // there is no system message support, we will merge it with user prompt
327
- system_prompt = message->content;
328
- continue;
329
- } else if (role == "user") {
330
- ss << "Human: ";
331
- if (!system_prompt.empty()) {
332
- ss << system_prompt << "\n\n";
333
- system_prompt = "";
334
- }
335
- ss << message->content << "\n\nAssistant: </s>";
336
- } else {
337
- ss << message->content << "</s>";
338
- }
339
- }
340
- } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
341
- // openchat/openchat-3.5-0106,
342
- for (auto message : chat) {
343
- std::string role(message->role);
344
- if (role == "system") {
345
- ss << message->content << "<|end_of_turn|>";
346
- } else {
347
- role[0] = toupper(role[0]);
348
- ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
349
- }
350
- }
351
- if (add_ass) {
352
- ss << "GPT4 Correct Assistant:";
353
- }
354
- } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
355
- // eachadea/vicuna-13b-1.1 (and Orca variant)
356
- for (auto message : chat) {
357
- std::string role(message->role);
358
- if (role == "system") {
359
- // Orca-Vicuna variant uses a system prefix
360
- if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
361
- ss << "SYSTEM: " << message->content << "\n";
362
- } else {
363
- ss << message->content << "\n\n";
364
- }
365
- } else if (role == "user") {
366
- ss << "USER: " << message->content << "\n";
367
- } else if (role == "assistant") {
368
- ss << "ASSISTANT: " << message->content << "</s>\n";
369
- }
370
- }
371
- if (add_ass) {
372
- ss << "ASSISTANT:";
373
- }
374
- } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
375
- // deepseek-ai/deepseek-coder-33b-instruct
376
- for (auto message : chat) {
377
- std::string role(message->role);
378
- if (role == "system") {
379
- ss << message->content;
380
- } else if (role == "user") {
381
- ss << "### Instruction:\n" << message->content << "\n";
382
- } else if (role == "assistant") {
383
- ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
384
- }
385
- }
386
- if (add_ass) {
387
- ss << "### Response:\n";
388
- }
389
- } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
390
- // CohereForAI/c4ai-command-r-plus
391
- for (auto message : chat) {
392
- std::string role(message->role);
393
- if (role == "system") {
394
- ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
395
- } else if (role == "user") {
396
- ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
397
- } else if (role == "assistant") {
398
- ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
399
- }
400
- }
401
- if (add_ass) {
402
- ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
403
- }
404
- } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
405
- // Llama 3
406
- for (auto message : chat) {
407
- std::string role(message->role);
408
- ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
409
- }
410
- if (add_ass) {
411
- ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
412
- }
413
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
414
- // chatglm3-6b
415
- ss << "[gMASK]" << "sop";
416
- for (auto message : chat) {
417
- std::string role(message->role);
418
- ss << "<|" << role << "|>" << "\n " << message->content;
419
- }
420
- if (add_ass) {
421
- ss << "<|assistant|>";
422
- }
423
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
424
- ss << "[gMASK]" << "<sop>";
425
- for (auto message : chat) {
426
- std::string role(message->role);
427
- ss << "<|" << role << "|>" << "\n" << message->content;
428
- }
429
- if (add_ass) {
430
- ss << "<|assistant|>";
431
- }
432
- } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
433
- // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
434
- for (auto message : chat) {
435
- std::string role(message->role);
436
- if (role == "user") {
437
- ss << LU8("<用户>");
438
- ss << trim(message->content);
439
- ss << "<AI>";
440
- } else {
441
- ss << trim(message->content);
442
- }
443
- }
444
- } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
445
- // DeepSeek-V2
446
- for (auto message : chat) {
447
- std::string role(message->role);
448
- if (role == "system") {
449
- ss << message->content << "\n\n";
450
- } else if (role == "user") {
451
- ss << "User: " << message->content << "\n\n";
452
- } else if (role == "assistant") {
453
- ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
454
- }
455
- }
456
- if (add_ass) {
457
- ss << "Assistant:";
458
- }
459
- } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
460
- // DeepSeek-V3
461
- for (auto message : chat) {
462
- std::string role(message->role);
463
- if (role == "system") {
464
- ss << message->content << "\n\n";
465
- } else if (role == "user") {
466
- ss << LU8("<|User|>") << message->content;
467
- } else if (role == "assistant") {
468
- ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
469
- }
470
- }
471
- if (add_ass) {
472
- ss << LU8("<|Assistant|>");
473
- }
474
- } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
475
- // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
476
- // EXAONE-3.0-7.8B-Instruct
477
- for (auto message : chat) {
478
- std::string role(message->role);
479
- if (role == "system") {
480
- ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
481
- } else if (role == "user") {
482
- ss << "[|user|]" << trim(message->content) << "\n";
483
- } else if (role == "assistant") {
484
- ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
485
- }
486
- }
487
- if (add_ass) {
488
- ss << "[|assistant|]";
489
- }
490
- } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
491
- // this template requires the model to have "\n\n" as EOT token
492
- for (auto message : chat) {
493
- std::string role(message->role);
494
- if (role == "user") {
495
- ss << "User: " << message->content << "\n\nAssistant:";
496
- } else {
497
- ss << message->content << "\n\n";
498
- }
499
- }
500
- } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
501
- // IBM Granite template
502
- for (const auto & message : chat) {
503
- std::string role(message->role);
504
- ss << "<|start_of_role|>" << role << "<|end_of_role|>";
505
- if (role == "assistant_tool_call") {
506
- ss << "<|tool_call|>";
507
- }
508
- ss << message->content << "<|end_of_text|>\n";
509
- }
510
- if (add_ass) {
511
- ss << "<|start_of_role|>assistant<|end_of_role|>\n";
512
- }
513
- } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
514
- // GigaChat template
515
- bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
516
-
517
- // Handle system message if present
518
- if (has_system) {
519
- ss << "<s>" << chat[0]->content << "<|message_sep|>";
520
- } else {
521
- ss << "<s>";
522
- }
523
-
524
- // Process remaining messages
525
- for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
526
- std::string role(chat[i]->role);
527
- if (role == "user") {
528
- ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
529
- << "available functions<|role_sep|>[]<|message_sep|>";
530
- } else if (role == "assistant") {
531
- ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
532
- }
533
- }
534
-
535
- // Add generation prompt if needed
536
- if (add_ass) {
537
- ss << "assistant<|role_sep|>";
538
- }
539
- } else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
540
- // Megrez template
541
- for (auto message : chat) {
542
- std::string role(message->role);
543
- ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
544
- }
545
-
546
- if (add_ass) {
547
- ss << "<|role_start|>assistant<|role_end|>";
548
- }
549
- } else {
550
- // template not supported
551
- return -1;
552
- }
553
- dest = ss.str();
554
- return dest.size();
555
- }
556
-
557
- // public interface
558
-
559
- int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
560
- auto it = LLM_CHAT_TEMPLATES.begin();
561
- for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
562
- output[i] = it->first.c_str();
563
- std::advance(it, 1);
564
- }
565
- return (int32_t) LLM_CHAT_TEMPLATES.size();
566
- }
567
-
1
+ #include "llama-chat.h"
2
+
3
+ #include "llama.h"
4
+
5
+ #include <map>
6
+ #include <sstream>
7
+
8
+ #if __cplusplus >= 202000L
9
+ #define LU8(x) (const char*)(u8##x)
10
+ #else
11
+ #define LU8(x) u8##x
12
+ #endif
13
+
14
+ // trim whitespace from the beginning and end of a string
15
+ static std::string trim(const std::string & str) {
16
+ size_t start = 0;
17
+ size_t end = str.size();
18
+ while (start < end && isspace(str[start])) {
19
+ start += 1;
20
+ }
21
+ while (end > start && isspace(str[end - 1])) {
22
+ end -= 1;
23
+ }
24
+ return str.substr(start, end - start);
25
+ }
26
+
27
+ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
28
+ { "chatml", LLM_CHAT_TEMPLATE_CHATML },
29
+ { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
30
+ { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
31
+ { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
32
+ { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
33
+ { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
34
+ { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
35
+ { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
36
+ { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
37
+ { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
38
+ { "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
39
+ { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
40
+ { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
41
+ { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
42
+ { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
43
+ { "orion", LLM_CHAT_TEMPLATE_ORION },
44
+ { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
45
+ { "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
46
+ { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
47
+ { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
48
+ { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
49
+ { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
50
+ { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
51
+ { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
52
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
53
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
54
+ { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
55
+ { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
56
+ { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
57
+ { "granite", LLM_CHAT_TEMPLATE_GRANITE },
58
+ { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
59
+ { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
60
+ };
61
+
62
+ llm_chat_template llm_chat_template_from_str(const std::string & name) {
63
+ return LLM_CHAT_TEMPLATES.at(name);
64
+ }
65
+
66
+ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
67
+ try {
68
+ return llm_chat_template_from_str(tmpl);
69
+ } catch (const std::out_of_range &) {
70
+ // ignore
71
+ }
72
+
73
+ auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
74
+ return tmpl.find(haystack) != std::string::npos;
75
+ };
76
+ if (tmpl_contains("<|im_start|>")) {
77
+ return tmpl_contains("<|im_sep|>")
78
+ ? LLM_CHAT_TEMPLATE_PHI_4
79
+ : LLM_CHAT_TEMPLATE_CHATML;
80
+ } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
81
+ if (tmpl_contains("[SYSTEM_PROMPT]")) {
82
+ return LLM_CHAT_TEMPLATE_MISTRAL_V7;
83
+ } else if (
84
+ // catches official 'v1' template
85
+ tmpl_contains("' [INST] ' + system_message")
86
+ // catches official 'v3' and 'v3-tekken' templates
87
+ || tmpl_contains("[AVAILABLE_TOOLS]")
88
+ ) {
89
+ // Official mistral 'v1', 'v3' and 'v3-tekken' templates
90
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
91
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
92
+ if (tmpl_contains(" [INST]")) {
93
+ return LLM_CHAT_TEMPLATE_MISTRAL_V1;
94
+ } else if (tmpl_contains("\"[INST]\"")) {
95
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
96
+ }
97
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3;
98
+ } else {
99
+ // llama2 template and its variants
100
+ // [variant] support system message
101
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
102
+ bool support_system_message = tmpl_contains("<<SYS>>");
103
+ bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
104
+ bool strip_message = tmpl_contains("content.strip()");
105
+ if (strip_message) {
106
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
107
+ } else if (add_bos_inside_history) {
108
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
109
+ } else if (support_system_message) {
110
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
111
+ } else {
112
+ return LLM_CHAT_TEMPLATE_LLAMA_2;
113
+ }
114
+ }
115
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
116
+ return LLM_CHAT_TEMPLATE_PHI_3;
117
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
118
+ return LLM_CHAT_TEMPLATE_FALCON_3;
119
+ } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
120
+ return LLM_CHAT_TEMPLATE_ZEPHYR;
121
+ } else if (tmpl_contains("bos_token + message['role']")) {
122
+ return LLM_CHAT_TEMPLATE_MONARCH;
123
+ } else if (tmpl_contains("<start_of_turn>")) {
124
+ return LLM_CHAT_TEMPLATE_GEMMA;
125
+ } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
126
+ // OrionStarAI/Orion-14B-Chat
127
+ return LLM_CHAT_TEMPLATE_ORION;
128
+ } else if (tmpl_contains("GPT4 Correct ")) {
129
+ // openchat/openchat-3.5-0106
130
+ return LLM_CHAT_TEMPLATE_OPENCHAT;
131
+ } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
132
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
133
+ if (tmpl_contains("SYSTEM: ")) {
134
+ return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
135
+ }
136
+ return LLM_CHAT_TEMPLATE_VICUNA;
137
+ } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
138
+ // deepseek-ai/deepseek-coder-33b-instruct
139
+ return LLM_CHAT_TEMPLATE_DEEPSEEK;
140
+ } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
141
+ // CohereForAI/c4ai-command-r-plus
142
+ return LLM_CHAT_TEMPLATE_COMMAND_R;
143
+ } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
144
+ return LLM_CHAT_TEMPLATE_LLAMA_3;
145
+ } else if (tmpl_contains("[gMASK]sop")) {
146
+ // chatglm3-6b
147
+ return LLM_CHAT_TEMPLATE_CHATGML_3;
148
+ } else if (tmpl_contains("[gMASK]<sop>")) {
149
+ return LLM_CHAT_TEMPLATE_CHATGML_4;
150
+ } else if (tmpl_contains(LU8("<用户>"))) {
151
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
152
+ return LLM_CHAT_TEMPLATE_MINICPM;
153
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
154
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
155
+ } else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
156
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
157
+ } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
158
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
159
+ // EXAONE-3.0-7.8B-Instruct
160
+ return LLM_CHAT_TEMPLATE_EXAONE_3;
161
+ } else if (tmpl_contains("rwkv-world")) {
162
+ return LLM_CHAT_TEMPLATE_RWKV_WORLD;
163
+ } else if (tmpl_contains("<|start_of_role|>")) {
164
+ return LLM_CHAT_TEMPLATE_GRANITE;
165
+ } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
166
+ return LLM_CHAT_TEMPLATE_GIGACHAT;
167
+ } else if (tmpl_contains("<|role_start|>")) {
168
+ return LLM_CHAT_TEMPLATE_MEGREZ;
169
+ }
170
+ return LLM_CHAT_TEMPLATE_UNKNOWN;
171
+ }
172
+
173
+ // Simple version of "llama_apply_chat_template" that only works with strings
174
+ // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
175
+ int32_t llm_chat_apply_template(
176
+ llm_chat_template tmpl,
177
+ const std::vector<const llama_chat_message *> & chat,
178
+ std::string & dest, bool add_ass) {
179
+ // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
180
+ std::stringstream ss;
181
+ if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
182
+ // chatml template
183
+ for (auto message : chat) {
184
+ ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
185
+ }
186
+ if (add_ass) {
187
+ ss << "<|im_start|>assistant\n";
188
+ }
189
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
190
+ // Official mistral 'v7' template
191
+ // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
192
+ for (auto message : chat) {
193
+ std::string role(message->role);
194
+ std::string content(message->content);
195
+ if (role == "system") {
196
+ ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
197
+ } else if (role == "user") {
198
+ ss << "[INST] " << content << "[/INST]";
199
+ }
200
+ else {
201
+ ss << " " << content << "</s>";
202
+ }
203
+ }
204
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
205
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
206
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
207
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
208
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
209
+ std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
210
+ std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
211
+ bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
212
+ bool is_inside_turn = false;
213
+ for (auto message : chat) {
214
+ if (!is_inside_turn) {
215
+ ss << leading_space << "[INST]" << trailing_space;
216
+ is_inside_turn = true;
217
+ }
218
+ std::string role(message->role);
219
+ std::string content(message->content);
220
+ if (role == "system") {
221
+ ss << content << "\n\n";
222
+ } else if (role == "user") {
223
+ ss << content << leading_space << "[/INST]";
224
+ } else {
225
+ ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
226
+ is_inside_turn = false;
227
+ }
228
+ }
229
+ } else if (
230
+ tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
231
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
232
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
233
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
234
+ // llama2 template and its variants
235
+ // [variant] support system message
236
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
237
+ bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
238
+ // [variant] add BOS inside history
239
+ bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
240
+ // [variant] trim spaces from the input message
241
+ bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
242
+ // construct the prompt
243
+ bool is_inside_turn = true; // skip BOS at the beginning
244
+ ss << "[INST] ";
245
+ for (auto message : chat) {
246
+ std::string content = strip_message ? trim(message->content) : message->content;
247
+ std::string role(message->role);
248
+ if (!is_inside_turn) {
249
+ is_inside_turn = true;
250
+ ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
251
+ }
252
+ if (role == "system") {
253
+ if (support_system_message) {
254
+ ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
255
+ } else {
256
+ // if the model does not support system message, we still include it in the first message, but without <<SYS>>
257
+ ss << content << "\n";
258
+ }
259
+ } else if (role == "user") {
260
+ ss << content << " [/INST]";
261
+ } else {
262
+ ss << content << "</s>";
263
+ is_inside_turn = false;
264
+ }
265
+ }
266
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
267
+ // Phi 3
268
+ for (auto message : chat) {
269
+ std::string role(message->role);
270
+ ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
271
+ }
272
+ if (add_ass) {
273
+ ss << "<|assistant|>\n";
274
+ }
275
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
276
+ // chatml template
277
+ for (auto message : chat) {
278
+ ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
279
+ }
280
+ if (add_ass) {
281
+ ss << "<|im_start|>assistant<|im_sep|>";
282
+ }
283
+ } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
284
+ // Falcon 3
285
+ for (auto message : chat) {
286
+ std::string role(message->role);
287
+ ss << "<|" << role << "|>\n" << message->content << "\n";
288
+ }
289
+ if (add_ass) {
290
+ ss << "<|assistant|>\n";
291
+ }
292
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
293
+ // zephyr template
294
+ for (auto message : chat) {
295
+ ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
296
+ }
297
+ if (add_ass) {
298
+ ss << "<|assistant|>\n";
299
+ }
300
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
301
+ // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
302
+ for (auto message : chat) {
303
+ std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
304
+ ss << bos << message->role << "\n" << message->content << "</s>\n";
305
+ }
306
+ if (add_ass) {
307
+ ss << "<s>assistant\n";
308
+ }
309
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
310
+ // google/gemma-7b-it
311
+ std::string system_prompt = "";
312
+ for (auto message : chat) {
313
+ std::string role(message->role);
314
+ if (role == "system") {
315
+ // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
316
+ system_prompt = trim(message->content);
317
+ continue;
318
+ }
319
+ // in gemma, "assistant" is "model"
320
+ role = role == "assistant" ? "model" : message->role;
321
+ ss << "<start_of_turn>" << role << "\n";
322
+ if (!system_prompt.empty() && role != "model") {
323
+ ss << system_prompt << "\n\n";
324
+ system_prompt = "";
325
+ }
326
+ ss << trim(message->content) << "<end_of_turn>\n";
327
+ }
328
+ if (add_ass) {
329
+ ss << "<start_of_turn>model\n";
330
+ }
331
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
332
+ // OrionStarAI/Orion-14B-Chat
333
+ std::string system_prompt = "";
334
+ for (auto message : chat) {
335
+ std::string role(message->role);
336
+ if (role == "system") {
337
+ // there is no system message support, we will merge it with user prompt
338
+ system_prompt = message->content;
339
+ continue;
340
+ } else if (role == "user") {
341
+ ss << "Human: ";
342
+ if (!system_prompt.empty()) {
343
+ ss << system_prompt << "\n\n";
344
+ system_prompt = "";
345
+ }
346
+ ss << message->content << "\n\nAssistant: </s>";
347
+ } else {
348
+ ss << message->content << "</s>";
349
+ }
350
+ }
351
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
352
+ // openchat/openchat-3.5-0106,
353
+ for (auto message : chat) {
354
+ std::string role(message->role);
355
+ if (role == "system") {
356
+ ss << message->content << "<|end_of_turn|>";
357
+ } else {
358
+ role[0] = toupper(role[0]);
359
+ ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
360
+ }
361
+ }
362
+ if (add_ass) {
363
+ ss << "GPT4 Correct Assistant:";
364
+ }
365
+ } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
366
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
367
+ for (auto message : chat) {
368
+ std::string role(message->role);
369
+ if (role == "system") {
370
+ // Orca-Vicuna variant uses a system prefix
371
+ if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
372
+ ss << "SYSTEM: " << message->content << "\n";
373
+ } else {
374
+ ss << message->content << "\n\n";
375
+ }
376
+ } else if (role == "user") {
377
+ ss << "USER: " << message->content << "\n";
378
+ } else if (role == "assistant") {
379
+ ss << "ASSISTANT: " << message->content << "</s>\n";
380
+ }
381
+ }
382
+ if (add_ass) {
383
+ ss << "ASSISTANT:";
384
+ }
385
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
386
+ // deepseek-ai/deepseek-coder-33b-instruct
387
+ for (auto message : chat) {
388
+ std::string role(message->role);
389
+ if (role == "system") {
390
+ ss << message->content;
391
+ } else if (role == "user") {
392
+ ss << "### Instruction:\n" << message->content << "\n";
393
+ } else if (role == "assistant") {
394
+ ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
395
+ }
396
+ }
397
+ if (add_ass) {
398
+ ss << "### Response:\n";
399
+ }
400
+ } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
401
+ // CohereForAI/c4ai-command-r-plus
402
+ for (auto message : chat) {
403
+ std::string role(message->role);
404
+ if (role == "system") {
405
+ ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
406
+ } else if (role == "user") {
407
+ ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
408
+ } else if (role == "assistant") {
409
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
410
+ }
411
+ }
412
+ if (add_ass) {
413
+ ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
414
+ }
415
+ } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
416
+ // Llama 3
417
+ for (auto message : chat) {
418
+ std::string role(message->role);
419
+ ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
420
+ }
421
+ if (add_ass) {
422
+ ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
423
+ }
424
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
425
+ // chatglm3-6b
426
+ ss << "[gMASK]" << "sop";
427
+ for (auto message : chat) {
428
+ std::string role(message->role);
429
+ ss << "<|" << role << "|>" << "\n " << message->content;
430
+ }
431
+ if (add_ass) {
432
+ ss << "<|assistant|>";
433
+ }
434
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
435
+ ss << "[gMASK]" << "<sop>";
436
+ for (auto message : chat) {
437
+ std::string role(message->role);
438
+ ss << "<|" << role << "|>" << "\n" << message->content;
439
+ }
440
+ if (add_ass) {
441
+ ss << "<|assistant|>";
442
+ }
443
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
444
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
445
+ for (auto message : chat) {
446
+ std::string role(message->role);
447
+ if (role == "user") {
448
+ ss << LU8("<用户>");
449
+ ss << trim(message->content);
450
+ ss << "<AI>";
451
+ } else {
452
+ ss << trim(message->content);
453
+ }
454
+ }
455
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
456
+ // DeepSeek-V2
457
+ for (auto message : chat) {
458
+ std::string role(message->role);
459
+ if (role == "system") {
460
+ ss << message->content << "\n\n";
461
+ } else if (role == "user") {
462
+ ss << "User: " << message->content << "\n\n";
463
+ } else if (role == "assistant") {
464
+ ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
465
+ }
466
+ }
467
+ if (add_ass) {
468
+ ss << "Assistant:";
469
+ }
470
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
471
+ // DeepSeek-V3
472
+ for (auto message : chat) {
473
+ std::string role(message->role);
474
+ if (role == "system") {
475
+ ss << message->content << "\n\n";
476
+ } else if (role == "user") {
477
+ ss << LU8("<|User|>") << message->content;
478
+ } else if (role == "assistant") {
479
+ ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
480
+ }
481
+ }
482
+ if (add_ass) {
483
+ ss << LU8("<|Assistant|>");
484
+ }
485
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
486
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
487
+ // EXAONE-3.0-7.8B-Instruct
488
+ for (auto message : chat) {
489
+ std::string role(message->role);
490
+ if (role == "system") {
491
+ ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
492
+ } else if (role == "user") {
493
+ ss << "[|user|]" << trim(message->content) << "\n";
494
+ } else if (role == "assistant") {
495
+ ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
496
+ }
497
+ }
498
+ if (add_ass) {
499
+ ss << "[|assistant|]";
500
+ }
501
+ } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
502
+ // this template requires the model to have "\n\n" as EOT token
503
+ for (auto message : chat) {
504
+ std::string role(message->role);
505
+ if (role == "user") {
506
+ ss << "User: " << message->content << "\n\nAssistant:";
507
+ } else {
508
+ ss << message->content << "\n\n";
509
+ }
510
+ }
511
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
512
+ // IBM Granite template
513
+ for (const auto & message : chat) {
514
+ std::string role(message->role);
515
+ ss << "<|start_of_role|>" << role << "<|end_of_role|>";
516
+ if (role == "assistant_tool_call") {
517
+ ss << "<|tool_call|>";
518
+ }
519
+ ss << message->content << "<|end_of_text|>\n";
520
+ }
521
+ if (add_ass) {
522
+ ss << "<|start_of_role|>assistant<|end_of_role|>\n";
523
+ }
524
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
525
+ // GigaChat template
526
+ bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
527
+
528
+ // Handle system message if present
529
+ if (has_system) {
530
+ ss << "<s>" << chat[0]->content << "<|message_sep|>";
531
+ } else {
532
+ ss << "<s>";
533
+ }
534
+
535
+ // Process remaining messages
536
+ for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
537
+ std::string role(chat[i]->role);
538
+ if (role == "user") {
539
+ ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
540
+ << "available functions<|role_sep|>[]<|message_sep|>";
541
+ } else if (role == "assistant") {
542
+ ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
543
+ }
544
+ }
545
+
546
+ // Add generation prompt if needed
547
+ if (add_ass) {
548
+ ss << "assistant<|role_sep|>";
549
+ }
550
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
551
+ // Megrez template
552
+ for (auto message : chat) {
553
+ std::string role(message->role);
554
+ ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
555
+ }
556
+
557
+ if (add_ass) {
558
+ ss << "<|role_start|>assistant<|role_end|>";
559
+ }
560
+ } else {
561
+ // template not supported
562
+ return -1;
563
+ }
564
+ dest = ss.str();
565
+ return dest.size();
566
+ }
567
+
568
+ // public interface
569
+
570
+ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
571
+ auto it = LLM_CHAT_TEMPLATES.begin();
572
+ for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
573
+ output[i] = it->first.c_str();
574
+ std::advance(it, 1);
575
+ }
576
+ return (int32_t) LLM_CHAT_TEMPLATES.size();
577
+ }
578
+