cui-llama.rn 1.3.3 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +5 -7
- package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
- package/android/src/main/jni.cpp +9 -9
- package/cpp/common.cpp +21 -40
- package/cpp/common.h +21 -12
- package/cpp/ggml-backend-impl.h +38 -20
- package/cpp/ggml-backend-reg.cpp +216 -87
- package/cpp/ggml-backend.h +1 -0
- package/cpp/ggml-common.h +42 -48
- package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +591 -152
- package/cpp/ggml-cpu-aarch64.h +2 -26
- package/cpp/ggml-cpu-traits.cpp +36 -0
- package/cpp/ggml-cpu-traits.h +38 -0
- package/cpp/ggml-cpu.c +14122 -13971
- package/cpp/ggml-cpu.cpp +618 -715
- package/cpp/ggml-cpu.h +0 -17
- package/cpp/ggml-impl.h +6 -6
- package/cpp/ggml-metal.m +482 -24
- package/cpp/ggml-quants.c +0 -9
- package/cpp/ggml-threading.h +4 -2
- package/cpp/ggml.c +132 -43
- package/cpp/ggml.h +44 -13
- package/cpp/llama-sampling.cpp +35 -90
- package/cpp/llama-vocab.cpp +2 -1
- package/cpp/llama.cpp +737 -233
- package/cpp/llama.h +20 -16
- package/cpp/sampling.cpp +11 -16
- package/cpp/speculative.cpp +4 -0
- package/cpp/unicode.cpp +51 -51
- package/cpp/unicode.h +9 -10
- package/lib/commonjs/index.js +38 -1
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/index.js +36 -0
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +2 -3
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +36 -2
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +3 -3
- package/src/index.ts +46 -2
- package/cpp/amx/amx.cpp +0 -196
- package/cpp/amx/amx.h +0 -20
- package/cpp/amx/common.h +0 -101
- package/cpp/amx/mmq.cpp +0 -2524
- package/cpp/amx/mmq.h +0 -16
- package/cpp/ggml-aarch64.c +0 -129
- package/cpp/ggml-aarch64.h +0 -19
package/cpp/llama.cpp
CHANGED
@@ -174,6 +174,7 @@ enum llm_arch {
|
|
174
174
|
LLM_ARCH_QWEN,
|
175
175
|
LLM_ARCH_QWEN2,
|
176
176
|
LLM_ARCH_QWEN2MOE,
|
177
|
+
LLM_ARCH_QWEN2VL,
|
177
178
|
LLM_ARCH_PHI2,
|
178
179
|
LLM_ARCH_PHI3,
|
179
180
|
LLM_ARCH_PLAMO,
|
@@ -194,6 +195,7 @@ enum llm_arch {
|
|
194
195
|
LLM_ARCH_OLMOE,
|
195
196
|
LLM_ARCH_OPENELM,
|
196
197
|
LLM_ARCH_ARCTIC,
|
198
|
+
LLM_ARCH_DEEPSEEK,
|
197
199
|
LLM_ARCH_DEEPSEEK2,
|
198
200
|
LLM_ARCH_CHATGLM,
|
199
201
|
LLM_ARCH_BITNET,
|
@@ -228,6 +230,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
228
230
|
{ LLM_ARCH_QWEN, "qwen" },
|
229
231
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
230
232
|
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
233
|
+
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
231
234
|
{ LLM_ARCH_PHI2, "phi2" },
|
232
235
|
{ LLM_ARCH_PHI3, "phi3" },
|
233
236
|
{ LLM_ARCH_PLAMO, "plamo" },
|
@@ -248,6 +251,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
248
251
|
{ LLM_ARCH_OLMOE, "olmoe" },
|
249
252
|
{ LLM_ARCH_OPENELM, "openelm" },
|
250
253
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
254
|
+
{ LLM_ARCH_DEEPSEEK, "deepseek" },
|
251
255
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
252
256
|
{ LLM_ARCH_CHATGLM, "chatglm" },
|
253
257
|
{ LLM_ARCH_BITNET, "bitnet" },
|
@@ -319,6 +323,7 @@ enum llm_kv {
|
|
319
323
|
LLM_KV_ATTENTION_SCALE,
|
320
324
|
|
321
325
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
326
|
+
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
322
327
|
LLM_KV_ROPE_FREQ_BASE,
|
323
328
|
LLM_KV_ROPE_SCALE_LINEAR,
|
324
329
|
LLM_KV_ROPE_SCALING_TYPE,
|
@@ -435,6 +440,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
435
440
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
436
441
|
|
437
442
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
443
|
+
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
438
444
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
439
445
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
440
446
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
@@ -909,6 +915,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
909
915
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
910
916
|
},
|
911
917
|
},
|
918
|
+
{
|
919
|
+
LLM_ARCH_QWEN2VL,
|
920
|
+
{
|
921
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
922
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
923
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
924
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
925
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
926
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
927
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
928
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
929
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
930
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
931
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
932
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
933
|
+
},
|
934
|
+
},
|
912
935
|
{
|
913
936
|
LLM_ARCH_QWEN2MOE,
|
914
937
|
{
|
@@ -1047,6 +1070,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1047
1070
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1048
1071
|
{ LLM_TENSOR_OUTPUT, "output" },
|
1049
1072
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
1073
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
1074
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
1050
1075
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1051
1076
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1052
1077
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
@@ -1297,6 +1322,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1297
1322
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1298
1323
|
},
|
1299
1324
|
},
|
1325
|
+
{
|
1326
|
+
LLM_ARCH_DEEPSEEK,
|
1327
|
+
{
|
1328
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1329
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1330
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1331
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
1332
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1333
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1334
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1335
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1336
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1337
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
1338
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1339
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1340
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1341
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1342
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1343
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1344
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1345
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1346
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
1347
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
1348
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
1349
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
1350
|
+
},
|
1351
|
+
},
|
1300
1352
|
{
|
1301
1353
|
LLM_ARCH_DEEPSEEK2,
|
1302
1354
|
{
|
@@ -1560,6 +1612,69 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1560
1612
|
},
|
1561
1613
|
};
|
1562
1614
|
|
1615
|
+
enum llm_chat_template {
|
1616
|
+
LLM_CHAT_TEMPLATE_CHATML,
|
1617
|
+
LLM_CHAT_TEMPLATE_LLAMA_2,
|
1618
|
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
|
1619
|
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
|
1620
|
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
|
1621
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V1,
|
1622
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
1623
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
1624
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
1625
|
+
LLM_CHAT_TEMPLATE_PHI_3,
|
1626
|
+
LLM_CHAT_TEMPLATE_ZEPHYR,
|
1627
|
+
LLM_CHAT_TEMPLATE_MONARCH,
|
1628
|
+
LLM_CHAT_TEMPLATE_GEMMA,
|
1629
|
+
LLM_CHAT_TEMPLATE_ORION,
|
1630
|
+
LLM_CHAT_TEMPLATE_OPENCHAT,
|
1631
|
+
LLM_CHAT_TEMPLATE_VICUNA,
|
1632
|
+
LLM_CHAT_TEMPLATE_VICUNA_ORCA,
|
1633
|
+
LLM_CHAT_TEMPLATE_DEEPSEEK,
|
1634
|
+
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
|
1635
|
+
LLM_CHAT_TEMPLATE_COMMAND_R,
|
1636
|
+
LLM_CHAT_TEMPLATE_LLAMA_3,
|
1637
|
+
LLM_CHAT_TEMPLATE_CHATGML_3,
|
1638
|
+
LLM_CHAT_TEMPLATE_CHATGML_4,
|
1639
|
+
LLM_CHAT_TEMPLATE_MINICPM,
|
1640
|
+
LLM_CHAT_TEMPLATE_EXAONE_3,
|
1641
|
+
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
1642
|
+
LLM_CHAT_TEMPLATE_GRANITE,
|
1643
|
+
LLM_CHAT_TEMPLATE_GIGACHAT,
|
1644
|
+
LLM_CHAT_TEMPLATE_UNKNOWN,
|
1645
|
+
};
|
1646
|
+
|
1647
|
+
static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
1648
|
+
{ "chatml", LLM_CHAT_TEMPLATE_CHATML },
|
1649
|
+
{ "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
|
1650
|
+
{ "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
|
1651
|
+
{ "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
|
1652
|
+
{ "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
|
1653
|
+
{ "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
|
1654
|
+
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
1655
|
+
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
1656
|
+
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
1657
|
+
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
1658
|
+
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
1659
|
+
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
1660
|
+
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
1661
|
+
{ "orion", LLM_CHAT_TEMPLATE_ORION },
|
1662
|
+
{ "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
|
1663
|
+
{ "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
|
1664
|
+
{ "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
|
1665
|
+
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
|
1666
|
+
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
|
1667
|
+
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
1668
|
+
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
1669
|
+
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
1670
|
+
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
|
1671
|
+
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
1672
|
+
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
1673
|
+
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
1674
|
+
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
1675
|
+
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
1676
|
+
};
|
1677
|
+
|
1563
1678
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
1564
1679
|
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
|
1565
1680
|
if (kv.second == name) {
|
@@ -1633,9 +1748,10 @@ struct LLM_TN {
|
|
1633
1748
|
//
|
1634
1749
|
|
1635
1750
|
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
1636
|
-
{ LLAMA_ROPE_SCALING_TYPE_NONE,
|
1637
|
-
{ LLAMA_ROPE_SCALING_TYPE_LINEAR,
|
1638
|
-
{ LLAMA_ROPE_SCALING_TYPE_YARN,
|
1751
|
+
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
1752
|
+
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
1753
|
+
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
1754
|
+
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
1639
1755
|
};
|
1640
1756
|
|
1641
1757
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
@@ -1741,7 +1857,7 @@ private:
|
|
1741
1857
|
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
1742
1858
|
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
1743
1859
|
if (!bufLen) {
|
1744
|
-
ret = format("Win32 error code: %
|
1860
|
+
ret = format("Win32 error code: %lx", error_code);
|
1745
1861
|
} else {
|
1746
1862
|
ret = lpMsgBuf;
|
1747
1863
|
LocalFree(lpMsgBuf);
|
@@ -2079,7 +2195,7 @@ struct llama_mmap {
|
|
2079
2195
|
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
2080
2196
|
|
2081
2197
|
// may fail on pre-Windows 8 systems
|
2082
|
-
pPrefetchVirtualMemory =
|
2198
|
+
pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
|
2083
2199
|
|
2084
2200
|
if (pPrefetchVirtualMemory) {
|
2085
2201
|
// advise the kernel to preload the mapped memory
|
@@ -2421,11 +2537,12 @@ struct llama_hparams {
|
|
2421
2537
|
uint32_t time_decay_extra_dim = 0;
|
2422
2538
|
uint32_t wkv_head_size = 0;
|
2423
2539
|
|
2424
|
-
float
|
2425
|
-
float
|
2426
|
-
float
|
2427
|
-
uint32_t
|
2428
|
-
float
|
2540
|
+
float rope_attn_factor = 1.0f;
|
2541
|
+
float rope_freq_base_train;
|
2542
|
+
float rope_freq_scale_train;
|
2543
|
+
uint32_t n_ctx_orig_yarn;
|
2544
|
+
float rope_yarn_log_mul;
|
2545
|
+
int rope_sections[4];
|
2429
2546
|
|
2430
2547
|
// for State Space Models
|
2431
2548
|
uint32_t ssm_d_conv = 0;
|
@@ -2482,6 +2599,9 @@ struct llama_hparams {
|
|
2482
2599
|
|
2483
2600
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
2484
2601
|
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
2602
|
+
if (std::equal(std::begin(this->rope_sections),
|
2603
|
+
std::end(this->rope_sections),
|
2604
|
+
std::begin(other.rope_sections))) return true;
|
2485
2605
|
|
2486
2606
|
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
2487
2607
|
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
@@ -3325,6 +3445,11 @@ struct llama_context {
|
|
3325
3445
|
// whether we are computing encoder output or decoder output
|
3326
3446
|
bool is_encoding = false;
|
3327
3447
|
|
3448
|
+
// TODO: find a better way to accommodate mutli-dimension position encoding methods
|
3449
|
+
// number of position id each token get, 1 for each token in most cases.
|
3450
|
+
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
|
3451
|
+
int n_pos_per_token = 1;
|
3452
|
+
|
3328
3453
|
// output of the encoder part of the encoder-decoder models
|
3329
3454
|
std::vector<float> embd_enc;
|
3330
3455
|
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
@@ -4525,9 +4650,6 @@ struct llama_model_loader {
|
|
4525
4650
|
case LM_GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
4526
4651
|
case LM_GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
4527
4652
|
case LM_GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
4528
|
-
case LM_GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
|
4529
|
-
case LM_GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
|
4530
|
-
case LM_GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
|
4531
4653
|
default:
|
4532
4654
|
{
|
4533
4655
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max));
|
@@ -5291,9 +5413,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
5291
5413
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
5292
5414
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
5293
5415
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
5294
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
|
5295
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
|
5296
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
|
5297
5416
|
|
5298
5417
|
default: return "unknown, may not work";
|
5299
5418
|
}
|
@@ -5530,8 +5649,12 @@ static void llm_load_hparams(
|
|
5530
5649
|
case LLM_ARCH_MINICPM:
|
5531
5650
|
{
|
5532
5651
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5652
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
5653
|
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
5654
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
5533
5655
|
|
5534
5656
|
switch (hparams.n_layer) {
|
5657
|
+
case 52: model.type = e_model::MODEL_1B; break;
|
5535
5658
|
case 40: model.type = e_model::MODEL_2B; break;
|
5536
5659
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5537
5660
|
}
|
@@ -5696,6 +5819,13 @@ static void llm_load_hparams(
|
|
5696
5819
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5697
5820
|
}
|
5698
5821
|
} break;
|
5822
|
+
case LLM_ARCH_QWEN2VL:
|
5823
|
+
{
|
5824
|
+
std::array<int, 4> section_dims;
|
5825
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
|
5826
|
+
std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
|
5827
|
+
}
|
5828
|
+
// fall through
|
5699
5829
|
case LLM_ARCH_QWEN2:
|
5700
5830
|
{
|
5701
5831
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -6006,6 +6136,19 @@ static void llm_load_hparams(
|
|
6006
6136
|
model.type = e_model::MODEL_UNKNOWN;
|
6007
6137
|
}
|
6008
6138
|
} break;
|
6139
|
+
case LLM_ARCH_DEEPSEEK:
|
6140
|
+
{
|
6141
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
6142
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
6143
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
6144
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
6145
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
6146
|
+
|
6147
|
+
switch (hparams.n_layer) {
|
6148
|
+
case 28: model.type = e_model::MODEL_20B; break;
|
6149
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
6150
|
+
}
|
6151
|
+
} break;
|
6009
6152
|
case LLM_ARCH_DEEPSEEK2:
|
6010
6153
|
{
|
6011
6154
|
bool is_lite = (hparams.n_layer == 27);
|
@@ -6352,6 +6495,7 @@ static void llm_load_vocab(
|
|
6352
6495
|
tokenizer_pre == "phi-2" ||
|
6353
6496
|
tokenizer_pre == "jina-es" ||
|
6354
6497
|
tokenizer_pre == "jina-de" ||
|
6498
|
+
tokenizer_pre == "gigachat" ||
|
6355
6499
|
tokenizer_pre == "jina-v1-en" ||
|
6356
6500
|
tokenizer_pre == "jina-v2-es" ||
|
6357
6501
|
tokenizer_pre == "jina-v2-de" ||
|
@@ -6422,6 +6566,9 @@ static void llm_load_vocab(
|
|
6422
6566
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
6423
6567
|
vocab.tokenizer_add_bos = true;
|
6424
6568
|
vocab.tokenizer_clean_spaces = false;
|
6569
|
+
} else if (
|
6570
|
+
tokenizer_pre == "minerva-7b") {
|
6571
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
6425
6572
|
} else {
|
6426
6573
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
6427
6574
|
}
|
@@ -7000,6 +7147,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
7000
7147
|
|
7001
7148
|
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
7002
7149
|
|
7150
|
+
if (model.arch == LLM_ARCH_DEEPSEEK) {
|
7151
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
7152
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
7153
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
7154
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
7155
|
+
}
|
7156
|
+
|
7003
7157
|
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
7004
7158
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
7005
7159
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
@@ -7015,7 +7169,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
7015
7169
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
7016
7170
|
}
|
7017
7171
|
|
7018
|
-
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
7172
|
+
if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
7019
7173
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
7020
7174
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
7021
7175
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
@@ -7640,7 +7794,13 @@ static bool llm_load_tensors(
|
|
7640
7794
|
|
7641
7795
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
7642
7796
|
|
7643
|
-
|
7797
|
+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
7798
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7799
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7800
|
+
}
|
7801
|
+
else {
|
7802
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7803
|
+
}
|
7644
7804
|
|
7645
7805
|
if (n_expert == 0) {
|
7646
7806
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
@@ -8107,6 +8267,7 @@ static bool llm_load_tensors(
|
|
8107
8267
|
}
|
8108
8268
|
} break;
|
8109
8269
|
case LLM_ARCH_QWEN2:
|
8270
|
+
case LLM_ARCH_QWEN2VL:
|
8110
8271
|
{
|
8111
8272
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
8112
8273
|
|
@@ -8767,6 +8928,55 @@ static bool llm_load_tensors(
|
|
8767
8928
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
8768
8929
|
}
|
8769
8930
|
} break;
|
8931
|
+
case LLM_ARCH_DEEPSEEK:
|
8932
|
+
{
|
8933
|
+
|
8934
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
8935
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
8936
|
+
|
8937
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
8938
|
+
|
8939
|
+
// output
|
8940
|
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
8941
|
+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
8942
|
+
|
8943
|
+
for (int i = 0; i < n_layer; ++i) {
|
8944
|
+
auto & layer = model.layers[i];
|
8945
|
+
|
8946
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
8947
|
+
|
8948
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
8949
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
8950
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
8951
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
8952
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
8953
|
+
|
8954
|
+
if (i < (int) hparams.n_layer_dense_lead) {
|
8955
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
8956
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
8957
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
8958
|
+
} else {
|
8959
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
8960
|
+
|
8961
|
+
if (n_expert == 0) {
|
8962
|
+
throw std::runtime_error("n_expert must be > 0");
|
8963
|
+
}
|
8964
|
+
if (n_expert_used == 0) {
|
8965
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
8966
|
+
}
|
8967
|
+
|
8968
|
+
// MoE branch
|
8969
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
8970
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
8971
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
8972
|
+
|
8973
|
+
// Shared expert branch
|
8974
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
8975
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
8976
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
8977
|
+
}
|
8978
|
+
}
|
8979
|
+
} break;
|
8770
8980
|
case LLM_ARCH_DEEPSEEK2:
|
8771
8981
|
{
|
8772
8982
|
const bool is_lite = (hparams.n_layer == 27);
|
@@ -12496,6 +12706,124 @@ struct llm_build_context {
|
|
12496
12706
|
return gf;
|
12497
12707
|
}
|
12498
12708
|
|
12709
|
+
struct lm_ggml_cgraph * build_qwen2vl() {
|
12710
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
12711
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
12712
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
12713
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
12714
|
+
|
12715
|
+
struct lm_ggml_tensor * cur;
|
12716
|
+
struct lm_ggml_tensor * inpL;
|
12717
|
+
|
12718
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
12719
|
+
|
12720
|
+
// inp_pos - contains the positions
|
12721
|
+
lctx.inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens * 4);
|
12722
|
+
cb(lctx.inp_pos, "inp_pos", -1);
|
12723
|
+
lm_ggml_set_input(lctx.inp_pos);
|
12724
|
+
struct lm_ggml_tensor * inp_pos = lctx.inp_pos;
|
12725
|
+
|
12726
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
12727
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
12728
|
+
int sections[4];
|
12729
|
+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
12730
|
+
|
12731
|
+
for (int il = 0; il < n_layer; ++il) {
|
12732
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
12733
|
+
|
12734
|
+
// norm
|
12735
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
12736
|
+
model.layers[il].attn_norm, NULL,
|
12737
|
+
LLM_NORM_RMS, cb, il);
|
12738
|
+
cb(cur, "attn_norm", il);
|
12739
|
+
|
12740
|
+
// self-attention
|
12741
|
+
{
|
12742
|
+
// compute Q and K and RoPE them
|
12743
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
12744
|
+
cb(Qcur, "Qcur", il);
|
12745
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
12746
|
+
cb(Qcur, "Qcur", il);
|
12747
|
+
|
12748
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
12749
|
+
cb(Kcur, "Kcur", il);
|
12750
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
12751
|
+
cb(Kcur, "Kcur", il);
|
12752
|
+
|
12753
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
12754
|
+
cb(Vcur, "Vcur", il);
|
12755
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
12756
|
+
cb(Vcur, "Vcur", il);
|
12757
|
+
|
12758
|
+
Qcur = lm_ggml_rope_multi(
|
12759
|
+
ctx0,
|
12760
|
+
lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
12761
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
12762
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
12763
|
+
);
|
12764
|
+
cb(Qcur, "Qcur", il);
|
12765
|
+
|
12766
|
+
Kcur = lm_ggml_rope_multi(
|
12767
|
+
ctx0,
|
12768
|
+
lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
12769
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
12770
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
12771
|
+
);
|
12772
|
+
cb(Kcur, "Kcur", il);
|
12773
|
+
|
12774
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
12775
|
+
model.layers[il].wo, model.layers[il].bo,
|
12776
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
12777
|
+
}
|
12778
|
+
|
12779
|
+
if (il == n_layer - 1) {
|
12780
|
+
// skip computing output for unused tokens
|
12781
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
12782
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
12783
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
12784
|
+
}
|
12785
|
+
|
12786
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
12787
|
+
cb(ffn_inp, "ffn_inp", il);
|
12788
|
+
|
12789
|
+
// feed-forward network
|
12790
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
12791
|
+
model.layers[il].ffn_norm, NULL,
|
12792
|
+
LLM_NORM_RMS, cb, il);
|
12793
|
+
cb(cur, "ffn_norm", il);
|
12794
|
+
|
12795
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
12796
|
+
model.layers[il].ffn_up, NULL, NULL,
|
12797
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
12798
|
+
model.layers[il].ffn_down, NULL, NULL,
|
12799
|
+
NULL,
|
12800
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
12801
|
+
cb(cur, "ffn_out", il);
|
12802
|
+
|
12803
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
12804
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
12805
|
+
cb(cur, "l_out", il);
|
12806
|
+
|
12807
|
+
// input for next layer
|
12808
|
+
inpL = cur;
|
12809
|
+
}
|
12810
|
+
|
12811
|
+
cur = inpL;
|
12812
|
+
|
12813
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
12814
|
+
model.output_norm, NULL,
|
12815
|
+
LLM_NORM_RMS, cb, -1);
|
12816
|
+
cb(cur, "result_norm", -1);
|
12817
|
+
|
12818
|
+
// lm_head
|
12819
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
12820
|
+
cb(cur, "result_output", -1);
|
12821
|
+
|
12822
|
+
lm_ggml_build_forward_expand(gf, cur);
|
12823
|
+
|
12824
|
+
return gf;
|
12825
|
+
}
|
12826
|
+
|
12499
12827
|
struct lm_ggml_cgraph * build_qwen2moe() {
|
12500
12828
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
12501
12829
|
|
@@ -13447,21 +13775,18 @@ struct llm_build_context {
|
|
13447
13775
|
return gf;
|
13448
13776
|
}
|
13449
13777
|
|
13450
|
-
|
13451
|
-
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
|
13452
|
-
// based on the original build_llama() function
|
13453
|
-
struct lm_ggml_cgraph * build_minicpm() {
|
13778
|
+
struct lm_ggml_cgraph * build_minicpm3() {
|
13454
13779
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13455
13780
|
|
13456
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
13457
|
-
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
13458
|
-
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
13459
|
-
|
13460
|
-
const int64_t n_embd = hparams.n_embd;
|
13461
13781
|
//TODO: if the model varies, these parameters need to be read from the model
|
13462
13782
|
const int64_t n_embd_base = 256;
|
13463
13783
|
const float scale_embd = 12.0f;
|
13464
13784
|
const float scale_depth = 1.4f;
|
13785
|
+
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
|
13786
|
+
|
13787
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
13788
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
13789
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
13465
13790
|
|
13466
13791
|
struct lm_ggml_tensor * cur;
|
13467
13792
|
struct lm_ggml_tensor * inpL;
|
@@ -13481,163 +13806,19 @@ struct llm_build_context {
|
|
13481
13806
|
for (int il = 0; il < n_layer; ++il) {
|
13482
13807
|
struct lm_ggml_tensor * inpSA = inpL;
|
13483
13808
|
|
13809
|
+
struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
|
13484
13810
|
// norm
|
13485
13811
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
13486
13812
|
model.layers[il].attn_norm, NULL,
|
13487
13813
|
LLM_NORM_RMS, cb, il);
|
13488
13814
|
cb(cur, "attn_norm", il);
|
13489
13815
|
|
13490
|
-
//
|
13816
|
+
// self_attention
|
13491
13817
|
{
|
13492
|
-
|
13493
|
-
|
13494
|
-
|
13495
|
-
|
13496
|
-
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
13497
|
-
cb(Qcur, "Qcur", il);
|
13498
|
-
}
|
13499
|
-
|
13500
|
-
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
13501
|
-
cb(Kcur, "Kcur", il);
|
13502
|
-
if (model.layers[il].bk) {
|
13503
|
-
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
13504
|
-
cb(Kcur, "Kcur", il);
|
13505
|
-
}
|
13506
|
-
|
13507
|
-
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13508
|
-
cb(Vcur, "Vcur", il);
|
13509
|
-
if (model.layers[il].bv) {
|
13510
|
-
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
13511
|
-
cb(Vcur, "Vcur", il);
|
13512
|
-
}
|
13513
|
-
|
13514
|
-
Qcur = lm_ggml_rope_ext(
|
13515
|
-
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
13516
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13517
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
13518
|
-
);
|
13519
|
-
cb(Qcur, "Qcur", il);
|
13520
|
-
|
13521
|
-
Kcur = lm_ggml_rope_ext(
|
13522
|
-
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
13523
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13524
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
13525
|
-
);
|
13526
|
-
cb(Kcur, "Kcur", il);
|
13527
|
-
|
13528
|
-
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13529
|
-
model.layers[il].wo, model.layers[il].bo,
|
13530
|
-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
13531
|
-
}
|
13532
|
-
|
13533
|
-
if (il == n_layer - 1) {
|
13534
|
-
// skip computing output for unused tokens
|
13535
|
-
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13536
|
-
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13537
|
-
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13538
|
-
}
|
13539
|
-
|
13540
|
-
// scale_res - scale the hidden states for residual connection
|
13541
|
-
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
13542
|
-
cur = lm_ggml_scale(ctx0, cur, scale_res);
|
13543
|
-
cb(cur, "hidden_scaled", -1);
|
13544
|
-
|
13545
|
-
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
13546
|
-
cb(ffn_inp, "ffn_inp", il);
|
13547
|
-
|
13548
|
-
// feed-forward network
|
13549
|
-
{
|
13550
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13551
|
-
model.layers[il].ffn_norm, NULL,
|
13552
|
-
LLM_NORM_RMS, cb, il);
|
13553
|
-
cb(cur, "ffn_norm", il);
|
13554
|
-
|
13555
|
-
cur = llm_build_ffn(ctx0, lctx, cur,
|
13556
|
-
model.layers[il].ffn_up, NULL, NULL,
|
13557
|
-
model.layers[il].ffn_gate, NULL, NULL,
|
13558
|
-
model.layers[il].ffn_down, NULL, NULL,
|
13559
|
-
NULL,
|
13560
|
-
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
13561
|
-
cb(cur, "ffn_out", il);
|
13562
|
-
}
|
13563
|
-
|
13564
|
-
// scale the hidden states for residual connection
|
13565
|
-
cur = lm_ggml_scale(ctx0, cur, scale_res);
|
13566
|
-
cb(cur, "hidden_scaled_ffn", -1);
|
13567
|
-
|
13568
|
-
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
13569
|
-
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
13570
|
-
cb(cur, "l_out", il);
|
13571
|
-
|
13572
|
-
// input for next layer
|
13573
|
-
inpL = cur;
|
13574
|
-
}
|
13575
|
-
|
13576
|
-
cur = inpL;
|
13577
|
-
|
13578
|
-
cur = llm_build_norm(ctx0, cur, hparams,
|
13579
|
-
model.output_norm, NULL,
|
13580
|
-
LLM_NORM_RMS, cb, -1);
|
13581
|
-
cb(cur, "result_norm", -1);
|
13582
|
-
|
13583
|
-
// lm_head scaling
|
13584
|
-
const float scale_lmhead = float(n_embd_base)/float(n_embd);
|
13585
|
-
cur = lm_ggml_scale(ctx0, cur, scale_lmhead);
|
13586
|
-
cb(cur, "lmhead_scaling", -1);
|
13587
|
-
|
13588
|
-
// lm_head
|
13589
|
-
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
13590
|
-
cb(cur, "result_output", -1);
|
13591
|
-
|
13592
|
-
lm_ggml_build_forward_expand(gf, cur);
|
13593
|
-
|
13594
|
-
return gf;
|
13595
|
-
}
|
13596
|
-
|
13597
|
-
struct lm_ggml_cgraph * build_minicpm3() {
|
13598
|
-
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13599
|
-
|
13600
|
-
//TODO: if the model varies, these parameters need to be read from the model
|
13601
|
-
const int64_t n_embd_base = 256;
|
13602
|
-
const float scale_embd = 12.0f;
|
13603
|
-
const float scale_depth = 1.4f;
|
13604
|
-
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
|
13605
|
-
|
13606
|
-
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
13607
|
-
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
13608
|
-
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
13609
|
-
|
13610
|
-
struct lm_ggml_tensor * cur;
|
13611
|
-
struct lm_ggml_tensor * inpL;
|
13612
|
-
|
13613
|
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
13614
|
-
|
13615
|
-
// scale the input embeddings
|
13616
|
-
inpL = lm_ggml_scale(ctx0, inpL, scale_embd);
|
13617
|
-
cb(inpL, "inp_scaled", -1);
|
13618
|
-
|
13619
|
-
// inp_pos - contains the positions
|
13620
|
-
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
13621
|
-
|
13622
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13623
|
-
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
13624
|
-
|
13625
|
-
for (int il = 0; il < n_layer; ++il) {
|
13626
|
-
struct lm_ggml_tensor * inpSA = inpL;
|
13627
|
-
|
13628
|
-
struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
|
13629
|
-
// norm
|
13630
|
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
13631
|
-
model.layers[il].attn_norm, NULL,
|
13632
|
-
LLM_NORM_RMS, cb, il);
|
13633
|
-
cb(cur, "attn_norm", il);
|
13634
|
-
|
13635
|
-
// self_attention
|
13636
|
-
{
|
13637
|
-
struct lm_ggml_tensor * q = NULL;
|
13638
|
-
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
13639
|
-
q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
13640
|
-
cb(q, "q", il);
|
13818
|
+
struct lm_ggml_tensor * q = NULL;
|
13819
|
+
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
13820
|
+
q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
13821
|
+
cb(q, "q", il);
|
13641
13822
|
|
13642
13823
|
q = llm_build_norm(ctx0, q, hparams,
|
13643
13824
|
model.layers[il].attn_q_a_norm, NULL,
|
@@ -15150,6 +15331,161 @@ struct llm_build_context {
|
|
15150
15331
|
return gf;
|
15151
15332
|
}
|
15152
15333
|
|
15334
|
+
struct lm_ggml_cgraph * build_deepseek() {
|
15335
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
15336
|
+
|
15337
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
15338
|
+
int32_t n_tokens = this->n_tokens;
|
15339
|
+
|
15340
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
15341
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
15342
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
15343
|
+
|
15344
|
+
struct lm_ggml_tensor * cur;
|
15345
|
+
struct lm_ggml_tensor * inpL;
|
15346
|
+
|
15347
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
15348
|
+
|
15349
|
+
// inp_pos - contains the positions
|
15350
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
15351
|
+
|
15352
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
15353
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
15354
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
15355
|
+
for (int il = 0; il < n_layer; ++il) {
|
15356
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
15357
|
+
|
15358
|
+
// norm
|
15359
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
15360
|
+
model.layers[il].attn_norm, NULL,
|
15361
|
+
LLM_NORM_RMS, cb, il);
|
15362
|
+
cb(cur, "attn_norm", il);
|
15363
|
+
|
15364
|
+
// self-attention
|
15365
|
+
{
|
15366
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
15367
|
+
struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
|
15368
|
+
|
15369
|
+
// compute Q and K and RoPE them
|
15370
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
15371
|
+
cb(Qcur, "Qcur", il);
|
15372
|
+
if (model.layers[il].bq) {
|
15373
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
15374
|
+
cb(Qcur, "Qcur", il);
|
15375
|
+
}
|
15376
|
+
|
15377
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
15378
|
+
cb(Kcur, "Kcur", il);
|
15379
|
+
if (model.layers[il].bk) {
|
15380
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
15381
|
+
cb(Kcur, "Kcur", il);
|
15382
|
+
}
|
15383
|
+
|
15384
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
15385
|
+
cb(Vcur, "Vcur", il);
|
15386
|
+
if (model.layers[il].bv) {
|
15387
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
15388
|
+
cb(Vcur, "Vcur", il);
|
15389
|
+
}
|
15390
|
+
|
15391
|
+
Qcur = lm_ggml_rope_ext(
|
15392
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
15393
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
15394
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
15395
|
+
);
|
15396
|
+
cb(Qcur, "Qcur", il);
|
15397
|
+
|
15398
|
+
Kcur = lm_ggml_rope_ext(
|
15399
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
15400
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
15401
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
15402
|
+
);
|
15403
|
+
cb(Kcur, "Kcur", il);
|
15404
|
+
|
15405
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
15406
|
+
model.layers[il].wo, model.layers[il].bo,
|
15407
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
15408
|
+
}
|
15409
|
+
|
15410
|
+
if (il == n_layer - 1) {
|
15411
|
+
// skip computing output for unused tokens
|
15412
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
15413
|
+
n_tokens = n_outputs;
|
15414
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
15415
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
15416
|
+
}
|
15417
|
+
|
15418
|
+
|
15419
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
15420
|
+
cb(ffn_inp, "ffn_inp", il);
|
15421
|
+
|
15422
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
15423
|
+
model.layers[il].ffn_norm, NULL,
|
15424
|
+
LLM_NORM_RMS, cb, il);
|
15425
|
+
cb(cur, "ffn_norm", il);
|
15426
|
+
|
15427
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
15428
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
15429
|
+
model.layers[il].ffn_up, NULL, NULL,
|
15430
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
15431
|
+
model.layers[il].ffn_down, NULL, NULL,
|
15432
|
+
NULL,
|
15433
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
15434
|
+
cb(cur, "ffn_out", il);
|
15435
|
+
} else {
|
15436
|
+
// MoE branch
|
15437
|
+
lm_ggml_tensor * moe_out =
|
15438
|
+
llm_build_moe_ffn(ctx0, lctx, cur,
|
15439
|
+
model.layers[il].ffn_gate_inp,
|
15440
|
+
model.layers[il].ffn_up_exps,
|
15441
|
+
model.layers[il].ffn_gate_exps,
|
15442
|
+
model.layers[il].ffn_down_exps,
|
15443
|
+
n_expert, n_expert_used,
|
15444
|
+
LLM_FFN_SILU, false,
|
15445
|
+
false, hparams.expert_weights_scale,
|
15446
|
+
cb, il);
|
15447
|
+
cb(moe_out, "ffn_moe_out", il);
|
15448
|
+
|
15449
|
+
// FFN shared expert
|
15450
|
+
{
|
15451
|
+
lm_ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
|
15452
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
15453
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
15454
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
15455
|
+
NULL,
|
15456
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
15457
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
15458
|
+
|
15459
|
+
cur = lm_ggml_add(ctx0, moe_out, ffn_shexp);
|
15460
|
+
cb(cur, "ffn_out", il);
|
15461
|
+
}
|
15462
|
+
}
|
15463
|
+
|
15464
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
15465
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
15466
|
+
cb(cur, "l_out", il);
|
15467
|
+
|
15468
|
+
// input for next layer
|
15469
|
+
inpL = cur;
|
15470
|
+
}
|
15471
|
+
|
15472
|
+
cur = inpL;
|
15473
|
+
|
15474
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
15475
|
+
model.output_norm, NULL,
|
15476
|
+
LLM_NORM_RMS, cb, -1);
|
15477
|
+
cb(cur, "result_norm", -1);
|
15478
|
+
|
15479
|
+
// lm_head
|
15480
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
15481
|
+
|
15482
|
+
cb(cur, "result_output", -1);
|
15483
|
+
|
15484
|
+
lm_ggml_build_forward_expand(gf, cur);
|
15485
|
+
|
15486
|
+
return gf;
|
15487
|
+
}
|
15488
|
+
|
15153
15489
|
struct lm_ggml_cgraph * build_deepseek2() {
|
15154
15490
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
15155
15491
|
|
@@ -16692,6 +17028,7 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16692
17028
|
|
16693
17029
|
switch (model.arch) {
|
16694
17030
|
case LLM_ARCH_LLAMA:
|
17031
|
+
case LLM_ARCH_MINICPM:
|
16695
17032
|
case LLM_ARCH_GRANITE:
|
16696
17033
|
case LLM_ARCH_GRANITE_MOE:
|
16697
17034
|
{
|
@@ -16743,6 +17080,11 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16743
17080
|
{
|
16744
17081
|
result = llm.build_qwen2();
|
16745
17082
|
} break;
|
17083
|
+
case LLM_ARCH_QWEN2VL:
|
17084
|
+
{
|
17085
|
+
lctx.n_pos_per_token = 4;
|
17086
|
+
result = llm.build_qwen2vl();
|
17087
|
+
} break;
|
16746
17088
|
case LLM_ARCH_QWEN2MOE:
|
16747
17089
|
{
|
16748
17090
|
result = llm.build_qwen2moe();
|
@@ -16775,10 +17117,6 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16775
17117
|
{
|
16776
17118
|
result = llm.build_internlm2();
|
16777
17119
|
} break;
|
16778
|
-
case LLM_ARCH_MINICPM:
|
16779
|
-
{
|
16780
|
-
result = llm.build_minicpm();
|
16781
|
-
} break;
|
16782
17120
|
case LLM_ARCH_MINICPM3:
|
16783
17121
|
{
|
16784
17122
|
result = llm.build_minicpm3();
|
@@ -16835,6 +17173,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
16835
17173
|
{
|
16836
17174
|
result = llm.build_arctic();
|
16837
17175
|
} break;
|
17176
|
+
case LLM_ARCH_DEEPSEEK:
|
17177
|
+
{
|
17178
|
+
result = llm.build_deepseek();
|
17179
|
+
} break;
|
16838
17180
|
case LLM_ARCH_DEEPSEEK2:
|
16839
17181
|
{
|
16840
17182
|
result = llm.build_deepseek2();
|
@@ -16965,8 +17307,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
|
16965
17307
|
|
16966
17308
|
if (ubatch.pos && lctx.inp_pos) {
|
16967
17309
|
const int64_t n_tokens = ubatch.n_tokens;
|
16968
|
-
|
16969
|
-
lm_ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*lm_ggml_element_size(lctx.inp_pos));
|
17310
|
+
auto n_pos = lctx.n_pos_per_token;
|
17311
|
+
lm_ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*lm_ggml_element_size(lctx.inp_pos));
|
16970
17312
|
}
|
16971
17313
|
|
16972
17314
|
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
@@ -18451,10 +18793,6 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
|
|
18451
18793
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
18452
18794
|
new_type = LM_GGML_TYPE_IQ3_S;
|
18453
18795
|
}
|
18454
|
-
else if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8 ||
|
18455
|
-
new_type == LM_GGML_TYPE_Q4_0_8_8) {
|
18456
|
-
new_type = LM_GGML_TYPE_Q4_0;
|
18457
|
-
}
|
18458
18796
|
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
18459
18797
|
new_type = LM_GGML_TYPE_Q4_K;
|
18460
18798
|
}
|
@@ -18777,9 +19115,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
18777
19115
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = LM_GGML_TYPE_IQ4_XS; break;
|
18778
19116
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = LM_GGML_TYPE_IQ3_S; break;
|
18779
19117
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = LM_GGML_TYPE_IQ3_S; break;
|
18780
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = LM_GGML_TYPE_Q4_0_4_4; break;
|
18781
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = LM_GGML_TYPE_Q4_0_4_8; break;
|
18782
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = LM_GGML_TYPE_Q4_0_8_8; break;
|
18783
19118
|
|
18784
19119
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
18785
19120
|
}
|
@@ -19118,14 +19453,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
19118
19453
|
f32_data = (float *) f32_conv_buf.data();
|
19119
19454
|
}
|
19120
19455
|
|
19121
|
-
int chunk_size_multiplier = 1;
|
19122
|
-
if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8 || new_type == LM_GGML_TYPE_Q4_0_8_8) {
|
19123
|
-
if ((new_type == LM_GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = LM_GGML_TYPE_Q4_0;
|
19124
|
-
else if (tensor->ne[1] % 4 != 0) new_type = LM_GGML_TYPE_Q4_0;
|
19125
|
-
if (new_type == LM_GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
|
19126
|
-
else if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
|
19127
|
-
}
|
19128
|
-
|
19129
19456
|
LLAMA_LOG_INFO("converting to %s .. ", lm_ggml_type_name(new_type));
|
19130
19457
|
fflush(stdout);
|
19131
19458
|
|
@@ -19138,8 +19465,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
19138
19465
|
const int64_t nrows = tensor->ne[1];
|
19139
19466
|
|
19140
19467
|
static const int64_t min_chunk_size = 32 * 512;
|
19141
|
-
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row))
|
19142
|
-
chunk_size_multiplier;
|
19468
|
+
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
19143
19469
|
|
19144
19470
|
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
19145
19471
|
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
@@ -20082,6 +20408,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
20082
20408
|
case LLM_ARCH_COMMAND_R:
|
20083
20409
|
case LLM_ARCH_OLMO:
|
20084
20410
|
case LLM_ARCH_ARCTIC:
|
20411
|
+
case LLM_ARCH_DEEPSEEK:
|
20085
20412
|
case LLM_ARCH_DEEPSEEK2:
|
20086
20413
|
case LLM_ARCH_CHATGLM:
|
20087
20414
|
case LLM_ARCH_GRANITE:
|
@@ -20115,6 +20442,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
20115
20442
|
case LLM_ARCH_MINICPM3:
|
20116
20443
|
return LLAMA_ROPE_TYPE_NEOX;
|
20117
20444
|
|
20445
|
+
case LLM_ARCH_QWEN2VL:
|
20446
|
+
return LLAMA_ROPE_TYPE_MROPE;
|
20447
|
+
|
20118
20448
|
// all model arches should be listed explicitly here
|
20119
20449
|
case LLM_ARCH_UNKNOWN:
|
20120
20450
|
LM_GGML_ABORT("unknown architecture");
|
@@ -21683,7 +22013,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
|
21683
22013
|
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
21684
22014
|
}
|
21685
22015
|
} else if ((size_t) i >= ctx->output_ids.size()) {
|
21686
|
-
throw std::runtime_error(format("out of range [0, %
|
22016
|
+
throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
|
21687
22017
|
} else {
|
21688
22018
|
j = ctx->output_ids[i];
|
21689
22019
|
}
|
@@ -21854,18 +22184,111 @@ int32_t llama_detokenize(
|
|
21854
22184
|
// chat templates
|
21855
22185
|
//
|
21856
22186
|
|
22187
|
+
static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
22188
|
+
if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
|
22189
|
+
return LLM_CHAT_TEMPLATES.at(tmpl);
|
22190
|
+
}
|
22191
|
+
auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
|
22192
|
+
return tmpl.find(haystack) != std::string::npos;
|
22193
|
+
};
|
22194
|
+
if (tmpl_contains("<|im_start|>")) {
|
22195
|
+
return LLM_CHAT_TEMPLATE_CHATML;
|
22196
|
+
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
22197
|
+
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
22198
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
22199
|
+
} else if (
|
22200
|
+
// catches official 'v1' template
|
22201
|
+
tmpl_contains("' [INST] ' + system_message")
|
22202
|
+
// catches official 'v3' and 'v3-tekken' templates
|
22203
|
+
|| tmpl_contains("[AVAILABLE_TOOLS]")
|
22204
|
+
) {
|
22205
|
+
// Official mistral 'v1', 'v3' and 'v3-tekken' templates
|
22206
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
|
22207
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
|
22208
|
+
if (tmpl_contains(" [INST]")) {
|
22209
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V1;
|
22210
|
+
} else if (tmpl_contains("\"[INST]\"")) {
|
22211
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
|
22212
|
+
}
|
22213
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V3;
|
22214
|
+
} else {
|
22215
|
+
// llama2 template and its variants
|
22216
|
+
// [variant] support system message
|
22217
|
+
// See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
|
22218
|
+
bool support_system_message = tmpl_contains("<<SYS>>");
|
22219
|
+
bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
|
22220
|
+
bool strip_message = tmpl_contains("content.strip()");
|
22221
|
+
if (strip_message) {
|
22222
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
|
22223
|
+
} else if (add_bos_inside_history) {
|
22224
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
|
22225
|
+
} else if (support_system_message) {
|
22226
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
|
22227
|
+
} else {
|
22228
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2;
|
22229
|
+
}
|
22230
|
+
}
|
22231
|
+
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
22232
|
+
return LLM_CHAT_TEMPLATE_PHI_3;
|
22233
|
+
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
22234
|
+
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
22235
|
+
} else if (tmpl_contains("bos_token + message['role']")) {
|
22236
|
+
return LLM_CHAT_TEMPLATE_MONARCH;
|
22237
|
+
} else if (tmpl_contains("<start_of_turn>")) {
|
22238
|
+
return LLM_CHAT_TEMPLATE_GEMMA;
|
22239
|
+
} else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
|
22240
|
+
// OrionStarAI/Orion-14B-Chat
|
22241
|
+
return LLM_CHAT_TEMPLATE_ORION;
|
22242
|
+
} else if (tmpl_contains("GPT4 Correct ")) {
|
22243
|
+
// openchat/openchat-3.5-0106
|
22244
|
+
return LLM_CHAT_TEMPLATE_OPENCHAT;
|
22245
|
+
} else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
|
22246
|
+
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
22247
|
+
if (tmpl_contains("SYSTEM: ")) {
|
22248
|
+
return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
|
22249
|
+
}
|
22250
|
+
return LLM_CHAT_TEMPLATE_VICUNA;
|
22251
|
+
} else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
|
22252
|
+
// deepseek-ai/deepseek-coder-33b-instruct
|
22253
|
+
return LLM_CHAT_TEMPLATE_DEEPSEEK;
|
22254
|
+
} else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
|
22255
|
+
// CohereForAI/c4ai-command-r-plus
|
22256
|
+
return LLM_CHAT_TEMPLATE_COMMAND_R;
|
22257
|
+
} else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
|
22258
|
+
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
22259
|
+
} else if (tmpl_contains("[gMASK]sop")) {
|
22260
|
+
// chatglm3-6b
|
22261
|
+
return LLM_CHAT_TEMPLATE_CHATGML_3;
|
22262
|
+
} else if (tmpl_contains("[gMASK]<sop>")) {
|
22263
|
+
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
22264
|
+
} else if (tmpl_contains(LU8("<用户>"))) {
|
22265
|
+
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
22266
|
+
return LLM_CHAT_TEMPLATE_MINICPM;
|
22267
|
+
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
|
22268
|
+
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
|
22269
|
+
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
|
22270
|
+
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
22271
|
+
// EXAONE-3.0-7.8B-Instruct
|
22272
|
+
return LLM_CHAT_TEMPLATE_EXAONE_3;
|
22273
|
+
} else if (tmpl_contains("rwkv-world")) {
|
22274
|
+
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
22275
|
+
} else if (tmpl_contains("<|start_of_role|>")) {
|
22276
|
+
return LLM_CHAT_TEMPLATE_GRANITE;
|
22277
|
+
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
22278
|
+
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
22279
|
+
}
|
22280
|
+
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
22281
|
+
}
|
22282
|
+
|
21857
22283
|
// Simple version of "llama_apply_chat_template" that only works with strings
|
21858
22284
|
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
21859
22285
|
static int32_t llama_chat_apply_template_internal(
|
21860
|
-
const
|
22286
|
+
const llm_chat_template tmpl,
|
21861
22287
|
const std::vector<const llama_chat_message *> & chat,
|
21862
22288
|
std::string & dest, bool add_ass) {
|
21863
22289
|
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
21864
22290
|
std::stringstream ss;
|
21865
|
-
|
21866
|
-
return tmpl.find(haystack) != std::string::npos;
|
21867
|
-
};
|
21868
|
-
if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
|
22291
|
+
if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
|
21869
22292
|
// chatml template
|
21870
22293
|
for (auto message : chat) {
|
21871
22294
|
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
@@ -21873,16 +22296,59 @@ static int32_t llama_chat_apply_template_internal(
|
|
21873
22296
|
if (add_ass) {
|
21874
22297
|
ss << "<|im_start|>assistant\n";
|
21875
22298
|
}
|
21876
|
-
} else if (tmpl ==
|
22299
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
22300
|
+
// Official mistral 'v7' template
|
22301
|
+
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
22302
|
+
for (auto message : chat) {
|
22303
|
+
std::string role(message->role);
|
22304
|
+
std::string content(message->content);
|
22305
|
+
if (role == "system") {
|
22306
|
+
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
|
22307
|
+
} else if (role == "user") {
|
22308
|
+
ss << "[INST] " << content << "[/INST]";
|
22309
|
+
}
|
22310
|
+
else {
|
22311
|
+
ss << " " << content << "</s>";
|
22312
|
+
}
|
22313
|
+
}
|
22314
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
22315
|
+
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
|
22316
|
+
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
|
22317
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
|
22318
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
|
22319
|
+
std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
|
22320
|
+
std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
|
22321
|
+
bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
|
22322
|
+
bool is_inside_turn = false;
|
22323
|
+
for (auto message : chat) {
|
22324
|
+
if (!is_inside_turn) {
|
22325
|
+
ss << leading_space << "[INST]" << trailing_space;
|
22326
|
+
is_inside_turn = true;
|
22327
|
+
}
|
22328
|
+
std::string role(message->role);
|
22329
|
+
std::string content(message->content);
|
22330
|
+
if (role == "system") {
|
22331
|
+
ss << content << "\n\n";
|
22332
|
+
} else if (role == "user") {
|
22333
|
+
ss << content << leading_space << "[/INST]";
|
22334
|
+
} else {
|
22335
|
+
ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
|
22336
|
+
is_inside_turn = false;
|
22337
|
+
}
|
22338
|
+
}
|
22339
|
+
} else if (
|
22340
|
+
tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
|
22341
|
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
|
22342
|
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
|
22343
|
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
|
21877
22344
|
// llama2 template and its variants
|
21878
22345
|
// [variant] support system message
|
21879
|
-
|
21880
|
-
|
21881
|
-
bool space_around_response = tmpl_contains("' ' + eos_token");
|
22346
|
+
// See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
|
22347
|
+
bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
|
21882
22348
|
// [variant] add BOS inside history
|
21883
|
-
bool add_bos_inside_history =
|
22349
|
+
bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
|
21884
22350
|
// [variant] trim spaces from the input message
|
21885
|
-
bool strip_message =
|
22351
|
+
bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
|
21886
22352
|
// construct the prompt
|
21887
22353
|
bool is_inside_turn = true; // skip BOS at the beginning
|
21888
22354
|
ss << "[INST] ";
|
@@ -21903,12 +22369,11 @@ static int32_t llama_chat_apply_template_internal(
|
|
21903
22369
|
} else if (role == "user") {
|
21904
22370
|
ss << content << " [/INST]";
|
21905
22371
|
} else {
|
21906
|
-
ss <<
|
22372
|
+
ss << content << "</s>";
|
21907
22373
|
is_inside_turn = false;
|
21908
22374
|
}
|
21909
22375
|
}
|
21910
|
-
|
21911
|
-
} else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
|
22376
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
|
21912
22377
|
// Phi 3
|
21913
22378
|
for (auto message : chat) {
|
21914
22379
|
std::string role(message->role);
|
@@ -21917,7 +22382,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
21917
22382
|
if (add_ass) {
|
21918
22383
|
ss << "<|assistant|>\n";
|
21919
22384
|
}
|
21920
|
-
} else if (tmpl ==
|
22385
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
21921
22386
|
// zephyr template
|
21922
22387
|
for (auto message : chat) {
|
21923
22388
|
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
@@ -21925,7 +22390,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
21925
22390
|
if (add_ass) {
|
21926
22391
|
ss << "<|assistant|>\n";
|
21927
22392
|
}
|
21928
|
-
} else if (tmpl ==
|
22393
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
|
21929
22394
|
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
21930
22395
|
for (auto message : chat) {
|
21931
22396
|
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
@@ -21934,7 +22399,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
21934
22399
|
if (add_ass) {
|
21935
22400
|
ss << "<s>assistant\n";
|
21936
22401
|
}
|
21937
|
-
} else if (tmpl ==
|
22402
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
|
21938
22403
|
// google/gemma-7b-it
|
21939
22404
|
std::string system_prompt = "";
|
21940
22405
|
for (auto message : chat) {
|
@@ -21956,7 +22421,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
21956
22421
|
if (add_ass) {
|
21957
22422
|
ss << "<start_of_turn>model\n";
|
21958
22423
|
}
|
21959
|
-
} else if (tmpl ==
|
22424
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
|
21960
22425
|
// OrionStarAI/Orion-14B-Chat
|
21961
22426
|
std::string system_prompt = "";
|
21962
22427
|
for (auto message : chat) {
|
@@ -21976,7 +22441,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
21976
22441
|
ss << message->content << "</s>";
|
21977
22442
|
}
|
21978
22443
|
}
|
21979
|
-
} else if (tmpl ==
|
22444
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
|
21980
22445
|
// openchat/openchat-3.5-0106,
|
21981
22446
|
for (auto message : chat) {
|
21982
22447
|
std::string role(message->role);
|
@@ -21990,13 +22455,13 @@ static int32_t llama_chat_apply_template_internal(
|
|
21990
22455
|
if (add_ass) {
|
21991
22456
|
ss << "GPT4 Correct Assistant:";
|
21992
22457
|
}
|
21993
|
-
} else if (tmpl ==
|
22458
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
|
21994
22459
|
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
21995
22460
|
for (auto message : chat) {
|
21996
22461
|
std::string role(message->role);
|
21997
22462
|
if (role == "system") {
|
21998
22463
|
// Orca-Vicuna variant uses a system prefix
|
21999
|
-
if (tmpl ==
|
22464
|
+
if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
|
22000
22465
|
ss << "SYSTEM: " << message->content << "\n";
|
22001
22466
|
} else {
|
22002
22467
|
ss << message->content << "\n\n";
|
@@ -22010,7 +22475,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22010
22475
|
if (add_ass) {
|
22011
22476
|
ss << "ASSISTANT:";
|
22012
22477
|
}
|
22013
|
-
} else if (tmpl ==
|
22478
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
|
22014
22479
|
// deepseek-ai/deepseek-coder-33b-instruct
|
22015
22480
|
for (auto message : chat) {
|
22016
22481
|
std::string role(message->role);
|
@@ -22025,7 +22490,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22025
22490
|
if (add_ass) {
|
22026
22491
|
ss << "### Response:\n";
|
22027
22492
|
}
|
22028
|
-
} else if (tmpl ==
|
22493
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
|
22029
22494
|
// CohereForAI/c4ai-command-r-plus
|
22030
22495
|
for (auto message : chat) {
|
22031
22496
|
std::string role(message->role);
|
@@ -22040,7 +22505,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22040
22505
|
if (add_ass) {
|
22041
22506
|
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
22042
22507
|
}
|
22043
|
-
} else if (tmpl ==
|
22508
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
|
22044
22509
|
// Llama 3
|
22045
22510
|
for (auto message : chat) {
|
22046
22511
|
std::string role(message->role);
|
@@ -22049,7 +22514,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22049
22514
|
if (add_ass) {
|
22050
22515
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
22051
22516
|
}
|
22052
|
-
} else if (tmpl ==
|
22517
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
|
22053
22518
|
// chatglm3-6b
|
22054
22519
|
ss << "[gMASK]" << "sop";
|
22055
22520
|
for (auto message : chat) {
|
@@ -22059,7 +22524,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22059
22524
|
if (add_ass) {
|
22060
22525
|
ss << "<|assistant|>";
|
22061
22526
|
}
|
22062
|
-
} else if (tmpl ==
|
22527
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
|
22063
22528
|
ss << "[gMASK]" << "<sop>";
|
22064
22529
|
for (auto message : chat) {
|
22065
22530
|
std::string role(message->role);
|
@@ -22068,7 +22533,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22068
22533
|
if (add_ass) {
|
22069
22534
|
ss << "<|assistant|>";
|
22070
22535
|
}
|
22071
|
-
} else if (tmpl ==
|
22536
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
22072
22537
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
22073
22538
|
for (auto message : chat) {
|
22074
22539
|
std::string role(message->role);
|
@@ -22080,7 +22545,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22080
22545
|
ss << trim(message->content);
|
22081
22546
|
}
|
22082
22547
|
}
|
22083
|
-
} else if (tmpl ==
|
22548
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
|
22084
22549
|
// DeepSeek-V2
|
22085
22550
|
for (auto message : chat) {
|
22086
22551
|
std::string role(message->role);
|
@@ -22095,7 +22560,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22095
22560
|
if (add_ass) {
|
22096
22561
|
ss << "Assistant:";
|
22097
22562
|
}
|
22098
|
-
} else if (tmpl ==
|
22563
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
|
22099
22564
|
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
22100
22565
|
// EXAONE-3.0-7.8B-Instruct
|
22101
22566
|
for (auto message : chat) {
|
@@ -22111,7 +22576,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22111
22576
|
if (add_ass) {
|
22112
22577
|
ss << "[|assistant|]";
|
22113
22578
|
}
|
22114
|
-
} else if (tmpl ==
|
22579
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
|
22115
22580
|
// this template requires the model to have "\n\n" as EOT token
|
22116
22581
|
for (auto message : chat) {
|
22117
22582
|
std::string role(message->role);
|
@@ -22121,7 +22586,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
22121
22586
|
ss << message->content << "\n\n";
|
22122
22587
|
}
|
22123
22588
|
}
|
22124
|
-
} else if (tmpl ==
|
22589
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
|
22125
22590
|
// IBM Granite template
|
22126
22591
|
for (const auto & message : chat) {
|
22127
22592
|
std::string role(message->role);
|
@@ -22134,6 +22599,32 @@ static int32_t llama_chat_apply_template_internal(
|
|
22134
22599
|
if (add_ass) {
|
22135
22600
|
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
|
22136
22601
|
}
|
22602
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
|
22603
|
+
// GigaChat template
|
22604
|
+
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
|
22605
|
+
|
22606
|
+
// Handle system message if present
|
22607
|
+
if (has_system) {
|
22608
|
+
ss << "<s>" << chat[0]->content << "<|message_sep|>";
|
22609
|
+
} else {
|
22610
|
+
ss << "<s>";
|
22611
|
+
}
|
22612
|
+
|
22613
|
+
// Process remaining messages
|
22614
|
+
for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
|
22615
|
+
std::string role(chat[i]->role);
|
22616
|
+
if (role == "user") {
|
22617
|
+
ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
|
22618
|
+
<< "available functions<|role_sep|>[]<|message_sep|>";
|
22619
|
+
} else if (role == "assistant") {
|
22620
|
+
ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
|
22621
|
+
}
|
22622
|
+
}
|
22623
|
+
|
22624
|
+
// Add generation prompt if needed
|
22625
|
+
if (add_ass) {
|
22626
|
+
ss << "assistant<|role_sep|>";
|
22627
|
+
}
|
22137
22628
|
} else {
|
22138
22629
|
// template not supported
|
22139
22630
|
return -1;
|
@@ -22173,7 +22664,11 @@ int32_t llama_chat_apply_template(
|
|
22173
22664
|
}
|
22174
22665
|
|
22175
22666
|
std::string formatted_chat;
|
22176
|
-
|
22667
|
+
llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
|
22668
|
+
if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
|
22669
|
+
return -1;
|
22670
|
+
}
|
22671
|
+
int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
|
22177
22672
|
if (res < 0) {
|
22178
22673
|
return res;
|
22179
22674
|
}
|
@@ -22183,6 +22678,15 @@ int32_t llama_chat_apply_template(
|
|
22183
22678
|
return res;
|
22184
22679
|
}
|
22185
22680
|
|
22681
|
+
int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
|
22682
|
+
auto it = LLM_CHAT_TEMPLATES.begin();
|
22683
|
+
for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
|
22684
|
+
output[i] = it->first.c_str();
|
22685
|
+
std::advance(it, 1);
|
22686
|
+
}
|
22687
|
+
return (int32_t) LLM_CHAT_TEMPLATES.size();
|
22688
|
+
}
|
22689
|
+
|
22186
22690
|
//
|
22187
22691
|
// sampling
|
22188
22692
|
//
|