@fugood/llama.node 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/CMakeLists.txt +4 -5
- package/src/llama.cpp/common/arg.cpp +37 -0
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +14 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +13 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
- package/src/llama.cpp/include/llama.h +13 -48
- package/src/llama.cpp/src/llama-arch.cpp +222 -15
- package/src/llama.cpp/src/llama-arch.h +16 -1
- package/src/llama.cpp/src/llama-batch.cpp +76 -70
- package/src/llama.cpp/src/llama-batch.h +24 -18
- package/src/llama.cpp/src/llama-chat.cpp +44 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +134 -95
- package/src/llama.cpp/src/llama-context.h +13 -16
- package/src/llama.cpp/src/llama-cparams.h +3 -2
- package/src/llama.cpp/src/llama-graph.cpp +239 -154
- package/src/llama.cpp/src/llama-graph.h +162 -126
- package/src/llama.cpp/src/llama-hparams.cpp +45 -0
- package/src/llama.cpp/src/llama-hparams.h +11 -1
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
- package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
- package/src/llama.cpp/src/llama-model.cpp +2309 -665
- package/src/llama.cpp/src/llama-model.h +18 -4
- package/src/llama.cpp/src/llama-quant.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +368 -9
- package/src/llama.cpp/src/llama-vocab.h +43 -0
- package/src/llama.cpp/src/unicode.cpp +207 -0
- package/src/llama.cpp/src/unicode.h +2 -0
|
@@ -40,17 +40,21 @@ const char * llm_type_name(llm_type type) {
|
|
|
40
40
|
case LLM_TYPE_190M: return "190M";
|
|
41
41
|
case LLM_TYPE_220M: return "220M";
|
|
42
42
|
case LLM_TYPE_250M: return "250M";
|
|
43
|
+
case LLM_TYPE_256M: return "256M";
|
|
43
44
|
case LLM_TYPE_270M: return "270M";
|
|
44
45
|
case LLM_TYPE_335M: return "335M";
|
|
46
|
+
case LLM_TYPE_350M: return "350M";
|
|
45
47
|
case LLM_TYPE_410M: return "410M";
|
|
46
48
|
case LLM_TYPE_450M: return "450M";
|
|
47
49
|
case LLM_TYPE_475M: return "475M";
|
|
50
|
+
case LLM_TYPE_700M: return "700M";
|
|
48
51
|
case LLM_TYPE_770M: return "770M";
|
|
49
52
|
case LLM_TYPE_780M: return "780M";
|
|
50
53
|
case LLM_TYPE_0_3B: return "0.3B";
|
|
51
54
|
case LLM_TYPE_0_5B: return "0.5B";
|
|
52
55
|
case LLM_TYPE_0_6B: return "0.6B";
|
|
53
56
|
case LLM_TYPE_1B: return "1B";
|
|
57
|
+
case LLM_TYPE_1_2B: return "1.2B";
|
|
54
58
|
case LLM_TYPE_1_3B: return "1.3B";
|
|
55
59
|
case LLM_TYPE_1_4B: return "1.4B";
|
|
56
60
|
case LLM_TYPE_1_5B: return "1.5B";
|
|
@@ -103,8 +107,10 @@ const char * llm_type_name(llm_type type) {
|
|
|
103
107
|
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
|
104
108
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
105
109
|
case LLM_TYPE_A13B: return "A13B";
|
|
110
|
+
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
106
111
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
107
112
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
113
|
+
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
108
114
|
case LLM_TYPE_E2B: return "E2B";
|
|
109
115
|
case LLM_TYPE_E4B: return "E4B";
|
|
110
116
|
default: return "?B";
|
|
@@ -581,6 +587,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
581
587
|
case 22: type = LLM_TYPE_1B; break;
|
|
582
588
|
case 26: type = LLM_TYPE_3B; break;
|
|
583
589
|
case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
|
|
590
|
+
case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
|
|
584
591
|
// granite uses a vocab with len 49152
|
|
585
592
|
case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
|
|
586
593
|
case 36: type = LLM_TYPE_8B; break; // granite
|
|
@@ -844,6 +851,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
844
851
|
default: type = LLM_TYPE_UNKNOWN;
|
|
845
852
|
}
|
|
846
853
|
} break;
|
|
854
|
+
case LLM_ARCH_DREAM:
|
|
855
|
+
{
|
|
856
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
857
|
+
// Dream models are primarily 7B with 28 layers
|
|
858
|
+
switch (hparams.n_layer) {
|
|
859
|
+
case 28:
|
|
860
|
+
type = LLM_TYPE_7B;
|
|
861
|
+
break;
|
|
862
|
+
default:
|
|
863
|
+
type = LLM_TYPE_UNKNOWN;
|
|
864
|
+
}
|
|
865
|
+
// Set non-causal attention for diffusion models
|
|
866
|
+
hparams.causal_attn = false;
|
|
867
|
+
}
|
|
868
|
+
break;
|
|
847
869
|
case LLM_ARCH_QWEN2MOE:
|
|
848
870
|
{
|
|
849
871
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
@@ -930,6 +952,33 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
930
952
|
default: type = LLM_TYPE_UNKNOWN;
|
|
931
953
|
}
|
|
932
954
|
} break;
|
|
955
|
+
case LLM_ARCH_PLAMO2:
|
|
956
|
+
{
|
|
957
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
958
|
+
|
|
959
|
+
// Load Mamba SSM parameters
|
|
960
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
961
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
962
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
963
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
964
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
965
|
+
|
|
966
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
967
|
+
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
switch (hparams.n_layer) {
|
|
971
|
+
case 16: type = LLM_TYPE_1B; break;
|
|
972
|
+
case 32:
|
|
973
|
+
if (hparams.n_embd == 2048) {
|
|
974
|
+
type = LLM_TYPE_2B;
|
|
975
|
+
} else if (hparams.n_embd == 4096) {
|
|
976
|
+
type = LLM_TYPE_8B;
|
|
977
|
+
}
|
|
978
|
+
break;
|
|
979
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
980
|
+
}
|
|
981
|
+
} break;
|
|
933
982
|
case LLM_ARCH_GPT2:
|
|
934
983
|
{
|
|
935
984
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1118,6 +1167,26 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1118
1167
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1119
1168
|
}
|
|
1120
1169
|
} break;
|
|
1170
|
+
case LLM_ARCH_JAMBA:
|
|
1171
|
+
{
|
|
1172
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1173
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1174
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1175
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1176
|
+
|
|
1177
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1178
|
+
|
|
1179
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
1180
|
+
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
switch (hparams.n_layer) {
|
|
1184
|
+
// TODO: Jamba layers are a bit heterogenous, so naming this is hard.
|
|
1185
|
+
case 12: // 900M 8x???M
|
|
1186
|
+
case 32: // 51B 16x?B
|
|
1187
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1188
|
+
}
|
|
1189
|
+
} break;
|
|
1121
1190
|
case LLM_ARCH_XVERSE:
|
|
1122
1191
|
{
|
|
1123
1192
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1421,6 +1490,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1421
1490
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1422
1491
|
}
|
|
1423
1492
|
} break;
|
|
1493
|
+
case LLM_ARCH_EXAONE4:
|
|
1494
|
+
{
|
|
1495
|
+
if (hparams.n_layer == 64) { // 32B
|
|
1496
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1497
|
+
hparams.n_swa = 4096;
|
|
1498
|
+
hparams.set_swa_pattern(4);
|
|
1499
|
+
}
|
|
1500
|
+
|
|
1501
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1502
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1503
|
+
|
|
1504
|
+
switch (hparams.n_layer) {
|
|
1505
|
+
case 30: type = LLM_TYPE_1_2B; break;
|
|
1506
|
+
case 64: type = LLM_TYPE_32B; break;
|
|
1507
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1508
|
+
}
|
|
1509
|
+
} break;
|
|
1424
1510
|
case LLM_ARCH_RWKV6:
|
|
1425
1511
|
case LLM_ARCH_RWKV6QWEN2:
|
|
1426
1512
|
{
|
|
@@ -1484,6 +1570,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1484
1570
|
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
|
1485
1571
|
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
|
1486
1572
|
|
|
1573
|
+
// Granite uses rope_finetuned as a switch for rope, so default to true
|
|
1574
|
+
bool rope_finetuned = true;
|
|
1575
|
+
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
|
1576
|
+
hparams.rope_finetuned = rope_finetuned;
|
|
1577
|
+
|
|
1487
1578
|
switch (hparams.n_layer) {
|
|
1488
1579
|
case 32: type = LLM_TYPE_3B; break;
|
|
1489
1580
|
case 40: type = LLM_TYPE_3B; break;
|
|
@@ -1491,6 +1582,40 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1491
1582
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1492
1583
|
}
|
|
1493
1584
|
|
|
1585
|
+
// For Granite MoE Shared
|
|
1586
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
|
|
1587
|
+
} break;
|
|
1588
|
+
case LLM_ARCH_GRANITE_HYBRID:
|
|
1589
|
+
{
|
|
1590
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1591
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
|
|
1592
|
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
|
|
1593
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
|
|
1594
|
+
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
|
|
1595
|
+
|
|
1596
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1597
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1598
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1599
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1600
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1601
|
+
|
|
1602
|
+
// Granite uses rope_finetuned as a switch for rope, so default to true
|
|
1603
|
+
bool rope_finetuned = true;
|
|
1604
|
+
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
|
1605
|
+
hparams.rope_finetuned = rope_finetuned;
|
|
1606
|
+
|
|
1607
|
+
// A layer is recurrent IFF the n_head_kv value is set to 0
|
|
1608
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
1609
|
+
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1613
|
+
|
|
1614
|
+
switch (hparams.n_layer) {
|
|
1615
|
+
// TODO: Add llm type label (not sure this is useful)
|
|
1616
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1617
|
+
}
|
|
1618
|
+
|
|
1494
1619
|
// For Granite MoE Shared
|
|
1495
1620
|
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
|
|
1496
1621
|
} break;
|
|
@@ -1543,10 +1668,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1543
1668
|
}
|
|
1544
1669
|
} break;
|
|
1545
1670
|
case LLM_ARCH_ERNIE4_5:
|
|
1671
|
+
case LLM_ARCH_ERNIE4_5_MOE:
|
|
1546
1672
|
{
|
|
1547
1673
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1674
|
+
if (arch == LLM_ARCH_ERNIE4_5_MOE) {
|
|
1675
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1676
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
1677
|
+
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
|
1678
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1679
|
+
}
|
|
1680
|
+
|
|
1548
1681
|
switch (hparams.n_layer) {
|
|
1549
1682
|
case 18: type = LLM_TYPE_0_3B; break;
|
|
1683
|
+
case 28: type = LLM_TYPE_21B_A3B; break;
|
|
1684
|
+
case 54: type = LLM_TYPE_300B_A47B; break;
|
|
1550
1685
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1551
1686
|
}
|
|
1552
1687
|
} break;
|
|
@@ -1602,6 +1737,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1602
1737
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1603
1738
|
}
|
|
1604
1739
|
} break;
|
|
1740
|
+
case LLM_ARCH_LFM2:
|
|
1741
|
+
{
|
|
1742
|
+
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
|
1743
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1744
|
+
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
|
1745
|
+
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
|
1746
|
+
}
|
|
1747
|
+
switch (hparams.n_embd) {
|
|
1748
|
+
case 1024: type = LLM_TYPE_350M; break;
|
|
1749
|
+
case 1536: type = LLM_TYPE_700M; break;
|
|
1750
|
+
case 2048: type = LLM_TYPE_1_2B; break;
|
|
1751
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1752
|
+
}
|
|
1753
|
+
} break;
|
|
1605
1754
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1606
1755
|
}
|
|
1607
1756
|
|
|
@@ -2565,12 +2714,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2565
2714
|
} break;
|
|
2566
2715
|
case LLM_ARCH_QWEN2:
|
|
2567
2716
|
case LLM_ARCH_QWEN2VL:
|
|
2717
|
+
case LLM_ARCH_DREAM:
|
|
2568
2718
|
{
|
|
2569
2719
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2570
2720
|
|
|
2571
2721
|
// output
|
|
2572
2722
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2573
2723
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2724
|
+
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2574
2725
|
// if output is NULL, init from the input tok embed
|
|
2575
2726
|
if (output == NULL) {
|
|
2576
2727
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
@@ -2860,6 +3011,73 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2860
3011
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2861
3012
|
}
|
|
2862
3013
|
} break;
|
|
3014
|
+
case LLM_ARCH_PLAMO2:
|
|
3015
|
+
{
|
|
3016
|
+
const uint32_t d_conv = hparams.ssm_d_conv;
|
|
3017
|
+
const uint32_t d_state = hparams.ssm_d_state;
|
|
3018
|
+
const uint32_t num_heads = hparams.ssm_dt_rank;
|
|
3019
|
+
const uint32_t intermediate_size = hparams.ssm_d_inner;
|
|
3020
|
+
const uint32_t head_dim = intermediate_size / num_heads;
|
|
3021
|
+
const uint32_t qk_dim = head_dim;
|
|
3022
|
+
const uint32_t v_dim = head_dim;
|
|
3023
|
+
const int64_t num_attention_heads = hparams.n_head();
|
|
3024
|
+
const int64_t q_num_heads = num_attention_heads;
|
|
3025
|
+
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
|
|
3026
|
+
|
|
3027
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3028
|
+
|
|
3029
|
+
// output
|
|
3030
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3031
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3032
|
+
// if output is NULL, init from the input tok embed
|
|
3033
|
+
if (output == NULL) {
|
|
3034
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3035
|
+
}
|
|
3036
|
+
|
|
3037
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3038
|
+
auto & layer = layers[i];
|
|
3039
|
+
bool is_mamba_layer = hparams.is_recurrent(i);
|
|
3040
|
+
|
|
3041
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3042
|
+
|
|
3043
|
+
if (is_mamba_layer) {
|
|
3044
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
|
|
3045
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
|
|
3046
|
+
|
|
3047
|
+
layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
|
|
3048
|
+
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
|
|
3049
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
|
|
3050
|
+
|
|
3051
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
|
|
3052
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
|
|
3053
|
+
|
|
3054
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
|
|
3055
|
+
|
|
3056
|
+
layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
|
|
3057
|
+
layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
|
|
3058
|
+
layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
|
|
3059
|
+
} else {
|
|
3060
|
+
const int64_t num_key_value_heads = hparams.n_head_kv(i);
|
|
3061
|
+
const int64_t k_num_heads = num_key_value_heads;
|
|
3062
|
+
const int64_t v_num_heads = num_key_value_heads;
|
|
3063
|
+
const int64_t q_proj_dim = q_num_heads * qk_dim;
|
|
3064
|
+
const int64_t k_proj_dim = k_num_heads * qk_dim;
|
|
3065
|
+
const int64_t v_proj_dim = v_num_heads * v_dim;
|
|
3066
|
+
|
|
3067
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
|
|
3068
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
|
|
3069
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
|
|
3070
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
|
|
3071
|
+
}
|
|
3072
|
+
|
|
3073
|
+
// All layers have post-attention norm, FFN norm, and FFN tensors
|
|
3074
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
|
|
3075
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3076
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3077
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
|
|
3078
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
|
|
3079
|
+
}
|
|
3080
|
+
} break;
|
|
2863
3081
|
case LLM_ARCH_GPT2:
|
|
2864
3082
|
{
|
|
2865
3083
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3231,10 +3449,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3231
3449
|
{
|
|
3232
3450
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3233
3451
|
|
|
3234
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
3452
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3235
3453
|
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
3236
3454
|
if (output == NULL) {
|
|
3237
|
-
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},
|
|
3455
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3238
3456
|
}
|
|
3239
3457
|
}
|
|
3240
3458
|
|
|
@@ -3261,6 +3479,180 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3261
3479
|
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
3262
3480
|
}
|
|
3263
3481
|
} break;
|
|
3482
|
+
case LLM_ARCH_JAMBA:
|
|
3483
|
+
{
|
|
3484
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
3485
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
3486
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
3487
|
+
const int64_t dt_rank = hparams.ssm_dt_rank;
|
|
3488
|
+
|
|
3489
|
+
// only an expansion factor of 2 is supported for now
|
|
3490
|
+
GGML_ASSERT(2 * n_embd == d_inner);
|
|
3491
|
+
|
|
3492
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3493
|
+
|
|
3494
|
+
// output
|
|
3495
|
+
{
|
|
3496
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3497
|
+
|
|
3498
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3499
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
3500
|
+
if (output == NULL) {
|
|
3501
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3502
|
+
}
|
|
3503
|
+
}
|
|
3504
|
+
|
|
3505
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3506
|
+
const int64_t n_head_kv = hparams.n_head_kv(i);
|
|
3507
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
|
|
3508
|
+
|
|
3509
|
+
auto & layer = layers[i];
|
|
3510
|
+
|
|
3511
|
+
// norm
|
|
3512
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3513
|
+
|
|
3514
|
+
if (n_head_kv == 0) {
|
|
3515
|
+
// Mamba layer
|
|
3516
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
|
|
3517
|
+
|
|
3518
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
|
|
3519
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
|
|
3520
|
+
|
|
3521
|
+
layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
|
|
3522
|
+
|
|
3523
|
+
layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
|
|
3524
|
+
|
|
3525
|
+
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
|
|
3526
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
|
|
3527
|
+
|
|
3528
|
+
layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
|
|
3529
|
+
layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
|
|
3530
|
+
|
|
3531
|
+
// no "weight" suffix for these
|
|
3532
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
|
|
3533
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
|
|
3534
|
+
|
|
3535
|
+
// out_proj
|
|
3536
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
3537
|
+
} else {
|
|
3538
|
+
// Attention layers
|
|
3539
|
+
|
|
3540
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
3541
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3542
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3543
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3544
|
+
}
|
|
3545
|
+
|
|
3546
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3547
|
+
|
|
3548
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
|
3549
|
+
|
|
3550
|
+
if (layer.ffn_gate_inp) {
|
|
3551
|
+
// MoE
|
|
3552
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
3553
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
|
|
3554
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
3555
|
+
} else {
|
|
3556
|
+
// FFN (no MoE)
|
|
3557
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3558
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3559
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3560
|
+
}
|
|
3561
|
+
}
|
|
3562
|
+
} break;
|
|
3563
|
+
case LLM_ARCH_GRANITE_HYBRID:
|
|
3564
|
+
{
|
|
3565
|
+
// mamba2 Mixer SSM params
|
|
3566
|
+
// NOTE: int64_t for tensor dimensions
|
|
3567
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
3568
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
3569
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
3570
|
+
const int64_t n_ssm_head = hparams.ssm_dt_rank;
|
|
3571
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
3572
|
+
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
3573
|
+
|
|
3574
|
+
// only an expansion factor of 2 is supported for now
|
|
3575
|
+
GGML_ASSERT(2 * n_embd == d_inner);
|
|
3576
|
+
|
|
3577
|
+
// embeddings
|
|
3578
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3579
|
+
|
|
3580
|
+
// output
|
|
3581
|
+
{
|
|
3582
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3583
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3584
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
3585
|
+
if (output == NULL) {
|
|
3586
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3587
|
+
}
|
|
3588
|
+
}
|
|
3589
|
+
|
|
3590
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3591
|
+
auto & layer = layers[i];
|
|
3592
|
+
|
|
3593
|
+
// norm
|
|
3594
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3595
|
+
|
|
3596
|
+
if (hparams.is_recurrent(i)) {
|
|
3597
|
+
// ssm layers
|
|
3598
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
|
|
3599
|
+
|
|
3600
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
|
|
3601
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
|
|
3602
|
+
|
|
3603
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
|
|
3604
|
+
|
|
3605
|
+
// no "weight" suffix for these
|
|
3606
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
|
|
3607
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
|
|
3608
|
+
|
|
3609
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
|
|
3610
|
+
|
|
3611
|
+
// out_proj
|
|
3612
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
3613
|
+
} else {
|
|
3614
|
+
// attention layers (with optional bias)
|
|
3615
|
+
const int64_t n_head_i = hparams.n_head(i);
|
|
3616
|
+
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
|
|
3617
|
+
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
|
|
3618
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
|
|
3619
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
|
|
3620
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
|
|
3621
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
|
|
3622
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3623
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
3624
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
3625
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3626
|
+
}
|
|
3627
|
+
|
|
3628
|
+
// feed forward (w/ optional biases)
|
|
3629
|
+
if (n_expert > 0) {
|
|
3630
|
+
// MoE FFN
|
|
3631
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3632
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
3633
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
3634
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
|
|
3635
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
3636
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
3637
|
+
|
|
3638
|
+
// For Granite MoE Shared
|
|
3639
|
+
if (hparams.n_ff_shexp > 0) {
|
|
3640
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
3641
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
3642
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
|
3643
|
+
}
|
|
3644
|
+
} else {
|
|
3645
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3646
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
3647
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3648
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3649
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3650
|
+
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
3651
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3652
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
3653
|
+
}
|
|
3654
|
+
}
|
|
3655
|
+
} break;
|
|
3264
3656
|
case LLM_ARCH_XVERSE:
|
|
3265
3657
|
{
|
|
3266
3658
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3980,6 +4372,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3980
4372
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3981
4373
|
}
|
|
3982
4374
|
} break;
|
|
4375
|
+
case LLM_ARCH_EXAONE4:
|
|
4376
|
+
{
|
|
4377
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4378
|
+
|
|
4379
|
+
// output
|
|
4380
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4381
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4382
|
+
|
|
4383
|
+
// if output is NULL, init from the input tok embed
|
|
4384
|
+
if (output == NULL) {
|
|
4385
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4386
|
+
}
|
|
4387
|
+
|
|
4388
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4389
|
+
auto & layer = layers[i];
|
|
4390
|
+
|
|
4391
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4392
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4393
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4394
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
4395
|
+
|
|
4396
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
4397
|
+
|
|
4398
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
4399
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4400
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4401
|
+
|
|
4402
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4403
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4404
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4405
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
4406
|
+
}
|
|
4407
|
+
} break;
|
|
3983
4408
|
case LLM_ARCH_RWKV6:
|
|
3984
4409
|
{
|
|
3985
4410
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -4495,6 +4920,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4495
4920
|
}
|
|
4496
4921
|
} break;
|
|
4497
4922
|
case LLM_ARCH_ERNIE4_5:
|
|
4923
|
+
case LLM_ARCH_ERNIE4_5_MOE:
|
|
4498
4924
|
{
|
|
4499
4925
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4500
4926
|
|
|
@@ -4523,9 +4949,27 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4523
4949
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4524
4950
|
|
|
4525
4951
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4526
|
-
|
|
4527
|
-
|
|
4528
|
-
|
|
4952
|
+
|
|
4953
|
+
if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
|
|
4954
|
+
int n_ff_exp = hparams.n_ff_exp;
|
|
4955
|
+
|
|
4956
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
4957
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
4958
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
|
4959
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
|
|
4960
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
4961
|
+
|
|
4962
|
+
// Shared expert (if present)
|
|
4963
|
+
if (hparams.n_ff_shexp > 0) {
|
|
4964
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
|
|
4965
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
|
|
4966
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
|
|
4967
|
+
}
|
|
4968
|
+
} else { // Dense layers
|
|
4969
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4970
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4971
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4972
|
+
}
|
|
4529
4973
|
}
|
|
4530
4974
|
} break;
|
|
4531
4975
|
case LLM_ARCH_FALCON_H1:
|
|
@@ -4671,6 +5115,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4671
5115
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4672
5116
|
}
|
|
4673
5117
|
} break;
|
|
5118
|
+
case LLM_ARCH_LFM2:
|
|
5119
|
+
{
|
|
5120
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5121
|
+
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
5122
|
+
|
|
5123
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5124
|
+
auto & layer = layers[i];
|
|
5125
|
+
// ffn is same for transformer and conv layers
|
|
5126
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
5127
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
5128
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
5129
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5130
|
+
|
|
5131
|
+
// for operator_norm
|
|
5132
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5133
|
+
|
|
5134
|
+
if (!hparams.is_recurrent(i)) {
|
|
5135
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5136
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5137
|
+
GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
|
|
5138
|
+
|
|
5139
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
5140
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
|
|
5141
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
|
|
5142
|
+
|
|
5143
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
5144
|
+
} else {
|
|
5145
|
+
layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
|
|
5146
|
+
layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
|
|
5147
|
+
layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
|
|
5148
|
+
}
|
|
5149
|
+
}
|
|
5150
|
+
} break;
|
|
4674
5151
|
default:
|
|
4675
5152
|
throw std::runtime_error("unknown architecture");
|
|
4676
5153
|
}
|
|
@@ -4910,16 +5387,6 @@ void llama_model::print_info() const {
|
|
|
4910
5387
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
4911
5388
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
4912
5389
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
4913
|
-
}
|
|
4914
|
-
|
|
4915
|
-
if (arch == LLM_ARCH_MAMBA || arch == LLM_ARCH_MAMBA2) {
|
|
4916
|
-
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
4917
|
-
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
4918
|
-
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
4919
|
-
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
4920
|
-
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
|
|
4921
|
-
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
4922
|
-
|
|
4923
5390
|
if (!classifier_labels.empty()) {
|
|
4924
5391
|
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
|
|
4925
5392
|
|
|
@@ -4930,6 +5397,20 @@ void llama_model::print_info() const {
|
|
|
4930
5397
|
}
|
|
4931
5398
|
}
|
|
4932
5399
|
|
|
5400
|
+
if (arch == LLM_ARCH_MAMBA ||
|
|
5401
|
+
arch == LLM_ARCH_MAMBA2 ||
|
|
5402
|
+
arch == LLM_ARCH_JAMBA ||
|
|
5403
|
+
arch == LLM_ARCH_FALCON_H1 ||
|
|
5404
|
+
arch == LLM_ARCH_PLAMO2 ||
|
|
5405
|
+
arch == LLM_ARCH_GRANITE_HYBRID) {
|
|
5406
|
+
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
5407
|
+
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
5408
|
+
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
5409
|
+
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
5410
|
+
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
|
|
5411
|
+
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
5412
|
+
}
|
|
5413
|
+
|
|
4933
5414
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
|
4934
5415
|
if (pimpl->n_elements >= 1e12) {
|
|
4935
5416
|
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
|
|
@@ -4976,7 +5457,8 @@ void llama_model::print_info() const {
|
|
|
4976
5457
|
|
|
4977
5458
|
if (arch == LLM_ARCH_MINICPM ||
|
|
4978
5459
|
arch == LLM_ARCH_GRANITE ||
|
|
4979
|
-
arch == LLM_ARCH_GRANITE_MOE
|
|
5460
|
+
arch == LLM_ARCH_GRANITE_MOE ||
|
|
5461
|
+
arch == LLM_ARCH_GRANITE_HYBRID) {
|
|
4980
5462
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
4981
5463
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
4982
5464
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
@@ -5092,7 +5574,7 @@ ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int i
|
|
|
5092
5574
|
}
|
|
5093
5575
|
|
|
5094
5576
|
struct llm_build_llama : public llm_graph_context {
|
|
5095
|
-
llm_build_llama(const llama_model & model, const llm_graph_params & params
|
|
5577
|
+
llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5096
5578
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5097
5579
|
|
|
5098
5580
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -5168,7 +5650,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
5168
5650
|
cb(Kcur, "Kcur", il);
|
|
5169
5651
|
cb(Vcur, "Vcur", il);
|
|
5170
5652
|
|
|
5171
|
-
cur = build_attn(inp_attn,
|
|
5653
|
+
cur = build_attn(inp_attn,
|
|
5172
5654
|
model.layers[il].wo, model.layers[il].bo,
|
|
5173
5655
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
5174
5656
|
cb(cur, "attn_out", il);
|
|
@@ -5248,7 +5730,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
5248
5730
|
};
|
|
5249
5731
|
|
|
5250
5732
|
struct llm_build_llama_iswa : public llm_graph_context {
|
|
5251
|
-
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params
|
|
5733
|
+
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5252
5734
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5253
5735
|
|
|
5254
5736
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -5342,7 +5824,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
5342
5824
|
cb(Kcur, "Kcur_normed", il);
|
|
5343
5825
|
}
|
|
5344
5826
|
|
|
5345
|
-
cur = build_attn(inp_attn,
|
|
5827
|
+
cur = build_attn(inp_attn,
|
|
5346
5828
|
model.layers[il].wo, model.layers[il].bo,
|
|
5347
5829
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
5348
5830
|
cb(cur, "attn_out", il);
|
|
@@ -5431,7 +5913,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
5431
5913
|
};
|
|
5432
5914
|
|
|
5433
5915
|
struct llm_build_deci : public llm_graph_context {
|
|
5434
|
-
llm_build_deci(const llama_model & model, const llm_graph_params & params
|
|
5916
|
+
llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5435
5917
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5436
5918
|
|
|
5437
5919
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -5519,7 +6001,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
5519
6001
|
cb(Kcur, "Kcur", il);
|
|
5520
6002
|
cb(Vcur, "Vcur", il);
|
|
5521
6003
|
|
|
5522
|
-
cur = build_attn(inp_attn,
|
|
6004
|
+
cur = build_attn(inp_attn,
|
|
5523
6005
|
model.layers[il].wo, model.layers[il].bo,
|
|
5524
6006
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
5525
6007
|
}
|
|
@@ -5587,7 +6069,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
5587
6069
|
};
|
|
5588
6070
|
|
|
5589
6071
|
struct llm_build_baichuan : public llm_graph_context {
|
|
5590
|
-
llm_build_baichuan(const llama_model & model, const llm_graph_params & params
|
|
6072
|
+
llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5591
6073
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5592
6074
|
|
|
5593
6075
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -5651,7 +6133,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
5651
6133
|
cb(Kcur, "Kcur", il);
|
|
5652
6134
|
cb(Vcur, "Vcur", il);
|
|
5653
6135
|
|
|
5654
|
-
cur = build_attn(inp_attn,
|
|
6136
|
+
cur = build_attn(inp_attn,
|
|
5655
6137
|
model.layers[il].wo, NULL,
|
|
5656
6138
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5657
6139
|
}
|
|
@@ -5709,7 +6191,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
5709
6191
|
};
|
|
5710
6192
|
|
|
5711
6193
|
struct llm_build_xverse : public llm_graph_context {
|
|
5712
|
-
llm_build_xverse(const llama_model & model, const llm_graph_params & params
|
|
6194
|
+
llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5713
6195
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5714
6196
|
|
|
5715
6197
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -5766,7 +6248,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
5766
6248
|
cb(Kcur, "Kcur", il);
|
|
5767
6249
|
cb(Vcur, "Vcur", il);
|
|
5768
6250
|
|
|
5769
|
-
cur = build_attn(inp_attn,
|
|
6251
|
+
cur = build_attn(inp_attn,
|
|
5770
6252
|
model.layers[il].wo, NULL,
|
|
5771
6253
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5772
6254
|
}
|
|
@@ -5822,7 +6304,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
5822
6304
|
};
|
|
5823
6305
|
|
|
5824
6306
|
struct llm_build_falcon : public llm_graph_context {
|
|
5825
|
-
llm_build_falcon(const llama_model & model, const llm_graph_params & params
|
|
6307
|
+
llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5826
6308
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5827
6309
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
5828
6310
|
|
|
@@ -5889,7 +6371,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
5889
6371
|
cb(Kcur, "Kcur", il);
|
|
5890
6372
|
cb(Vcur, "Vcur", il);
|
|
5891
6373
|
|
|
5892
|
-
cur = build_attn(inp_attn,
|
|
6374
|
+
cur = build_attn(inp_attn,
|
|
5893
6375
|
model.layers[il].wo, NULL,
|
|
5894
6376
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5895
6377
|
}
|
|
@@ -5944,7 +6426,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
5944
6426
|
};
|
|
5945
6427
|
|
|
5946
6428
|
struct llm_build_grok : public llm_graph_context {
|
|
5947
|
-
llm_build_grok(const llama_model & model, const llm_graph_params & params
|
|
6429
|
+
llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5948
6430
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5949
6431
|
|
|
5950
6432
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -6019,7 +6501,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
6019
6501
|
cb(Kcur, "Kcur", il);
|
|
6020
6502
|
cb(Vcur, "Vcur", il);
|
|
6021
6503
|
|
|
6022
|
-
cur = build_attn(inp_attn,
|
|
6504
|
+
cur = build_attn(inp_attn,
|
|
6023
6505
|
model.layers[il].wo, model.layers[il].bo,
|
|
6024
6506
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
6025
6507
|
}
|
|
@@ -6106,7 +6588,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
6106
6588
|
};
|
|
6107
6589
|
|
|
6108
6590
|
struct llm_build_dbrx : public llm_graph_context {
|
|
6109
|
-
llm_build_dbrx(const llama_model & model, const llm_graph_params & params
|
|
6591
|
+
llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6110
6592
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6111
6593
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6112
6594
|
|
|
@@ -6168,7 +6650,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
6168
6650
|
cb(Kcur, "Kcur", il);
|
|
6169
6651
|
cb(Vcur, "Vcur", il);
|
|
6170
6652
|
|
|
6171
|
-
cur = build_attn(inp_attn,
|
|
6653
|
+
cur = build_attn(inp_attn,
|
|
6172
6654
|
model.layers[il].wo, NULL,
|
|
6173
6655
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6174
6656
|
}
|
|
@@ -6231,7 +6713,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
6231
6713
|
};
|
|
6232
6714
|
|
|
6233
6715
|
struct llm_build_starcoder : public llm_graph_context {
|
|
6234
|
-
llm_build_starcoder(const llama_model & model, const llm_graph_params & params
|
|
6716
|
+
llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6235
6717
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6236
6718
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6237
6719
|
|
|
@@ -6282,7 +6764,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
6282
6764
|
cb(Kcur, "Kcur", il);
|
|
6283
6765
|
cb(Vcur, "Vcur", il);
|
|
6284
6766
|
|
|
6285
|
-
cur = build_attn(inp_attn,
|
|
6767
|
+
cur = build_attn(inp_attn,
|
|
6286
6768
|
model.layers[il].wo, model.layers[il].bo,
|
|
6287
6769
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6288
6770
|
}
|
|
@@ -6340,7 +6822,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
6340
6822
|
};
|
|
6341
6823
|
|
|
6342
6824
|
struct llm_build_refact : public llm_graph_context {
|
|
6343
|
-
llm_build_refact(const llama_model & model, const llm_graph_params & params
|
|
6825
|
+
llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6344
6826
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6345
6827
|
|
|
6346
6828
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -6381,7 +6863,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
6381
6863
|
cb(Kcur, "Kcur", il);
|
|
6382
6864
|
cb(Vcur, "Vcur", il);
|
|
6383
6865
|
|
|
6384
|
-
cur = build_attn(inp_attn,
|
|
6866
|
+
cur = build_attn(inp_attn,
|
|
6385
6867
|
model.layers[il].wo, NULL,
|
|
6386
6868
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6387
6869
|
}
|
|
@@ -6439,7 +6921,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
6439
6921
|
};
|
|
6440
6922
|
|
|
6441
6923
|
struct llm_build_bert : public llm_graph_context {
|
|
6442
|
-
llm_build_bert(const llama_model & model, const llm_graph_params & params
|
|
6924
|
+
llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6443
6925
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6444
6926
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6445
6927
|
|
|
@@ -6538,7 +7020,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6538
7020
|
cb(Kcur, "Kcur", il);
|
|
6539
7021
|
cb(Vcur, "Vcur", il);
|
|
6540
7022
|
|
|
6541
|
-
cur = build_attn(inp_attn,
|
|
7023
|
+
cur = build_attn(inp_attn,
|
|
6542
7024
|
model.layers[il].wo, model.layers[il].bo,
|
|
6543
7025
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6544
7026
|
cb(cur, "kqv_out", il);
|
|
@@ -6625,7 +7107,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
6625
7107
|
};
|
|
6626
7108
|
|
|
6627
7109
|
struct llm_build_neo_bert : public llm_graph_context {
|
|
6628
|
-
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params
|
|
7110
|
+
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6629
7111
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6630
7112
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6631
7113
|
|
|
@@ -6683,7 +7165,7 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
6683
7165
|
cb(Kcur, "Kcur", il);
|
|
6684
7166
|
cb(Vcur, "Vcur", il);
|
|
6685
7167
|
|
|
6686
|
-
cur = build_attn(inp_attn,
|
|
7168
|
+
cur = build_attn(inp_attn,
|
|
6687
7169
|
model.layers[il].wo, nullptr,
|
|
6688
7170
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6689
7171
|
cb(cur, "kqv_out", il);
|
|
@@ -6735,7 +7217,7 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
6735
7217
|
};
|
|
6736
7218
|
|
|
6737
7219
|
struct llm_build_bloom : public llm_graph_context {
|
|
6738
|
-
llm_build_bloom(const llama_model & model, const llm_graph_params & params
|
|
7220
|
+
llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6739
7221
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6740
7222
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6741
7223
|
|
|
@@ -6783,7 +7265,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
6783
7265
|
cb(Kcur, "Kcur", il);
|
|
6784
7266
|
cb(Vcur, "Vcur", il);
|
|
6785
7267
|
|
|
6786
|
-
cur = build_attn(inp_attn,
|
|
7268
|
+
cur = build_attn(inp_attn,
|
|
6787
7269
|
model.layers[il].wo, model.layers[il].bo,
|
|
6788
7270
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6789
7271
|
}
|
|
@@ -6841,7 +7323,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
6841
7323
|
};
|
|
6842
7324
|
|
|
6843
7325
|
struct llm_build_mpt : public llm_graph_context {
|
|
6844
|
-
llm_build_mpt(const llama_model & model, const llm_graph_params & params
|
|
7326
|
+
llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6845
7327
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6846
7328
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6847
7329
|
|
|
@@ -6930,7 +7412,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6930
7412
|
cb(Kcur, "Kcur", il);
|
|
6931
7413
|
cb(Vcur, "Vcur", il);
|
|
6932
7414
|
|
|
6933
|
-
cur = build_attn(inp_attn,
|
|
7415
|
+
cur = build_attn(inp_attn,
|
|
6934
7416
|
model.layers[il].wo, model.layers[il].bo,
|
|
6935
7417
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6936
7418
|
}
|
|
@@ -6989,7 +7471,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6989
7471
|
};
|
|
6990
7472
|
|
|
6991
7473
|
struct llm_build_stablelm : public llm_graph_context {
|
|
6992
|
-
llm_build_stablelm(const llama_model & model, const llm_graph_params & params
|
|
7474
|
+
llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
6993
7475
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6994
7476
|
|
|
6995
7477
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -7076,7 +7558,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
7076
7558
|
cb(Kcur, "Kcur", il);
|
|
7077
7559
|
cb(Vcur, "Vcur", il);
|
|
7078
7560
|
|
|
7079
|
-
cur = build_attn(inp_attn,
|
|
7561
|
+
cur = build_attn(inp_attn,
|
|
7080
7562
|
model.layers[il].wo, NULL,
|
|
7081
7563
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7082
7564
|
}
|
|
@@ -7141,7 +7623,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
7141
7623
|
};
|
|
7142
7624
|
|
|
7143
7625
|
struct llm_build_qwen : public llm_graph_context {
|
|
7144
|
-
llm_build_qwen(const llama_model & model, const llm_graph_params & params
|
|
7626
|
+
llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
7145
7627
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7146
7628
|
|
|
7147
7629
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -7197,7 +7679,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
7197
7679
|
cb(Kcur, "Kcur", il);
|
|
7198
7680
|
cb(Vcur, "Vcur", il);
|
|
7199
7681
|
|
|
7200
|
-
cur = build_attn(inp_attn,
|
|
7682
|
+
cur = build_attn(inp_attn,
|
|
7201
7683
|
model.layers[il].wo, NULL,
|
|
7202
7684
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7203
7685
|
}
|
|
@@ -7255,7 +7737,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
7255
7737
|
};
|
|
7256
7738
|
|
|
7257
7739
|
struct llm_build_qwen2 : public llm_graph_context {
|
|
7258
|
-
llm_build_qwen2(const llama_model & model, const llm_graph_params & params
|
|
7740
|
+
llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
7259
7741
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7260
7742
|
|
|
7261
7743
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -7317,7 +7799,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
7317
7799
|
cb(Kcur, "Kcur", il);
|
|
7318
7800
|
cb(Vcur, "Vcur", il);
|
|
7319
7801
|
|
|
7320
|
-
cur = build_attn(inp_attn,
|
|
7802
|
+
cur = build_attn(inp_attn,
|
|
7321
7803
|
model.layers[il].wo, model.layers[il].bo,
|
|
7322
7804
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7323
7805
|
}
|
|
@@ -7365,6 +7847,113 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
7365
7847
|
// lm_head
|
|
7366
7848
|
cur = build_lora_mm(model.output, cur);
|
|
7367
7849
|
|
|
7850
|
+
if (model.output_b != nullptr) {
|
|
7851
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
|
7852
|
+
}
|
|
7853
|
+
|
|
7854
|
+
cb(cur, "result_output", -1);
|
|
7855
|
+
res->t_logits = cur;
|
|
7856
|
+
|
|
7857
|
+
ggml_build_forward_expand(gf, cur);
|
|
7858
|
+
}
|
|
7859
|
+
};
|
|
7860
|
+
|
|
7861
|
+
struct llm_build_dream : public llm_graph_context {
|
|
7862
|
+
llm_build_dream(const llama_model & model, const llm_graph_params & params) :
|
|
7863
|
+
llm_graph_context(params) {
|
|
7864
|
+
//copied from qwen2
|
|
7865
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7866
|
+
|
|
7867
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7868
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
7869
|
+
|
|
7870
|
+
ggml_tensor * cur;
|
|
7871
|
+
ggml_tensor * inpL;
|
|
7872
|
+
|
|
7873
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
7874
|
+
|
|
7875
|
+
// inp_pos - contains the positions
|
|
7876
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
7877
|
+
|
|
7878
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
7879
|
+
|
|
7880
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7881
|
+
|
|
7882
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
7883
|
+
ggml_tensor * inpSA = inpL;
|
|
7884
|
+
|
|
7885
|
+
// norm
|
|
7886
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
7887
|
+
cb(cur, "attn_norm", il);
|
|
7888
|
+
|
|
7889
|
+
// self-attention
|
|
7890
|
+
{
|
|
7891
|
+
// compute Q and K and RoPE them
|
|
7892
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
7893
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
7894
|
+
cb(Qcur, "Qcur", il);
|
|
7895
|
+
|
|
7896
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
7897
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
7898
|
+
cb(Kcur, "Kcur", il);
|
|
7899
|
+
|
|
7900
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
7901
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
7902
|
+
cb(Vcur, "Vcur", il);
|
|
7903
|
+
|
|
7904
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7905
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7906
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7907
|
+
|
|
7908
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7909
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
7910
|
+
|
|
7911
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7912
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
7913
|
+
|
|
7914
|
+
cb(Qcur, "Qcur", il);
|
|
7915
|
+
cb(Kcur, "Kcur", il);
|
|
7916
|
+
cb(Vcur, "Vcur", il);
|
|
7917
|
+
|
|
7918
|
+
cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr,
|
|
7919
|
+
nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
7920
|
+
}
|
|
7921
|
+
|
|
7922
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
7923
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7924
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7925
|
+
}
|
|
7926
|
+
|
|
7927
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
7928
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
7929
|
+
|
|
7930
|
+
// feed-forward network
|
|
7931
|
+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
7932
|
+
cb(cur, "ffn_norm", il);
|
|
7933
|
+
|
|
7934
|
+
cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
|
|
7935
|
+
model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
7936
|
+
cb(cur, "ffn_out", il);
|
|
7937
|
+
|
|
7938
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
7939
|
+
|
|
7940
|
+
cur = build_cvec(cur, il);
|
|
7941
|
+
cb(cur, "l_out", il);
|
|
7942
|
+
|
|
7943
|
+
// input for next layer
|
|
7944
|
+
inpL = cur;
|
|
7945
|
+
}
|
|
7946
|
+
|
|
7947
|
+
cur = inpL;
|
|
7948
|
+
|
|
7949
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
7950
|
+
|
|
7951
|
+
cb(cur, "result_norm", -1);
|
|
7952
|
+
res->t_embd = cur;
|
|
7953
|
+
|
|
7954
|
+
// lm_head
|
|
7955
|
+
cur = build_lora_mm(model.output, cur);
|
|
7956
|
+
|
|
7368
7957
|
cb(cur, "result_output", -1);
|
|
7369
7958
|
res->t_logits = cur;
|
|
7370
7959
|
|
|
@@ -7373,7 +7962,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
7373
7962
|
};
|
|
7374
7963
|
|
|
7375
7964
|
struct llm_build_qwen2vl : public llm_graph_context {
|
|
7376
|
-
llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params
|
|
7965
|
+
llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
7377
7966
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7378
7967
|
|
|
7379
7968
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -7438,7 +8027,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
7438
8027
|
cb(Kcur, "Kcur", il);
|
|
7439
8028
|
cb(Vcur, "Vcur", il);
|
|
7440
8029
|
|
|
7441
|
-
cur = build_attn(inp_attn,
|
|
8030
|
+
cur = build_attn(inp_attn,
|
|
7442
8031
|
model.layers[il].wo, model.layers[il].bo,
|
|
7443
8032
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7444
8033
|
}
|
|
@@ -7494,7 +8083,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
7494
8083
|
};
|
|
7495
8084
|
|
|
7496
8085
|
struct llm_build_qwen2moe : public llm_graph_context {
|
|
7497
|
-
llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params
|
|
8086
|
+
llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
7498
8087
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7499
8088
|
|
|
7500
8089
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -7565,7 +8154,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
7565
8154
|
cb(Kcur, "Kcur", il);
|
|
7566
8155
|
cb(Vcur, "Vcur", il);
|
|
7567
8156
|
|
|
7568
|
-
cur = build_attn(inp_attn,
|
|
8157
|
+
cur = build_attn(inp_attn,
|
|
7569
8158
|
model.layers[il].wo, model.layers[il].bo,
|
|
7570
8159
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7571
8160
|
}
|
|
@@ -7653,7 +8242,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
7653
8242
|
};
|
|
7654
8243
|
|
|
7655
8244
|
struct llm_build_qwen3 : public llm_graph_context {
|
|
7656
|
-
llm_build_qwen3(const llama_model & model, const llm_graph_params & params
|
|
8245
|
+
llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
7657
8246
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7658
8247
|
|
|
7659
8248
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -7718,7 +8307,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
7718
8307
|
cb(Kcur, "Kcur", il);
|
|
7719
8308
|
cb(Vcur, "Vcur", il);
|
|
7720
8309
|
|
|
7721
|
-
cur = build_attn(inp_attn,
|
|
8310
|
+
cur = build_attn(inp_attn,
|
|
7722
8311
|
model.layers[il].wo, model.layers[il].bo,
|
|
7723
8312
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7724
8313
|
}
|
|
@@ -7774,7 +8363,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
7774
8363
|
};
|
|
7775
8364
|
|
|
7776
8365
|
struct llm_build_qwen3moe : public llm_graph_context {
|
|
7777
|
-
llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params
|
|
8366
|
+
llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
7778
8367
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7779
8368
|
|
|
7780
8369
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -7839,7 +8428,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
7839
8428
|
cb(Kcur, "Kcur", il);
|
|
7840
8429
|
cb(Vcur, "Vcur", il);
|
|
7841
8430
|
|
|
7842
|
-
cur = build_attn(inp_attn,
|
|
8431
|
+
cur = build_attn(inp_attn,
|
|
7843
8432
|
model.layers[il].wo, model.layers[il].bo,
|
|
7844
8433
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7845
8434
|
}
|
|
@@ -7902,7 +8491,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
7902
8491
|
};
|
|
7903
8492
|
|
|
7904
8493
|
struct llm_build_phi2 : public llm_graph_context {
|
|
7905
|
-
llm_build_phi2(const llama_model & model, const llm_graph_params & params
|
|
8494
|
+
llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
7906
8495
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7907
8496
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
7908
8497
|
|
|
@@ -7979,7 +8568,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7979
8568
|
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
|
7980
8569
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
|
7981
8570
|
|
|
7982
|
-
cur = build_attn(inp_attn,
|
|
8571
|
+
cur = build_attn(inp_attn,
|
|
7983
8572
|
model.layers[il].wo, model.layers[il].bo,
|
|
7984
8573
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7985
8574
|
}
|
|
@@ -8033,7 +8622,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
8033
8622
|
|
|
8034
8623
|
template<bool iswa>
|
|
8035
8624
|
struct llm_build_phi3 : public llm_graph_context {
|
|
8036
|
-
llm_build_phi3(const llama_model & model, const llm_graph_params & params
|
|
8625
|
+
llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8037
8626
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8038
8627
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
8039
8628
|
|
|
@@ -8116,7 +8705,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
8116
8705
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
|
8117
8706
|
cb(Qcur, "Qcur", il);
|
|
8118
8707
|
|
|
8119
|
-
cur = build_attn(inp_attn,
|
|
8708
|
+
cur = build_attn(inp_attn,
|
|
8120
8709
|
model.layers[il].wo, model.layers[il].bo,
|
|
8121
8710
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
8122
8711
|
}
|
|
@@ -8191,7 +8780,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
8191
8780
|
};
|
|
8192
8781
|
|
|
8193
8782
|
struct llm_build_plamo : public llm_graph_context {
|
|
8194
|
-
llm_build_plamo(const llama_model & model, const llm_graph_params & params
|
|
8783
|
+
llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8195
8784
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8196
8785
|
|
|
8197
8786
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -8250,7 +8839,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
8250
8839
|
cb(Kcur, "Kcur", il);
|
|
8251
8840
|
cb(Vcur, "Vcur", il);
|
|
8252
8841
|
|
|
8253
|
-
cur = build_attn(inp_attn,
|
|
8842
|
+
cur = build_attn(inp_attn,
|
|
8254
8843
|
model.layers[il].wo, NULL,
|
|
8255
8844
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8256
8845
|
}
|
|
@@ -8306,7 +8895,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
8306
8895
|
};
|
|
8307
8896
|
|
|
8308
8897
|
struct llm_build_gpt2 : public llm_graph_context {
|
|
8309
|
-
llm_build_gpt2(const llama_model & model, const llm_graph_params & params
|
|
8898
|
+
llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8310
8899
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8311
8900
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
8312
8901
|
|
|
@@ -8358,7 +8947,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
8358
8947
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8359
8948
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8360
8949
|
|
|
8361
|
-
cur = build_attn(inp_attn,
|
|
8950
|
+
cur = build_attn(inp_attn,
|
|
8362
8951
|
model.layers[il].wo, model.layers[il].bo,
|
|
8363
8952
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8364
8953
|
}
|
|
@@ -8416,7 +9005,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
8416
9005
|
};
|
|
8417
9006
|
|
|
8418
9007
|
struct llm_build_codeshell : public llm_graph_context {
|
|
8419
|
-
llm_build_codeshell(const llama_model & model, const llm_graph_params & params
|
|
9008
|
+
llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8420
9009
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8421
9010
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
8422
9011
|
|
|
@@ -8472,7 +9061,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
8472
9061
|
cb(Kcur, "Kcur", il);
|
|
8473
9062
|
cb(Vcur, "Vcur", il);
|
|
8474
9063
|
|
|
8475
|
-
cur = build_attn(inp_attn,
|
|
9064
|
+
cur = build_attn(inp_attn,
|
|
8476
9065
|
model.layers[il].wo, model.layers[il].bo,
|
|
8477
9066
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8478
9067
|
}
|
|
@@ -8530,7 +9119,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
8530
9119
|
};
|
|
8531
9120
|
|
|
8532
9121
|
struct llm_build_orion : public llm_graph_context {
|
|
8533
|
-
llm_build_orion(const llama_model & model, const llm_graph_params & params
|
|
9122
|
+
llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8534
9123
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8535
9124
|
|
|
8536
9125
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -8601,7 +9190,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
8601
9190
|
cb(Kcur, "Kcur", il);
|
|
8602
9191
|
cb(Vcur, "Vcur", il);
|
|
8603
9192
|
|
|
8604
|
-
cur = build_attn(inp_attn,
|
|
9193
|
+
cur = build_attn(inp_attn,
|
|
8605
9194
|
model.layers[il].wo, NULL,
|
|
8606
9195
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8607
9196
|
}
|
|
@@ -8657,7 +9246,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
8657
9246
|
};
|
|
8658
9247
|
|
|
8659
9248
|
struct llm_build_internlm2 : public llm_graph_context {
|
|
8660
|
-
llm_build_internlm2(const llama_model & model, const llm_graph_params & params
|
|
9249
|
+
llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8661
9250
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8662
9251
|
|
|
8663
9252
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -8728,7 +9317,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
8728
9317
|
cb(Kcur, "Kcur", il);
|
|
8729
9318
|
cb(Vcur, "Vcur", il);
|
|
8730
9319
|
|
|
8731
|
-
cur = build_attn(inp_attn,
|
|
9320
|
+
cur = build_attn(inp_attn,
|
|
8732
9321
|
model.layers[il].wo, model.layers[il].bo,
|
|
8733
9322
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8734
9323
|
}
|
|
@@ -8784,7 +9373,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
8784
9373
|
};
|
|
8785
9374
|
|
|
8786
9375
|
struct llm_build_minicpm3 : public llm_graph_context {
|
|
8787
|
-
llm_build_minicpm3(const llama_model & model, const llm_graph_params & params
|
|
9376
|
+
llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8788
9377
|
//TODO: if the model varies, these parameters need to be read from the model
|
|
8789
9378
|
const int64_t n_embd_base = 256;
|
|
8790
9379
|
const float scale_embd = 12.0f;
|
|
@@ -8916,7 +9505,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8916
9505
|
ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
|
8917
9506
|
cb(k_states, "k_states", il);
|
|
8918
9507
|
|
|
8919
|
-
cur = build_attn(inp_attn,
|
|
9508
|
+
cur = build_attn(inp_attn,
|
|
8920
9509
|
model.layers[il].wo, NULL,
|
|
8921
9510
|
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
8922
9511
|
}
|
|
@@ -8988,7 +9577,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8988
9577
|
};
|
|
8989
9578
|
|
|
8990
9579
|
struct llm_build_gemma : public llm_graph_context {
|
|
8991
|
-
llm_build_gemma(const llama_model & model, const llm_graph_params & params
|
|
9580
|
+
llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
8992
9581
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8993
9582
|
|
|
8994
9583
|
ggml_tensor * cur;
|
|
@@ -9046,7 +9635,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
9046
9635
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
|
9047
9636
|
cb(Qcur, "Qcur_scaled", il);
|
|
9048
9637
|
|
|
9049
|
-
cur = build_attn(inp_attn,
|
|
9638
|
+
cur = build_attn(inp_attn,
|
|
9050
9639
|
model.layers[il].wo, NULL,
|
|
9051
9640
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9052
9641
|
}
|
|
@@ -9104,7 +9693,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
9104
9693
|
};
|
|
9105
9694
|
|
|
9106
9695
|
struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
9107
|
-
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params
|
|
9696
|
+
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
9108
9697
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
9109
9698
|
|
|
9110
9699
|
ggml_tensor * cur;
|
|
@@ -9161,7 +9750,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
9161
9750
|
|
|
9162
9751
|
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
9163
9752
|
|
|
9164
|
-
cur = build_attn(inp_attn,
|
|
9753
|
+
cur = build_attn(inp_attn,
|
|
9165
9754
|
model.layers[il].wo, NULL,
|
|
9166
9755
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9167
9756
|
}
|
|
@@ -9234,7 +9823,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
9234
9823
|
};
|
|
9235
9824
|
|
|
9236
9825
|
struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
9237
|
-
llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params
|
|
9826
|
+
llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
9238
9827
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
9239
9828
|
|
|
9240
9829
|
ggml_tensor * cur;
|
|
@@ -9303,7 +9892,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
9303
9892
|
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
|
|
9304
9893
|
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
9305
9894
|
|
|
9306
|
-
cur = build_attn(inp_attn,
|
|
9895
|
+
cur = build_attn(inp_attn,
|
|
9307
9896
|
model.layers[il].wo, NULL,
|
|
9308
9897
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9309
9898
|
}
|
|
@@ -9372,7 +9961,6 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
9372
9961
|
|
|
9373
9962
|
struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
9374
9963
|
const llama_model & model;
|
|
9375
|
-
ggml_cgraph * gf;
|
|
9376
9964
|
|
|
9377
9965
|
const int64_t n_embd_head;
|
|
9378
9966
|
const int64_t n_embd_altup;
|
|
@@ -9382,12 +9970,9 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
9382
9970
|
const int n_layer_sparsity = 10; // number of layers using activation sparsity
|
|
9383
9971
|
const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
|
|
9384
9972
|
|
|
9385
|
-
|
|
9386
|
-
|
|
9387
|
-
llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
|
|
9973
|
+
llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params)
|
|
9388
9974
|
: llm_graph_context(params),
|
|
9389
9975
|
model(model),
|
|
9390
|
-
gf(gf),
|
|
9391
9976
|
n_embd_head(model.hparams.n_embd_head_k),
|
|
9392
9977
|
n_embd_altup(model.hparams.n_embd_altup),
|
|
9393
9978
|
n_altup(model.hparams.n_altup),
|
|
@@ -9395,14 +9980,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
9395
9980
|
ggml_tensor * cur;
|
|
9396
9981
|
ggml_tensor * inpL;
|
|
9397
9982
|
|
|
9398
|
-
// TODO: remove this when ggml_scale_add is implemented
|
|
9399
|
-
one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
|
9400
|
-
{
|
|
9401
|
-
auto inp = std::make_unique<llm_graph_input_one>();
|
|
9402
|
-
inp->one = one;
|
|
9403
|
-
res->add_input(std::move(inp));
|
|
9404
|
-
}
|
|
9405
|
-
|
|
9406
9983
|
inpL = build_inp_embd(model.tok_embd);
|
|
9407
9984
|
|
|
9408
9985
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
@@ -9496,7 +10073,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
9496
10073
|
cb(Qcur, "Qcur_pos", il);
|
|
9497
10074
|
cb(Kcur, "Kcur_pos", il);
|
|
9498
10075
|
|
|
9499
|
-
cur = build_attn(inp_attn,
|
|
10076
|
+
cur = build_attn(inp_attn,
|
|
9500
10077
|
model.layers[il].wo, NULL,
|
|
9501
10078
|
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
9502
10079
|
} else {
|
|
@@ -9514,7 +10091,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
9514
10091
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9515
10092
|
cb(Qcur, "Qcur_pos", il);
|
|
9516
10093
|
|
|
9517
|
-
cur = build_attn(inp_attn,
|
|
10094
|
+
cur = build_attn(inp_attn,
|
|
9518
10095
|
model.layers[il].wo, NULL,
|
|
9519
10096
|
Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
9520
10097
|
}
|
|
@@ -9792,7 +10369,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
9792
10369
|
cb(innovation, "innovation", il);
|
|
9793
10370
|
|
|
9794
10371
|
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
|
|
9795
|
-
all_coefs =
|
|
10372
|
+
all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
|
|
9796
10373
|
cb(all_coefs, "all_coefs", il);
|
|
9797
10374
|
all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
|
|
9798
10375
|
all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
|
|
@@ -9808,7 +10385,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
9808
10385
|
|
|
9809
10386
|
// TODO: move up next to build_starcoder
|
|
9810
10387
|
struct llm_build_starcoder2 : public llm_graph_context {
|
|
9811
|
-
llm_build_starcoder2(const llama_model & model, const llm_graph_params & params
|
|
10388
|
+
llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
9812
10389
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
9813
10390
|
|
|
9814
10391
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -9879,7 +10456,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
9879
10456
|
cb(Kcur, "Kcur", il);
|
|
9880
10457
|
cb(Vcur, "Vcur", il);
|
|
9881
10458
|
|
|
9882
|
-
cur = build_attn(inp_attn,
|
|
10459
|
+
cur = build_attn(inp_attn,
|
|
9883
10460
|
model.layers[il].wo, model.layers[il].bo,
|
|
9884
10461
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9885
10462
|
}
|
|
@@ -9935,74 +10512,22 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
9935
10512
|
}
|
|
9936
10513
|
};
|
|
9937
10514
|
|
|
9938
|
-
struct
|
|
9939
|
-
|
|
9940
|
-
ggml_tensor * cur;
|
|
9941
|
-
ggml_tensor * inpL;
|
|
9942
|
-
|
|
9943
|
-
// {n_embd, n_tokens}
|
|
9944
|
-
inpL = build_inp_embd(model.tok_embd);
|
|
9945
|
-
|
|
9946
|
-
auto * rs_inp = build_rs_inp();
|
|
9947
|
-
|
|
9948
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9949
|
-
|
|
9950
|
-
for (int il = 0; il < n_layer; ++il) {
|
|
9951
|
-
// norm
|
|
9952
|
-
cur = build_norm(inpL,
|
|
9953
|
-
model.layers[il].attn_norm, NULL,
|
|
9954
|
-
LLM_NORM_RMS, il);
|
|
9955
|
-
cb(cur, "attn_norm", il);
|
|
9956
|
-
|
|
9957
|
-
if (model.arch == LLM_ARCH_MAMBA2) {
|
|
9958
|
-
cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il);
|
|
9959
|
-
} else {
|
|
9960
|
-
cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il);
|
|
9961
|
-
}
|
|
9962
|
-
|
|
9963
|
-
if (il == n_layer - 1 && inp_out_ids) {
|
|
9964
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
9965
|
-
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
9966
|
-
}
|
|
9967
|
-
|
|
9968
|
-
// residual
|
|
9969
|
-
cur = ggml_add(ctx0, cur, inpL);
|
|
9970
|
-
|
|
9971
|
-
cur = build_cvec(cur, il);
|
|
9972
|
-
cb(cur, "l_out", il);
|
|
9973
|
-
|
|
9974
|
-
// input for next layer
|
|
9975
|
-
inpL = cur;
|
|
9976
|
-
}
|
|
9977
|
-
|
|
9978
|
-
// final rmsnorm
|
|
9979
|
-
cur = build_norm(inpL,
|
|
9980
|
-
model.output_norm, NULL,
|
|
9981
|
-
LLM_NORM_RMS, -1);
|
|
9982
|
-
|
|
9983
|
-
cb(cur, "result_norm", -1);
|
|
9984
|
-
res->t_embd = cur;
|
|
9985
|
-
|
|
9986
|
-
// lm_head
|
|
9987
|
-
cur = build_lora_mm(model.output, cur);
|
|
9988
|
-
|
|
9989
|
-
cb(cur, "result_output", -1);
|
|
9990
|
-
res->t_logits = cur;
|
|
9991
|
-
|
|
9992
|
-
ggml_build_forward_expand(gf, cur);
|
|
9993
|
-
}
|
|
10515
|
+
struct llm_graph_context_mamba : public llm_graph_context {
|
|
10516
|
+
llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
|
|
9994
10517
|
|
|
9995
10518
|
ggml_tensor * build_mamba_layer(
|
|
9996
10519
|
llm_graph_input_rs * inp,
|
|
9997
|
-
ggml_cgraph * gf,
|
|
9998
10520
|
ggml_tensor * cur,
|
|
9999
10521
|
const llama_model & model,
|
|
10000
10522
|
const llama_ubatch & ubatch,
|
|
10001
|
-
int il)
|
|
10002
|
-
|
|
10523
|
+
int il) {
|
|
10524
|
+
|
|
10525
|
+
const auto * mctx_cur = inp->mctx;
|
|
10003
10526
|
|
|
10004
10527
|
const auto kv_head = mctx_cur->get_head();
|
|
10005
10528
|
|
|
10529
|
+
const auto & layer = model.layers[il];
|
|
10530
|
+
|
|
10006
10531
|
const int64_t d_conv = hparams.ssm_d_conv;
|
|
10007
10532
|
const int64_t d_inner = hparams.ssm_d_inner;
|
|
10008
10533
|
const int64_t d_state = hparams.ssm_d_state;
|
|
@@ -10012,26 +10537,24 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
10012
10537
|
const int64_t n_seqs = ubatch.n_seqs;
|
|
10013
10538
|
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
|
|
10014
10539
|
const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
|
|
10015
|
-
// Use the same RMS norm as the final layer norm
|
|
10016
|
-
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
|
10017
10540
|
|
|
10018
10541
|
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
10019
10542
|
|
|
10020
10543
|
GGML_ASSERT(n_seqs != 0);
|
|
10021
|
-
GGML_ASSERT(ubatch.equal_seqs);
|
|
10544
|
+
GGML_ASSERT(ubatch.equal_seqs());
|
|
10022
10545
|
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
10023
10546
|
|
|
10024
10547
|
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
10025
10548
|
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
10026
10549
|
|
|
10027
|
-
ggml_tensor * conv = build_rs(inp,
|
|
10550
|
+
ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
|
|
10028
10551
|
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
|
10029
10552
|
|
|
10030
10553
|
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
10031
10554
|
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
|
10032
10555
|
|
|
10033
10556
|
// {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
|
|
10034
|
-
ggml_tensor * xz = build_lora_mm(
|
|
10557
|
+
ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
|
|
10035
10558
|
// split the above in two
|
|
10036
10559
|
// => {d_inner, n_seq_tokens, n_seqs}
|
|
10037
10560
|
ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
|
|
@@ -10060,10 +10583,10 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
10060
10583
|
// then permute away the ne[0] dimension,
|
|
10061
10584
|
// and then you're left with the resulting x tensor.
|
|
10062
10585
|
// For simultaneous sequences, all sequences need to have the same length.
|
|
10063
|
-
x = ggml_ssm_conv(ctx0, conv_x,
|
|
10586
|
+
x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
|
|
10064
10587
|
|
|
10065
10588
|
// bias
|
|
10066
|
-
x = ggml_add(ctx0, x,
|
|
10589
|
+
x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
|
|
10067
10590
|
|
|
10068
10591
|
x = ggml_silu(ctx0, x);
|
|
10069
10592
|
}
|
|
@@ -10071,27 +10594,27 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
10071
10594
|
// ssm
|
|
10072
10595
|
{
|
|
10073
10596
|
// {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
|
|
10074
|
-
ggml_tensor * x_db = build_lora_mm(
|
|
10597
|
+
ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
|
|
10075
10598
|
// split
|
|
10076
10599
|
ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
|
|
10077
10600
|
ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
|
|
10078
10601
|
ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
|
|
10079
10602
|
|
|
10080
|
-
// Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
|
|
10081
|
-
if (ssm_dt_b_c_rms) {
|
|
10082
|
-
dt =
|
|
10083
|
-
B
|
|
10084
|
-
C
|
|
10603
|
+
// Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
|
|
10604
|
+
if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
|
|
10605
|
+
dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
|
|
10606
|
+
B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
|
|
10607
|
+
C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
|
|
10085
10608
|
}
|
|
10086
10609
|
|
|
10087
10610
|
// {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
|
|
10088
|
-
dt = build_lora_mm(
|
|
10089
|
-
dt = ggml_add(ctx0, dt,
|
|
10611
|
+
dt = build_lora_mm(layer.ssm_dt, dt);
|
|
10612
|
+
dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
|
|
10090
10613
|
|
|
10091
10614
|
cur = x;
|
|
10092
10615
|
x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
|
|
10093
10616
|
|
|
10094
|
-
ggml_tensor * A =
|
|
10617
|
+
ggml_tensor * A = layer.ssm_a;
|
|
10095
10618
|
|
|
10096
10619
|
// use the states and the indices provided by build_recurrent_state
|
|
10097
10620
|
// (this is necessary in order to properly use the states before they are overwritten,
|
|
@@ -10105,7 +10628,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
10105
10628
|
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
|
10106
10629
|
};
|
|
10107
10630
|
|
|
10108
|
-
ggml_tensor * y_ssm = build_rs(inp,
|
|
10631
|
+
ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
|
10109
10632
|
|
|
10110
10633
|
// store last states
|
|
10111
10634
|
ggml_build_forward_expand(gf,
|
|
@@ -10117,28 +10640,27 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
10117
10640
|
|
|
10118
10641
|
// TODO: skip computing output earlier for unused tokens
|
|
10119
10642
|
|
|
10120
|
-
y = ggml_add(ctx0, y, ggml_mul(ctx0, cur,
|
|
10121
|
-
y =
|
|
10643
|
+
y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
|
|
10644
|
+
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
|
|
10122
10645
|
|
|
10123
10646
|
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
10124
|
-
cur = build_lora_mm(
|
|
10647
|
+
cur = build_lora_mm(layer.ssm_out, y);
|
|
10125
10648
|
}
|
|
10126
10649
|
|
|
10127
10650
|
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
|
10128
10651
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
|
|
10129
|
-
// cb(cur, "mamba_out", il);
|
|
10130
10652
|
|
|
10131
10653
|
return cur;
|
|
10132
10654
|
}
|
|
10133
10655
|
|
|
10134
10656
|
ggml_tensor * build_mamba2_layer(
|
|
10135
10657
|
llm_graph_input_rs * inp,
|
|
10136
|
-
|
|
10137
|
-
|
|
10138
|
-
|
|
10139
|
-
|
|
10140
|
-
|
|
10141
|
-
const auto * mctx_cur =
|
|
10658
|
+
ggml_tensor * cur,
|
|
10659
|
+
const llama_model & model,
|
|
10660
|
+
const llama_ubatch & ubatch,
|
|
10661
|
+
int il) const {
|
|
10662
|
+
|
|
10663
|
+
const auto * mctx_cur = inp->mctx;
|
|
10142
10664
|
|
|
10143
10665
|
const auto kv_head = mctx_cur->get_head();
|
|
10144
10666
|
|
|
@@ -10153,13 +10675,13 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
10153
10675
|
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
10154
10676
|
|
|
10155
10677
|
GGML_ASSERT(n_seqs != 0);
|
|
10156
|
-
GGML_ASSERT(ubatch.equal_seqs);
|
|
10678
|
+
GGML_ASSERT(ubatch.equal_seqs());
|
|
10157
10679
|
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
10158
10680
|
|
|
10159
10681
|
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
10160
10682
|
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
10161
10683
|
|
|
10162
|
-
ggml_tensor * conv = build_rs(inp,
|
|
10684
|
+
ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
|
|
10163
10685
|
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
|
|
10164
10686
|
|
|
10165
10687
|
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
@@ -10229,7 +10751,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
10229
10751
|
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
|
10230
10752
|
};
|
|
10231
10753
|
|
|
10232
|
-
ggml_tensor * y_ssm = build_rs(inp,
|
|
10754
|
+
ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
|
10233
10755
|
|
|
10234
10756
|
// store last states
|
|
10235
10757
|
ggml_build_forward_expand(gf,
|
|
@@ -10242,11 +10764,14 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
10242
10764
|
// TODO: skip computing output earlier for unused tokens
|
|
10243
10765
|
|
|
10244
10766
|
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
|
10245
|
-
y =
|
|
10767
|
+
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
|
|
10246
10768
|
|
|
10247
10769
|
// grouped RMS norm
|
|
10248
|
-
|
|
10249
|
-
|
|
10770
|
+
if (model.layers[il].ssm_norm) {
|
|
10771
|
+
y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
|
|
10772
|
+
y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
|
|
10773
|
+
}
|
|
10774
|
+
|
|
10250
10775
|
y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
|
|
10251
10776
|
|
|
10252
10777
|
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
@@ -10261,23 +10786,15 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
10261
10786
|
}
|
|
10262
10787
|
};
|
|
10263
10788
|
|
|
10264
|
-
struct
|
|
10265
|
-
|
|
10266
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10267
|
-
|
|
10268
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
10269
|
-
|
|
10270
|
-
const float f_logit_scale = hparams.f_logit_scale;
|
|
10271
|
-
|
|
10789
|
+
struct llm_build_mamba : public llm_graph_context_mamba {
|
|
10790
|
+
llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
|
|
10272
10791
|
ggml_tensor * cur;
|
|
10273
10792
|
ggml_tensor * inpL;
|
|
10274
10793
|
|
|
10794
|
+
// {n_embd, n_tokens}
|
|
10275
10795
|
inpL = build_inp_embd(model.tok_embd);
|
|
10276
10796
|
|
|
10277
|
-
|
|
10278
|
-
ggml_tensor * inp_pos = build_inp_pos();
|
|
10279
|
-
|
|
10280
|
-
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10797
|
+
auto * rs_inp = build_rs_inp();
|
|
10281
10798
|
|
|
10282
10799
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10283
10800
|
|
|
@@ -10285,20 +10802,194 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
10285
10802
|
// norm
|
|
10286
10803
|
cur = build_norm(inpL,
|
|
10287
10804
|
model.layers[il].attn_norm, NULL,
|
|
10288
|
-
|
|
10805
|
+
LLM_NORM_RMS, il);
|
|
10289
10806
|
cb(cur, "attn_norm", il);
|
|
10290
10807
|
|
|
10291
|
-
|
|
10808
|
+
if (model.arch == LLM_ARCH_MAMBA2) {
|
|
10809
|
+
cur = build_mamba2_layer(rs_inp, cur, model, ubatch, il);
|
|
10810
|
+
} else {
|
|
10811
|
+
cur = build_mamba_layer(rs_inp, cur, model, ubatch, il);
|
|
10812
|
+
}
|
|
10292
10813
|
|
|
10293
|
-
|
|
10294
|
-
|
|
10295
|
-
|
|
10296
|
-
|
|
10297
|
-
|
|
10298
|
-
|
|
10299
|
-
|
|
10300
|
-
|
|
10301
|
-
|
|
10814
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10815
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10816
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
10817
|
+
}
|
|
10818
|
+
|
|
10819
|
+
// residual
|
|
10820
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
10821
|
+
|
|
10822
|
+
cur = build_cvec(cur, il);
|
|
10823
|
+
cb(cur, "l_out", il);
|
|
10824
|
+
|
|
10825
|
+
// input for next layer
|
|
10826
|
+
inpL = cur;
|
|
10827
|
+
}
|
|
10828
|
+
|
|
10829
|
+
// final rmsnorm
|
|
10830
|
+
cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
10831
|
+
|
|
10832
|
+
cb(cur, "result_norm", -1);
|
|
10833
|
+
res->t_embd = cur;
|
|
10834
|
+
|
|
10835
|
+
// lm_head
|
|
10836
|
+
cur = build_lora_mm(model.output, cur);
|
|
10837
|
+
|
|
10838
|
+
cb(cur, "result_output", -1);
|
|
10839
|
+
res->t_logits = cur;
|
|
10840
|
+
|
|
10841
|
+
ggml_build_forward_expand(gf, cur);
|
|
10842
|
+
}
|
|
10843
|
+
|
|
10844
|
+
};
|
|
10845
|
+
|
|
10846
|
+
struct llm_build_jamba : public llm_graph_context_mamba {
|
|
10847
|
+
llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
|
|
10848
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10849
|
+
|
|
10850
|
+
ggml_tensor * cur;
|
|
10851
|
+
ggml_tensor * inpL;
|
|
10852
|
+
|
|
10853
|
+
// {n_embd, n_tokens}
|
|
10854
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
10855
|
+
|
|
10856
|
+
auto * inp_hybrid = build_inp_mem_hybrid();
|
|
10857
|
+
|
|
10858
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10859
|
+
|
|
10860
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
10861
|
+
const int64_t n_head_kv = hparams.n_head_kv(il);
|
|
10862
|
+
|
|
10863
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
10864
|
+
cb(cur, "attn_norm", il);
|
|
10865
|
+
|
|
10866
|
+
if (n_head_kv == 0) {
|
|
10867
|
+
cur = build_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
|
|
10868
|
+
} else {
|
|
10869
|
+
// Attention
|
|
10870
|
+
|
|
10871
|
+
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
10872
|
+
struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
10873
|
+
struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
10874
|
+
|
|
10875
|
+
cb(Qcur, "Qcur", il);
|
|
10876
|
+
cb(Kcur, "Kcur", il);
|
|
10877
|
+
cb(Vcur, "Vcur", il);
|
|
10878
|
+
|
|
10879
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
10880
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
10881
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
10882
|
+
|
|
10883
|
+
cb(Qcur, "Qcur", il);
|
|
10884
|
+
cb(Kcur, "Kcur", il);
|
|
10885
|
+
cb(Vcur, "Vcur", il);
|
|
10886
|
+
|
|
10887
|
+
// No RoPE :)
|
|
10888
|
+
cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10889
|
+
}
|
|
10890
|
+
|
|
10891
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
10892
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10893
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
10894
|
+
}
|
|
10895
|
+
|
|
10896
|
+
// residual
|
|
10897
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur);
|
|
10898
|
+
cb(cur, "ffn_inp", il);
|
|
10899
|
+
|
|
10900
|
+
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
10901
|
+
cb(cur, "ffn_norm", il);
|
|
10902
|
+
|
|
10903
|
+
// feed-forward network
|
|
10904
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
10905
|
+
// FFN
|
|
10906
|
+
cur = build_ffn(cur,
|
|
10907
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
10908
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
10909
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
10910
|
+
NULL,
|
|
10911
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
10912
|
+
cb(cur, "ffn_out", il);
|
|
10913
|
+
} else {
|
|
10914
|
+
// MoE branch
|
|
10915
|
+
cur = build_moe_ffn(cur,
|
|
10916
|
+
model.layers[il].ffn_gate_inp,
|
|
10917
|
+
model.layers[il].ffn_up_exps,
|
|
10918
|
+
model.layers[il].ffn_gate_exps,
|
|
10919
|
+
model.layers[il].ffn_down_exps,
|
|
10920
|
+
nullptr,
|
|
10921
|
+
n_expert, n_expert_used,
|
|
10922
|
+
LLM_FFN_SILU, false,
|
|
10923
|
+
false, 0.0,
|
|
10924
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
10925
|
+
il);
|
|
10926
|
+
cb(cur, "ffn_moe_out", il);
|
|
10927
|
+
}
|
|
10928
|
+
|
|
10929
|
+
// residual
|
|
10930
|
+
cur = ggml_add(ctx0, ffn_inp, cur);
|
|
10931
|
+
|
|
10932
|
+
cur = build_cvec(cur, il);
|
|
10933
|
+
cb(cur, "l_out", il);
|
|
10934
|
+
|
|
10935
|
+
// input for next layer
|
|
10936
|
+
inpL = cur;
|
|
10937
|
+
}
|
|
10938
|
+
|
|
10939
|
+
// final rmsnorm
|
|
10940
|
+
cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
10941
|
+
|
|
10942
|
+
cb(cur, "result_norm", -1);
|
|
10943
|
+
res->t_embd = cur;
|
|
10944
|
+
|
|
10945
|
+
// lm_head
|
|
10946
|
+
cur = build_lora_mm(model.output, cur);
|
|
10947
|
+
|
|
10948
|
+
cb(cur, "result_output", -1);
|
|
10949
|
+
res->t_logits = cur;
|
|
10950
|
+
|
|
10951
|
+
ggml_build_forward_expand(gf, cur);
|
|
10952
|
+
}
|
|
10953
|
+
};
|
|
10954
|
+
|
|
10955
|
+
struct llm_build_command_r : public llm_graph_context {
|
|
10956
|
+
llm_build_command_r(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
10957
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10958
|
+
|
|
10959
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
10960
|
+
|
|
10961
|
+
const float f_logit_scale = hparams.f_logit_scale;
|
|
10962
|
+
|
|
10963
|
+
ggml_tensor * cur;
|
|
10964
|
+
ggml_tensor * inpL;
|
|
10965
|
+
|
|
10966
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
10967
|
+
|
|
10968
|
+
// inp_pos - contains the positions
|
|
10969
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
10970
|
+
|
|
10971
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10972
|
+
|
|
10973
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10974
|
+
|
|
10975
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
10976
|
+
// norm
|
|
10977
|
+
cur = build_norm(inpL,
|
|
10978
|
+
model.layers[il].attn_norm, NULL,
|
|
10979
|
+
LLM_NORM, il);
|
|
10980
|
+
cb(cur, "attn_norm", il);
|
|
10981
|
+
|
|
10982
|
+
ggml_tensor * ffn_inp = cur;
|
|
10983
|
+
|
|
10984
|
+
// self-attention
|
|
10985
|
+
{
|
|
10986
|
+
// compute Q and K and RoPE them
|
|
10987
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
10988
|
+
cb(Qcur, "Qcur", il);
|
|
10989
|
+
if (model.layers[il].bq) {
|
|
10990
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
10991
|
+
cb(Qcur, "Qcur", il);
|
|
10992
|
+
}
|
|
10302
10993
|
|
|
10303
10994
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
10304
10995
|
cb(Kcur, "Kcur", il);
|
|
@@ -10350,7 +11041,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
10350
11041
|
cb(Kcur, "Kcur", il);
|
|
10351
11042
|
cb(Vcur, "Vcur", il);
|
|
10352
11043
|
|
|
10353
|
-
cur = build_attn(inp_attn,
|
|
11044
|
+
cur = build_attn(inp_attn,
|
|
10354
11045
|
model.layers[il].wo, model.layers[il].bo,
|
|
10355
11046
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10356
11047
|
}
|
|
@@ -10409,7 +11100,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
10409
11100
|
};
|
|
10410
11101
|
|
|
10411
11102
|
struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
10412
|
-
llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params
|
|
11103
|
+
llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
10413
11104
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10414
11105
|
|
|
10415
11106
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -10485,7 +11176,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
10485
11176
|
cb(Kcur, "Kcur", il);
|
|
10486
11177
|
cb(Vcur, "Vcur", il);
|
|
10487
11178
|
|
|
10488
|
-
cur = build_attn(inp_attn,
|
|
11179
|
+
cur = build_attn(inp_attn,
|
|
10489
11180
|
model.layers[il].wo, model.layers[il].bo,
|
|
10490
11181
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10491
11182
|
}
|
|
@@ -10545,7 +11236,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
10545
11236
|
// * removed bias
|
|
10546
11237
|
// * removed MoE
|
|
10547
11238
|
struct llm_build_olmo : public llm_graph_context {
|
|
10548
|
-
llm_build_olmo(const llama_model & model, const llm_graph_params & params
|
|
11239
|
+
llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
10549
11240
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10550
11241
|
|
|
10551
11242
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -10616,7 +11307,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
10616
11307
|
cb(Kcur, "Kcur", il);
|
|
10617
11308
|
cb(Vcur, "Vcur", il);
|
|
10618
11309
|
|
|
10619
|
-
cur = build_attn(inp_attn,
|
|
11310
|
+
cur = build_attn(inp_attn,
|
|
10620
11311
|
model.layers[il].wo, nullptr,
|
|
10621
11312
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10622
11313
|
}
|
|
@@ -10673,7 +11364,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
10673
11364
|
};
|
|
10674
11365
|
|
|
10675
11366
|
struct llm_build_olmo2 : public llm_graph_context {
|
|
10676
|
-
llm_build_olmo2(const llama_model & model, const llm_graph_params & params
|
|
11367
|
+
llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
10677
11368
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10678
11369
|
|
|
10679
11370
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -10736,7 +11427,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
10736
11427
|
cb(Kcur, "Kcur", il);
|
|
10737
11428
|
cb(Vcur, "Vcur", il);
|
|
10738
11429
|
|
|
10739
|
-
cur = build_attn(inp_attn,
|
|
11430
|
+
cur = build_attn(inp_attn,
|
|
10740
11431
|
model.layers[il].wo, NULL,
|
|
10741
11432
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10742
11433
|
}
|
|
@@ -10802,7 +11493,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
10802
11493
|
// * removed bias
|
|
10803
11494
|
// * added q, k norm
|
|
10804
11495
|
struct llm_build_olmoe : public llm_graph_context {
|
|
10805
|
-
llm_build_olmoe(const llama_model & model, const llm_graph_params & params
|
|
11496
|
+
llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
10806
11497
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10807
11498
|
|
|
10808
11499
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -10869,7 +11560,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
10869
11560
|
cb(Kcur, "Kcur", il);
|
|
10870
11561
|
cb(Vcur, "Vcur", il);
|
|
10871
11562
|
|
|
10872
|
-
cur = build_attn(inp_attn,
|
|
11563
|
+
cur = build_attn(inp_attn,
|
|
10873
11564
|
model.layers[il].wo, NULL,
|
|
10874
11565
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10875
11566
|
}
|
|
@@ -10930,7 +11621,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
10930
11621
|
};
|
|
10931
11622
|
|
|
10932
11623
|
struct llm_build_openelm : public llm_graph_context {
|
|
10933
|
-
llm_build_openelm(const llama_model & model, const llm_graph_params & params
|
|
11624
|
+
llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
10934
11625
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10935
11626
|
|
|
10936
11627
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -11002,7 +11693,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
11002
11693
|
cb(Kcur, "Kcur", il);
|
|
11003
11694
|
cb(Qcur, "Vcur", il);
|
|
11004
11695
|
|
|
11005
|
-
cur = build_attn(inp_attn,
|
|
11696
|
+
cur = build_attn(inp_attn,
|
|
11006
11697
|
model.layers[il].wo, NULL,
|
|
11007
11698
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11008
11699
|
}
|
|
@@ -11059,7 +11750,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
11059
11750
|
};
|
|
11060
11751
|
|
|
11061
11752
|
struct llm_build_gptneox : public llm_graph_context {
|
|
11062
|
-
llm_build_gptneox(const llama_model & model, const llm_graph_params & params
|
|
11753
|
+
llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
11063
11754
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
11064
11755
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
11065
11756
|
|
|
@@ -11114,7 +11805,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
11114
11805
|
cb(Kcur, "Kcur", il);
|
|
11115
11806
|
cb(Vcur, "Vcur", il);
|
|
11116
11807
|
|
|
11117
|
-
cur = build_attn(inp_attn,
|
|
11808
|
+
cur = build_attn(inp_attn,
|
|
11118
11809
|
model.layers[il].wo, model.layers[il].bo,
|
|
11119
11810
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11120
11811
|
}
|
|
@@ -11205,7 +11896,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
11205
11896
|
};
|
|
11206
11897
|
|
|
11207
11898
|
struct llm_build_arctic : public llm_graph_context {
|
|
11208
|
-
llm_build_arctic(const llama_model & model, const llm_graph_params & params
|
|
11899
|
+
llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
11209
11900
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
11210
11901
|
|
|
11211
11902
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -11264,7 +11955,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
11264
11955
|
cb(Kcur, "Kcur", il);
|
|
11265
11956
|
cb(Vcur, "Vcur", il);
|
|
11266
11957
|
|
|
11267
|
-
cur = build_attn(inp_attn,
|
|
11958
|
+
cur = build_attn(inp_attn,
|
|
11268
11959
|
model.layers[il].wo, NULL,
|
|
11269
11960
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11270
11961
|
}
|
|
@@ -11343,7 +12034,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
11343
12034
|
};
|
|
11344
12035
|
|
|
11345
12036
|
struct llm_build_deepseek : public llm_graph_context {
|
|
11346
|
-
llm_build_deepseek(const llama_model & model, const llm_graph_params & params
|
|
12037
|
+
llm_build_deepseek(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
11347
12038
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
11348
12039
|
|
|
11349
12040
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -11419,7 +12110,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
11419
12110
|
cb(Kcur, "Kcur", il);
|
|
11420
12111
|
cb(Vcur, "Vcur", il);
|
|
11421
12112
|
|
|
11422
|
-
cur = build_attn(inp_attn,
|
|
12113
|
+
cur = build_attn(inp_attn,
|
|
11423
12114
|
model.layers[il].wo, model.layers[il].bo,
|
|
11424
12115
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
11425
12116
|
}
|
|
@@ -11505,7 +12196,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
11505
12196
|
};
|
|
11506
12197
|
|
|
11507
12198
|
struct llm_build_deepseek2 : public llm_graph_context {
|
|
11508
|
-
llm_build_deepseek2(const llama_model & model, const llm_graph_params & params
|
|
12199
|
+
llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
11509
12200
|
bool is_lite = (hparams.n_layer == 27);
|
|
11510
12201
|
|
|
11511
12202
|
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
|
@@ -11647,7 +12338,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
11647
12338
|
cb(Vcur, "Vcur", il);
|
|
11648
12339
|
|
|
11649
12340
|
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
|
11650
|
-
cur = build_attn(inp_attn,
|
|
12341
|
+
cur = build_attn(inp_attn,
|
|
11651
12342
|
model.layers[il].wo, NULL,
|
|
11652
12343
|
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
11653
12344
|
} else {
|
|
@@ -11681,7 +12372,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
11681
12372
|
cb(Kcur, "Kcur", il);
|
|
11682
12373
|
|
|
11683
12374
|
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
|
11684
|
-
cur = build_attn(inp_attn,
|
|
12375
|
+
cur = build_attn(inp_attn,
|
|
11685
12376
|
model.layers[il].wo, NULL,
|
|
11686
12377
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
11687
12378
|
}
|
|
@@ -11768,7 +12459,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
11768
12459
|
};
|
|
11769
12460
|
|
|
11770
12461
|
struct llm_build_bitnet : public llm_graph_context {
|
|
11771
|
-
llm_build_bitnet(const llama_model & model, const llm_graph_params & params
|
|
12462
|
+
llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
11772
12463
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
11773
12464
|
|
|
11774
12465
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -11848,7 +12539,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
11848
12539
|
cb(Kcur, "Kcur", il);
|
|
11849
12540
|
cb(Vcur, "Vcur", il);
|
|
11850
12541
|
|
|
11851
|
-
cur = build_attn(inp_attn,
|
|
12542
|
+
cur = build_attn(inp_attn,
|
|
11852
12543
|
NULL, NULL,
|
|
11853
12544
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11854
12545
|
|
|
@@ -11928,7 +12619,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
11928
12619
|
};
|
|
11929
12620
|
|
|
11930
12621
|
struct llm_build_t5_enc : public llm_graph_context {
|
|
11931
|
-
llm_build_t5_enc(const llama_model & model, const llm_graph_params & params
|
|
12622
|
+
llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
11932
12623
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
11933
12624
|
|
|
11934
12625
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -11971,7 +12662,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
11971
12662
|
ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
|
|
11972
12663
|
ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
|
|
11973
12664
|
|
|
11974
|
-
cur = build_attn(inp_attn,
|
|
12665
|
+
cur = build_attn(inp_attn,
|
|
11975
12666
|
model.layers[il].wo_enc, nullptr,
|
|
11976
12667
|
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
11977
12668
|
cb(cur, "kqv_out", il);
|
|
@@ -12029,7 +12720,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
12029
12720
|
};
|
|
12030
12721
|
|
|
12031
12722
|
struct llm_build_t5_dec : public llm_graph_context {
|
|
12032
|
-
llm_build_t5_dec(const llama_model & model, const llm_graph_params & params
|
|
12723
|
+
llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12033
12724
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12034
12725
|
//const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
12035
12726
|
|
|
@@ -12077,7 +12768,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
12077
12768
|
ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
|
|
12078
12769
|
ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
|
|
12079
12770
|
|
|
12080
|
-
cur = build_attn(inp_attn_self,
|
|
12771
|
+
cur = build_attn(inp_attn_self,
|
|
12081
12772
|
model.layers[il].wo, model.layers[il].bo,
|
|
12082
12773
|
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
12083
12774
|
cb(cur, "kqv_out", il);
|
|
@@ -12109,7 +12800,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
12109
12800
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
|
|
12110
12801
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
|
|
12111
12802
|
|
|
12112
|
-
cur = build_attn(inp_attn_cross,
|
|
12803
|
+
cur = build_attn(inp_attn_cross,
|
|
12113
12804
|
model.layers[il].wo_cross, nullptr,
|
|
12114
12805
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
12115
12806
|
cb(cur, "kqv_out", il);
|
|
@@ -12199,7 +12890,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
12199
12890
|
};
|
|
12200
12891
|
|
|
12201
12892
|
struct llm_build_jais : public llm_graph_context {
|
|
12202
|
-
llm_build_jais(const llama_model & model, const llm_graph_params & params
|
|
12893
|
+
llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12203
12894
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12204
12895
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
12205
12896
|
|
|
@@ -12241,7 +12932,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
12241
12932
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12242
12933
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12243
12934
|
|
|
12244
|
-
cur = build_attn(inp_attn,
|
|
12935
|
+
cur = build_attn(inp_attn,
|
|
12245
12936
|
model.layers[il].wo, model.layers[il].bo,
|
|
12246
12937
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
12247
12938
|
}
|
|
@@ -12294,7 +12985,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
12294
12985
|
};
|
|
12295
12986
|
|
|
12296
12987
|
struct llm_build_chatglm : public llm_graph_context {
|
|
12297
|
-
llm_build_chatglm(const llama_model & model, const llm_graph_params & params
|
|
12988
|
+
llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12298
12989
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12299
12990
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
12300
12991
|
|
|
@@ -12373,7 +13064,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
12373
13064
|
cb(Kcur, "Kcur", il);
|
|
12374
13065
|
cb(Vcur, "Vcur", il);
|
|
12375
13066
|
|
|
12376
|
-
cur = build_attn(inp_attn,
|
|
13067
|
+
cur = build_attn(inp_attn,
|
|
12377
13068
|
model.layers[il].wo, NULL,
|
|
12378
13069
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12379
13070
|
}
|
|
@@ -12427,7 +13118,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
12427
13118
|
};
|
|
12428
13119
|
|
|
12429
13120
|
struct llm_build_glm4 : public llm_graph_context {
|
|
12430
|
-
llm_build_glm4(const llama_model & model, const llm_graph_params & params
|
|
13121
|
+
llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12431
13122
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12432
13123
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
12433
13124
|
|
|
@@ -12506,7 +13197,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
12506
13197
|
cb(Kcur, "Kcur", il);
|
|
12507
13198
|
cb(Vcur, "Vcur", il);
|
|
12508
13199
|
|
|
12509
|
-
cur = build_attn(inp_attn,
|
|
13200
|
+
cur = build_attn(inp_attn,
|
|
12510
13201
|
model.layers[il].wo, NULL,
|
|
12511
13202
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12512
13203
|
}
|
|
@@ -12578,7 +13269,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
12578
13269
|
};
|
|
12579
13270
|
|
|
12580
13271
|
struct llm_build_nemotron : public llm_graph_context {
|
|
12581
|
-
llm_build_nemotron(const llama_model & model, const llm_graph_params & params
|
|
13272
|
+
llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12582
13273
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12583
13274
|
|
|
12584
13275
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -12650,7 +13341,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
12650
13341
|
cb(Kcur, "Kcur", il);
|
|
12651
13342
|
cb(Vcur, "Vcur", il);
|
|
12652
13343
|
|
|
12653
|
-
cur = build_attn(inp_attn,
|
|
13344
|
+
cur = build_attn(inp_attn,
|
|
12654
13345
|
model.layers[il].wo, model.layers[il].bo,
|
|
12655
13346
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12656
13347
|
}
|
|
@@ -12707,7 +13398,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
12707
13398
|
};
|
|
12708
13399
|
|
|
12709
13400
|
struct llm_build_exaone : public llm_graph_context {
|
|
12710
|
-
llm_build_exaone(const llama_model & model, const llm_graph_params & params
|
|
13401
|
+
llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12711
13402
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12712
13403
|
|
|
12713
13404
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -12781,7 +13472,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
12781
13472
|
cb(Kcur, "Kcur", il);
|
|
12782
13473
|
cb(Vcur, "Vcur", il);
|
|
12783
13474
|
|
|
12784
|
-
cur = build_attn(inp_attn,
|
|
13475
|
+
cur = build_attn(inp_attn,
|
|
12785
13476
|
model.layers[il].wo, model.layers[il].bo,
|
|
12786
13477
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12787
13478
|
}
|
|
@@ -12837,93 +13528,228 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
12837
13528
|
}
|
|
12838
13529
|
};
|
|
12839
13530
|
|
|
12840
|
-
|
|
12841
|
-
|
|
13531
|
+
template <bool iswa>
|
|
13532
|
+
struct llm_build_exaone4 : public llm_graph_context {
|
|
13533
|
+
llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
13534
|
+
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
12842
13535
|
|
|
12843
|
-
|
|
12844
|
-
|
|
13536
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
|
|
13537
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
12845
13538
|
|
|
12846
|
-
|
|
12847
|
-
|
|
12848
|
-
ggml_tensor * cur,
|
|
12849
|
-
ggml_tensor * x_prev,
|
|
12850
|
-
llm_arch arch) const {
|
|
12851
|
-
ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
|
|
12852
|
-
switch (arch) {
|
|
12853
|
-
case LLM_ARCH_RWKV6:
|
|
12854
|
-
{
|
|
12855
|
-
ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
|
|
12856
|
-
ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
|
|
13539
|
+
ggml_tensor * cur;
|
|
13540
|
+
ggml_tensor * inpL;
|
|
12857
13541
|
|
|
12858
|
-
|
|
12859
|
-
ggml_tensor * k = ggml_sqr(
|
|
12860
|
-
ctx0,
|
|
12861
|
-
ggml_relu(
|
|
12862
|
-
ctx0,
|
|
12863
|
-
build_lora_mm(layer->channel_mix_key, xk)
|
|
12864
|
-
)
|
|
12865
|
-
);
|
|
12866
|
-
cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
|
|
12867
|
-
} break;
|
|
12868
|
-
default:
|
|
12869
|
-
GGML_ABORT("fatal error");
|
|
12870
|
-
}
|
|
13542
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12871
13543
|
|
|
12872
|
-
|
|
12873
|
-
|
|
13544
|
+
// inp_pos - contains the positions
|
|
13545
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
12874
13546
|
|
|
12875
|
-
|
|
12876
|
-
|
|
12877
|
-
ggml_cgraph * gf,
|
|
12878
|
-
ggml_tensor * cur,
|
|
12879
|
-
ggml_tensor * x_prev,
|
|
12880
|
-
const llama_ubatch & ubatch,
|
|
12881
|
-
int il) const {
|
|
12882
|
-
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
13547
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
|
13548
|
+
inp_attn_type * inp_attn = nullptr;
|
|
12883
13549
|
|
|
12884
|
-
|
|
12885
|
-
|
|
12886
|
-
|
|
12887
|
-
|
|
12888
|
-
|
|
12889
|
-
const auto n_head = n_embd / head_size;
|
|
12890
|
-
const auto n_head_kv = hparams.n_head_kv(il);
|
|
13550
|
+
if constexpr (iswa) {
|
|
13551
|
+
inp_attn = build_attn_inp_kv_unified_iswa();
|
|
13552
|
+
} else {
|
|
13553
|
+
inp_attn = build_attn_inp_kv_unified();
|
|
13554
|
+
}
|
|
12891
13555
|
|
|
12892
|
-
|
|
13556
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12893
13557
|
|
|
12894
|
-
|
|
13558
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
13559
|
+
ggml_tensor * inpSA = inpL;
|
|
12895
13560
|
|
|
12896
|
-
|
|
13561
|
+
// use RoPE for SWA layers or non-SWA models
|
|
13562
|
+
const bool use_rope = hparams.is_swa(il) || hparams.swa_type == LLAMA_SWA_TYPE_NONE;
|
|
12897
13563
|
|
|
12898
|
-
|
|
13564
|
+
cur = inpL;
|
|
12899
13565
|
|
|
12900
|
-
|
|
12901
|
-
|
|
13566
|
+
// self-attention
|
|
13567
|
+
{
|
|
13568
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
12902
13569
|
|
|
12903
|
-
|
|
13570
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
13571
|
+
cb(Qcur, "Qcur", il);
|
|
12904
13572
|
|
|
12905
|
-
|
|
12906
|
-
|
|
12907
|
-
ggml_tanh(
|
|
12908
|
-
ctx0,
|
|
12909
|
-
ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
|
|
12910
|
-
),
|
|
12911
|
-
layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
|
|
12912
|
-
);
|
|
13573
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
13574
|
+
cb(Kcur, "Kcur", il);
|
|
12913
13575
|
|
|
12914
|
-
|
|
13576
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
13577
|
+
cb(Vcur, "Vcur", il);
|
|
12915
13578
|
|
|
12916
|
-
|
|
12917
|
-
ctx0,
|
|
12918
|
-
|
|
12919
|
-
ctx0,
|
|
12920
|
-
layer.time_mix_w2,
|
|
12921
|
-
layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
|
|
12922
|
-
),
|
|
12923
|
-
xxx
|
|
12924
|
-
);
|
|
13579
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13580
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13581
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12925
13582
|
|
|
12926
|
-
|
|
13583
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
13584
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
13585
|
+
cb(Qcur, "Qcur_normed", il);
|
|
13586
|
+
cb(Kcur, "Kcur_normed", il);
|
|
13587
|
+
|
|
13588
|
+
if (use_rope) {
|
|
13589
|
+
Qcur = ggml_rope_ext(
|
|
13590
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
13591
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13592
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13593
|
+
);
|
|
13594
|
+
|
|
13595
|
+
Kcur = ggml_rope_ext(
|
|
13596
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
13597
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13598
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13599
|
+
);
|
|
13600
|
+
}
|
|
13601
|
+
|
|
13602
|
+
cb(Qcur, "Qcur", il);
|
|
13603
|
+
cb(Kcur, "Kcur", il);
|
|
13604
|
+
cb(Vcur, "Vcur", il);
|
|
13605
|
+
|
|
13606
|
+
cur = build_attn(inp_attn,
|
|
13607
|
+
model.layers[il].wo, NULL,
|
|
13608
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13609
|
+
cb(cur, "attn_out", il);
|
|
13610
|
+
}
|
|
13611
|
+
|
|
13612
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
13613
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13614
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13615
|
+
}
|
|
13616
|
+
|
|
13617
|
+
cur = build_norm(cur,
|
|
13618
|
+
model.layers[il].attn_post_norm, NULL,
|
|
13619
|
+
LLM_NORM_RMS, il);
|
|
13620
|
+
cb(cur, "attn_post_norm", il);
|
|
13621
|
+
|
|
13622
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13623
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13624
|
+
|
|
13625
|
+
// feed-forward network
|
|
13626
|
+
cur = build_ffn(ffn_inp,
|
|
13627
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
13628
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
13629
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
13630
|
+
NULL,
|
|
13631
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
13632
|
+
cb(cur, "ffn_out", il);
|
|
13633
|
+
|
|
13634
|
+
cur = build_norm(cur,
|
|
13635
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
13636
|
+
LLM_NORM_RMS, -1);
|
|
13637
|
+
cb(cur, "ffn_post_norm", -1);
|
|
13638
|
+
|
|
13639
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13640
|
+
|
|
13641
|
+
cur = build_cvec(cur, il);
|
|
13642
|
+
cb(cur, "l_out", il);
|
|
13643
|
+
|
|
13644
|
+
// input for next layer
|
|
13645
|
+
inpL = cur;
|
|
13646
|
+
}
|
|
13647
|
+
|
|
13648
|
+
cur = inpL;
|
|
13649
|
+
|
|
13650
|
+
cur = build_norm(cur,
|
|
13651
|
+
model.output_norm, NULL,
|
|
13652
|
+
LLM_NORM_RMS, -1);
|
|
13653
|
+
|
|
13654
|
+
cb(cur, "result_norm", -1);
|
|
13655
|
+
res->t_embd = cur;
|
|
13656
|
+
|
|
13657
|
+
// lm_head
|
|
13658
|
+
cur = build_lora_mm(model.output, cur);
|
|
13659
|
+
|
|
13660
|
+
cb(cur, "result_output", -1);
|
|
13661
|
+
res->t_logits = cur;
|
|
13662
|
+
|
|
13663
|
+
ggml_build_forward_expand(gf, cur);
|
|
13664
|
+
}
|
|
13665
|
+
};
|
|
13666
|
+
|
|
13667
|
+
struct llm_build_rwkv6_base : public llm_graph_context {
|
|
13668
|
+
const llama_model & model;
|
|
13669
|
+
|
|
13670
|
+
llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
|
|
13671
|
+
}
|
|
13672
|
+
|
|
13673
|
+
ggml_tensor * build_rwkv6_channel_mix(
|
|
13674
|
+
const llama_layer * layer,
|
|
13675
|
+
ggml_tensor * cur,
|
|
13676
|
+
ggml_tensor * x_prev,
|
|
13677
|
+
llm_arch arch) const {
|
|
13678
|
+
ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
|
|
13679
|
+
switch (arch) {
|
|
13680
|
+
case LLM_ARCH_RWKV6:
|
|
13681
|
+
{
|
|
13682
|
+
ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
|
|
13683
|
+
ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
|
|
13684
|
+
|
|
13685
|
+
ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
|
|
13686
|
+
ggml_tensor * k = ggml_sqr(
|
|
13687
|
+
ctx0,
|
|
13688
|
+
ggml_relu(
|
|
13689
|
+
ctx0,
|
|
13690
|
+
build_lora_mm(layer->channel_mix_key, xk)
|
|
13691
|
+
)
|
|
13692
|
+
);
|
|
13693
|
+
cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
|
|
13694
|
+
} break;
|
|
13695
|
+
default:
|
|
13696
|
+
GGML_ABORT("fatal error");
|
|
13697
|
+
}
|
|
13698
|
+
|
|
13699
|
+
return cur;
|
|
13700
|
+
}
|
|
13701
|
+
|
|
13702
|
+
ggml_tensor * build_rwkv6_time_mix(
|
|
13703
|
+
llm_graph_input_rs * inp,
|
|
13704
|
+
ggml_tensor * cur,
|
|
13705
|
+
ggml_tensor * x_prev,
|
|
13706
|
+
const llama_ubatch & ubatch,
|
|
13707
|
+
int il) const {
|
|
13708
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
13709
|
+
|
|
13710
|
+
const auto n_tokens = ubatch.n_tokens;
|
|
13711
|
+
const auto n_seqs = ubatch.n_seqs;
|
|
13712
|
+
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
|
13713
|
+
const auto n_embd = hparams.n_embd;
|
|
13714
|
+
const auto head_size = hparams.wkv_head_size;
|
|
13715
|
+
const auto n_head = n_embd / head_size;
|
|
13716
|
+
const auto n_head_kv = hparams.n_head_kv(il);
|
|
13717
|
+
|
|
13718
|
+
const auto kv_head = mctx_cur->get_head();
|
|
13719
|
+
|
|
13720
|
+
const auto & layer = model.layers[il];
|
|
13721
|
+
|
|
13722
|
+
bool is_qrwkv = layer.time_mix_first == nullptr;
|
|
13723
|
+
|
|
13724
|
+
ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
|
|
13725
|
+
|
|
13726
|
+
sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
|
|
13727
|
+
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
|
13728
|
+
|
|
13729
|
+
ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
|
|
13730
|
+
|
|
13731
|
+
xxx = ggml_reshape_4d(
|
|
13732
|
+
ctx0,
|
|
13733
|
+
ggml_tanh(
|
|
13734
|
+
ctx0,
|
|
13735
|
+
ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
|
|
13736
|
+
),
|
|
13737
|
+
layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
|
|
13738
|
+
);
|
|
13739
|
+
|
|
13740
|
+
xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
|
|
13741
|
+
|
|
13742
|
+
xxx = ggml_mul_mat(
|
|
13743
|
+
ctx0,
|
|
13744
|
+
ggml_reshape_4d(
|
|
13745
|
+
ctx0,
|
|
13746
|
+
layer.time_mix_w2,
|
|
13747
|
+
layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
|
|
13748
|
+
),
|
|
13749
|
+
xxx
|
|
13750
|
+
);
|
|
13751
|
+
|
|
13752
|
+
ggml_tensor *xw, *xk, *xv, *xr, *xg;
|
|
12927
13753
|
if (layer.time_mix_lerp_fused) {
|
|
12928
13754
|
// fusing these weights makes some performance improvement
|
|
12929
13755
|
sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens);
|
|
@@ -13001,7 +13827,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
13001
13827
|
}
|
|
13002
13828
|
|
|
13003
13829
|
ggml_tensor * wkv_state = build_rs(
|
|
13004
|
-
inp,
|
|
13830
|
+
inp, mctx_cur->get_s_l(il),
|
|
13005
13831
|
hparams.n_embd_s(), n_seqs);
|
|
13006
13832
|
|
|
13007
13833
|
ggml_tensor * wkv_output;
|
|
@@ -13047,7 +13873,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|
|
13047
13873
|
};
|
|
13048
13874
|
|
|
13049
13875
|
struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
13050
|
-
llm_build_rwkv6(const llama_model & model, const llm_graph_params & params
|
|
13876
|
+
llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
|
|
13051
13877
|
GGML_ASSERT(hparams.token_shift_count == 2);
|
|
13052
13878
|
|
|
13053
13879
|
ggml_tensor * cur;
|
|
@@ -13068,7 +13894,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
13068
13894
|
const llama_layer * layer = &model.layers[il];
|
|
13069
13895
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
13070
13896
|
|
|
13071
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp,
|
|
13897
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
|
|
13072
13898
|
|
|
13073
13899
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
|
13074
13900
|
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
|
|
@@ -13083,7 +13909,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
13083
13909
|
1
|
|
13084
13910
|
);
|
|
13085
13911
|
|
|
13086
|
-
cur = build_rwkv6_time_mix(rs_inp,
|
|
13912
|
+
cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
|
|
13087
13913
|
|
|
13088
13914
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
13089
13915
|
cb(ffn_inp, "ffn_inp", il);
|
|
@@ -13148,7 +13974,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
|
13148
13974
|
|
|
13149
13975
|
// ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
|
|
13150
13976
|
struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
13151
|
-
llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params
|
|
13977
|
+
llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
|
|
13152
13978
|
GGML_ASSERT(n_embd == hparams.n_embd_r());
|
|
13153
13979
|
|
|
13154
13980
|
ggml_tensor * cur;
|
|
@@ -13168,7 +13994,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
13168
13994
|
const llama_layer * layer = &model.layers[il];
|
|
13169
13995
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
13170
13996
|
|
|
13171
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp,
|
|
13997
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
|
|
13172
13998
|
|
|
13173
13999
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
|
13174
14000
|
cb(att_norm, "attn_norm", il);
|
|
@@ -13180,7 +14006,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
|
|
13180
14006
|
1
|
|
13181
14007
|
);
|
|
13182
14008
|
|
|
13183
|
-
cur = build_rwkv6_time_mix(rs_inp,
|
|
14009
|
+
cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
|
|
13184
14010
|
|
|
13185
14011
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
|
13186
14012
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
@@ -13270,7 +14096,6 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
13270
14096
|
|
|
13271
14097
|
ggml_tensor * build_rwkv7_time_mix(
|
|
13272
14098
|
llm_graph_input_rs * inp,
|
|
13273
|
-
ggml_cgraph * gf,
|
|
13274
14099
|
ggml_tensor * cur,
|
|
13275
14100
|
ggml_tensor * x_prev,
|
|
13276
14101
|
ggml_tensor *& first_layer_value,
|
|
@@ -13356,7 +14181,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
13356
14181
|
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
|
|
13357
14182
|
|
|
13358
14183
|
ggml_tensor * wkv_state = build_rs(
|
|
13359
|
-
inp,
|
|
14184
|
+
inp, mctx_cur->get_s_l(il),
|
|
13360
14185
|
hparams.n_embd_s(), n_seqs);
|
|
13361
14186
|
|
|
13362
14187
|
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
|
|
@@ -13403,7 +14228,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
13403
14228
|
};
|
|
13404
14229
|
|
|
13405
14230
|
struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
13406
|
-
llm_build_rwkv7(const llama_model & model, const llm_graph_params & params
|
|
14231
|
+
llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
|
|
13407
14232
|
GGML_ASSERT(hparams.token_shift_count == 2);
|
|
13408
14233
|
|
|
13409
14234
|
ggml_tensor * cur;
|
|
@@ -13425,7 +14250,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
13425
14250
|
const llama_layer * layer = &model.layers[il];
|
|
13426
14251
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
13427
14252
|
|
|
13428
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp,
|
|
14253
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
|
|
13429
14254
|
|
|
13430
14255
|
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
|
13431
14256
|
ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
|
|
@@ -13440,7 +14265,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
13440
14265
|
1
|
|
13441
14266
|
);
|
|
13442
14267
|
|
|
13443
|
-
cur = build_rwkv7_time_mix(rs_inp,
|
|
14268
|
+
cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
|
|
13444
14269
|
|
|
13445
14270
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
13446
14271
|
cb(ffn_inp, "ffn_inp", il);
|
|
@@ -13499,7 +14324,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
|
|
13499
14324
|
|
|
13500
14325
|
|
|
13501
14326
|
struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
13502
|
-
llm_build_arwkv7(const llama_model & model, const llm_graph_params & params
|
|
14327
|
+
llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
|
|
13503
14328
|
GGML_ASSERT(n_embd == hparams.n_embd_r());
|
|
13504
14329
|
|
|
13505
14330
|
ggml_tensor * cur;
|
|
@@ -13520,7 +14345,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
13520
14345
|
const llama_layer * layer = &model.layers[il];
|
|
13521
14346
|
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
13522
14347
|
|
|
13523
|
-
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp,
|
|
14348
|
+
ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
|
|
13524
14349
|
|
|
13525
14350
|
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
|
13526
14351
|
cb(att_norm, "attn_norm", il);
|
|
@@ -13532,7 +14357,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
13532
14357
|
1
|
|
13533
14358
|
);
|
|
13534
14359
|
|
|
13535
|
-
cur = build_rwkv7_time_mix(rs_inp,
|
|
14360
|
+
cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
|
|
13536
14361
|
|
|
13537
14362
|
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
|
13538
14363
|
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
|
@@ -13586,13 +14411,10 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
|
13586
14411
|
}
|
|
13587
14412
|
};
|
|
13588
14413
|
|
|
13589
|
-
|
|
13590
14414
|
struct llm_build_granite : public llm_graph_context {
|
|
13591
14415
|
llm_build_granite(
|
|
13592
14416
|
const llama_model & model,
|
|
13593
|
-
const llm_graph_params & params
|
|
13594
|
-
ggml_cgraph * gf,
|
|
13595
|
-
const bool use_rope = true)
|
|
14417
|
+
const llm_graph_params & params)
|
|
13596
14418
|
: llm_graph_context(params) {
|
|
13597
14419
|
|
|
13598
14420
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -13607,14 +14429,12 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
13607
14429
|
|
|
13608
14430
|
// inp_pos - built only if rope enabled
|
|
13609
14431
|
ggml_tensor * inp_pos = nullptr;
|
|
13610
|
-
if (
|
|
14432
|
+
if (hparams.rope_finetuned) {
|
|
13611
14433
|
inp_pos = build_inp_pos();
|
|
13612
14434
|
}
|
|
13613
14435
|
|
|
13614
14436
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
13615
14437
|
|
|
13616
|
-
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
13617
|
-
|
|
13618
14438
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13619
14439
|
|
|
13620
14440
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -13627,128 +14447,234 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
13627
14447
|
cb(cur, "attn_norm", il);
|
|
13628
14448
|
|
|
13629
14449
|
// self-attention
|
|
13630
|
-
|
|
13631
|
-
|
|
13632
|
-
|
|
13633
|
-
cb(Qcur, "Qcur", il);
|
|
13634
|
-
if (model.layers[il].bq) {
|
|
13635
|
-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
13636
|
-
cb(Qcur, "Qcur", il);
|
|
13637
|
-
}
|
|
14450
|
+
cur = build_attention_layer(
|
|
14451
|
+
cur, inp_pos, inp_attn,
|
|
14452
|
+
model, n_embd_head, il);
|
|
13638
14453
|
|
|
13639
|
-
|
|
13640
|
-
|
|
13641
|
-
|
|
13642
|
-
|
|
13643
|
-
cb(Kcur, "Kcur", il);
|
|
13644
|
-
}
|
|
14454
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14455
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14456
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14457
|
+
}
|
|
13645
14458
|
|
|
13646
|
-
|
|
13647
|
-
|
|
13648
|
-
if (model.layers[il].bv) {
|
|
13649
|
-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
13650
|
-
cb(Vcur, "Vcur", il);
|
|
13651
|
-
}
|
|
14459
|
+
// ffn
|
|
14460
|
+
cur = build_layer_ffn(cur, inpSA, model, il);
|
|
13652
14461
|
|
|
13653
|
-
|
|
13654
|
-
|
|
13655
|
-
|
|
14462
|
+
// input for next layer
|
|
14463
|
+
inpL = cur;
|
|
14464
|
+
}
|
|
13656
14465
|
|
|
13657
|
-
|
|
13658
|
-
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
13659
|
-
Qcur = ggml_rope_ext(
|
|
13660
|
-
ctx0, Qcur, inp_pos, rope_factors,
|
|
13661
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
13662
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13663
|
-
);
|
|
14466
|
+
cur = inpL;
|
|
13664
14467
|
|
|
13665
|
-
|
|
13666
|
-
|
|
13667
|
-
|
|
13668
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13669
|
-
);
|
|
13670
|
-
}
|
|
14468
|
+
cur = build_norm(cur,
|
|
14469
|
+
model.output_norm, NULL,
|
|
14470
|
+
LLM_NORM_RMS, -1);
|
|
13671
14471
|
|
|
13672
|
-
|
|
13673
|
-
|
|
13674
|
-
cb(Vcur, "Vcur", il);
|
|
14472
|
+
cb(cur, "result_norm", -1);
|
|
14473
|
+
res->t_embd = cur;
|
|
13675
14474
|
|
|
13676
|
-
|
|
13677
|
-
|
|
13678
|
-
|
|
14475
|
+
// lm_head
|
|
14476
|
+
cur = build_lora_mm(model.output, cur);
|
|
14477
|
+
|
|
14478
|
+
// For Granite architectures - scale logits
|
|
14479
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
14480
|
+
cb(cur, "result_output", -1);
|
|
14481
|
+
res->t_logits = cur;
|
|
14482
|
+
|
|
14483
|
+
ggml_build_forward_expand(gf, cur);
|
|
14484
|
+
}
|
|
14485
|
+
|
|
14486
|
+
ggml_tensor * build_attention_layer(
|
|
14487
|
+
ggml_tensor * cur,
|
|
14488
|
+
ggml_tensor * inp_pos,
|
|
14489
|
+
llm_graph_input_attn_kv_unified * inp_attn,
|
|
14490
|
+
const llama_model & model,
|
|
14491
|
+
const int64_t n_embd_head,
|
|
14492
|
+
const int il) {
|
|
14493
|
+
|
|
14494
|
+
// compute Q and K and (optionally) RoPE them
|
|
14495
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14496
|
+
cb(Qcur, "Qcur", il);
|
|
14497
|
+
if (model.layers[il].bq) {
|
|
14498
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14499
|
+
cb(Qcur, "Qcur", il);
|
|
14500
|
+
}
|
|
14501
|
+
|
|
14502
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14503
|
+
cb(Kcur, "Kcur", il);
|
|
14504
|
+
if (model.layers[il].bk) {
|
|
14505
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14506
|
+
cb(Kcur, "Kcur", il);
|
|
14507
|
+
}
|
|
14508
|
+
|
|
14509
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14510
|
+
cb(Vcur, "Vcur", il);
|
|
14511
|
+
if (model.layers[il].bv) {
|
|
14512
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14513
|
+
cb(Vcur, "Vcur", il);
|
|
14514
|
+
}
|
|
14515
|
+
|
|
14516
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
|
|
14517
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14518
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14519
|
+
|
|
14520
|
+
const bool use_rope = hparams.rope_finetuned;
|
|
14521
|
+
if (use_rope) {
|
|
14522
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
14523
|
+
Qcur = ggml_rope_ext(
|
|
14524
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
14525
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14526
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14527
|
+
);
|
|
14528
|
+
|
|
14529
|
+
Kcur = ggml_rope_ext(
|
|
14530
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
14531
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14532
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14533
|
+
);
|
|
14534
|
+
}
|
|
14535
|
+
|
|
14536
|
+
cb(Qcur, "Qcur", il);
|
|
14537
|
+
cb(Kcur, "Kcur", il);
|
|
14538
|
+
cb(Vcur, "Vcur", il);
|
|
14539
|
+
|
|
14540
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14541
|
+
cur = build_attn(inp_attn,
|
|
14542
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
14543
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
13679
14544
|
cb(cur, "attn_out", il);
|
|
13680
|
-
|
|
14545
|
+
return cur;
|
|
14546
|
+
}
|
|
13681
14547
|
|
|
13682
|
-
|
|
13683
|
-
|
|
13684
|
-
|
|
13685
|
-
|
|
14548
|
+
ggml_tensor * build_layer_ffn(
|
|
14549
|
+
ggml_tensor * cur,
|
|
14550
|
+
ggml_tensor * inpSA,
|
|
14551
|
+
const llama_model & model,
|
|
14552
|
+
const int il) {
|
|
13686
14553
|
|
|
13687
|
-
|
|
14554
|
+
// For Granite architectures - scale residual
|
|
14555
|
+
if (hparams.f_residual_scale) {
|
|
13688
14556
|
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
13689
|
-
|
|
13690
|
-
|
|
14557
|
+
}
|
|
14558
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14559
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13691
14560
|
|
|
13692
|
-
|
|
13693
|
-
|
|
14561
|
+
// feed-forward network (non-MoE)
|
|
14562
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
13694
14563
|
|
|
13695
|
-
|
|
13696
|
-
|
|
13697
|
-
|
|
13698
|
-
|
|
14564
|
+
cur = build_norm(ffn_inp,
|
|
14565
|
+
model.layers[il].ffn_norm, NULL,
|
|
14566
|
+
LLM_NORM_RMS, il);
|
|
14567
|
+
cb(cur, "ffn_norm", il);
|
|
13699
14568
|
|
|
13700
|
-
|
|
13701
|
-
|
|
13702
|
-
|
|
13703
|
-
|
|
13704
|
-
|
|
13705
|
-
|
|
13706
|
-
|
|
14569
|
+
cur = build_ffn(cur,
|
|
14570
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
14571
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
14572
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
14573
|
+
NULL,
|
|
14574
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14575
|
+
cb(cur, "ffn_out", il);
|
|
13707
14576
|
|
|
13708
|
-
|
|
13709
|
-
|
|
13710
|
-
|
|
13711
|
-
|
|
13712
|
-
|
|
13713
|
-
|
|
14577
|
+
} else {
|
|
14578
|
+
// MoE branch
|
|
14579
|
+
cur = build_norm(ffn_inp,
|
|
14580
|
+
model.layers[il].ffn_norm, NULL,
|
|
14581
|
+
LLM_NORM_RMS, il);
|
|
14582
|
+
cb(cur, "ffn_norm", il);
|
|
13714
14583
|
|
|
13715
|
-
|
|
13716
|
-
|
|
13717
|
-
|
|
13718
|
-
|
|
13719
|
-
|
|
13720
|
-
|
|
13721
|
-
|
|
13722
|
-
|
|
13723
|
-
|
|
13724
|
-
|
|
13725
|
-
|
|
13726
|
-
|
|
14584
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
14585
|
+
model.layers[il].ffn_gate_inp,
|
|
14586
|
+
model.layers[il].ffn_up_exps,
|
|
14587
|
+
model.layers[il].ffn_gate_exps,
|
|
14588
|
+
model.layers[il].ffn_down_exps,
|
|
14589
|
+
nullptr,
|
|
14590
|
+
n_expert, n_expert_used,
|
|
14591
|
+
LLM_FFN_SILU, true,
|
|
14592
|
+
false, 0.0,
|
|
14593
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
14594
|
+
il);
|
|
14595
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
13727
14596
|
|
|
13728
|
-
|
|
13729
|
-
|
|
13730
|
-
|
|
13731
|
-
|
|
13732
|
-
|
|
13733
|
-
|
|
13734
|
-
|
|
13735
|
-
|
|
13736
|
-
|
|
14597
|
+
// For Granite MoE Shared
|
|
14598
|
+
if (hparams.n_ff_shexp > 0) {
|
|
14599
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
14600
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
14601
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
14602
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
14603
|
+
NULL,
|
|
14604
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14605
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
13737
14606
|
|
|
13738
|
-
|
|
13739
|
-
|
|
13740
|
-
|
|
13741
|
-
|
|
13742
|
-
}
|
|
14607
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
14608
|
+
cb(cur, "ffn_out", il);
|
|
14609
|
+
} else {
|
|
14610
|
+
cur = moe_out;
|
|
13743
14611
|
}
|
|
14612
|
+
}
|
|
13744
14613
|
|
|
13745
|
-
|
|
14614
|
+
// For Granite architectures - scale residual
|
|
14615
|
+
if (hparams.f_residual_scale) {
|
|
13746
14616
|
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
13747
|
-
|
|
13748
|
-
|
|
14617
|
+
}
|
|
14618
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14619
|
+
cb(cur, "ffn_out", il);
|
|
13749
14620
|
|
|
13750
|
-
|
|
13751
|
-
|
|
14621
|
+
cur = build_cvec(cur, il);
|
|
14622
|
+
cb(cur, "l_out", il);
|
|
14623
|
+
|
|
14624
|
+
return cur;
|
|
14625
|
+
}
|
|
14626
|
+
};
|
|
14627
|
+
|
|
14628
|
+
struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
14629
|
+
llm_build_granite_hybrid(
|
|
14630
|
+
const llama_model & model,
|
|
14631
|
+
const llm_graph_params & params) :
|
|
14632
|
+
llm_graph_context_mamba(params) {
|
|
14633
|
+
|
|
14634
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14635
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14636
|
+
|
|
14637
|
+
ggml_tensor * cur;
|
|
14638
|
+
ggml_tensor * inpL;
|
|
14639
|
+
|
|
14640
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14641
|
+
|
|
14642
|
+
auto * inp = build_inp_mem_hybrid();
|
|
14643
|
+
|
|
14644
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14645
|
+
|
|
14646
|
+
// Positional embeddings populated if rope enabled
|
|
14647
|
+
ggml_tensor * inp_pos = nullptr;
|
|
14648
|
+
if (hparams.rope_finetuned) {
|
|
14649
|
+
inp_pos = build_inp_pos();
|
|
14650
|
+
}
|
|
14651
|
+
|
|
14652
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14653
|
+
struct ggml_tensor * inpSA = inpL;
|
|
14654
|
+
|
|
14655
|
+
// norm
|
|
14656
|
+
cur = build_norm(inpL,
|
|
14657
|
+
model.layers[il].attn_norm, NULL,
|
|
14658
|
+
LLM_NORM_RMS, il);
|
|
14659
|
+
cb(cur, "attn_norm", il);
|
|
14660
|
+
|
|
14661
|
+
if (hparams.is_recurrent(il)) {
|
|
14662
|
+
// ssm layer //
|
|
14663
|
+
cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
|
|
14664
|
+
} else {
|
|
14665
|
+
// attention layer //
|
|
14666
|
+
cur = build_attention_layer(
|
|
14667
|
+
cur, inp_pos, inp->get_attn(), model,
|
|
14668
|
+
n_embd_head, il);
|
|
14669
|
+
}
|
|
14670
|
+
|
|
14671
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14672
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14673
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14674
|
+
}
|
|
14675
|
+
|
|
14676
|
+
// ffn
|
|
14677
|
+
cur = build_layer_ffn(cur, inpSA, model, il);
|
|
13752
14678
|
|
|
13753
14679
|
// input for next layer
|
|
13754
14680
|
inpL = cur;
|
|
@@ -13760,18 +14686,161 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
13760
14686
|
model.output_norm, NULL,
|
|
13761
14687
|
LLM_NORM_RMS, -1);
|
|
13762
14688
|
|
|
13763
|
-
cb(cur, "result_norm", -1);
|
|
13764
|
-
res->t_embd = cur;
|
|
14689
|
+
cb(cur, "result_norm", -1);
|
|
14690
|
+
res->t_embd = cur;
|
|
14691
|
+
|
|
14692
|
+
// lm_head
|
|
14693
|
+
cur = build_lora_mm(model.output, cur);
|
|
14694
|
+
|
|
14695
|
+
// For Granite architectures - scale logits
|
|
14696
|
+
if (hparams.f_logit_scale) {
|
|
14697
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
|
14698
|
+
}
|
|
14699
|
+
cb(cur, "result_output", -1);
|
|
14700
|
+
res->t_logits = cur;
|
|
14701
|
+
|
|
14702
|
+
ggml_build_forward_expand(gf, cur);
|
|
14703
|
+
}
|
|
14704
|
+
|
|
14705
|
+
ggml_tensor * build_attention_layer(
|
|
14706
|
+
ggml_tensor * cur,
|
|
14707
|
+
ggml_tensor * inp_pos,
|
|
14708
|
+
llm_graph_input_attn_kv_unified * inp_attn,
|
|
14709
|
+
const llama_model & model,
|
|
14710
|
+
const int64_t n_embd_head,
|
|
14711
|
+
const int il) {
|
|
14712
|
+
|
|
14713
|
+
// compute Q and K and (optionally) RoPE them
|
|
14714
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14715
|
+
cb(Qcur, "Qcur", il);
|
|
14716
|
+
if (model.layers[il].bq) {
|
|
14717
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14718
|
+
cb(Qcur, "Qcur", il);
|
|
14719
|
+
}
|
|
14720
|
+
|
|
14721
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14722
|
+
cb(Kcur, "Kcur", il);
|
|
14723
|
+
if (model.layers[il].bk) {
|
|
14724
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14725
|
+
cb(Kcur, "Kcur", il);
|
|
14726
|
+
}
|
|
14727
|
+
|
|
14728
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14729
|
+
cb(Vcur, "Vcur", il);
|
|
14730
|
+
if (model.layers[il].bv) {
|
|
14731
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14732
|
+
cb(Vcur, "Vcur", il);
|
|
14733
|
+
}
|
|
14734
|
+
|
|
14735
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
|
|
14736
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14737
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14738
|
+
|
|
14739
|
+
const bool use_rope = hparams.rope_finetuned;
|
|
14740
|
+
if (use_rope) {
|
|
14741
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
14742
|
+
Qcur = ggml_rope_ext(
|
|
14743
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
14744
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14745
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14746
|
+
);
|
|
14747
|
+
|
|
14748
|
+
Kcur = ggml_rope_ext(
|
|
14749
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
14750
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14751
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14752
|
+
);
|
|
14753
|
+
}
|
|
14754
|
+
|
|
14755
|
+
cb(Qcur, "Qcur", il);
|
|
14756
|
+
cb(Kcur, "Kcur", il);
|
|
14757
|
+
cb(Vcur, "Vcur", il);
|
|
14758
|
+
|
|
14759
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14760
|
+
cur = build_attn(inp_attn,
|
|
14761
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
14762
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
14763
|
+
cb(cur, "attn_out", il);
|
|
14764
|
+
return cur;
|
|
14765
|
+
}
|
|
14766
|
+
|
|
14767
|
+
ggml_tensor * build_layer_ffn(
|
|
14768
|
+
ggml_tensor * cur,
|
|
14769
|
+
ggml_tensor * inpSA,
|
|
14770
|
+
const llama_model & model,
|
|
14771
|
+
const int il) {
|
|
14772
|
+
|
|
14773
|
+
// For Granite architectures - scale residual
|
|
14774
|
+
if (hparams.f_residual_scale) {
|
|
14775
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
14776
|
+
}
|
|
14777
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14778
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
14779
|
+
|
|
14780
|
+
// feed-forward network (non-MoE)
|
|
14781
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
14782
|
+
|
|
14783
|
+
cur = build_norm(ffn_inp,
|
|
14784
|
+
model.layers[il].ffn_norm, NULL,
|
|
14785
|
+
LLM_NORM_RMS, il);
|
|
14786
|
+
cb(cur, "ffn_norm", il);
|
|
14787
|
+
|
|
14788
|
+
cur = build_ffn(cur,
|
|
14789
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
14790
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
14791
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
14792
|
+
NULL,
|
|
14793
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14794
|
+
cb(cur, "ffn_out", il);
|
|
14795
|
+
|
|
14796
|
+
} else {
|
|
14797
|
+
// MoE branch
|
|
14798
|
+
cur = build_norm(ffn_inp,
|
|
14799
|
+
model.layers[il].ffn_norm, NULL,
|
|
14800
|
+
LLM_NORM_RMS, il);
|
|
14801
|
+
cb(cur, "ffn_norm", il);
|
|
14802
|
+
|
|
14803
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
14804
|
+
model.layers[il].ffn_gate_inp,
|
|
14805
|
+
model.layers[il].ffn_up_exps,
|
|
14806
|
+
model.layers[il].ffn_gate_exps,
|
|
14807
|
+
model.layers[il].ffn_down_exps,
|
|
14808
|
+
nullptr,
|
|
14809
|
+
n_expert, n_expert_used,
|
|
14810
|
+
LLM_FFN_SILU, true,
|
|
14811
|
+
false, 0.0,
|
|
14812
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
14813
|
+
il);
|
|
14814
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
14815
|
+
|
|
14816
|
+
// For Granite MoE Shared
|
|
14817
|
+
if (hparams.n_ff_shexp > 0) {
|
|
14818
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
14819
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
14820
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
14821
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
14822
|
+
NULL,
|
|
14823
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14824
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
14825
|
+
|
|
14826
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
14827
|
+
cb(cur, "ffn_out", il);
|
|
14828
|
+
} else {
|
|
14829
|
+
cur = moe_out;
|
|
14830
|
+
}
|
|
14831
|
+
}
|
|
13765
14832
|
|
|
13766
|
-
//
|
|
13767
|
-
|
|
14833
|
+
// For Granite architectures - scale residual
|
|
14834
|
+
if (hparams.f_residual_scale) {
|
|
14835
|
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
|
14836
|
+
}
|
|
14837
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14838
|
+
cb(cur, "ffn_out", il);
|
|
13768
14839
|
|
|
13769
|
-
|
|
13770
|
-
cur
|
|
13771
|
-
cb(cur, "result_output", -1);
|
|
13772
|
-
res->t_logits = cur;
|
|
14840
|
+
cur = build_cvec(cur, il);
|
|
14841
|
+
cb(cur, "l_out", il);
|
|
13773
14842
|
|
|
13774
|
-
|
|
14843
|
+
return cur;
|
|
13775
14844
|
}
|
|
13776
14845
|
};
|
|
13777
14846
|
|
|
@@ -13782,7 +14851,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
13782
14851
|
// * removed bias
|
|
13783
14852
|
// * removed MoE
|
|
13784
14853
|
struct llm_build_chameleon : public llm_graph_context {
|
|
13785
|
-
llm_build_chameleon(const llama_model & model, const llm_graph_params & params
|
|
14854
|
+
llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
13786
14855
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13787
14856
|
|
|
13788
14857
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -13873,7 +14942,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
13873
14942
|
cb(Kcur, "Kcur", il);
|
|
13874
14943
|
cb(Vcur, "Vcur", il);
|
|
13875
14944
|
|
|
13876
|
-
cur = build_attn(inp_attn,
|
|
14945
|
+
cur = build_attn(inp_attn,
|
|
13877
14946
|
model.layers[il].wo, nullptr,
|
|
13878
14947
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13879
14948
|
}
|
|
@@ -13959,7 +15028,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
13959
15028
|
};
|
|
13960
15029
|
|
|
13961
15030
|
struct llm_build_wavtokenizer_dec : public llm_graph_context {
|
|
13962
|
-
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params
|
|
15031
|
+
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
13963
15032
|
ggml_tensor * cur;
|
|
13964
15033
|
ggml_tensor * inpL;
|
|
13965
15034
|
|
|
@@ -14111,7 +15180,7 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
|
|
|
14111
15180
|
};
|
|
14112
15181
|
|
|
14113
15182
|
struct llm_build_plm : public llm_graph_context {
|
|
14114
|
-
llm_build_plm(const llama_model & model, const llm_graph_params & params
|
|
15183
|
+
llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
14115
15184
|
const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
|
|
14116
15185
|
|
|
14117
15186
|
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
@@ -14229,7 +15298,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
14229
15298
|
ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
|
14230
15299
|
cb(k_states, "k_states", il);
|
|
14231
15300
|
|
|
14232
|
-
cur = build_attn(inp_attn,
|
|
15301
|
+
cur = build_attn(inp_attn,
|
|
14233
15302
|
model.layers[il].wo, NULL,
|
|
14234
15303
|
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
14235
15304
|
}
|
|
@@ -14283,7 +15352,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
14283
15352
|
};
|
|
14284
15353
|
|
|
14285
15354
|
struct llm_build_bailingmoe : public llm_graph_context {
|
|
14286
|
-
llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params
|
|
15355
|
+
llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
14287
15356
|
ggml_tensor * cur;
|
|
14288
15357
|
ggml_tensor * inpL;
|
|
14289
15358
|
|
|
@@ -14352,7 +15421,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
14352
15421
|
cb(Kcur, "Kcur", il);
|
|
14353
15422
|
cb(Vcur, "Vcur", il);
|
|
14354
15423
|
|
|
14355
|
-
cur = build_attn(inp_attn,
|
|
15424
|
+
cur = build_attn(inp_attn,
|
|
14356
15425
|
model.layers[il].wo, model.layers[il].bo,
|
|
14357
15426
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
14358
15427
|
}
|
|
@@ -14427,7 +15496,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
14427
15496
|
};
|
|
14428
15497
|
|
|
14429
15498
|
struct llm_build_dots1 : public llm_graph_context {
|
|
14430
|
-
llm_build_dots1(const llama_model & model, const llm_graph_params & params
|
|
15499
|
+
llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
14431
15500
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14432
15501
|
|
|
14433
15502
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -14492,7 +15561,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
14492
15561
|
cb(Kcur, "Kcur", il);
|
|
14493
15562
|
cb(Vcur, "Vcur", il);
|
|
14494
15563
|
|
|
14495
|
-
cur = build_attn(inp_attn,
|
|
15564
|
+
cur = build_attn(inp_attn,
|
|
14496
15565
|
model.layers[il].wo, model.layers[il].bo,
|
|
14497
15566
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14498
15567
|
}
|
|
@@ -14577,7 +15646,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
14577
15646
|
};
|
|
14578
15647
|
|
|
14579
15648
|
struct llm_build_ernie4_5 : public llm_graph_context {
|
|
14580
|
-
llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params
|
|
15649
|
+
llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
14581
15650
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14582
15651
|
|
|
14583
15652
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -14647,7 +15716,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
14647
15716
|
cb(Kcur, "Kcur", il);
|
|
14648
15717
|
cb(Vcur, "Vcur", il);
|
|
14649
15718
|
|
|
14650
|
-
cur = build_attn(inp_attn,
|
|
15719
|
+
cur = build_attn(inp_attn,
|
|
14651
15720
|
model.layers[il].wo, NULL,
|
|
14652
15721
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14653
15722
|
}
|
|
@@ -14706,10 +15775,178 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
14706
15775
|
}
|
|
14707
15776
|
};
|
|
14708
15777
|
|
|
14709
|
-
struct
|
|
14710
|
-
const llama_model & model
|
|
15778
|
+
struct llm_build_ernie4_5_moe : public llm_graph_context {
|
|
15779
|
+
llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
15780
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
15781
|
+
|
|
15782
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
15783
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
15784
|
+
|
|
15785
|
+
ggml_tensor * cur;
|
|
15786
|
+
ggml_tensor * inpL;
|
|
15787
|
+
|
|
15788
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
15789
|
+
|
|
15790
|
+
// inp_pos - contains the positions
|
|
15791
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
15792
|
+
|
|
15793
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
15794
|
+
|
|
15795
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15796
|
+
|
|
15797
|
+
GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
|
|
15798
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
15799
|
+
ggml_tensor * inpSA = inpL;
|
|
15800
|
+
// norm
|
|
15801
|
+
{
|
|
15802
|
+
cur = build_norm(inpL,
|
|
15803
|
+
model.layers[il].attn_norm, NULL,
|
|
15804
|
+
LLM_NORM_RMS, il);
|
|
15805
|
+
cb(cur, "attn_norm", il);
|
|
15806
|
+
}
|
|
15807
|
+
|
|
15808
|
+
// self-attention
|
|
15809
|
+
{
|
|
15810
|
+
// compute Q and K and RoPE them
|
|
15811
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
15812
|
+
cb(Qcur, "Qcur", il);
|
|
15813
|
+
if (model.layers[il].bq) {
|
|
15814
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
15815
|
+
cb(Qcur, "Qcur", il);
|
|
15816
|
+
}
|
|
15817
|
+
|
|
15818
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
15819
|
+
cb(Kcur, "Kcur", il);
|
|
15820
|
+
if (model.layers[il].bk) {
|
|
15821
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
15822
|
+
cb(Kcur, "Kcur", il);
|
|
15823
|
+
}
|
|
15824
|
+
|
|
15825
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
15826
|
+
cb(Vcur, "Vcur", il);
|
|
15827
|
+
if (model.layers[il].bv) {
|
|
15828
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
15829
|
+
cb(Vcur, "Vcur", il);
|
|
15830
|
+
}
|
|
15831
|
+
|
|
15832
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
15833
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
15834
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
15835
|
+
|
|
15836
|
+
Qcur = ggml_rope_ext(
|
|
15837
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
15838
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15839
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15840
|
+
);
|
|
15841
|
+
|
|
15842
|
+
Kcur = ggml_rope_ext(
|
|
15843
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
15844
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15845
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15846
|
+
);
|
|
15847
|
+
|
|
15848
|
+
cb(Qcur, "Qcur", il);
|
|
15849
|
+
cb(Kcur, "Kcur", il);
|
|
15850
|
+
cb(Vcur, "Vcur", il);
|
|
15851
|
+
|
|
15852
|
+
cur = build_attn(inp_attn,
|
|
15853
|
+
model.layers[il].wo, NULL,
|
|
15854
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15855
|
+
cb(cur, "attn_out", il);
|
|
15856
|
+
}
|
|
15857
|
+
|
|
15858
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
15859
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
15860
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
15861
|
+
}
|
|
15862
|
+
|
|
15863
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
15864
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
15865
|
+
|
|
15866
|
+
// feed-forward network
|
|
15867
|
+
bool is_moe_layer = static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
|
|
15868
|
+
|
|
15869
|
+
if (!is_moe_layer) {
|
|
15870
|
+
cur = build_norm(ffn_inp,
|
|
15871
|
+
model.layers[il].ffn_norm, NULL,
|
|
15872
|
+
LLM_NORM_RMS, il);
|
|
15873
|
+
cb(cur, "ffn_norm", il);
|
|
15874
|
+
|
|
15875
|
+
cur = build_ffn(cur,
|
|
15876
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
15877
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
15878
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
15879
|
+
NULL,
|
|
15880
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15881
|
+
cb(cur, "ffn_out", il);
|
|
15882
|
+
} else {
|
|
15883
|
+
// MoE branch
|
|
15884
|
+
cur = build_norm(ffn_inp,
|
|
15885
|
+
model.layers[il].ffn_norm, NULL,
|
|
15886
|
+
LLM_NORM_RMS, il);
|
|
15887
|
+
cb(cur, "ffn_norm", il);
|
|
15888
|
+
|
|
15889
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
15890
|
+
model.layers[il].ffn_gate_inp,
|
|
15891
|
+
model.layers[il].ffn_up_exps,
|
|
15892
|
+
model.layers[il].ffn_gate_exps,
|
|
15893
|
+
model.layers[il].ffn_down_exps,
|
|
15894
|
+
model.layers[il].ffn_exp_probs_b,
|
|
15895
|
+
n_expert, n_expert_used,
|
|
15896
|
+
LLM_FFN_SILU, true,
|
|
15897
|
+
false, 0.0,
|
|
15898
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
15899
|
+
il);
|
|
15900
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
15901
|
+
|
|
15902
|
+
// Shared expert (if present)
|
|
15903
|
+
if (hparams.n_ff_shexp > 0) {
|
|
15904
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
15905
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
15906
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
15907
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
15908
|
+
NULL,
|
|
15909
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15910
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
15911
|
+
|
|
15912
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
15913
|
+
} else {
|
|
15914
|
+
cur = moe_out;
|
|
15915
|
+
}
|
|
15916
|
+
cb(cur, "ffn_out", il);
|
|
15917
|
+
}
|
|
15918
|
+
|
|
15919
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
15920
|
+
cb(cur, "ffn_out", il);
|
|
15921
|
+
|
|
15922
|
+
cur = build_cvec(cur, il);
|
|
15923
|
+
cb(cur, "l_out", il);
|
|
15924
|
+
|
|
15925
|
+
// input for next layer
|
|
15926
|
+
inpL = cur;
|
|
15927
|
+
}
|
|
15928
|
+
|
|
15929
|
+
cur = inpL;
|
|
15930
|
+
|
|
15931
|
+
cur = build_norm(cur,
|
|
15932
|
+
model.output_norm, NULL,
|
|
15933
|
+
LLM_NORM_RMS, -1);
|
|
15934
|
+
|
|
15935
|
+
cb(cur, "result_norm", -1);
|
|
15936
|
+
res->t_embd = cur;
|
|
15937
|
+
|
|
15938
|
+
// lm_head
|
|
15939
|
+
cur = build_lora_mm(model.output, cur);
|
|
15940
|
+
|
|
15941
|
+
cb(cur, "result_output", -1);
|
|
15942
|
+
res->t_logits = cur;
|
|
14711
15943
|
|
|
14712
|
-
|
|
15944
|
+
ggml_build_forward_expand(gf, cur);
|
|
15945
|
+
}
|
|
15946
|
+
};
|
|
15947
|
+
|
|
15948
|
+
struct llm_build_falcon_h1 : public llm_graph_context_mamba {
|
|
15949
|
+
llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
|
|
14713
15950
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14714
15951
|
|
|
14715
15952
|
ggml_tensor * cur;
|
|
@@ -14765,210 +16002,389 @@ struct llm_build_falcon_h1 : public llm_graph_context {
|
|
|
14765
16002
|
cb(Kcur, "Kcur-post-rope", il);
|
|
14766
16003
|
cb(Vcur, "Vcur-post-rope", il);
|
|
14767
16004
|
|
|
14768
|
-
ggml_tensor * attn_out = build_attn(inp,
|
|
16005
|
+
ggml_tensor * attn_out = build_attn(inp->get_attn(),
|
|
14769
16006
|
model.layers[il].wo, NULL,
|
|
14770
16007
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
14771
16008
|
cb(attn_out, "attn_out", il);
|
|
14772
16009
|
|
|
14773
|
-
cur = build_norm(inpL,
|
|
14774
|
-
model.layers[il].attn_norm, NULL,
|
|
14775
|
-
LLM_NORM_RMS, il);
|
|
14776
|
-
// Mamba2 layer
|
|
14777
|
-
cb(cur, "ssm_in", il);
|
|
16010
|
+
cur = build_norm(inpL,
|
|
16011
|
+
model.layers[il].attn_norm, NULL,
|
|
16012
|
+
LLM_NORM_RMS, il);
|
|
16013
|
+
// Mamba2 layer
|
|
16014
|
+
cb(cur, "ssm_in", il);
|
|
16015
|
+
|
|
16016
|
+
ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
|
|
16017
|
+
cb(ssm_out, "ssm_out", il);
|
|
16018
|
+
|
|
16019
|
+
// // Aggregation
|
|
16020
|
+
cur = ggml_add(ctx0, attn_out, ssm_out);
|
|
16021
|
+
inpSA = ggml_add(ctx0, cur, inpSA);
|
|
16022
|
+
cb(cur, "layer_out", il);
|
|
16023
|
+
|
|
16024
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
16025
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
16026
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
16027
|
+
}
|
|
16028
|
+
|
|
16029
|
+
ggml_tensor * ffn_inp = inpSA;
|
|
16030
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
16031
|
+
|
|
16032
|
+
// feed-forward network
|
|
16033
|
+
cur = build_norm(ffn_inp,
|
|
16034
|
+
model.layers[il].ffn_norm, NULL,
|
|
16035
|
+
LLM_NORM_RMS, il);
|
|
16036
|
+
cb(cur, "ffn_norm", il);
|
|
16037
|
+
|
|
16038
|
+
cur = build_ffn(cur,
|
|
16039
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
16040
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
16041
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
16042
|
+
NULL,
|
|
16043
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
16044
|
+
cb(cur, "ffn_out", il);
|
|
16045
|
+
|
|
16046
|
+
cur = ggml_add(ctx0, cur, inpSA);
|
|
16047
|
+
|
|
16048
|
+
cur = build_cvec(cur, il);
|
|
16049
|
+
cb(cur, "l_out", il);
|
|
16050
|
+
|
|
16051
|
+
// input for next layer
|
|
16052
|
+
inpL = cur;
|
|
16053
|
+
}
|
|
16054
|
+
|
|
16055
|
+
cur = inpL;
|
|
16056
|
+
|
|
16057
|
+
cur = build_norm(cur,
|
|
16058
|
+
model.output_norm, NULL,
|
|
16059
|
+
LLM_NORM_RMS, -1);
|
|
16060
|
+
|
|
16061
|
+
cb(cur, "result_norm", -1);
|
|
16062
|
+
res->t_embd = cur;
|
|
16063
|
+
|
|
16064
|
+
// lm_head
|
|
16065
|
+
cur = build_lora_mm(model.output, cur);
|
|
16066
|
+
|
|
16067
|
+
cb(cur, "result_output", -1);
|
|
16068
|
+
res->t_logits = cur;
|
|
16069
|
+
|
|
16070
|
+
ggml_build_forward_expand(gf, cur);
|
|
16071
|
+
}
|
|
16072
|
+
};
|
|
16073
|
+
|
|
16074
|
+
struct llm_build_plamo2 : public llm_graph_context_mamba {
|
|
16075
|
+
llm_build_plamo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
|
|
16076
|
+
ggml_tensor * cur;
|
|
16077
|
+
ggml_tensor * inpL;
|
|
16078
|
+
|
|
16079
|
+
// {n_embd, n_tokens}
|
|
16080
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
16081
|
+
cb(inpL, "embedding_output", -1);
|
|
16082
|
+
|
|
16083
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
16084
|
+
|
|
16085
|
+
auto * inp_hybrid = build_inp_mem_hybrid();
|
|
16086
|
+
|
|
16087
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16088
|
+
|
|
16089
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
16090
|
+
ggml_tensor * residual = inpL;
|
|
16091
|
+
|
|
16092
|
+
// ggml_graph_add_node(gf, model.layers[il].attn_norm);
|
|
16093
|
+
// cb(model.layers[il].attn_norm, "attn_norm", il);
|
|
14778
16094
|
|
|
14779
|
-
|
|
14780
|
-
|
|
16095
|
+
// pre_mixer_norm
|
|
16096
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
14781
16097
|
|
|
14782
|
-
//
|
|
14783
|
-
|
|
14784
|
-
inpSA = ggml_add(ctx0, cur, inpSA);
|
|
14785
|
-
cb(cur, "layer_out", il);
|
|
16098
|
+
// check if this layer is Mamba or Attention
|
|
16099
|
+
bool is_mamba_layer = hparams.is_recurrent(il);
|
|
14786
16100
|
|
|
14787
|
-
if (
|
|
14788
|
-
|
|
14789
|
-
|
|
16101
|
+
if (is_mamba_layer) {
|
|
16102
|
+
// PLaMo-2 Mamba layer
|
|
16103
|
+
cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
|
|
16104
|
+
} else {
|
|
16105
|
+
// PLaMo-2 Attention layer
|
|
16106
|
+
cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, cur, model, il);
|
|
14790
16107
|
}
|
|
14791
16108
|
|
|
14792
|
-
|
|
14793
|
-
|
|
16109
|
+
// post_mixer_norm
|
|
16110
|
+
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
|
16111
|
+
cb(cur, "attn_post_norm", il);
|
|
14794
16112
|
|
|
14795
|
-
//
|
|
14796
|
-
cur =
|
|
14797
|
-
|
|
14798
|
-
|
|
14799
|
-
|
|
16113
|
+
// residual connection
|
|
16114
|
+
cur = ggml_add(ctx0, cur, residual);
|
|
16115
|
+
cb(cur, "attn_residual", il);
|
|
16116
|
+
residual = cur;
|
|
16117
|
+
|
|
16118
|
+
// pre-ffn norm
|
|
16119
|
+
cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
16120
|
+
cb(cur, "ffn_pre_norm", il);
|
|
14800
16121
|
|
|
16122
|
+
// feed-forward network
|
|
14801
16123
|
cur = build_ffn(cur,
|
|
14802
|
-
model.layers[il].ffn_up,
|
|
14803
|
-
|
|
14804
|
-
model.layers[il].ffn_down,
|
|
16124
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
16125
|
+
NULL, NULL, NULL,
|
|
16126
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
14805
16127
|
NULL,
|
|
14806
|
-
|
|
16128
|
+
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
|
14807
16129
|
cb(cur, "ffn_out", il);
|
|
14808
16130
|
|
|
14809
|
-
|
|
16131
|
+
// post ffn norm
|
|
16132
|
+
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
|
|
16133
|
+
cb(cur, "ffn_post_norm", il);
|
|
14810
16134
|
|
|
14811
|
-
|
|
14812
|
-
|
|
16135
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
16136
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
16137
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
16138
|
+
}
|
|
16139
|
+
|
|
16140
|
+
// residual connection
|
|
16141
|
+
cur = ggml_add(ctx0, cur, residual);
|
|
16142
|
+
cb(cur, "ffn_residual", il);
|
|
14813
16143
|
|
|
14814
|
-
// input for next layer
|
|
14815
16144
|
inpL = cur;
|
|
14816
16145
|
}
|
|
14817
16146
|
|
|
14818
16147
|
cur = inpL;
|
|
14819
16148
|
|
|
14820
|
-
|
|
14821
|
-
|
|
14822
|
-
LLM_NORM_RMS, -1);
|
|
14823
|
-
|
|
16149
|
+
// final norm
|
|
16150
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
14824
16151
|
cb(cur, "result_norm", -1);
|
|
14825
|
-
res->t_embd = cur;
|
|
14826
16152
|
|
|
14827
16153
|
// lm_head
|
|
14828
16154
|
cur = build_lora_mm(model.output, cur);
|
|
14829
|
-
|
|
14830
16155
|
cb(cur, "result_output", -1);
|
|
16156
|
+
|
|
16157
|
+
// Explicitly mark as output tensor to ensure proper backend assignment
|
|
16158
|
+
ggml_set_output(cur);
|
|
16159
|
+
|
|
14831
16160
|
res->t_logits = cur;
|
|
14832
16161
|
|
|
14833
16162
|
ggml_build_forward_expand(gf, cur);
|
|
14834
16163
|
}
|
|
14835
16164
|
|
|
14836
|
-
|
|
14837
|
-
|
|
14838
|
-
|
|
14839
|
-
|
|
14840
|
-
|
|
14841
|
-
|
|
14842
|
-
|
|
16165
|
+
private:
|
|
16166
|
+
ggml_tensor * build_plamo2_attn_layer(
|
|
16167
|
+
llm_graph_input_attn_kv_unified * inp,
|
|
16168
|
+
ggml_tensor * inp_pos,
|
|
16169
|
+
ggml_tensor * cur,
|
|
16170
|
+
const llama_model & model,
|
|
16171
|
+
int il) {
|
|
14843
16172
|
|
|
14844
|
-
|
|
16173
|
+
// self-attention
|
|
16174
|
+
{
|
|
16175
|
+
// PLaMo-2 uses combined QKV tensor
|
|
16176
|
+
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
|
|
16177
|
+
cb(qkv, "qkv", il);
|
|
14845
16178
|
|
|
14846
|
-
|
|
14847
|
-
|
|
14848
|
-
|
|
14849
|
-
|
|
14850
|
-
|
|
14851
|
-
|
|
14852
|
-
|
|
16179
|
+
// split QKV tensor into Q, K, V
|
|
16180
|
+
const int64_t n_embd_head_q = hparams.n_embd_head_k;
|
|
16181
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
16182
|
+
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
|
16183
|
+
int32_t n_head_kv = hparams.n_head_kv(il);
|
|
16184
|
+
|
|
16185
|
+
const int64_t q_offset = 0;
|
|
16186
|
+
const int64_t k_offset = n_embd_head_q * n_head;
|
|
16187
|
+
const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
|
|
16188
|
+
|
|
16189
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
|
16190
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
|
16191
|
+
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv)));
|
|
16192
|
+
|
|
16193
|
+
cb(Qcur, "Qcur", il);
|
|
16194
|
+
cb(Kcur, "Kcur", il);
|
|
16195
|
+
cb(Vcur, "Vcur", il);
|
|
16196
|
+
|
|
16197
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
|
|
16198
|
+
|
|
16199
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
16200
|
+
cb(Qcur, "Qcur_normed", il);
|
|
16201
|
+
|
|
16202
|
+
Qcur = ggml_rope_ext(
|
|
16203
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
16204
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
16205
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
16206
|
+
);
|
|
16207
|
+
|
|
16208
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
16209
|
+
cb(Kcur, "Kcur_normed", il);
|
|
16210
|
+
|
|
16211
|
+
Kcur = ggml_rope_ext(
|
|
16212
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
16213
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
16214
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
16215
|
+
);
|
|
16216
|
+
|
|
16217
|
+
cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
|
|
16218
|
+
}
|
|
16219
|
+
|
|
16220
|
+
cb(cur, "attn_out", il);
|
|
16221
|
+
|
|
16222
|
+
return cur;
|
|
16223
|
+
}
|
|
16224
|
+
|
|
16225
|
+
ggml_tensor * build_plamo2_mamba_layer(
|
|
16226
|
+
llm_graph_input_rs * inp,
|
|
16227
|
+
ggml_tensor * cur,
|
|
16228
|
+
const llama_model & model,
|
|
16229
|
+
const llama_ubatch & ubatch,
|
|
16230
|
+
int il) {
|
|
16231
|
+
|
|
16232
|
+
const auto * mctx_cur = inp->mctx;
|
|
16233
|
+
|
|
16234
|
+
const auto kv_head = mctx_cur->get_head();
|
|
16235
|
+
|
|
16236
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
16237
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
16238
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
16239
|
+
const int64_t n_heads = hparams.ssm_dt_rank;
|
|
16240
|
+
const int64_t head_dim = d_inner / n_heads;
|
|
16241
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
16242
|
+
const int64_t n_seqs = ubatch.n_seqs;
|
|
14853
16243
|
|
|
14854
16244
|
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
14855
16245
|
|
|
14856
16246
|
GGML_ASSERT(n_seqs != 0);
|
|
14857
|
-
GGML_ASSERT(ubatch.equal_seqs);
|
|
16247
|
+
GGML_ASSERT(ubatch.equal_seqs());
|
|
14858
16248
|
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
14859
16249
|
|
|
14860
|
-
ggml_tensor * conv_states_all =
|
|
14861
|
-
ggml_tensor * ssm_states_all =
|
|
16250
|
+
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
16251
|
+
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
14862
16252
|
|
|
14863
|
-
ggml_tensor * conv = build_rs(inp,
|
|
16253
|
+
ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
|
|
14864
16254
|
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
|
|
14865
16255
|
|
|
14866
16256
|
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
14867
16257
|
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
|
14868
16258
|
|
|
14869
|
-
//
|
|
14870
|
-
|
|
14871
|
-
|
|
14872
|
-
|
|
14873
|
-
|
|
14874
|
-
|
|
14875
|
-
|
|
14876
|
-
|
|
14877
|
-
|
|
14878
|
-
|
|
14879
|
-
|
|
14880
|
-
|
|
16259
|
+
// in_proj: {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
|
|
16260
|
+
ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
|
|
16261
|
+
cb(zx, "mamba_in_proj", il);
|
|
16262
|
+
// {8192, 5, 1, 1} -> {8192, 1, 5, 1}
|
|
16263
|
+
zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
|
|
16264
|
+
zx = ggml_cont(ctx0, zx);
|
|
16265
|
+
zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
|
|
16266
|
+
cb(zx, "mamba_in_proj_out", il);
|
|
16267
|
+
|
|
16268
|
+
// split into z and x
|
|
16269
|
+
// => {head_dim * n_heads, n_seq_tokens, n_seqs}
|
|
16270
|
+
ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
|
|
16271
|
+
x = ggml_cont(ctx0, x);
|
|
16272
|
+
x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
|
|
16273
|
+
// x = ggml_permute(ctx0, x, 0, 2, 1, 3);
|
|
16274
|
+
cb(x, "mamba_x_split", il);
|
|
16275
|
+
|
|
16276
|
+
ggml_tensor * z = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], 0);
|
|
16277
|
+
cb(z, "mamba_z_split", il);
|
|
16278
|
+
|
|
16279
|
+
// conv1d
|
|
14881
16280
|
{
|
|
14882
|
-
// => {d_conv - 1 + n_seq_tokens, d_inner
|
|
14883
|
-
ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0,
|
|
16281
|
+
// => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
|
|
16282
|
+
ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
|
|
16283
|
+
cb(conv_x, "mamba_conv1d_input", il);
|
|
14884
16284
|
|
|
14885
16285
|
// copy last (d_conv - 1) columns back into the state cache
|
|
14886
|
-
ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner
|
|
16286
|
+
ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
|
|
16287
|
+
conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
|
|
14887
16288
|
|
|
14888
16289
|
ggml_build_forward_expand(gf,
|
|
14889
16290
|
ggml_cpy(ctx0, last_conv,
|
|
14890
16291
|
ggml_view_1d(ctx0, conv_states_all,
|
|
14891
|
-
(d_conv - 1)*(d_inner
|
|
14892
|
-
kv_head*(d_conv - 1)*(d_inner
|
|
16292
|
+
(d_conv - 1)*(d_inner)*(n_seqs),
|
|
16293
|
+
kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
|
|
14893
16294
|
|
|
14894
16295
|
// 1D convolution
|
|
14895
|
-
|
|
14896
|
-
|
|
14897
|
-
// then element-wise multiply that with the conv1d weight,
|
|
14898
|
-
// then sum the elements of each row,
|
|
14899
|
-
// (the last two steps are a dot product over rows (also doable with mul_mat))
|
|
14900
|
-
// then permute away the ne[0] dimension,
|
|
14901
|
-
// and then you're left with the resulting x tensor.
|
|
14902
|
-
// For simultaneous sequences, all sequences need to have the same length.
|
|
14903
|
-
xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
|
|
14904
|
-
|
|
14905
|
-
// bias
|
|
14906
|
-
xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
|
|
16296
|
+
x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
|
|
16297
|
+
cb(x, "mamba_conv1d", il);
|
|
14907
16298
|
|
|
14908
|
-
|
|
16299
|
+
x = ggml_silu(ctx0, x);
|
|
16300
|
+
cb(x, "mamba_conv1d_silu", il);
|
|
14909
16301
|
}
|
|
14910
16302
|
|
|
14911
|
-
//
|
|
16303
|
+
// SSM
|
|
14912
16304
|
{
|
|
14913
|
-
//
|
|
14914
|
-
ggml_tensor *
|
|
14915
|
-
|
|
14916
|
-
|
|
14917
|
-
|
|
14918
|
-
|
|
16305
|
+
// bcdt_proj: {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
|
|
16306
|
+
ggml_tensor * x_bcdt = build_lora_mm(model.layers[il].ssm_x, x);
|
|
16307
|
+
cb(x_bcdt, "mamba_bcdt_proj", il);
|
|
16308
|
+
|
|
16309
|
+
// split into dt, B, C
|
|
16310
|
+
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
|
|
16311
|
+
ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
|
|
16312
|
+
ggml_tensor * C = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*d_state);
|
|
16313
|
+
ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*(2*d_state));
|
|
16314
|
+
cb(B, "mamba_B_raw", il);
|
|
16315
|
+
cb(C, "mamba_C_raw", il);
|
|
16316
|
+
cb(dt, "mamba_dt_raw", il);
|
|
16317
|
+
|
|
16318
|
+
// Apply RMS norm to dt, B, C (PLaMo-2 specific)
|
|
16319
|
+
B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il);
|
|
16320
|
+
C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il);
|
|
16321
|
+
dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il);
|
|
16322
|
+
cb(B, "mamba_B_normed", il);
|
|
16323
|
+
cb(C, "mamba_C_normed", il);
|
|
16324
|
+
cb(dt, "mamba_dt_normed", il);
|
|
16325
|
+
|
|
16326
|
+
// dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
|
|
16327
|
+
dt = build_lora_mm(model.layers[il].ssm_dt, dt);
|
|
16328
|
+
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
|
|
16329
|
+
cb(dt, "mamba_dt_proj", il);
|
|
14919
16330
|
|
|
14920
|
-
|
|
14921
|
-
|
|
16331
|
+
ggml_tensor * A = ggml_reshape_2d(ctx0, model.layers[il].ssm_a, 1, n_heads);
|
|
16332
|
+
cb(A, "mamba_A", il);
|
|
14922
16333
|
|
|
14923
|
-
|
|
16334
|
+
x = ggml_view_4d(ctx0, x, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
|
|
16335
|
+
B = ggml_view_4d(ctx0, B, d_state, 1, n_seq_tokens, n_seqs, d_state * B->nb[0], B->nb[1], B->nb[2], 0);
|
|
16336
|
+
C = ggml_view_4d(ctx0, C, d_state, 1, n_seq_tokens, n_seqs, d_state * C->nb[0], C->nb[1], C->nb[2], 0);
|
|
14924
16337
|
|
|
14925
|
-
// use the states and the indices provided by
|
|
16338
|
+
// use the states and the indices provided by build_recurrent_state
|
|
14926
16339
|
// (this is necessary in order to properly use the states before they are overwritten,
|
|
14927
16340
|
// while avoiding to make unnecessary copies of the states)
|
|
14928
16341
|
auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
|
|
14929
|
-
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim,
|
|
16342
|
+
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_heads, mctx_cur->get_size());
|
|
14930
16343
|
|
|
14931
|
-
//
|
|
16344
|
+
// Custom operator to optimize the parallel associative scan
|
|
16345
|
+
// as described in the Annex D of the Mamba paper.
|
|
14932
16346
|
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
|
|
14933
16347
|
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
|
14934
16348
|
};
|
|
14935
16349
|
|
|
14936
|
-
ggml_tensor * y_ssm = build_rs(inp,
|
|
16350
|
+
ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
|
16351
|
+
cb(y_ssm, "mamba_ssm_scan", il);
|
|
14937
16352
|
|
|
14938
16353
|
// store last states
|
|
14939
16354
|
ggml_build_forward_expand(gf,
|
|
14940
16355
|
ggml_cpy(ctx0,
|
|
14941
|
-
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs,
|
|
14942
|
-
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
|
|
16356
|
+
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
|
|
16357
|
+
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
|
|
16358
|
+
kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
|
14943
16359
|
|
|
14944
|
-
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim,
|
|
16360
|
+
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
|
|
16361
|
+
cb(y, "mamba_y_view", il);
|
|
14945
16362
|
|
|
14946
|
-
//
|
|
16363
|
+
// Add D parameter and apply gating with z
|
|
16364
|
+
// {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
|
|
16365
|
+
ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
|
|
16366
|
+
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
|
|
16367
|
+
cb(y, "mamba_y_add_d", il);
|
|
14947
16368
|
|
|
14948
|
-
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
|
14949
16369
|
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
|
|
16370
|
+
cb(y, "mamba_y_swiglu_z", il);
|
|
14950
16371
|
|
|
14951
|
-
//
|
|
14952
|
-
|
|
14953
|
-
y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
|
|
14954
|
-
y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
|
|
14955
|
-
}
|
|
14956
|
-
|
|
14957
|
-
y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
|
|
14958
|
-
|
|
14959
|
-
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
16372
|
+
// out_proj: {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
16373
|
+
y = ggml_view_3d(ctx0, y, head_dim * n_heads, n_seq_tokens, n_seqs, y->nb[2], y->nb[3], 0);
|
|
14960
16374
|
cur = build_lora_mm(model.layers[il].ssm_out, y);
|
|
16375
|
+
cb(cur, "mamba_out_proj", il);
|
|
14961
16376
|
}
|
|
14962
16377
|
|
|
14963
16378
|
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
|
14964
16379
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
|
|
14965
16380
|
cb(cur, "mamba_out", il);
|
|
16381
|
+
|
|
14966
16382
|
return cur;
|
|
14967
16383
|
}
|
|
14968
16384
|
};
|
|
14969
16385
|
|
|
14970
16386
|
struct llm_build_arcee : public llm_graph_context {
|
|
14971
|
-
llm_build_arcee(const llama_model & model, const llm_graph_params & params
|
|
16387
|
+
llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
14972
16388
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14973
16389
|
|
|
14974
16390
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -15044,7 +16460,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
15044
16460
|
cb(Kcur, "Kcur", il);
|
|
15045
16461
|
cb(Vcur, "Vcur", il);
|
|
15046
16462
|
|
|
15047
|
-
cur = build_attn(inp_attn,
|
|
16463
|
+
cur = build_attn(inp_attn,
|
|
15048
16464
|
model.layers[il].wo, model.layers[il].bo,
|
|
15049
16465
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15050
16466
|
cb(cur, "attn_out", il);
|
|
@@ -15103,7 +16519,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
15103
16519
|
};
|
|
15104
16520
|
|
|
15105
16521
|
struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
15106
|
-
llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params
|
|
16522
|
+
llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
15107
16523
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
15108
16524
|
|
|
15109
16525
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -15189,7 +16605,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
15189
16605
|
LLM_NORM_RMS, il);
|
|
15190
16606
|
cb(Qcur, "Qcur_norm", il);
|
|
15191
16607
|
|
|
15192
|
-
cur = build_attn(inp_attn,
|
|
16608
|
+
cur = build_attn(inp_attn,
|
|
15193
16609
|
model.layers[il].wo, model.layers[il].bo,
|
|
15194
16610
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15195
16611
|
cb(cur, "attn_out", il);
|
|
@@ -15264,7 +16680,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
15264
16680
|
};
|
|
15265
16681
|
|
|
15266
16682
|
struct llm_build_smollm3 : public llm_graph_context {
|
|
15267
|
-
llm_build_smollm3(const llama_model & model, const llm_graph_params & params
|
|
16683
|
+
llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
15268
16684
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
15269
16685
|
|
|
15270
16686
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -15341,7 +16757,7 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
15341
16757
|
cb(Kcur, "Kcur", il);
|
|
15342
16758
|
cb(Vcur, "Vcur", il);
|
|
15343
16759
|
|
|
15344
|
-
cur = build_attn(inp_attn,
|
|
16760
|
+
cur = build_attn(inp_attn,
|
|
15345
16761
|
model.layers[il].wo, model.layers[il].bo,
|
|
15346
16762
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15347
16763
|
cb(cur, "attn_out", il);
|
|
@@ -15400,6 +16816,183 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
15400
16816
|
}
|
|
15401
16817
|
};
|
|
15402
16818
|
|
|
16819
|
+
struct llm_build_lfm2 : public llm_graph_context {
|
|
16820
|
+
const llama_model & model;
|
|
16821
|
+
|
|
16822
|
+
llm_build_lfm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
|
|
16823
|
+
|
|
16824
|
+
ggml_tensor * cur = build_inp_embd(model.tok_embd);
|
|
16825
|
+
cb(cur, "model.embed_tokens", -1);
|
|
16826
|
+
|
|
16827
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
16828
|
+
auto * inp_hybrid = build_inp_mem_hybrid();
|
|
16829
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16830
|
+
|
|
16831
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
16832
|
+
auto * prev_cur = cur;
|
|
16833
|
+
cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
16834
|
+
cb(cur, "model.layers.{}.operator_norm", il);
|
|
16835
|
+
|
|
16836
|
+
cur = hparams.is_recurrent(il) ?
|
|
16837
|
+
build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
|
|
16838
|
+
build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il) ;
|
|
16839
|
+
|
|
16840
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
16841
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
16842
|
+
prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
|
|
16843
|
+
}
|
|
16844
|
+
|
|
16845
|
+
cur = ggml_add(ctx0, prev_cur, cur);
|
|
16846
|
+
cur = ggml_add(ctx0, cur, build_feed_forward(cur, il));
|
|
16847
|
+
}
|
|
16848
|
+
|
|
16849
|
+
cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
|
|
16850
|
+
cb(cur, "model.embedding_norm", -1);
|
|
16851
|
+
res->t_embd = cur;
|
|
16852
|
+
|
|
16853
|
+
// lm_head is tied with embeddings
|
|
16854
|
+
cur = build_lora_mm(model.tok_embd, cur);
|
|
16855
|
+
cb(cur, "lm_head", -1);
|
|
16856
|
+
|
|
16857
|
+
res->t_logits = cur;
|
|
16858
|
+
|
|
16859
|
+
ggml_build_forward_expand(gf, cur);
|
|
16860
|
+
}
|
|
16861
|
+
|
|
16862
|
+
ggml_tensor * build_feed_forward(ggml_tensor * cur,
|
|
16863
|
+
int il) const {
|
|
16864
|
+
cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
|
16865
|
+
cb(cur, "model.layers.{}.ffn_norm", il);
|
|
16866
|
+
|
|
16867
|
+
GGML_ASSERT(!model.layers[il].ffn_up_b);
|
|
16868
|
+
GGML_ASSERT(!model.layers[il].ffn_gate_b);
|
|
16869
|
+
GGML_ASSERT(!model.layers[il].ffn_down_b);
|
|
16870
|
+
cur = build_ffn(cur,
|
|
16871
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
16872
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
16873
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
16874
|
+
NULL,
|
|
16875
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
16876
|
+
cb(cur, "model.layers.{}.feed_forward.w2", il);
|
|
16877
|
+
|
|
16878
|
+
return cur;
|
|
16879
|
+
}
|
|
16880
|
+
|
|
16881
|
+
ggml_tensor * build_attn_block(ggml_tensor * cur,
|
|
16882
|
+
ggml_tensor * inp_pos,
|
|
16883
|
+
llm_graph_input_attn_kv_unified * inp_attn,
|
|
16884
|
+
int il) const {
|
|
16885
|
+
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
|
|
16886
|
+
auto const n_embd_head = hparams.n_embd_head_v;
|
|
16887
|
+
auto const n_head_kv = hparams.n_head_kv(il);
|
|
16888
|
+
|
|
16889
|
+
auto * q = build_lora_mm(model.layers[il].wq, cur);
|
|
16890
|
+
cb(q, "model.layers.{}.self_attn.q_proj", il);
|
|
16891
|
+
auto * k = build_lora_mm(model.layers[il].wk, cur);
|
|
16892
|
+
cb(k, "model.layers.{}.self_attn.k_proj", il);
|
|
16893
|
+
auto * v = build_lora_mm(model.layers[il].wv, cur);
|
|
16894
|
+
cb(v, "model.layers.{}.self_attn.v_proj", il);
|
|
16895
|
+
|
|
16896
|
+
q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
|
|
16897
|
+
k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
|
|
16898
|
+
v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
|
|
16899
|
+
|
|
16900
|
+
// qk norm
|
|
16901
|
+
q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
16902
|
+
cb(q, "model.layers.{}.self_attn.q_layernorm", il);
|
|
16903
|
+
k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
16904
|
+
cb(k, "model.layers.{}.self_attn.k_layernorm", il);
|
|
16905
|
+
|
|
16906
|
+
// RoPE
|
|
16907
|
+
q = ggml_rope_ext(
|
|
16908
|
+
ctx0, q, inp_pos, nullptr,
|
|
16909
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
16910
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
16911
|
+
);
|
|
16912
|
+
k = ggml_rope_ext(
|
|
16913
|
+
ctx0, k, inp_pos, nullptr,
|
|
16914
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
16915
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
16916
|
+
);
|
|
16917
|
+
|
|
16918
|
+
cur = build_attn(inp_attn, model.layers[il].wo, NULL,
|
|
16919
|
+
q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16920
|
+
|
|
16921
|
+
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
|
16922
|
+
|
|
16923
|
+
return cur;
|
|
16924
|
+
}
|
|
16925
|
+
|
|
16926
|
+
ggml_tensor * build_shortconv_block(ggml_tensor * cur,
|
|
16927
|
+
llm_graph_input_rs * inp_recr,
|
|
16928
|
+
int il) {
|
|
16929
|
+
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
|
|
16930
|
+
const uint32_t kv_head = mctx_cur->get_head();
|
|
16931
|
+
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
16932
|
+
const int64_t n_seqs = ubatch.n_seqs;
|
|
16933
|
+
GGML_ASSERT(n_seqs != 0);
|
|
16934
|
+
GGML_ASSERT(ubatch.equal_seqs());
|
|
16935
|
+
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
16936
|
+
|
|
16937
|
+
GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
|
|
16938
|
+
const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
|
|
16939
|
+
|
|
16940
|
+
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
16941
|
+
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
|
16942
|
+
|
|
16943
|
+
auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
|
|
16944
|
+
cb(bcx, "model.layers.{}.conv.in_proj", il);
|
|
16945
|
+
|
|
16946
|
+
constexpr auto n_chunks = 3;
|
|
16947
|
+
GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
|
|
16948
|
+
auto const chunk_size = bcx->ne[0] / n_chunks;
|
|
16949
|
+
auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 0*chunk_size*ggml_element_size(bcx));
|
|
16950
|
+
auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 1*chunk_size*ggml_element_size(bcx));
|
|
16951
|
+
auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 2*chunk_size*ggml_element_size(bcx));
|
|
16952
|
+
|
|
16953
|
+
auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
|
|
16954
|
+
|
|
16955
|
+
// read conv state
|
|
16956
|
+
auto * conv_state = mctx_cur->get_r_l(il);
|
|
16957
|
+
auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
|
|
16958
|
+
auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
|
|
16959
|
+
|
|
16960
|
+
bx = ggml_concat(ctx0, conv, bx, 0);
|
|
16961
|
+
GGML_ASSERT(bx->ne[0] > conv->ne[0]);
|
|
16962
|
+
|
|
16963
|
+
// last d_conv columns is a new conv state
|
|
16964
|
+
auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2], (bx->ne[0] - conv->ne[0])*ggml_element_size(bx));
|
|
16965
|
+
GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
|
|
16966
|
+
|
|
16967
|
+
// write new conv conv state
|
|
16968
|
+
ggml_build_forward_expand(
|
|
16969
|
+
gf,
|
|
16970
|
+
ggml_cpy(
|
|
16971
|
+
ctx0,
|
|
16972
|
+
new_conv,
|
|
16973
|
+
ggml_view_1d(
|
|
16974
|
+
ctx0,
|
|
16975
|
+
conv_state,
|
|
16976
|
+
ggml_nelements(new_conv),
|
|
16977
|
+
kv_head*d_conv*n_embd*ggml_element_size(new_conv)
|
|
16978
|
+
)
|
|
16979
|
+
)
|
|
16980
|
+
);
|
|
16981
|
+
|
|
16982
|
+
auto * conv_kernel = model.layers[il].shortconv.conv;
|
|
16983
|
+
auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
|
|
16984
|
+
cb(conv_out, "model.layers.{}.conv.conv", il);
|
|
16985
|
+
|
|
16986
|
+
auto * y = ggml_mul(ctx0, c, conv_out);
|
|
16987
|
+
y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
|
|
16988
|
+
cb(y, "model.layers.{}.conv.out_proj", il);
|
|
16989
|
+
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
|
16990
|
+
y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
|
|
16991
|
+
|
|
16992
|
+
return y;
|
|
16993
|
+
}
|
|
16994
|
+
};
|
|
16995
|
+
|
|
15403
16996
|
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
15404
16997
|
llama_memory_i * res;
|
|
15405
16998
|
|
|
@@ -15412,6 +17005,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
15412
17005
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
15413
17006
|
case LLM_ARCH_NEO_BERT:
|
|
15414
17007
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
17008
|
+
case LLM_ARCH_DREAM:
|
|
15415
17009
|
{
|
|
15416
17010
|
res = nullptr;
|
|
15417
17011
|
} break;
|
|
@@ -15452,7 +17046,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
15452
17046
|
} else {
|
|
15453
17047
|
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
15454
17048
|
|
|
15455
|
-
|
|
17049
|
+
uint32_t n_ctx_per_stream = cparams.n_ctx;
|
|
17050
|
+
|
|
17051
|
+
if (!cparams.kv_unified) {
|
|
17052
|
+
n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
|
|
17053
|
+
n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
|
|
17054
|
+
|
|
17055
|
+
cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
|
|
17056
|
+
} else {
|
|
17057
|
+
n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
|
|
17058
|
+
|
|
17059
|
+
cparams.n_ctx = n_ctx_per_stream;
|
|
17060
|
+
}
|
|
15456
17061
|
|
|
15457
17062
|
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
15458
17063
|
|
|
@@ -15466,7 +17071,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
15466
17071
|
!cparams.flash_attn,
|
|
15467
17072
|
cparams.offload_kqv,
|
|
15468
17073
|
params.swa_full,
|
|
15469
|
-
cparams.
|
|
17074
|
+
cparams.kv_unified,
|
|
17075
|
+
n_ctx_per_stream,
|
|
15470
17076
|
cparams.n_seq_max,
|
|
15471
17077
|
cparams.n_ubatch,
|
|
15472
17078
|
padding);
|
|
@@ -15480,7 +17086,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
15480
17086
|
params.type_v,
|
|
15481
17087
|
!cparams.flash_attn,
|
|
15482
17088
|
cparams.offload_kqv,
|
|
15483
|
-
cparams.
|
|
17089
|
+
cparams.kv_unified,
|
|
17090
|
+
n_ctx_per_stream,
|
|
15484
17091
|
cparams.n_seq_max,
|
|
15485
17092
|
padding,
|
|
15486
17093
|
hparams.n_swa,
|
|
@@ -15493,223 +17100,233 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
15493
17100
|
return res;
|
|
15494
17101
|
}
|
|
15495
17102
|
|
|
15496
|
-
|
|
15497
|
-
const llm_graph_params & params,
|
|
15498
|
-
ggml_cgraph * gf,
|
|
15499
|
-
llm_graph_type type) const {
|
|
17103
|
+
ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
15500
17104
|
std::unique_ptr<llm_graph_context> llm;
|
|
15501
17105
|
|
|
15502
17106
|
switch (arch) {
|
|
15503
17107
|
case LLM_ARCH_LLAMA:
|
|
15504
17108
|
{
|
|
15505
|
-
llm = std::make_unique<llm_build_llama>(*this, params
|
|
17109
|
+
llm = std::make_unique<llm_build_llama>(*this, params);
|
|
15506
17110
|
} break;
|
|
15507
17111
|
case LLM_ARCH_LLAMA4:
|
|
15508
17112
|
{
|
|
15509
|
-
llm = std::make_unique<llm_build_llama_iswa>(*this, params
|
|
17113
|
+
llm = std::make_unique<llm_build_llama_iswa>(*this, params);
|
|
15510
17114
|
} break;
|
|
15511
17115
|
case LLM_ARCH_DECI:
|
|
15512
17116
|
{
|
|
15513
|
-
llm = std::make_unique<llm_build_deci>(*this, params
|
|
17117
|
+
llm = std::make_unique<llm_build_deci>(*this, params);
|
|
15514
17118
|
} break;
|
|
15515
17119
|
case LLM_ARCH_BAICHUAN:
|
|
15516
17120
|
{
|
|
15517
|
-
llm = std::make_unique<llm_build_baichuan>(*this, params
|
|
17121
|
+
llm = std::make_unique<llm_build_baichuan>(*this, params);
|
|
15518
17122
|
} break;
|
|
15519
17123
|
case LLM_ARCH_FALCON:
|
|
15520
17124
|
{
|
|
15521
|
-
llm = std::make_unique<llm_build_falcon>(*this, params
|
|
17125
|
+
llm = std::make_unique<llm_build_falcon>(*this, params);
|
|
15522
17126
|
} break;
|
|
15523
17127
|
case LLM_ARCH_GROK:
|
|
15524
17128
|
{
|
|
15525
|
-
llm = std::make_unique<llm_build_grok>(*this, params
|
|
17129
|
+
llm = std::make_unique<llm_build_grok>(*this, params);
|
|
15526
17130
|
} break;
|
|
15527
17131
|
case LLM_ARCH_STARCODER:
|
|
15528
17132
|
{
|
|
15529
|
-
llm = std::make_unique<llm_build_starcoder>(*this, params
|
|
17133
|
+
llm = std::make_unique<llm_build_starcoder>(*this, params);
|
|
15530
17134
|
} break;
|
|
15531
17135
|
case LLM_ARCH_REFACT:
|
|
15532
17136
|
{
|
|
15533
|
-
llm = std::make_unique<llm_build_refact>(*this, params
|
|
17137
|
+
llm = std::make_unique<llm_build_refact>(*this, params);
|
|
15534
17138
|
} break;
|
|
15535
17139
|
case LLM_ARCH_BERT:
|
|
15536
17140
|
case LLM_ARCH_JINA_BERT_V2:
|
|
15537
17141
|
case LLM_ARCH_NOMIC_BERT:
|
|
15538
17142
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
15539
17143
|
{
|
|
15540
|
-
llm = std::make_unique<llm_build_bert>(*this, params
|
|
17144
|
+
llm = std::make_unique<llm_build_bert>(*this, params);
|
|
15541
17145
|
} break;
|
|
15542
17146
|
case LLM_ARCH_NEO_BERT:
|
|
15543
17147
|
{
|
|
15544
|
-
llm = std::make_unique<llm_build_neo_bert>(*this, params
|
|
17148
|
+
llm = std::make_unique<llm_build_neo_bert>(*this, params);
|
|
15545
17149
|
} break;
|
|
15546
17150
|
case LLM_ARCH_BLOOM:
|
|
15547
17151
|
{
|
|
15548
|
-
llm = std::make_unique<llm_build_bloom>(*this, params
|
|
17152
|
+
llm = std::make_unique<llm_build_bloom>(*this, params);
|
|
15549
17153
|
} break;
|
|
15550
17154
|
case LLM_ARCH_MPT:
|
|
15551
17155
|
{
|
|
15552
|
-
llm = std::make_unique<llm_build_mpt>(*this, params
|
|
17156
|
+
llm = std::make_unique<llm_build_mpt>(*this, params);
|
|
15553
17157
|
} break;
|
|
15554
17158
|
case LLM_ARCH_STABLELM:
|
|
15555
17159
|
{
|
|
15556
|
-
llm = std::make_unique<llm_build_stablelm>(*this, params
|
|
17160
|
+
llm = std::make_unique<llm_build_stablelm>(*this, params);
|
|
15557
17161
|
} break;
|
|
15558
17162
|
case LLM_ARCH_QWEN:
|
|
15559
17163
|
{
|
|
15560
|
-
llm = std::make_unique<llm_build_qwen>(*this, params
|
|
17164
|
+
llm = std::make_unique<llm_build_qwen>(*this, params);
|
|
15561
17165
|
} break;
|
|
15562
17166
|
case LLM_ARCH_QWEN2:
|
|
15563
17167
|
{
|
|
15564
|
-
llm = std::make_unique<llm_build_qwen2>(*this, params
|
|
17168
|
+
llm = std::make_unique<llm_build_qwen2>(*this, params);
|
|
15565
17169
|
} break;
|
|
17170
|
+
case LLM_ARCH_DREAM:
|
|
17171
|
+
{
|
|
17172
|
+
llm = std::make_unique<llm_build_dream>(*this, params);
|
|
17173
|
+
}
|
|
17174
|
+
break;
|
|
15566
17175
|
case LLM_ARCH_QWEN2VL:
|
|
15567
17176
|
{
|
|
15568
|
-
llm = std::make_unique<llm_build_qwen2vl>(*this, params
|
|
17177
|
+
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
|
15569
17178
|
} break;
|
|
15570
17179
|
case LLM_ARCH_QWEN2MOE:
|
|
15571
17180
|
{
|
|
15572
|
-
llm = std::make_unique<llm_build_qwen2moe>(*this, params
|
|
17181
|
+
llm = std::make_unique<llm_build_qwen2moe>(*this, params);
|
|
15573
17182
|
} break;
|
|
15574
17183
|
case LLM_ARCH_QWEN3:
|
|
15575
17184
|
{
|
|
15576
|
-
llm = std::make_unique<llm_build_qwen3>(*this, params
|
|
17185
|
+
llm = std::make_unique<llm_build_qwen3>(*this, params);
|
|
15577
17186
|
} break;
|
|
15578
17187
|
case LLM_ARCH_QWEN3MOE:
|
|
15579
17188
|
{
|
|
15580
|
-
llm = std::make_unique<llm_build_qwen3moe>(*this, params
|
|
17189
|
+
llm = std::make_unique<llm_build_qwen3moe>(*this, params);
|
|
15581
17190
|
} break;
|
|
15582
17191
|
case LLM_ARCH_PHI2:
|
|
15583
17192
|
{
|
|
15584
|
-
llm = std::make_unique<llm_build_phi2>(*this, params
|
|
17193
|
+
llm = std::make_unique<llm_build_phi2>(*this, params);
|
|
15585
17194
|
} break;
|
|
15586
17195
|
case LLM_ARCH_PHI3:
|
|
15587
17196
|
case LLM_ARCH_PHIMOE:
|
|
15588
17197
|
{
|
|
15589
17198
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
15590
|
-
llm = std::make_unique<llm_build_phi3<true>> (*this, params
|
|
17199
|
+
llm = std::make_unique<llm_build_phi3<true>> (*this, params);
|
|
15591
17200
|
} else {
|
|
15592
|
-
llm = std::make_unique<llm_build_phi3<false>>(*this, params
|
|
17201
|
+
llm = std::make_unique<llm_build_phi3<false>>(*this, params);
|
|
15593
17202
|
}
|
|
15594
17203
|
} break;
|
|
15595
17204
|
case LLM_ARCH_PLAMO:
|
|
15596
17205
|
{
|
|
15597
|
-
llm = std::make_unique<llm_build_plamo>(*this, params
|
|
17206
|
+
llm = std::make_unique<llm_build_plamo>(*this, params);
|
|
17207
|
+
} break;
|
|
17208
|
+
case LLM_ARCH_PLAMO2:
|
|
17209
|
+
{
|
|
17210
|
+
llm = std::make_unique<llm_build_plamo2>(*this, params);
|
|
15598
17211
|
} break;
|
|
15599
17212
|
case LLM_ARCH_GPT2:
|
|
15600
17213
|
{
|
|
15601
|
-
llm = std::make_unique<llm_build_gpt2>(*this, params
|
|
17214
|
+
llm = std::make_unique<llm_build_gpt2>(*this, params);
|
|
15602
17215
|
} break;
|
|
15603
17216
|
case LLM_ARCH_CODESHELL:
|
|
15604
17217
|
{
|
|
15605
|
-
llm = std::make_unique<llm_build_codeshell>(*this, params
|
|
17218
|
+
llm = std::make_unique<llm_build_codeshell>(*this, params);
|
|
15606
17219
|
} break;
|
|
15607
17220
|
case LLM_ARCH_ORION:
|
|
15608
17221
|
{
|
|
15609
|
-
llm = std::make_unique<llm_build_orion>(*this, params
|
|
17222
|
+
llm = std::make_unique<llm_build_orion>(*this, params);
|
|
15610
17223
|
} break;
|
|
15611
17224
|
case LLM_ARCH_INTERNLM2:
|
|
15612
17225
|
{
|
|
15613
|
-
llm = std::make_unique<llm_build_internlm2>(*this, params
|
|
17226
|
+
llm = std::make_unique<llm_build_internlm2>(*this, params);
|
|
15614
17227
|
} break;
|
|
15615
17228
|
case LLM_ARCH_MINICPM3:
|
|
15616
17229
|
{
|
|
15617
|
-
llm = std::make_unique<llm_build_minicpm3>(*this, params
|
|
17230
|
+
llm = std::make_unique<llm_build_minicpm3>(*this, params);
|
|
15618
17231
|
} break;
|
|
15619
17232
|
case LLM_ARCH_GEMMA:
|
|
15620
17233
|
{
|
|
15621
|
-
llm = std::make_unique<llm_build_gemma>(*this, params
|
|
17234
|
+
llm = std::make_unique<llm_build_gemma>(*this, params);
|
|
15622
17235
|
} break;
|
|
15623
17236
|
case LLM_ARCH_GEMMA2:
|
|
15624
17237
|
{
|
|
15625
|
-
llm = std::make_unique<llm_build_gemma2_iswa>(*this, params
|
|
17238
|
+
llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
|
|
15626
17239
|
} break;
|
|
15627
17240
|
case LLM_ARCH_GEMMA3:
|
|
15628
17241
|
{
|
|
15629
|
-
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params
|
|
17242
|
+
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
|
|
15630
17243
|
} break;
|
|
15631
17244
|
case LLM_ARCH_GEMMA3N:
|
|
15632
17245
|
{
|
|
15633
|
-
llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params
|
|
17246
|
+
llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
|
|
15634
17247
|
} break;
|
|
15635
17248
|
case LLM_ARCH_STARCODER2:
|
|
15636
17249
|
{
|
|
15637
|
-
llm = std::make_unique<llm_build_starcoder2>(*this, params
|
|
17250
|
+
llm = std::make_unique<llm_build_starcoder2>(*this, params);
|
|
15638
17251
|
} break;
|
|
15639
17252
|
case LLM_ARCH_MAMBA:
|
|
15640
17253
|
case LLM_ARCH_MAMBA2:
|
|
15641
17254
|
{
|
|
15642
|
-
llm = std::make_unique<llm_build_mamba>(*this, params
|
|
17255
|
+
llm = std::make_unique<llm_build_mamba>(*this, params);
|
|
17256
|
+
} break;
|
|
17257
|
+
case LLM_ARCH_JAMBA:
|
|
17258
|
+
{
|
|
17259
|
+
llm = std::make_unique<llm_build_jamba>(*this, params);
|
|
15643
17260
|
} break;
|
|
15644
17261
|
case LLM_ARCH_XVERSE:
|
|
15645
17262
|
{
|
|
15646
|
-
llm = std::make_unique<llm_build_xverse>(*this, params
|
|
17263
|
+
llm = std::make_unique<llm_build_xverse>(*this, params);
|
|
15647
17264
|
} break;
|
|
15648
17265
|
case LLM_ARCH_COMMAND_R:
|
|
15649
17266
|
{
|
|
15650
|
-
llm = std::make_unique<llm_build_command_r>(*this, params
|
|
17267
|
+
llm = std::make_unique<llm_build_command_r>(*this, params);
|
|
15651
17268
|
} break;
|
|
15652
17269
|
case LLM_ARCH_COHERE2:
|
|
15653
17270
|
{
|
|
15654
|
-
llm = std::make_unique<llm_build_cohere2_iswa>(*this, params
|
|
17271
|
+
llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
|
|
15655
17272
|
} break;
|
|
15656
17273
|
case LLM_ARCH_DBRX:
|
|
15657
17274
|
{
|
|
15658
|
-
llm = std::make_unique<llm_build_dbrx>(*this, params
|
|
17275
|
+
llm = std::make_unique<llm_build_dbrx>(*this, params);
|
|
15659
17276
|
} break;
|
|
15660
17277
|
case LLM_ARCH_OLMO:
|
|
15661
17278
|
{
|
|
15662
|
-
llm = std::make_unique<llm_build_olmo>(*this, params
|
|
17279
|
+
llm = std::make_unique<llm_build_olmo>(*this, params);
|
|
15663
17280
|
} break;
|
|
15664
17281
|
case LLM_ARCH_OLMO2:
|
|
15665
17282
|
{
|
|
15666
|
-
llm = std::make_unique<llm_build_olmo2>(*this, params
|
|
17283
|
+
llm = std::make_unique<llm_build_olmo2>(*this, params);
|
|
15667
17284
|
} break;
|
|
15668
17285
|
case LLM_ARCH_OLMOE:
|
|
15669
17286
|
{
|
|
15670
|
-
llm = std::make_unique<llm_build_olmoe>(*this, params
|
|
17287
|
+
llm = std::make_unique<llm_build_olmoe>(*this, params);
|
|
15671
17288
|
} break;
|
|
15672
17289
|
case LLM_ARCH_OPENELM:
|
|
15673
17290
|
{
|
|
15674
|
-
llm = std::make_unique<llm_build_openelm>(*this, params
|
|
17291
|
+
llm = std::make_unique<llm_build_openelm>(*this, params);
|
|
15675
17292
|
} break;
|
|
15676
17293
|
case LLM_ARCH_GPTNEOX:
|
|
15677
17294
|
{
|
|
15678
|
-
llm = std::make_unique<llm_build_gptneox>(*this, params
|
|
17295
|
+
llm = std::make_unique<llm_build_gptneox>(*this, params);
|
|
15679
17296
|
} break;
|
|
15680
17297
|
case LLM_ARCH_ARCTIC:
|
|
15681
17298
|
{
|
|
15682
|
-
llm = std::make_unique<llm_build_arctic>(*this, params
|
|
17299
|
+
llm = std::make_unique<llm_build_arctic>(*this, params);
|
|
15683
17300
|
} break;
|
|
15684
17301
|
case LLM_ARCH_DEEPSEEK:
|
|
15685
17302
|
{
|
|
15686
|
-
llm = std::make_unique<llm_build_deepseek>(*this, params
|
|
17303
|
+
llm = std::make_unique<llm_build_deepseek>(*this, params);
|
|
15687
17304
|
} break;
|
|
15688
17305
|
case LLM_ARCH_DEEPSEEK2:
|
|
15689
17306
|
{
|
|
15690
|
-
llm = std::make_unique<llm_build_deepseek2>(*this, params
|
|
17307
|
+
llm = std::make_unique<llm_build_deepseek2>(*this, params);
|
|
15691
17308
|
} break;
|
|
15692
17309
|
case LLM_ARCH_CHATGLM:
|
|
15693
17310
|
{
|
|
15694
|
-
llm = std::make_unique<llm_build_chatglm>(*this, params
|
|
17311
|
+
llm = std::make_unique<llm_build_chatglm>(*this, params);
|
|
15695
17312
|
} break;
|
|
15696
17313
|
case LLM_ARCH_GLM4:
|
|
15697
17314
|
{
|
|
15698
|
-
llm = std::make_unique<llm_build_glm4>(*this, params
|
|
17315
|
+
llm = std::make_unique<llm_build_glm4>(*this, params);
|
|
15699
17316
|
} break;
|
|
15700
17317
|
case LLM_ARCH_BITNET:
|
|
15701
17318
|
{
|
|
15702
|
-
llm = std::make_unique<llm_build_bitnet>(*this, params
|
|
17319
|
+
llm = std::make_unique<llm_build_bitnet>(*this, params);
|
|
15703
17320
|
} break;
|
|
15704
17321
|
case LLM_ARCH_T5:
|
|
15705
17322
|
{
|
|
15706
|
-
switch (
|
|
17323
|
+
switch (params.gtype) {
|
|
15707
17324
|
case LLM_GRAPH_TYPE_ENCODER:
|
|
15708
|
-
llm = std::make_unique<llm_build_t5_enc>(*this, params
|
|
17325
|
+
llm = std::make_unique<llm_build_t5_enc>(*this, params);
|
|
15709
17326
|
break;
|
|
15710
17327
|
case LLM_GRAPH_TYPE_DEFAULT:
|
|
15711
17328
|
case LLM_GRAPH_TYPE_DECODER:
|
|
15712
|
-
llm = std::make_unique<llm_build_t5_dec>(*this, params
|
|
17329
|
+
llm = std::make_unique<llm_build_t5_dec>(*this, params);
|
|
15713
17330
|
break;
|
|
15714
17331
|
default:
|
|
15715
17332
|
GGML_ABORT("invalid graph type");
|
|
@@ -15717,91 +17334,111 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
15717
17334
|
} break;
|
|
15718
17335
|
case LLM_ARCH_T5ENCODER:
|
|
15719
17336
|
{
|
|
15720
|
-
llm = std::make_unique<llm_build_t5_enc>(*this, params
|
|
17337
|
+
llm = std::make_unique<llm_build_t5_enc>(*this, params);
|
|
15721
17338
|
}
|
|
15722
17339
|
break;
|
|
15723
17340
|
case LLM_ARCH_JAIS:
|
|
15724
17341
|
{
|
|
15725
|
-
llm = std::make_unique<llm_build_jais>(*this, params
|
|
17342
|
+
llm = std::make_unique<llm_build_jais>(*this, params);
|
|
15726
17343
|
} break;
|
|
15727
17344
|
case LLM_ARCH_NEMOTRON:
|
|
15728
17345
|
{
|
|
15729
|
-
llm = std::make_unique<llm_build_nemotron>(*this, params
|
|
17346
|
+
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
|
15730
17347
|
} break;
|
|
15731
17348
|
case LLM_ARCH_EXAONE:
|
|
15732
17349
|
{
|
|
15733
|
-
llm = std::make_unique<llm_build_exaone>(*this, params
|
|
17350
|
+
llm = std::make_unique<llm_build_exaone>(*this, params);
|
|
17351
|
+
} break;
|
|
17352
|
+
case LLM_ARCH_EXAONE4:
|
|
17353
|
+
{
|
|
17354
|
+
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
|
17355
|
+
llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
|
|
17356
|
+
} else {
|
|
17357
|
+
llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
|
|
17358
|
+
}
|
|
15734
17359
|
} break;
|
|
15735
17360
|
case LLM_ARCH_RWKV6:
|
|
15736
17361
|
{
|
|
15737
|
-
llm = std::make_unique<llm_build_rwkv6>(*this, params
|
|
17362
|
+
llm = std::make_unique<llm_build_rwkv6>(*this, params);
|
|
15738
17363
|
} break;
|
|
15739
17364
|
case LLM_ARCH_RWKV6QWEN2:
|
|
15740
17365
|
{
|
|
15741
|
-
llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params
|
|
17366
|
+
llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
|
|
15742
17367
|
} break;
|
|
15743
17368
|
case LLM_ARCH_RWKV7:
|
|
15744
17369
|
{
|
|
15745
|
-
llm = std::make_unique<llm_build_rwkv7>(*this, params
|
|
17370
|
+
llm = std::make_unique<llm_build_rwkv7>(*this, params);
|
|
15746
17371
|
} break;
|
|
15747
17372
|
case LLM_ARCH_ARWKV7:
|
|
15748
17373
|
{
|
|
15749
|
-
llm = std::make_unique<llm_build_arwkv7>(*this, params
|
|
17374
|
+
llm = std::make_unique<llm_build_arwkv7>(*this, params);
|
|
15750
17375
|
} break;
|
|
15751
17376
|
case LLM_ARCH_GRANITE:
|
|
15752
17377
|
case LLM_ARCH_GRANITE_MOE:
|
|
15753
17378
|
case LLM_ARCH_MINICPM:
|
|
15754
17379
|
{
|
|
15755
|
-
llm = std::make_unique<llm_build_granite>(*this, params
|
|
17380
|
+
llm = std::make_unique<llm_build_granite>(*this, params);
|
|
17381
|
+
} break;
|
|
17382
|
+
case LLM_ARCH_GRANITE_HYBRID:
|
|
17383
|
+
{
|
|
17384
|
+
llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
|
|
15756
17385
|
} break;
|
|
15757
17386
|
case LLM_ARCH_CHAMELEON:
|
|
15758
17387
|
{
|
|
15759
|
-
llm = std::make_unique<llm_build_chameleon>(*this, params
|
|
17388
|
+
llm = std::make_unique<llm_build_chameleon>(*this, params);
|
|
15760
17389
|
} break;
|
|
15761
17390
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
15762
17391
|
{
|
|
15763
|
-
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params
|
|
17392
|
+
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
|
|
15764
17393
|
} break;
|
|
15765
17394
|
case LLM_ARCH_PLM:
|
|
15766
17395
|
{
|
|
15767
|
-
llm = std::make_unique<llm_build_plm>(*this, params
|
|
17396
|
+
llm = std::make_unique<llm_build_plm>(*this, params);
|
|
15768
17397
|
} break;
|
|
15769
17398
|
case LLM_ARCH_BAILINGMOE:
|
|
15770
17399
|
{
|
|
15771
|
-
llm = std::make_unique<llm_build_bailingmoe>(*this, params
|
|
17400
|
+
llm = std::make_unique<llm_build_bailingmoe>(*this, params);
|
|
15772
17401
|
} break;
|
|
15773
17402
|
case LLM_ARCH_DOTS1:
|
|
15774
17403
|
{
|
|
15775
|
-
llm = std::make_unique<llm_build_dots1>(*this, params
|
|
17404
|
+
llm = std::make_unique<llm_build_dots1>(*this, params);
|
|
15776
17405
|
} break;
|
|
15777
17406
|
case LLM_ARCH_ARCEE:
|
|
15778
17407
|
{
|
|
15779
|
-
llm = std::make_unique<llm_build_arcee>(*this, params
|
|
17408
|
+
llm = std::make_unique<llm_build_arcee>(*this, params);
|
|
15780
17409
|
} break;
|
|
15781
17410
|
case LLM_ARCH_ERNIE4_5:
|
|
15782
17411
|
{
|
|
15783
|
-
llm = std::make_unique<llm_build_ernie4_5>(*this, params
|
|
17412
|
+
llm = std::make_unique<llm_build_ernie4_5>(*this, params);
|
|
17413
|
+
} break;
|
|
17414
|
+
case LLM_ARCH_ERNIE4_5_MOE:
|
|
17415
|
+
{
|
|
17416
|
+
llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
|
|
15784
17417
|
} break;
|
|
15785
17418
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
15786
17419
|
{
|
|
15787
|
-
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params
|
|
17420
|
+
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
|
15788
17421
|
} break;
|
|
15789
17422
|
case LLM_ARCH_SMOLLM3:
|
|
15790
17423
|
{
|
|
15791
|
-
llm = std::make_unique<llm_build_smollm3>(*this, params
|
|
17424
|
+
llm = std::make_unique<llm_build_smollm3>(*this, params);
|
|
15792
17425
|
} break;
|
|
15793
17426
|
case LLM_ARCH_FALCON_H1:
|
|
15794
17427
|
{
|
|
15795
|
-
llm = std::make_unique<llm_build_falcon_h1>(*this, params
|
|
17428
|
+
llm = std::make_unique<llm_build_falcon_h1>(*this, params);
|
|
17429
|
+
} break;
|
|
17430
|
+
case LLM_ARCH_LFM2:
|
|
17431
|
+
{
|
|
17432
|
+
llm = std::make_unique<llm_build_lfm2>(*this, params);
|
|
15796
17433
|
} break;
|
|
15797
17434
|
default:
|
|
15798
17435
|
GGML_ABORT("fatal error");
|
|
15799
17436
|
}
|
|
15800
17437
|
|
|
15801
17438
|
// add on pooling layer
|
|
15802
|
-
llm->build_pooling(
|
|
17439
|
+
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
|
15803
17440
|
|
|
15804
|
-
return
|
|
17441
|
+
return llm->res->get_gf();
|
|
15805
17442
|
}
|
|
15806
17443
|
|
|
15807
17444
|
//
|
|
@@ -15911,6 +17548,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
15911
17548
|
case LLM_ARCH_BLOOM:
|
|
15912
17549
|
case LLM_ARCH_MAMBA:
|
|
15913
17550
|
case LLM_ARCH_MAMBA2:
|
|
17551
|
+
case LLM_ARCH_JAMBA:
|
|
15914
17552
|
case LLM_ARCH_JINA_BERT_V2:
|
|
15915
17553
|
case LLM_ARCH_T5:
|
|
15916
17554
|
case LLM_ARCH_T5ENCODER:
|
|
@@ -15942,12 +17580,14 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
15942
17580
|
case LLM_ARCH_GLM4:
|
|
15943
17581
|
case LLM_ARCH_GRANITE:
|
|
15944
17582
|
case LLM_ARCH_GRANITE_MOE:
|
|
17583
|
+
case LLM_ARCH_GRANITE_HYBRID:
|
|
15945
17584
|
case LLM_ARCH_CHAMELEON:
|
|
15946
17585
|
case LLM_ARCH_BAILINGMOE:
|
|
15947
17586
|
case LLM_ARCH_NEO_BERT:
|
|
15948
17587
|
case LLM_ARCH_SMOLLM3:
|
|
15949
17588
|
case LLM_ARCH_ARCEE:
|
|
15950
17589
|
case LLM_ARCH_ERNIE4_5:
|
|
17590
|
+
case LLM_ARCH_ERNIE4_5_MOE:
|
|
15951
17591
|
return LLAMA_ROPE_TYPE_NORM;
|
|
15952
17592
|
|
|
15953
17593
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -15962,6 +17602,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
15962
17602
|
case LLM_ARCH_BITNET:
|
|
15963
17603
|
case LLM_ARCH_QWEN:
|
|
15964
17604
|
case LLM_ARCH_QWEN2:
|
|
17605
|
+
case LLM_ARCH_DREAM:
|
|
15965
17606
|
case LLM_ARCH_QWEN2MOE:
|
|
15966
17607
|
case LLM_ARCH_QWEN3:
|
|
15967
17608
|
case LLM_ARCH_QWEN3MOE:
|
|
@@ -15971,6 +17612,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
15971
17612
|
case LLM_ARCH_PHI3:
|
|
15972
17613
|
case LLM_ARCH_PHIMOE:
|
|
15973
17614
|
case LLM_ARCH_PLAMO:
|
|
17615
|
+
case LLM_ARCH_PLAMO2:
|
|
15974
17616
|
case LLM_ARCH_GEMMA:
|
|
15975
17617
|
case LLM_ARCH_GEMMA2:
|
|
15976
17618
|
case LLM_ARCH_GEMMA3:
|
|
@@ -15982,9 +17624,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
15982
17624
|
case LLM_ARCH_ORION:
|
|
15983
17625
|
case LLM_ARCH_NEMOTRON:
|
|
15984
17626
|
case LLM_ARCH_EXAONE:
|
|
17627
|
+
case LLM_ARCH_EXAONE4:
|
|
15985
17628
|
case LLM_ARCH_MINICPM3:
|
|
15986
17629
|
case LLM_ARCH_DOTS1:
|
|
15987
17630
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
17631
|
+
case LLM_ARCH_LFM2:
|
|
15988
17632
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
15989
17633
|
|
|
15990
17634
|
case LLM_ARCH_QWEN2VL:
|