@fugood/llama.node 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/common/CMakeLists.txt +4 -5
  3. package/src/llama.cpp/common/arg.cpp +37 -0
  4. package/src/llama.cpp/common/common.cpp +22 -6
  5. package/src/llama.cpp/common/common.h +14 -1
  6. package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
  7. package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  8. package/src/llama.cpp/ggml/include/ggml.h +13 -0
  9. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  10. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  11. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
  12. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
  14. package/src/llama.cpp/include/llama.h +13 -48
  15. package/src/llama.cpp/src/llama-arch.cpp +222 -15
  16. package/src/llama.cpp/src/llama-arch.h +16 -1
  17. package/src/llama.cpp/src/llama-batch.cpp +76 -70
  18. package/src/llama.cpp/src/llama-batch.h +24 -18
  19. package/src/llama.cpp/src/llama-chat.cpp +44 -1
  20. package/src/llama.cpp/src/llama-chat.h +2 -0
  21. package/src/llama.cpp/src/llama-context.cpp +134 -95
  22. package/src/llama.cpp/src/llama-context.h +13 -16
  23. package/src/llama.cpp/src/llama-cparams.h +3 -2
  24. package/src/llama.cpp/src/llama-graph.cpp +239 -154
  25. package/src/llama.cpp/src/llama-graph.h +162 -126
  26. package/src/llama.cpp/src/llama-hparams.cpp +45 -0
  27. package/src/llama.cpp/src/llama-hparams.h +11 -1
  28. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
  29. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
  30. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
  31. package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
  32. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
  33. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
  34. package/src/llama.cpp/src/llama-model.cpp +2309 -665
  35. package/src/llama.cpp/src/llama-model.h +18 -4
  36. package/src/llama.cpp/src/llama-quant.cpp +2 -2
  37. package/src/llama.cpp/src/llama-vocab.cpp +368 -9
  38. package/src/llama.cpp/src/llama-vocab.h +43 -0
  39. package/src/llama.cpp/src/unicode.cpp +207 -0
  40. package/src/llama.cpp/src/unicode.h +2 -0
@@ -40,17 +40,21 @@ const char * llm_type_name(llm_type type) {
40
40
  case LLM_TYPE_190M: return "190M";
41
41
  case LLM_TYPE_220M: return "220M";
42
42
  case LLM_TYPE_250M: return "250M";
43
+ case LLM_TYPE_256M: return "256M";
43
44
  case LLM_TYPE_270M: return "270M";
44
45
  case LLM_TYPE_335M: return "335M";
46
+ case LLM_TYPE_350M: return "350M";
45
47
  case LLM_TYPE_410M: return "410M";
46
48
  case LLM_TYPE_450M: return "450M";
47
49
  case LLM_TYPE_475M: return "475M";
50
+ case LLM_TYPE_700M: return "700M";
48
51
  case LLM_TYPE_770M: return "770M";
49
52
  case LLM_TYPE_780M: return "780M";
50
53
  case LLM_TYPE_0_3B: return "0.3B";
51
54
  case LLM_TYPE_0_5B: return "0.5B";
52
55
  case LLM_TYPE_0_6B: return "0.6B";
53
56
  case LLM_TYPE_1B: return "1B";
57
+ case LLM_TYPE_1_2B: return "1.2B";
54
58
  case LLM_TYPE_1_3B: return "1.3B";
55
59
  case LLM_TYPE_1_4B: return "1.4B";
56
60
  case LLM_TYPE_1_5B: return "1.5B";
@@ -103,8 +107,10 @@ const char * llm_type_name(llm_type type) {
103
107
  case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
104
108
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
105
109
  case LLM_TYPE_A13B: return "A13B";
110
+ case LLM_TYPE_21B_A3B: return "21B.A3B";
106
111
  case LLM_TYPE_30B_A3B: return "30B.A3B";
107
112
  case LLM_TYPE_235B_A22B: return "235B.A22B";
113
+ case LLM_TYPE_300B_A47B: return "300B.A47B";
108
114
  case LLM_TYPE_E2B: return "E2B";
109
115
  case LLM_TYPE_E4B: return "E4B";
110
116
  default: return "?B";
@@ -581,6 +587,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
581
587
  case 22: type = LLM_TYPE_1B; break;
582
588
  case 26: type = LLM_TYPE_3B; break;
583
589
  case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
590
+ case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
584
591
  // granite uses a vocab with len 49152
585
592
  case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
586
593
  case 36: type = LLM_TYPE_8B; break; // granite
@@ -844,6 +851,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
844
851
  default: type = LLM_TYPE_UNKNOWN;
845
852
  }
846
853
  } break;
854
+ case LLM_ARCH_DREAM:
855
+ {
856
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
857
+ // Dream models are primarily 7B with 28 layers
858
+ switch (hparams.n_layer) {
859
+ case 28:
860
+ type = LLM_TYPE_7B;
861
+ break;
862
+ default:
863
+ type = LLM_TYPE_UNKNOWN;
864
+ }
865
+ // Set non-causal attention for diffusion models
866
+ hparams.causal_attn = false;
867
+ }
868
+ break;
847
869
  case LLM_ARCH_QWEN2MOE:
848
870
  {
849
871
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -930,6 +952,33 @@ void llama_model::load_hparams(llama_model_loader & ml) {
930
952
  default: type = LLM_TYPE_UNKNOWN;
931
953
  }
932
954
  } break;
955
+ case LLM_ARCH_PLAMO2:
956
+ {
957
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
958
+
959
+ // Load Mamba SSM parameters
960
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
961
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
962
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
963
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
964
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
965
+
966
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
967
+ hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
968
+ }
969
+
970
+ switch (hparams.n_layer) {
971
+ case 16: type = LLM_TYPE_1B; break;
972
+ case 32:
973
+ if (hparams.n_embd == 2048) {
974
+ type = LLM_TYPE_2B;
975
+ } else if (hparams.n_embd == 4096) {
976
+ type = LLM_TYPE_8B;
977
+ }
978
+ break;
979
+ default: type = LLM_TYPE_UNKNOWN;
980
+ }
981
+ } break;
933
982
  case LLM_ARCH_GPT2:
934
983
  {
935
984
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1118,6 +1167,26 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1118
1167
  default: type = LLM_TYPE_UNKNOWN;
1119
1168
  }
1120
1169
  } break;
1170
+ case LLM_ARCH_JAMBA:
1171
+ {
1172
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1173
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1174
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1175
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1176
+
1177
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1178
+
1179
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1180
+ hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1181
+ }
1182
+
1183
+ switch (hparams.n_layer) {
1184
+ // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
1185
+ case 12: // 900M 8x???M
1186
+ case 32: // 51B 16x?B
1187
+ default: type = LLM_TYPE_UNKNOWN;
1188
+ }
1189
+ } break;
1121
1190
  case LLM_ARCH_XVERSE:
1122
1191
  {
1123
1192
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1421,6 +1490,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1421
1490
  default: type = LLM_TYPE_UNKNOWN;
1422
1491
  }
1423
1492
  } break;
1493
+ case LLM_ARCH_EXAONE4:
1494
+ {
1495
+ if (hparams.n_layer == 64) { // 32B
1496
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1497
+ hparams.n_swa = 4096;
1498
+ hparams.set_swa_pattern(4);
1499
+ }
1500
+
1501
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1502
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1503
+
1504
+ switch (hparams.n_layer) {
1505
+ case 30: type = LLM_TYPE_1_2B; break;
1506
+ case 64: type = LLM_TYPE_32B; break;
1507
+ default: type = LLM_TYPE_UNKNOWN;
1508
+ }
1509
+ } break;
1424
1510
  case LLM_ARCH_RWKV6:
1425
1511
  case LLM_ARCH_RWKV6QWEN2:
1426
1512
  {
@@ -1484,6 +1570,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1484
1570
  ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
1485
1571
  ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
1486
1572
 
1573
+ // Granite uses rope_finetuned as a switch for rope, so default to true
1574
+ bool rope_finetuned = true;
1575
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
1576
+ hparams.rope_finetuned = rope_finetuned;
1577
+
1487
1578
  switch (hparams.n_layer) {
1488
1579
  case 32: type = LLM_TYPE_3B; break;
1489
1580
  case 40: type = LLM_TYPE_3B; break;
@@ -1491,6 +1582,40 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1491
1582
  default: type = LLM_TYPE_UNKNOWN;
1492
1583
  }
1493
1584
 
1585
+ // For Granite MoE Shared
1586
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1587
+ } break;
1588
+ case LLM_ARCH_GRANITE_HYBRID:
1589
+ {
1590
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1591
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
1592
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
1593
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
1594
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
1595
+
1596
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1597
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1598
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1599
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1600
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1601
+
1602
+ // Granite uses rope_finetuned as a switch for rope, so default to true
1603
+ bool rope_finetuned = true;
1604
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
1605
+ hparams.rope_finetuned = rope_finetuned;
1606
+
1607
+ // A layer is recurrent IFF the n_head_kv value is set to 0
1608
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1609
+ hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1610
+ }
1611
+
1612
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1613
+
1614
+ switch (hparams.n_layer) {
1615
+ // TODO: Add llm type label (not sure this is useful)
1616
+ default: type = LLM_TYPE_UNKNOWN;
1617
+ }
1618
+
1494
1619
  // For Granite MoE Shared
1495
1620
  ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1496
1621
  } break;
@@ -1543,10 +1668,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1543
1668
  }
1544
1669
  } break;
1545
1670
  case LLM_ARCH_ERNIE4_5:
1671
+ case LLM_ARCH_ERNIE4_5_MOE:
1546
1672
  {
1547
1673
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1674
+ if (arch == LLM_ARCH_ERNIE4_5_MOE) {
1675
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1676
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1677
+ ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
1678
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1679
+ }
1680
+
1548
1681
  switch (hparams.n_layer) {
1549
1682
  case 18: type = LLM_TYPE_0_3B; break;
1683
+ case 28: type = LLM_TYPE_21B_A3B; break;
1684
+ case 54: type = LLM_TYPE_300B_A47B; break;
1550
1685
  default: type = LLM_TYPE_UNKNOWN;
1551
1686
  }
1552
1687
  } break;
@@ -1602,6 +1737,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1602
1737
  default: type = LLM_TYPE_UNKNOWN;
1603
1738
  }
1604
1739
  } break;
1740
+ case LLM_ARCH_LFM2:
1741
+ {
1742
+ ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
1743
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1744
+ for (uint32_t il = 0; il < hparams.n_layer; ++il) {
1745
+ hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
1746
+ }
1747
+ switch (hparams.n_embd) {
1748
+ case 1024: type = LLM_TYPE_350M; break;
1749
+ case 1536: type = LLM_TYPE_700M; break;
1750
+ case 2048: type = LLM_TYPE_1_2B; break;
1751
+ default: type = LLM_TYPE_UNKNOWN;
1752
+ }
1753
+ } break;
1605
1754
  default: throw std::runtime_error("unsupported model architecture");
1606
1755
  }
1607
1756
 
@@ -2565,12 +2714,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2565
2714
  } break;
2566
2715
  case LLM_ARCH_QWEN2:
2567
2716
  case LLM_ARCH_QWEN2VL:
2717
+ case LLM_ARCH_DREAM:
2568
2718
  {
2569
2719
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2570
2720
 
2571
2721
  // output
2572
2722
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2573
2723
  output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2724
+ output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, TENSOR_NOT_REQUIRED);
2574
2725
  // if output is NULL, init from the input tok embed
2575
2726
  if (output == NULL) {
2576
2727
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@@ -2860,6 +3011,73 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2860
3011
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2861
3012
  }
2862
3013
  } break;
3014
+ case LLM_ARCH_PLAMO2:
3015
+ {
3016
+ const uint32_t d_conv = hparams.ssm_d_conv;
3017
+ const uint32_t d_state = hparams.ssm_d_state;
3018
+ const uint32_t num_heads = hparams.ssm_dt_rank;
3019
+ const uint32_t intermediate_size = hparams.ssm_d_inner;
3020
+ const uint32_t head_dim = intermediate_size / num_heads;
3021
+ const uint32_t qk_dim = head_dim;
3022
+ const uint32_t v_dim = head_dim;
3023
+ const int64_t num_attention_heads = hparams.n_head();
3024
+ const int64_t q_num_heads = num_attention_heads;
3025
+ const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
3026
+
3027
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3028
+
3029
+ // output
3030
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3031
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3032
+ // if output is NULL, init from the input tok embed
3033
+ if (output == NULL) {
3034
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3035
+ }
3036
+
3037
+ for (int i = 0; i < n_layer; ++i) {
3038
+ auto & layer = layers[i];
3039
+ bool is_mamba_layer = hparams.is_recurrent(i);
3040
+
3041
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3042
+
3043
+ if (is_mamba_layer) {
3044
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
3045
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
3046
+
3047
+ layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {intermediate_size, dt_dim + 2*d_state}, 0);
3048
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_dim, num_heads}, 0);
3049
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {num_heads}, 0);
3050
+
3051
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {num_heads}, 0);
3052
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {num_heads}, 0);
3053
+
3054
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {intermediate_size, n_embd}, 0);
3055
+
3056
+ layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, i), {dt_dim}, 0);
3057
+ layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
3058
+ layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
3059
+ } else {
3060
+ const int64_t num_key_value_heads = hparams.n_head_kv(i);
3061
+ const int64_t k_num_heads = num_key_value_heads;
3062
+ const int64_t v_num_heads = num_key_value_heads;
3063
+ const int64_t q_proj_dim = q_num_heads * qk_dim;
3064
+ const int64_t k_proj_dim = k_num_heads * qk_dim;
3065
+ const int64_t v_proj_dim = v_num_heads * v_dim;
3066
+
3067
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
3068
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim, num_attention_heads}, 0);
3069
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim, k_num_heads}, 0);
3070
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
3071
+ }
3072
+
3073
+ // All layers have post-attention norm, FFN norm, and FFN tensors
3074
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
3075
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3076
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3077
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
3078
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
3079
+ }
3080
+ } break;
2863
3081
  case LLM_ARCH_GPT2:
2864
3082
  {
2865
3083
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3231,10 +3449,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3231
3449
  {
3232
3450
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3233
3451
 
3234
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
3452
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3235
3453
  // if output is NULL, init from the input tok embed, duplicated to allow offloading
3236
3454
  if (output == NULL) {
3237
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
3455
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3238
3456
  }
3239
3457
  }
3240
3458
 
@@ -3261,6 +3479,180 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3261
3479
  layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3262
3480
  }
3263
3481
  } break;
3482
+ case LLM_ARCH_JAMBA:
3483
+ {
3484
+ const int64_t d_conv = hparams.ssm_d_conv;
3485
+ const int64_t d_inner = hparams.ssm_d_inner;
3486
+ const int64_t d_state = hparams.ssm_d_state;
3487
+ const int64_t dt_rank = hparams.ssm_dt_rank;
3488
+
3489
+ // only an expansion factor of 2 is supported for now
3490
+ GGML_ASSERT(2 * n_embd == d_inner);
3491
+
3492
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3493
+
3494
+ // output
3495
+ {
3496
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3497
+
3498
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3499
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
3500
+ if (output == NULL) {
3501
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3502
+ }
3503
+ }
3504
+
3505
+ for (int i = 0; i < n_layer; ++i) {
3506
+ const int64_t n_head_kv = hparams.n_head_kv(i);
3507
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
3508
+
3509
+ auto & layer = layers[i];
3510
+
3511
+ // norm
3512
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3513
+
3514
+ if (n_head_kv == 0) {
3515
+ // Mamba layer
3516
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
3517
+
3518
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
3519
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
3520
+
3521
+ layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
3522
+
3523
+ layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
3524
+
3525
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
3526
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
3527
+
3528
+ layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
3529
+ layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
3530
+
3531
+ // no "weight" suffix for these
3532
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
3533
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
3534
+
3535
+ // out_proj
3536
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3537
+ } else {
3538
+ // Attention layers
3539
+
3540
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3541
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3542
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3543
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3544
+ }
3545
+
3546
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3547
+
3548
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
3549
+
3550
+ if (layer.ffn_gate_inp) {
3551
+ // MoE
3552
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3553
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
3554
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3555
+ } else {
3556
+ // FFN (no MoE)
3557
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3558
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3559
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3560
+ }
3561
+ }
3562
+ } break;
3563
+ case LLM_ARCH_GRANITE_HYBRID:
3564
+ {
3565
+ // mamba2 Mixer SSM params
3566
+ // NOTE: int64_t for tensor dimensions
3567
+ const int64_t d_conv = hparams.ssm_d_conv;
3568
+ const int64_t d_inner = hparams.ssm_d_inner;
3569
+ const int64_t d_state = hparams.ssm_d_state;
3570
+ const int64_t n_ssm_head = hparams.ssm_dt_rank;
3571
+ const int64_t n_group = hparams.ssm_n_group;
3572
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
3573
+
3574
+ // only an expansion factor of 2 is supported for now
3575
+ GGML_ASSERT(2 * n_embd == d_inner);
3576
+
3577
+ // embeddings
3578
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3579
+
3580
+ // output
3581
+ {
3582
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3583
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3584
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
3585
+ if (output == NULL) {
3586
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3587
+ }
3588
+ }
3589
+
3590
+ for (int i = 0; i < n_layer; ++i) {
3591
+ auto & layer = layers[i];
3592
+
3593
+ // norm
3594
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3595
+
3596
+ if (hparams.is_recurrent(i)) {
3597
+ // ssm layers
3598
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
3599
+
3600
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
3601
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
3602
+
3603
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
3604
+
3605
+ // no "weight" suffix for these
3606
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
3607
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
3608
+
3609
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
3610
+
3611
+ // out_proj
3612
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3613
+ } else {
3614
+ // attention layers (with optional bias)
3615
+ const int64_t n_head_i = hparams.n_head(i);
3616
+ const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
3617
+ const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
3618
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
3619
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
3620
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
3621
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
3622
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3623
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
3624
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
3625
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3626
+ }
3627
+
3628
+ // feed forward (w/ optional biases)
3629
+ if (n_expert > 0) {
3630
+ // MoE FFN
3631
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3632
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3633
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3634
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
3635
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
3636
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3637
+
3638
+ // For Granite MoE Shared
3639
+ if (hparams.n_ff_shexp > 0) {
3640
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3641
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3642
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
3643
+ }
3644
+ } else {
3645
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3646
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3647
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3648
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3649
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3650
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3651
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3652
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3653
+ }
3654
+ }
3655
+ } break;
3264
3656
  case LLM_ARCH_XVERSE:
3265
3657
  {
3266
3658
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3980,6 +4372,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3980
4372
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3981
4373
  }
3982
4374
  } break;
4375
+ case LLM_ARCH_EXAONE4:
4376
+ {
4377
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4378
+
4379
+ // output
4380
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4381
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4382
+
4383
+ // if output is NULL, init from the input tok embed
4384
+ if (output == NULL) {
4385
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4386
+ }
4387
+
4388
+ for (int i = 0; i < n_layer; ++i) {
4389
+ auto & layer = layers[i];
4390
+
4391
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4392
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4393
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4394
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4395
+
4396
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4397
+
4398
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4399
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4400
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4401
+
4402
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4403
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4404
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4405
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4406
+ }
4407
+ } break;
3983
4408
  case LLM_ARCH_RWKV6:
3984
4409
  {
3985
4410
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4495,6 +4920,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4495
4920
  }
4496
4921
  } break;
4497
4922
  case LLM_ARCH_ERNIE4_5:
4923
+ case LLM_ARCH_ERNIE4_5_MOE:
4498
4924
  {
4499
4925
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4500
4926
 
@@ -4523,9 +4949,27 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4523
4949
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4524
4950
 
4525
4951
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4526
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4527
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4528
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4952
+
4953
+ if (arch == LLM_ARCH_ERNIE4_5_MOE && static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
4954
+ int n_ff_exp = hparams.n_ff_exp;
4955
+
4956
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4957
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
4958
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
4959
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
4960
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
4961
+
4962
+ // Shared expert (if present)
4963
+ if (hparams.n_ff_shexp > 0) {
4964
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
4965
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd }, 0);
4966
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp}, 0);
4967
+ }
4968
+ } else { // Dense layers
4969
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4970
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4971
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4972
+ }
4529
4973
  }
4530
4974
  } break;
4531
4975
  case LLM_ARCH_FALCON_H1:
@@ -4671,6 +5115,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4671
5115
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4672
5116
  }
4673
5117
  } break;
5118
+ case LLM_ARCH_LFM2:
5119
+ {
5120
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5121
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5122
+
5123
+ for (int i = 0; i < n_layer; ++i) {
5124
+ auto & layer = layers[i];
5125
+ // ffn is same for transformer and conv layers
5126
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5127
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5128
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5129
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5130
+
5131
+ // for operator_norm
5132
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5133
+
5134
+ if (!hparams.is_recurrent(i)) {
5135
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5136
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5137
+ GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
5138
+
5139
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
5140
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
5141
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
5142
+
5143
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5144
+ } else {
5145
+ layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
5146
+ layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
5147
+ layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
5148
+ }
5149
+ }
5150
+ } break;
4674
5151
  default:
4675
5152
  throw std::runtime_error("unknown architecture");
4676
5153
  }
@@ -4910,16 +5387,6 @@ void llama_model::print_info() const {
4910
5387
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4911
5388
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
4912
5389
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
4913
- }
4914
-
4915
- if (arch == LLM_ARCH_MAMBA || arch == LLM_ARCH_MAMBA2) {
4916
- LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
4917
- LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
4918
- LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
4919
- LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
4920
- LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
4921
- LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
4922
-
4923
5390
  if (!classifier_labels.empty()) {
4924
5391
  LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
4925
5392
 
@@ -4930,6 +5397,20 @@ void llama_model::print_info() const {
4930
5397
  }
4931
5398
  }
4932
5399
 
5400
+ if (arch == LLM_ARCH_MAMBA ||
5401
+ arch == LLM_ARCH_MAMBA2 ||
5402
+ arch == LLM_ARCH_JAMBA ||
5403
+ arch == LLM_ARCH_FALCON_H1 ||
5404
+ arch == LLM_ARCH_PLAMO2 ||
5405
+ arch == LLM_ARCH_GRANITE_HYBRID) {
5406
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
5407
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
5408
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
5409
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
5410
+ LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
5411
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
5412
+ }
5413
+
4933
5414
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
4934
5415
  if (pimpl->n_elements >= 1e12) {
4935
5416
  LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
@@ -4976,7 +5457,8 @@ void llama_model::print_info() const {
4976
5457
 
4977
5458
  if (arch == LLM_ARCH_MINICPM ||
4978
5459
  arch == LLM_ARCH_GRANITE ||
4979
- arch == LLM_ARCH_GRANITE_MOE) {
5460
+ arch == LLM_ARCH_GRANITE_MOE ||
5461
+ arch == LLM_ARCH_GRANITE_HYBRID) {
4980
5462
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
4981
5463
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
4982
5464
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -5092,7 +5574,7 @@ ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int i
5092
5574
  }
5093
5575
 
5094
5576
  struct llm_build_llama : public llm_graph_context {
5095
- llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
5577
+ llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5096
5578
  const int64_t n_embd_head = hparams.n_embd_head_v;
5097
5579
 
5098
5580
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5168,7 +5650,7 @@ struct llm_build_llama : public llm_graph_context {
5168
5650
  cb(Kcur, "Kcur", il);
5169
5651
  cb(Vcur, "Vcur", il);
5170
5652
 
5171
- cur = build_attn(inp_attn, gf,
5653
+ cur = build_attn(inp_attn,
5172
5654
  model.layers[il].wo, model.layers[il].bo,
5173
5655
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
5174
5656
  cb(cur, "attn_out", il);
@@ -5248,7 +5730,7 @@ struct llm_build_llama : public llm_graph_context {
5248
5730
  };
5249
5731
 
5250
5732
  struct llm_build_llama_iswa : public llm_graph_context {
5251
- llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
5733
+ llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5252
5734
  const int64_t n_embd_head = hparams.n_embd_head_v;
5253
5735
 
5254
5736
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5342,7 +5824,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
5342
5824
  cb(Kcur, "Kcur_normed", il);
5343
5825
  }
5344
5826
 
5345
- cur = build_attn(inp_attn, gf,
5827
+ cur = build_attn(inp_attn,
5346
5828
  model.layers[il].wo, model.layers[il].bo,
5347
5829
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
5348
5830
  cb(cur, "attn_out", il);
@@ -5431,7 +5913,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
5431
5913
  };
5432
5914
 
5433
5915
  struct llm_build_deci : public llm_graph_context {
5434
- llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
5916
+ llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5435
5917
  const int64_t n_embd_head = hparams.n_embd_head_v;
5436
5918
 
5437
5919
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5519,7 +6001,7 @@ struct llm_build_deci : public llm_graph_context {
5519
6001
  cb(Kcur, "Kcur", il);
5520
6002
  cb(Vcur, "Vcur", il);
5521
6003
 
5522
- cur = build_attn(inp_attn, gf,
6004
+ cur = build_attn(inp_attn,
5523
6005
  model.layers[il].wo, model.layers[il].bo,
5524
6006
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
5525
6007
  }
@@ -5587,7 +6069,7 @@ struct llm_build_deci : public llm_graph_context {
5587
6069
  };
5588
6070
 
5589
6071
  struct llm_build_baichuan : public llm_graph_context {
5590
- llm_build_baichuan(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6072
+ llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5591
6073
  const int64_t n_embd_head = hparams.n_embd_head_v;
5592
6074
 
5593
6075
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5651,7 +6133,7 @@ struct llm_build_baichuan : public llm_graph_context {
5651
6133
  cb(Kcur, "Kcur", il);
5652
6134
  cb(Vcur, "Vcur", il);
5653
6135
 
5654
- cur = build_attn(inp_attn, gf,
6136
+ cur = build_attn(inp_attn,
5655
6137
  model.layers[il].wo, NULL,
5656
6138
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5657
6139
  }
@@ -5709,7 +6191,7 @@ struct llm_build_baichuan : public llm_graph_context {
5709
6191
  };
5710
6192
 
5711
6193
  struct llm_build_xverse : public llm_graph_context {
5712
- llm_build_xverse(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6194
+ llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5713
6195
  const int64_t n_embd_head = hparams.n_embd_head_v;
5714
6196
 
5715
6197
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -5766,7 +6248,7 @@ struct llm_build_xverse : public llm_graph_context {
5766
6248
  cb(Kcur, "Kcur", il);
5767
6249
  cb(Vcur, "Vcur", il);
5768
6250
 
5769
- cur = build_attn(inp_attn, gf,
6251
+ cur = build_attn(inp_attn,
5770
6252
  model.layers[il].wo, NULL,
5771
6253
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5772
6254
  }
@@ -5822,7 +6304,7 @@ struct llm_build_xverse : public llm_graph_context {
5822
6304
  };
5823
6305
 
5824
6306
  struct llm_build_falcon : public llm_graph_context {
5825
- llm_build_falcon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6307
+ llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5826
6308
  const int64_t n_embd_head = hparams.n_embd_head_v;
5827
6309
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5828
6310
 
@@ -5889,7 +6371,7 @@ struct llm_build_falcon : public llm_graph_context {
5889
6371
  cb(Kcur, "Kcur", il);
5890
6372
  cb(Vcur, "Vcur", il);
5891
6373
 
5892
- cur = build_attn(inp_attn, gf,
6374
+ cur = build_attn(inp_attn,
5893
6375
  model.layers[il].wo, NULL,
5894
6376
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5895
6377
  }
@@ -5944,7 +6426,7 @@ struct llm_build_falcon : public llm_graph_context {
5944
6426
  };
5945
6427
 
5946
6428
  struct llm_build_grok : public llm_graph_context {
5947
- llm_build_grok(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6429
+ llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5948
6430
  const int64_t n_embd_head = hparams.n_embd_head_v;
5949
6431
 
5950
6432
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -6019,7 +6501,7 @@ struct llm_build_grok : public llm_graph_context {
6019
6501
  cb(Kcur, "Kcur", il);
6020
6502
  cb(Vcur, "Vcur", il);
6021
6503
 
6022
- cur = build_attn(inp_attn, gf,
6504
+ cur = build_attn(inp_attn,
6023
6505
  model.layers[il].wo, model.layers[il].bo,
6024
6506
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
6025
6507
  }
@@ -6106,7 +6588,7 @@ struct llm_build_grok : public llm_graph_context {
6106
6588
  };
6107
6589
 
6108
6590
  struct llm_build_dbrx : public llm_graph_context {
6109
- llm_build_dbrx(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6591
+ llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
6110
6592
  const int64_t n_embd_head = hparams.n_embd_head_v;
6111
6593
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6112
6594
 
@@ -6168,7 +6650,7 @@ struct llm_build_dbrx : public llm_graph_context {
6168
6650
  cb(Kcur, "Kcur", il);
6169
6651
  cb(Vcur, "Vcur", il);
6170
6652
 
6171
- cur = build_attn(inp_attn, gf,
6653
+ cur = build_attn(inp_attn,
6172
6654
  model.layers[il].wo, NULL,
6173
6655
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6174
6656
  }
@@ -6231,7 +6713,7 @@ struct llm_build_dbrx : public llm_graph_context {
6231
6713
  };
6232
6714
 
6233
6715
  struct llm_build_starcoder : public llm_graph_context {
6234
- llm_build_starcoder(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6716
+ llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
6235
6717
  const int64_t n_embd_head = hparams.n_embd_head_v;
6236
6718
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6237
6719
 
@@ -6282,7 +6764,7 @@ struct llm_build_starcoder : public llm_graph_context {
6282
6764
  cb(Kcur, "Kcur", il);
6283
6765
  cb(Vcur, "Vcur", il);
6284
6766
 
6285
- cur = build_attn(inp_attn, gf,
6767
+ cur = build_attn(inp_attn,
6286
6768
  model.layers[il].wo, model.layers[il].bo,
6287
6769
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6288
6770
  }
@@ -6340,7 +6822,7 @@ struct llm_build_starcoder : public llm_graph_context {
6340
6822
  };
6341
6823
 
6342
6824
  struct llm_build_refact : public llm_graph_context {
6343
- llm_build_refact(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6825
+ llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
6344
6826
  const int64_t n_embd_head = hparams.n_embd_head_v;
6345
6827
 
6346
6828
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -6381,7 +6863,7 @@ struct llm_build_refact : public llm_graph_context {
6381
6863
  cb(Kcur, "Kcur", il);
6382
6864
  cb(Vcur, "Vcur", il);
6383
6865
 
6384
- cur = build_attn(inp_attn, gf,
6866
+ cur = build_attn(inp_attn,
6385
6867
  model.layers[il].wo, NULL,
6386
6868
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6387
6869
  }
@@ -6439,7 +6921,7 @@ struct llm_build_refact : public llm_graph_context {
6439
6921
  };
6440
6922
 
6441
6923
  struct llm_build_bert : public llm_graph_context {
6442
- llm_build_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6924
+ llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
6443
6925
  const int64_t n_embd_head = hparams.n_embd_head_v;
6444
6926
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6445
6927
 
@@ -6538,7 +7020,7 @@ struct llm_build_bert : public llm_graph_context {
6538
7020
  cb(Kcur, "Kcur", il);
6539
7021
  cb(Vcur, "Vcur", il);
6540
7022
 
6541
- cur = build_attn(inp_attn, gf,
7023
+ cur = build_attn(inp_attn,
6542
7024
  model.layers[il].wo, model.layers[il].bo,
6543
7025
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6544
7026
  cb(cur, "kqv_out", il);
@@ -6625,7 +7107,7 @@ struct llm_build_bert : public llm_graph_context {
6625
7107
  };
6626
7108
 
6627
7109
  struct llm_build_neo_bert : public llm_graph_context {
6628
- llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7110
+ llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
6629
7111
  const int64_t n_embd_head = hparams.n_embd_head_v;
6630
7112
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6631
7113
 
@@ -6683,7 +7165,7 @@ struct llm_build_neo_bert : public llm_graph_context {
6683
7165
  cb(Kcur, "Kcur", il);
6684
7166
  cb(Vcur, "Vcur", il);
6685
7167
 
6686
- cur = build_attn(inp_attn, gf,
7168
+ cur = build_attn(inp_attn,
6687
7169
  model.layers[il].wo, nullptr,
6688
7170
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6689
7171
  cb(cur, "kqv_out", il);
@@ -6735,7 +7217,7 @@ struct llm_build_neo_bert : public llm_graph_context {
6735
7217
  };
6736
7218
 
6737
7219
  struct llm_build_bloom : public llm_graph_context {
6738
- llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7220
+ llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
6739
7221
  const int64_t n_embd_head = hparams.n_embd_head_v;
6740
7222
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6741
7223
 
@@ -6783,7 +7265,7 @@ struct llm_build_bloom : public llm_graph_context {
6783
7265
  cb(Kcur, "Kcur", il);
6784
7266
  cb(Vcur, "Vcur", il);
6785
7267
 
6786
- cur = build_attn(inp_attn, gf,
7268
+ cur = build_attn(inp_attn,
6787
7269
  model.layers[il].wo, model.layers[il].bo,
6788
7270
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6789
7271
  }
@@ -6841,7 +7323,7 @@ struct llm_build_bloom : public llm_graph_context {
6841
7323
  };
6842
7324
 
6843
7325
  struct llm_build_mpt : public llm_graph_context {
6844
- llm_build_mpt(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7326
+ llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
6845
7327
  const int64_t n_embd_head = hparams.n_embd_head_v;
6846
7328
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6847
7329
 
@@ -6930,7 +7412,7 @@ struct llm_build_mpt : public llm_graph_context {
6930
7412
  cb(Kcur, "Kcur", il);
6931
7413
  cb(Vcur, "Vcur", il);
6932
7414
 
6933
- cur = build_attn(inp_attn, gf,
7415
+ cur = build_attn(inp_attn,
6934
7416
  model.layers[il].wo, model.layers[il].bo,
6935
7417
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6936
7418
  }
@@ -6989,7 +7471,7 @@ struct llm_build_mpt : public llm_graph_context {
6989
7471
  };
6990
7472
 
6991
7473
  struct llm_build_stablelm : public llm_graph_context {
6992
- llm_build_stablelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7474
+ llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
6993
7475
  const int64_t n_embd_head = hparams.n_embd_head_v;
6994
7476
 
6995
7477
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7076,7 +7558,7 @@ struct llm_build_stablelm : public llm_graph_context {
7076
7558
  cb(Kcur, "Kcur", il);
7077
7559
  cb(Vcur, "Vcur", il);
7078
7560
 
7079
- cur = build_attn(inp_attn, gf,
7561
+ cur = build_attn(inp_attn,
7080
7562
  model.layers[il].wo, NULL,
7081
7563
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7082
7564
  }
@@ -7141,7 +7623,7 @@ struct llm_build_stablelm : public llm_graph_context {
7141
7623
  };
7142
7624
 
7143
7625
  struct llm_build_qwen : public llm_graph_context {
7144
- llm_build_qwen(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7626
+ llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
7145
7627
  const int64_t n_embd_head = hparams.n_embd_head_v;
7146
7628
 
7147
7629
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7197,7 +7679,7 @@ struct llm_build_qwen : public llm_graph_context {
7197
7679
  cb(Kcur, "Kcur", il);
7198
7680
  cb(Vcur, "Vcur", il);
7199
7681
 
7200
- cur = build_attn(inp_attn, gf,
7682
+ cur = build_attn(inp_attn,
7201
7683
  model.layers[il].wo, NULL,
7202
7684
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7203
7685
  }
@@ -7255,7 +7737,7 @@ struct llm_build_qwen : public llm_graph_context {
7255
7737
  };
7256
7738
 
7257
7739
  struct llm_build_qwen2 : public llm_graph_context {
7258
- llm_build_qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7740
+ llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
7259
7741
  const int64_t n_embd_head = hparams.n_embd_head_v;
7260
7742
 
7261
7743
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7317,7 +7799,7 @@ struct llm_build_qwen2 : public llm_graph_context {
7317
7799
  cb(Kcur, "Kcur", il);
7318
7800
  cb(Vcur, "Vcur", il);
7319
7801
 
7320
- cur = build_attn(inp_attn, gf,
7802
+ cur = build_attn(inp_attn,
7321
7803
  model.layers[il].wo, model.layers[il].bo,
7322
7804
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7323
7805
  }
@@ -7365,6 +7847,113 @@ struct llm_build_qwen2 : public llm_graph_context {
7365
7847
  // lm_head
7366
7848
  cur = build_lora_mm(model.output, cur);
7367
7849
 
7850
+ if (model.output_b != nullptr) {
7851
+ cur = ggml_add(ctx0, cur, model.output_b);
7852
+ }
7853
+
7854
+ cb(cur, "result_output", -1);
7855
+ res->t_logits = cur;
7856
+
7857
+ ggml_build_forward_expand(gf, cur);
7858
+ }
7859
+ };
7860
+
7861
+ struct llm_build_dream : public llm_graph_context {
7862
+ llm_build_dream(const llama_model & model, const llm_graph_params & params) :
7863
+ llm_graph_context(params) {
7864
+ //copied from qwen2
7865
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7866
+
7867
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7868
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
7869
+
7870
+ ggml_tensor * cur;
7871
+ ggml_tensor * inpL;
7872
+
7873
+ inpL = build_inp_embd(model.tok_embd);
7874
+
7875
+ // inp_pos - contains the positions
7876
+ ggml_tensor * inp_pos = build_inp_pos();
7877
+
7878
+ auto * inp_attn = build_attn_inp_no_cache();
7879
+
7880
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7881
+
7882
+ for (int il = 0; il < n_layer; ++il) {
7883
+ ggml_tensor * inpSA = inpL;
7884
+
7885
+ // norm
7886
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
7887
+ cb(cur, "attn_norm", il);
7888
+
7889
+ // self-attention
7890
+ {
7891
+ // compute Q and K and RoPE them
7892
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
7893
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
7894
+ cb(Qcur, "Qcur", il);
7895
+
7896
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
7897
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
7898
+ cb(Kcur, "Kcur", il);
7899
+
7900
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
7901
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
7902
+ cb(Vcur, "Vcur", il);
7903
+
7904
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7905
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7906
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7907
+
7908
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7909
+ ext_factor, attn_factor, beta_fast, beta_slow);
7910
+
7911
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7912
+ ext_factor, attn_factor, beta_fast, beta_slow);
7913
+
7914
+ cb(Qcur, "Qcur", il);
7915
+ cb(Kcur, "Kcur", il);
7916
+ cb(Vcur, "Vcur", il);
7917
+
7918
+ cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr,
7919
+ nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
7920
+ }
7921
+
7922
+ if (il == n_layer - 1 && inp_out_ids) {
7923
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7924
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7925
+ }
7926
+
7927
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7928
+ cb(ffn_inp, "ffn_inp", il);
7929
+
7930
+ // feed-forward network
7931
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
7932
+ cb(cur, "ffn_norm", il);
7933
+
7934
+ cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
7935
+ model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
7936
+ cb(cur, "ffn_out", il);
7937
+
7938
+ cur = ggml_add(ctx0, cur, ffn_inp);
7939
+
7940
+ cur = build_cvec(cur, il);
7941
+ cb(cur, "l_out", il);
7942
+
7943
+ // input for next layer
7944
+ inpL = cur;
7945
+ }
7946
+
7947
+ cur = inpL;
7948
+
7949
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
7950
+
7951
+ cb(cur, "result_norm", -1);
7952
+ res->t_embd = cur;
7953
+
7954
+ // lm_head
7955
+ cur = build_lora_mm(model.output, cur);
7956
+
7368
7957
  cb(cur, "result_output", -1);
7369
7958
  res->t_logits = cur;
7370
7959
 
@@ -7373,7 +7962,7 @@ struct llm_build_qwen2 : public llm_graph_context {
7373
7962
  };
7374
7963
 
7375
7964
  struct llm_build_qwen2vl : public llm_graph_context {
7376
- llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7965
+ llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
7377
7966
  const int64_t n_embd_head = hparams.n_embd_head_v;
7378
7967
 
7379
7968
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7438,7 +8027,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
7438
8027
  cb(Kcur, "Kcur", il);
7439
8028
  cb(Vcur, "Vcur", il);
7440
8029
 
7441
- cur = build_attn(inp_attn, gf,
8030
+ cur = build_attn(inp_attn,
7442
8031
  model.layers[il].wo, model.layers[il].bo,
7443
8032
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7444
8033
  }
@@ -7494,7 +8083,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
7494
8083
  };
7495
8084
 
7496
8085
  struct llm_build_qwen2moe : public llm_graph_context {
7497
- llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8086
+ llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
7498
8087
  const int64_t n_embd_head = hparams.n_embd_head_v;
7499
8088
 
7500
8089
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7565,7 +8154,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
7565
8154
  cb(Kcur, "Kcur", il);
7566
8155
  cb(Vcur, "Vcur", il);
7567
8156
 
7568
- cur = build_attn(inp_attn, gf,
8157
+ cur = build_attn(inp_attn,
7569
8158
  model.layers[il].wo, model.layers[il].bo,
7570
8159
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7571
8160
  }
@@ -7653,7 +8242,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
7653
8242
  };
7654
8243
 
7655
8244
  struct llm_build_qwen3 : public llm_graph_context {
7656
- llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8245
+ llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
7657
8246
  const int64_t n_embd_head = hparams.n_embd_head_v;
7658
8247
 
7659
8248
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7718,7 +8307,7 @@ struct llm_build_qwen3 : public llm_graph_context {
7718
8307
  cb(Kcur, "Kcur", il);
7719
8308
  cb(Vcur, "Vcur", il);
7720
8309
 
7721
- cur = build_attn(inp_attn, gf,
8310
+ cur = build_attn(inp_attn,
7722
8311
  model.layers[il].wo, model.layers[il].bo,
7723
8312
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7724
8313
  }
@@ -7774,7 +8363,7 @@ struct llm_build_qwen3 : public llm_graph_context {
7774
8363
  };
7775
8364
 
7776
8365
  struct llm_build_qwen3moe : public llm_graph_context {
7777
- llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8366
+ llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
7778
8367
  const int64_t n_embd_head = hparams.n_embd_head_v;
7779
8368
 
7780
8369
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7839,7 +8428,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
7839
8428
  cb(Kcur, "Kcur", il);
7840
8429
  cb(Vcur, "Vcur", il);
7841
8430
 
7842
- cur = build_attn(inp_attn, gf,
8431
+ cur = build_attn(inp_attn,
7843
8432
  model.layers[il].wo, model.layers[il].bo,
7844
8433
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7845
8434
  }
@@ -7902,7 +8491,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
7902
8491
  };
7903
8492
 
7904
8493
  struct llm_build_phi2 : public llm_graph_context {
7905
- llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8494
+ llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
7906
8495
  const int64_t n_embd_head = hparams.n_embd_head_v;
7907
8496
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7908
8497
 
@@ -7979,7 +8568,7 @@ struct llm_build_phi2 : public llm_graph_context {
7979
8568
  // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
7980
8569
  Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
7981
8570
 
7982
- cur = build_attn(inp_attn, gf,
8571
+ cur = build_attn(inp_attn,
7983
8572
  model.layers[il].wo, model.layers[il].bo,
7984
8573
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7985
8574
  }
@@ -8033,7 +8622,7 @@ struct llm_build_phi2 : public llm_graph_context {
8033
8622
 
8034
8623
  template<bool iswa>
8035
8624
  struct llm_build_phi3 : public llm_graph_context {
8036
- llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8625
+ llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8037
8626
  const int64_t n_embd_head = hparams.n_embd_head_v;
8038
8627
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
8039
8628
 
@@ -8116,7 +8705,7 @@ struct llm_build_phi3 : public llm_graph_context {
8116
8705
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
8117
8706
  cb(Qcur, "Qcur", il);
8118
8707
 
8119
- cur = build_attn(inp_attn, gf,
8708
+ cur = build_attn(inp_attn,
8120
8709
  model.layers[il].wo, model.layers[il].bo,
8121
8710
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8122
8711
  }
@@ -8191,7 +8780,7 @@ struct llm_build_phi3 : public llm_graph_context {
8191
8780
  };
8192
8781
 
8193
8782
  struct llm_build_plamo : public llm_graph_context {
8194
- llm_build_plamo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8783
+ llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8195
8784
  const int64_t n_embd_head = hparams.n_embd_head_v;
8196
8785
 
8197
8786
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8250,7 +8839,7 @@ struct llm_build_plamo : public llm_graph_context {
8250
8839
  cb(Kcur, "Kcur", il);
8251
8840
  cb(Vcur, "Vcur", il);
8252
8841
 
8253
- cur = build_attn(inp_attn, gf,
8842
+ cur = build_attn(inp_attn,
8254
8843
  model.layers[il].wo, NULL,
8255
8844
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8256
8845
  }
@@ -8306,7 +8895,7 @@ struct llm_build_plamo : public llm_graph_context {
8306
8895
  };
8307
8896
 
8308
8897
  struct llm_build_gpt2 : public llm_graph_context {
8309
- llm_build_gpt2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8898
+ llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8310
8899
  const int64_t n_embd_head = hparams.n_embd_head_v;
8311
8900
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
8312
8901
 
@@ -8358,7 +8947,7 @@ struct llm_build_gpt2 : public llm_graph_context {
8358
8947
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8359
8948
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8360
8949
 
8361
- cur = build_attn(inp_attn, gf,
8950
+ cur = build_attn(inp_attn,
8362
8951
  model.layers[il].wo, model.layers[il].bo,
8363
8952
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8364
8953
  }
@@ -8416,7 +9005,7 @@ struct llm_build_gpt2 : public llm_graph_context {
8416
9005
  };
8417
9006
 
8418
9007
  struct llm_build_codeshell : public llm_graph_context {
8419
- llm_build_codeshell(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9008
+ llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8420
9009
  const int64_t n_embd_head = hparams.n_embd_head_v;
8421
9010
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
8422
9011
 
@@ -8472,7 +9061,7 @@ struct llm_build_codeshell : public llm_graph_context {
8472
9061
  cb(Kcur, "Kcur", il);
8473
9062
  cb(Vcur, "Vcur", il);
8474
9063
 
8475
- cur = build_attn(inp_attn, gf,
9064
+ cur = build_attn(inp_attn,
8476
9065
  model.layers[il].wo, model.layers[il].bo,
8477
9066
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8478
9067
  }
@@ -8530,7 +9119,7 @@ struct llm_build_codeshell : public llm_graph_context {
8530
9119
  };
8531
9120
 
8532
9121
  struct llm_build_orion : public llm_graph_context {
8533
- llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9122
+ llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8534
9123
  const int64_t n_embd_head = hparams.n_embd_head_v;
8535
9124
 
8536
9125
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8601,7 +9190,7 @@ struct llm_build_orion : public llm_graph_context {
8601
9190
  cb(Kcur, "Kcur", il);
8602
9191
  cb(Vcur, "Vcur", il);
8603
9192
 
8604
- cur = build_attn(inp_attn, gf,
9193
+ cur = build_attn(inp_attn,
8605
9194
  model.layers[il].wo, NULL,
8606
9195
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8607
9196
  }
@@ -8657,7 +9246,7 @@ struct llm_build_orion : public llm_graph_context {
8657
9246
  };
8658
9247
 
8659
9248
  struct llm_build_internlm2 : public llm_graph_context {
8660
- llm_build_internlm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9249
+ llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8661
9250
  const int64_t n_embd_head = hparams.n_embd_head_v;
8662
9251
 
8663
9252
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8728,7 +9317,7 @@ struct llm_build_internlm2 : public llm_graph_context {
8728
9317
  cb(Kcur, "Kcur", il);
8729
9318
  cb(Vcur, "Vcur", il);
8730
9319
 
8731
- cur = build_attn(inp_attn, gf,
9320
+ cur = build_attn(inp_attn,
8732
9321
  model.layers[il].wo, model.layers[il].bo,
8733
9322
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8734
9323
  }
@@ -8784,7 +9373,7 @@ struct llm_build_internlm2 : public llm_graph_context {
8784
9373
  };
8785
9374
 
8786
9375
  struct llm_build_minicpm3 : public llm_graph_context {
8787
- llm_build_minicpm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9376
+ llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8788
9377
  //TODO: if the model varies, these parameters need to be read from the model
8789
9378
  const int64_t n_embd_base = 256;
8790
9379
  const float scale_embd = 12.0f;
@@ -8916,7 +9505,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
8916
9505
  ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
8917
9506
  cb(k_states, "k_states", il);
8918
9507
 
8919
- cur = build_attn(inp_attn, gf,
9508
+ cur = build_attn(inp_attn,
8920
9509
  model.layers[il].wo, NULL,
8921
9510
  q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
8922
9511
  }
@@ -8988,7 +9577,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
8988
9577
  };
8989
9578
 
8990
9579
  struct llm_build_gemma : public llm_graph_context {
8991
- llm_build_gemma(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9580
+ llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8992
9581
  const int64_t n_embd_head = hparams.n_embd_head_v;
8993
9582
 
8994
9583
  ggml_tensor * cur;
@@ -9046,7 +9635,7 @@ struct llm_build_gemma : public llm_graph_context {
9046
9635
  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
9047
9636
  cb(Qcur, "Qcur_scaled", il);
9048
9637
 
9049
- cur = build_attn(inp_attn, gf,
9638
+ cur = build_attn(inp_attn,
9050
9639
  model.layers[il].wo, NULL,
9051
9640
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9052
9641
  }
@@ -9104,7 +9693,7 @@ struct llm_build_gemma : public llm_graph_context {
9104
9693
  };
9105
9694
 
9106
9695
  struct llm_build_gemma2_iswa : public llm_graph_context {
9107
- llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9696
+ llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
9108
9697
  const int64_t n_embd_head = hparams.n_embd_head_k;
9109
9698
 
9110
9699
  ggml_tensor * cur;
@@ -9161,7 +9750,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
9161
9750
 
9162
9751
  Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
9163
9752
 
9164
- cur = build_attn(inp_attn, gf,
9753
+ cur = build_attn(inp_attn,
9165
9754
  model.layers[il].wo, NULL,
9166
9755
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9167
9756
  }
@@ -9234,7 +9823,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
9234
9823
  };
9235
9824
 
9236
9825
  struct llm_build_gemma3_iswa : public llm_graph_context {
9237
- llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9826
+ llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
9238
9827
  const int64_t n_embd_head = hparams.n_embd_head_k;
9239
9828
 
9240
9829
  ggml_tensor * cur;
@@ -9303,7 +9892,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
9303
9892
  // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
9304
9893
  Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
9305
9894
 
9306
- cur = build_attn(inp_attn, gf,
9895
+ cur = build_attn(inp_attn,
9307
9896
  model.layers[il].wo, NULL,
9308
9897
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9309
9898
  }
@@ -9372,7 +9961,6 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
9372
9961
 
9373
9962
  struct llm_build_gemma3n_iswa : public llm_graph_context {
9374
9963
  const llama_model & model;
9375
- ggml_cgraph * gf;
9376
9964
 
9377
9965
  const int64_t n_embd_head;
9378
9966
  const int64_t n_embd_altup;
@@ -9382,12 +9970,9 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
9382
9970
  const int n_layer_sparsity = 10; // number of layers using activation sparsity
9383
9971
  const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
9384
9972
 
9385
- ggml_tensor * one; // containing single element 1.0f
9386
-
9387
- llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
9973
+ llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params)
9388
9974
  : llm_graph_context(params),
9389
9975
  model(model),
9390
- gf(gf),
9391
9976
  n_embd_head(model.hparams.n_embd_head_k),
9392
9977
  n_embd_altup(model.hparams.n_embd_altup),
9393
9978
  n_altup(model.hparams.n_altup),
@@ -9395,14 +9980,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
9395
9980
  ggml_tensor * cur;
9396
9981
  ggml_tensor * inpL;
9397
9982
 
9398
- // TODO: remove this when ggml_scale_add is implemented
9399
- one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
9400
- {
9401
- auto inp = std::make_unique<llm_graph_input_one>();
9402
- inp->one = one;
9403
- res->add_input(std::move(inp));
9404
- }
9405
-
9406
9983
  inpL = build_inp_embd(model.tok_embd);
9407
9984
 
9408
9985
  // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
@@ -9496,7 +10073,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
9496
10073
  cb(Qcur, "Qcur_pos", il);
9497
10074
  cb(Kcur, "Kcur_pos", il);
9498
10075
 
9499
- cur = build_attn(inp_attn, gf,
10076
+ cur = build_attn(inp_attn,
9500
10077
  model.layers[il].wo, NULL,
9501
10078
  Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
9502
10079
  } else {
@@ -9514,7 +10091,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
9514
10091
  ext_factor, attn_factor, beta_fast, beta_slow);
9515
10092
  cb(Qcur, "Qcur_pos", il);
9516
10093
 
9517
- cur = build_attn(inp_attn, gf,
10094
+ cur = build_attn(inp_attn,
9518
10095
  model.layers[il].wo, NULL,
9519
10096
  Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
9520
10097
  }
@@ -9792,7 +10369,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
9792
10369
  cb(innovation, "innovation", il);
9793
10370
 
9794
10371
  ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
9795
- all_coefs = ggml_add(ctx0, all_coefs, one);
10372
+ all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
9796
10373
  cb(all_coefs, "all_coefs", il);
9797
10374
  all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
9798
10375
  all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
@@ -9808,7 +10385,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
9808
10385
 
9809
10386
  // TODO: move up next to build_starcoder
9810
10387
  struct llm_build_starcoder2 : public llm_graph_context {
9811
- llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
10388
+ llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
9812
10389
  const int64_t n_embd_head = hparams.n_embd_head_v;
9813
10390
 
9814
10391
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9879,7 +10456,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
9879
10456
  cb(Kcur, "Kcur", il);
9880
10457
  cb(Vcur, "Vcur", il);
9881
10458
 
9882
- cur = build_attn(inp_attn, gf,
10459
+ cur = build_attn(inp_attn,
9883
10460
  model.layers[il].wo, model.layers[il].bo,
9884
10461
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9885
10462
  }
@@ -9935,74 +10512,22 @@ struct llm_build_starcoder2 : public llm_graph_context {
9935
10512
  }
9936
10513
  };
9937
10514
 
9938
- struct llm_build_mamba : public llm_graph_context {
9939
- llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9940
- ggml_tensor * cur;
9941
- ggml_tensor * inpL;
9942
-
9943
- // {n_embd, n_tokens}
9944
- inpL = build_inp_embd(model.tok_embd);
9945
-
9946
- auto * rs_inp = build_rs_inp();
9947
-
9948
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9949
-
9950
- for (int il = 0; il < n_layer; ++il) {
9951
- // norm
9952
- cur = build_norm(inpL,
9953
- model.layers[il].attn_norm, NULL,
9954
- LLM_NORM_RMS, il);
9955
- cb(cur, "attn_norm", il);
9956
-
9957
- if (model.arch == LLM_ARCH_MAMBA2) {
9958
- cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il);
9959
- } else {
9960
- cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il);
9961
- }
9962
-
9963
- if (il == n_layer - 1 && inp_out_ids) {
9964
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9965
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9966
- }
9967
-
9968
- // residual
9969
- cur = ggml_add(ctx0, cur, inpL);
9970
-
9971
- cur = build_cvec(cur, il);
9972
- cb(cur, "l_out", il);
9973
-
9974
- // input for next layer
9975
- inpL = cur;
9976
- }
9977
-
9978
- // final rmsnorm
9979
- cur = build_norm(inpL,
9980
- model.output_norm, NULL,
9981
- LLM_NORM_RMS, -1);
9982
-
9983
- cb(cur, "result_norm", -1);
9984
- res->t_embd = cur;
9985
-
9986
- // lm_head
9987
- cur = build_lora_mm(model.output, cur);
9988
-
9989
- cb(cur, "result_output", -1);
9990
- res->t_logits = cur;
9991
-
9992
- ggml_build_forward_expand(gf, cur);
9993
- }
10515
+ struct llm_graph_context_mamba : public llm_graph_context {
10516
+ llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
9994
10517
 
9995
10518
  ggml_tensor * build_mamba_layer(
9996
10519
  llm_graph_input_rs * inp,
9997
- ggml_cgraph * gf,
9998
10520
  ggml_tensor * cur,
9999
10521
  const llama_model & model,
10000
10522
  const llama_ubatch & ubatch,
10001
- int il) const {
10002
- const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
10523
+ int il) {
10524
+
10525
+ const auto * mctx_cur = inp->mctx;
10003
10526
 
10004
10527
  const auto kv_head = mctx_cur->get_head();
10005
10528
 
10529
+ const auto & layer = model.layers[il];
10530
+
10006
10531
  const int64_t d_conv = hparams.ssm_d_conv;
10007
10532
  const int64_t d_inner = hparams.ssm_d_inner;
10008
10533
  const int64_t d_state = hparams.ssm_d_state;
@@ -10012,26 +10537,24 @@ struct llm_build_mamba : public llm_graph_context {
10012
10537
  const int64_t n_seqs = ubatch.n_seqs;
10013
10538
  // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
10014
10539
  const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
10015
- // Use the same RMS norm as the final layer norm
10016
- const float norm_rms_eps = hparams.f_norm_rms_eps;
10017
10540
 
10018
10541
  const int64_t n_seq_tokens = ubatch.n_seq_tokens;
10019
10542
 
10020
10543
  GGML_ASSERT(n_seqs != 0);
10021
- GGML_ASSERT(ubatch.equal_seqs);
10544
+ GGML_ASSERT(ubatch.equal_seqs());
10022
10545
  GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
10023
10546
 
10024
10547
  ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
10025
10548
  ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
10026
10549
 
10027
- ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
10550
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
10028
10551
  conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
10029
10552
 
10030
10553
  // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
10031
10554
  cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
10032
10555
 
10033
10556
  // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
10034
- ggml_tensor * xz = build_lora_mm(model.layers[il].ssm_in, cur);
10557
+ ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
10035
10558
  // split the above in two
10036
10559
  // => {d_inner, n_seq_tokens, n_seqs}
10037
10560
  ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
@@ -10060,10 +10583,10 @@ struct llm_build_mamba : public llm_graph_context {
10060
10583
  // then permute away the ne[0] dimension,
10061
10584
  // and then you're left with the resulting x tensor.
10062
10585
  // For simultaneous sequences, all sequences need to have the same length.
10063
- x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
10586
+ x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
10064
10587
 
10065
10588
  // bias
10066
- x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
10589
+ x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
10067
10590
 
10068
10591
  x = ggml_silu(ctx0, x);
10069
10592
  }
@@ -10071,27 +10594,27 @@ struct llm_build_mamba : public llm_graph_context {
10071
10594
  // ssm
10072
10595
  {
10073
10596
  // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
10074
- ggml_tensor * x_db = build_lora_mm(model.layers[il].ssm_x, x);
10597
+ ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
10075
10598
  // split
10076
10599
  ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
10077
10600
  ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
10078
10601
  ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
10079
10602
 
10080
- // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
10081
- if (ssm_dt_b_c_rms) {
10082
- dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
10083
- B = ggml_rms_norm(ctx0, B, norm_rms_eps);
10084
- C = ggml_rms_norm(ctx0, C, norm_rms_eps);
10603
+ // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
10604
+ if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
10605
+ dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
10606
+ B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
10607
+ C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
10085
10608
  }
10086
10609
 
10087
10610
  // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
10088
- dt = build_lora_mm(model.layers[il].ssm_dt, dt);
10089
- dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
10611
+ dt = build_lora_mm(layer.ssm_dt, dt);
10612
+ dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
10090
10613
 
10091
10614
  cur = x;
10092
10615
  x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
10093
10616
 
10094
- ggml_tensor * A = model.layers[il].ssm_a;
10617
+ ggml_tensor * A = layer.ssm_a;
10095
10618
 
10096
10619
  // use the states and the indices provided by build_recurrent_state
10097
10620
  // (this is necessary in order to properly use the states before they are overwritten,
@@ -10105,7 +10628,7 @@ struct llm_build_mamba : public llm_graph_context {
10105
10628
  return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
10106
10629
  };
10107
10630
 
10108
- ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
10631
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
10109
10632
 
10110
10633
  // store last states
10111
10634
  ggml_build_forward_expand(gf,
@@ -10117,28 +10640,27 @@ struct llm_build_mamba : public llm_graph_context {
10117
10640
 
10118
10641
  // TODO: skip computing output earlier for unused tokens
10119
10642
 
10120
- y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, model.layers[il].ssm_d));
10121
- y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
10643
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
10644
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
10122
10645
 
10123
10646
  // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
10124
- cur = build_lora_mm(model.layers[il].ssm_out, y);
10647
+ cur = build_lora_mm(layer.ssm_out, y);
10125
10648
  }
10126
10649
 
10127
10650
  // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
10128
10651
  cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
10129
- // cb(cur, "mamba_out", il);
10130
10652
 
10131
10653
  return cur;
10132
10654
  }
10133
10655
 
10134
10656
  ggml_tensor * build_mamba2_layer(
10135
10657
  llm_graph_input_rs * inp,
10136
- ggml_cgraph * gf,
10137
- ggml_tensor * cur,
10138
- const llama_model & model,
10139
- const llama_ubatch & ubatch,
10140
- int il) const {
10141
- const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
10658
+ ggml_tensor * cur,
10659
+ const llama_model & model,
10660
+ const llama_ubatch & ubatch,
10661
+ int il) const {
10662
+
10663
+ const auto * mctx_cur = inp->mctx;
10142
10664
 
10143
10665
  const auto kv_head = mctx_cur->get_head();
10144
10666
 
@@ -10153,13 +10675,13 @@ struct llm_build_mamba : public llm_graph_context {
10153
10675
  const int64_t n_seq_tokens = ubatch.n_seq_tokens;
10154
10676
 
10155
10677
  GGML_ASSERT(n_seqs != 0);
10156
- GGML_ASSERT(ubatch.equal_seqs);
10678
+ GGML_ASSERT(ubatch.equal_seqs());
10157
10679
  GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
10158
10680
 
10159
10681
  ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
10160
10682
  ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
10161
10683
 
10162
- ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
10684
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
10163
10685
  conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
10164
10686
 
10165
10687
  // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
@@ -10229,7 +10751,7 @@ struct llm_build_mamba : public llm_graph_context {
10229
10751
  return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
10230
10752
  };
10231
10753
 
10232
- ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
10754
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
10233
10755
 
10234
10756
  // store last states
10235
10757
  ggml_build_forward_expand(gf,
@@ -10242,11 +10764,14 @@ struct llm_build_mamba : public llm_graph_context {
10242
10764
  // TODO: skip computing output earlier for unused tokens
10243
10765
 
10244
10766
  y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
10245
- y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
10767
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
10246
10768
 
10247
10769
  // grouped RMS norm
10248
- y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
10249
- y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
10770
+ if (model.layers[il].ssm_norm) {
10771
+ y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
10772
+ y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
10773
+ }
10774
+
10250
10775
  y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
10251
10776
 
10252
10777
  // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
@@ -10261,23 +10786,15 @@ struct llm_build_mamba : public llm_graph_context {
10261
10786
  }
10262
10787
  };
10263
10788
 
10264
- struct llm_build_command_r : public llm_graph_context {
10265
- llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
10266
- const int64_t n_embd_head = hparams.n_embd_head_v;
10267
-
10268
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10269
-
10270
- const float f_logit_scale = hparams.f_logit_scale;
10271
-
10789
+ struct llm_build_mamba : public llm_graph_context_mamba {
10790
+ llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
10272
10791
  ggml_tensor * cur;
10273
10792
  ggml_tensor * inpL;
10274
10793
 
10794
+ // {n_embd, n_tokens}
10275
10795
  inpL = build_inp_embd(model.tok_embd);
10276
10796
 
10277
- // inp_pos - contains the positions
10278
- ggml_tensor * inp_pos = build_inp_pos();
10279
-
10280
- auto * inp_attn = build_attn_inp_kv_unified();
10797
+ auto * rs_inp = build_rs_inp();
10281
10798
 
10282
10799
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10283
10800
 
@@ -10285,20 +10802,194 @@ struct llm_build_command_r : public llm_graph_context {
10285
10802
  // norm
10286
10803
  cur = build_norm(inpL,
10287
10804
  model.layers[il].attn_norm, NULL,
10288
- LLM_NORM, il);
10805
+ LLM_NORM_RMS, il);
10289
10806
  cb(cur, "attn_norm", il);
10290
10807
 
10291
- ggml_tensor * ffn_inp = cur;
10808
+ if (model.arch == LLM_ARCH_MAMBA2) {
10809
+ cur = build_mamba2_layer(rs_inp, cur, model, ubatch, il);
10810
+ } else {
10811
+ cur = build_mamba_layer(rs_inp, cur, model, ubatch, il);
10812
+ }
10292
10813
 
10293
- // self-attention
10294
- {
10295
- // compute Q and K and RoPE them
10296
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
10297
- cb(Qcur, "Qcur", il);
10298
- if (model.layers[il].bq) {
10299
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
10300
- cb(Qcur, "Qcur", il);
10301
- }
10814
+ if (il == n_layer - 1 && inp_out_ids) {
10815
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10816
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10817
+ }
10818
+
10819
+ // residual
10820
+ cur = ggml_add(ctx0, cur, inpL);
10821
+
10822
+ cur = build_cvec(cur, il);
10823
+ cb(cur, "l_out", il);
10824
+
10825
+ // input for next layer
10826
+ inpL = cur;
10827
+ }
10828
+
10829
+ // final rmsnorm
10830
+ cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
10831
+
10832
+ cb(cur, "result_norm", -1);
10833
+ res->t_embd = cur;
10834
+
10835
+ // lm_head
10836
+ cur = build_lora_mm(model.output, cur);
10837
+
10838
+ cb(cur, "result_output", -1);
10839
+ res->t_logits = cur;
10840
+
10841
+ ggml_build_forward_expand(gf, cur);
10842
+ }
10843
+
10844
+ };
10845
+
10846
+ struct llm_build_jamba : public llm_graph_context_mamba {
10847
+ llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
10848
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10849
+
10850
+ ggml_tensor * cur;
10851
+ ggml_tensor * inpL;
10852
+
10853
+ // {n_embd, n_tokens}
10854
+ inpL = build_inp_embd(model.tok_embd);
10855
+
10856
+ auto * inp_hybrid = build_inp_mem_hybrid();
10857
+
10858
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10859
+
10860
+ for (int il = 0; il < n_layer; ++il) {
10861
+ const int64_t n_head_kv = hparams.n_head_kv(il);
10862
+
10863
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
10864
+ cb(cur, "attn_norm", il);
10865
+
10866
+ if (n_head_kv == 0) {
10867
+ cur = build_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
10868
+ } else {
10869
+ // Attention
10870
+
10871
+ struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
10872
+ struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
10873
+ struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
10874
+
10875
+ cb(Qcur, "Qcur", il);
10876
+ cb(Kcur, "Kcur", il);
10877
+ cb(Vcur, "Vcur", il);
10878
+
10879
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
10880
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
10881
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
10882
+
10883
+ cb(Qcur, "Qcur", il);
10884
+ cb(Kcur, "Kcur", il);
10885
+ cb(Vcur, "Vcur", il);
10886
+
10887
+ // No RoPE :)
10888
+ cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
10889
+ }
10890
+
10891
+ if (il == n_layer - 1 && inp_out_ids) {
10892
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10893
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10894
+ }
10895
+
10896
+ // residual
10897
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur);
10898
+ cb(cur, "ffn_inp", il);
10899
+
10900
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
10901
+ cb(cur, "ffn_norm", il);
10902
+
10903
+ // feed-forward network
10904
+ if (model.layers[il].ffn_gate_inp == nullptr) {
10905
+ // FFN
10906
+ cur = build_ffn(cur,
10907
+ model.layers[il].ffn_up, NULL, NULL,
10908
+ model.layers[il].ffn_gate, NULL, NULL,
10909
+ model.layers[il].ffn_down, NULL, NULL,
10910
+ NULL,
10911
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
10912
+ cb(cur, "ffn_out", il);
10913
+ } else {
10914
+ // MoE branch
10915
+ cur = build_moe_ffn(cur,
10916
+ model.layers[il].ffn_gate_inp,
10917
+ model.layers[il].ffn_up_exps,
10918
+ model.layers[il].ffn_gate_exps,
10919
+ model.layers[il].ffn_down_exps,
10920
+ nullptr,
10921
+ n_expert, n_expert_used,
10922
+ LLM_FFN_SILU, false,
10923
+ false, 0.0,
10924
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
10925
+ il);
10926
+ cb(cur, "ffn_moe_out", il);
10927
+ }
10928
+
10929
+ // residual
10930
+ cur = ggml_add(ctx0, ffn_inp, cur);
10931
+
10932
+ cur = build_cvec(cur, il);
10933
+ cb(cur, "l_out", il);
10934
+
10935
+ // input for next layer
10936
+ inpL = cur;
10937
+ }
10938
+
10939
+ // final rmsnorm
10940
+ cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
10941
+
10942
+ cb(cur, "result_norm", -1);
10943
+ res->t_embd = cur;
10944
+
10945
+ // lm_head
10946
+ cur = build_lora_mm(model.output, cur);
10947
+
10948
+ cb(cur, "result_output", -1);
10949
+ res->t_logits = cur;
10950
+
10951
+ ggml_build_forward_expand(gf, cur);
10952
+ }
10953
+ };
10954
+
10955
+ struct llm_build_command_r : public llm_graph_context {
10956
+ llm_build_command_r(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
10957
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10958
+
10959
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10960
+
10961
+ const float f_logit_scale = hparams.f_logit_scale;
10962
+
10963
+ ggml_tensor * cur;
10964
+ ggml_tensor * inpL;
10965
+
10966
+ inpL = build_inp_embd(model.tok_embd);
10967
+
10968
+ // inp_pos - contains the positions
10969
+ ggml_tensor * inp_pos = build_inp_pos();
10970
+
10971
+ auto * inp_attn = build_attn_inp_kv_unified();
10972
+
10973
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10974
+
10975
+ for (int il = 0; il < n_layer; ++il) {
10976
+ // norm
10977
+ cur = build_norm(inpL,
10978
+ model.layers[il].attn_norm, NULL,
10979
+ LLM_NORM, il);
10980
+ cb(cur, "attn_norm", il);
10981
+
10982
+ ggml_tensor * ffn_inp = cur;
10983
+
10984
+ // self-attention
10985
+ {
10986
+ // compute Q and K and RoPE them
10987
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
10988
+ cb(Qcur, "Qcur", il);
10989
+ if (model.layers[il].bq) {
10990
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
10991
+ cb(Qcur, "Qcur", il);
10992
+ }
10302
10993
 
10303
10994
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
10304
10995
  cb(Kcur, "Kcur", il);
@@ -10350,7 +11041,7 @@ struct llm_build_command_r : public llm_graph_context {
10350
11041
  cb(Kcur, "Kcur", il);
10351
11042
  cb(Vcur, "Vcur", il);
10352
11043
 
10353
- cur = build_attn(inp_attn, gf,
11044
+ cur = build_attn(inp_attn,
10354
11045
  model.layers[il].wo, model.layers[il].bo,
10355
11046
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10356
11047
  }
@@ -10409,7 +11100,7 @@ struct llm_build_command_r : public llm_graph_context {
10409
11100
  };
10410
11101
 
10411
11102
  struct llm_build_cohere2_iswa : public llm_graph_context {
10412
- llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11103
+ llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
10413
11104
  const int64_t n_embd_head = hparams.n_embd_head_v;
10414
11105
 
10415
11106
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10485,7 +11176,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
10485
11176
  cb(Kcur, "Kcur", il);
10486
11177
  cb(Vcur, "Vcur", il);
10487
11178
 
10488
- cur = build_attn(inp_attn, gf,
11179
+ cur = build_attn(inp_attn,
10489
11180
  model.layers[il].wo, model.layers[il].bo,
10490
11181
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10491
11182
  }
@@ -10545,7 +11236,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
10545
11236
  // * removed bias
10546
11237
  // * removed MoE
10547
11238
  struct llm_build_olmo : public llm_graph_context {
10548
- llm_build_olmo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11239
+ llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
10549
11240
  const int64_t n_embd_head = hparams.n_embd_head_v;
10550
11241
 
10551
11242
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10616,7 +11307,7 @@ struct llm_build_olmo : public llm_graph_context {
10616
11307
  cb(Kcur, "Kcur", il);
10617
11308
  cb(Vcur, "Vcur", il);
10618
11309
 
10619
- cur = build_attn(inp_attn, gf,
11310
+ cur = build_attn(inp_attn,
10620
11311
  model.layers[il].wo, nullptr,
10621
11312
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10622
11313
  }
@@ -10673,7 +11364,7 @@ struct llm_build_olmo : public llm_graph_context {
10673
11364
  };
10674
11365
 
10675
11366
  struct llm_build_olmo2 : public llm_graph_context {
10676
- llm_build_olmo2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11367
+ llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
10677
11368
  const int64_t n_embd_head = hparams.n_embd_head_v;
10678
11369
 
10679
11370
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10736,7 +11427,7 @@ struct llm_build_olmo2 : public llm_graph_context {
10736
11427
  cb(Kcur, "Kcur", il);
10737
11428
  cb(Vcur, "Vcur", il);
10738
11429
 
10739
- cur = build_attn(inp_attn, gf,
11430
+ cur = build_attn(inp_attn,
10740
11431
  model.layers[il].wo, NULL,
10741
11432
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10742
11433
  }
@@ -10802,7 +11493,7 @@ struct llm_build_olmo2 : public llm_graph_context {
10802
11493
  // * removed bias
10803
11494
  // * added q, k norm
10804
11495
  struct llm_build_olmoe : public llm_graph_context {
10805
- llm_build_olmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11496
+ llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
10806
11497
  const int64_t n_embd_head = hparams.n_embd_head_v;
10807
11498
 
10808
11499
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10869,7 +11560,7 @@ struct llm_build_olmoe : public llm_graph_context {
10869
11560
  cb(Kcur, "Kcur", il);
10870
11561
  cb(Vcur, "Vcur", il);
10871
11562
 
10872
- cur = build_attn(inp_attn, gf,
11563
+ cur = build_attn(inp_attn,
10873
11564
  model.layers[il].wo, NULL,
10874
11565
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10875
11566
  }
@@ -10930,7 +11621,7 @@ struct llm_build_olmoe : public llm_graph_context {
10930
11621
  };
10931
11622
 
10932
11623
  struct llm_build_openelm : public llm_graph_context {
10933
- llm_build_openelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11624
+ llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
10934
11625
  const int64_t n_embd_head = hparams.n_embd_head_v;
10935
11626
 
10936
11627
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11002,7 +11693,7 @@ struct llm_build_openelm : public llm_graph_context {
11002
11693
  cb(Kcur, "Kcur", il);
11003
11694
  cb(Qcur, "Vcur", il);
11004
11695
 
11005
- cur = build_attn(inp_attn, gf,
11696
+ cur = build_attn(inp_attn,
11006
11697
  model.layers[il].wo, NULL,
11007
11698
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11008
11699
  }
@@ -11059,7 +11750,7 @@ struct llm_build_openelm : public llm_graph_context {
11059
11750
  };
11060
11751
 
11061
11752
  struct llm_build_gptneox : public llm_graph_context {
11062
- llm_build_gptneox(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11753
+ llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11063
11754
  const int64_t n_embd_head = hparams.n_embd_head_v;
11064
11755
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
11065
11756
 
@@ -11114,7 +11805,7 @@ struct llm_build_gptneox : public llm_graph_context {
11114
11805
  cb(Kcur, "Kcur", il);
11115
11806
  cb(Vcur, "Vcur", il);
11116
11807
 
11117
- cur = build_attn(inp_attn, gf,
11808
+ cur = build_attn(inp_attn,
11118
11809
  model.layers[il].wo, model.layers[il].bo,
11119
11810
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11120
11811
  }
@@ -11205,7 +11896,7 @@ struct llm_build_gptneox : public llm_graph_context {
11205
11896
  };
11206
11897
 
11207
11898
  struct llm_build_arctic : public llm_graph_context {
11208
- llm_build_arctic(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11899
+ llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11209
11900
  const int64_t n_embd_head = hparams.n_embd_head_v;
11210
11901
 
11211
11902
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11264,7 +11955,7 @@ struct llm_build_arctic : public llm_graph_context {
11264
11955
  cb(Kcur, "Kcur", il);
11265
11956
  cb(Vcur, "Vcur", il);
11266
11957
 
11267
- cur = build_attn(inp_attn, gf,
11958
+ cur = build_attn(inp_attn,
11268
11959
  model.layers[il].wo, NULL,
11269
11960
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11270
11961
  }
@@ -11343,7 +12034,7 @@ struct llm_build_arctic : public llm_graph_context {
11343
12034
  };
11344
12035
 
11345
12036
  struct llm_build_deepseek : public llm_graph_context {
11346
- llm_build_deepseek(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12037
+ llm_build_deepseek(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11347
12038
  const int64_t n_embd_head = hparams.n_embd_head_v;
11348
12039
 
11349
12040
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11419,7 +12110,7 @@ struct llm_build_deepseek : public llm_graph_context {
11419
12110
  cb(Kcur, "Kcur", il);
11420
12111
  cb(Vcur, "Vcur", il);
11421
12112
 
11422
- cur = build_attn(inp_attn, gf,
12113
+ cur = build_attn(inp_attn,
11423
12114
  model.layers[il].wo, model.layers[il].bo,
11424
12115
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
11425
12116
  }
@@ -11505,7 +12196,7 @@ struct llm_build_deepseek : public llm_graph_context {
11505
12196
  };
11506
12197
 
11507
12198
  struct llm_build_deepseek2 : public llm_graph_context {
11508
- llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12199
+ llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11509
12200
  bool is_lite = (hparams.n_layer == 27);
11510
12201
 
11511
12202
  const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
@@ -11647,7 +12338,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
11647
12338
  cb(Vcur, "Vcur", il);
11648
12339
 
11649
12340
  // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
11650
- cur = build_attn(inp_attn, gf,
12341
+ cur = build_attn(inp_attn,
11651
12342
  model.layers[il].wo, NULL,
11652
12343
  Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
11653
12344
  } else {
@@ -11681,7 +12372,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
11681
12372
  cb(Kcur, "Kcur", il);
11682
12373
 
11683
12374
  // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
11684
- cur = build_attn(inp_attn, gf,
12375
+ cur = build_attn(inp_attn,
11685
12376
  model.layers[il].wo, NULL,
11686
12377
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
11687
12378
  }
@@ -11768,7 +12459,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
11768
12459
  };
11769
12460
 
11770
12461
  struct llm_build_bitnet : public llm_graph_context {
11771
- llm_build_bitnet(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12462
+ llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11772
12463
  const int64_t n_embd_head = hparams.n_embd_head_v;
11773
12464
 
11774
12465
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11848,7 +12539,7 @@ struct llm_build_bitnet : public llm_graph_context {
11848
12539
  cb(Kcur, "Kcur", il);
11849
12540
  cb(Vcur, "Vcur", il);
11850
12541
 
11851
- cur = build_attn(inp_attn, gf,
12542
+ cur = build_attn(inp_attn,
11852
12543
  NULL, NULL,
11853
12544
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11854
12545
 
@@ -11928,7 +12619,7 @@ struct llm_build_bitnet : public llm_graph_context {
11928
12619
  };
11929
12620
 
11930
12621
  struct llm_build_t5_enc : public llm_graph_context {
11931
- llm_build_t5_enc(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12622
+ llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11932
12623
  const int64_t n_embd_head = hparams.n_embd_head_v;
11933
12624
 
11934
12625
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11971,7 +12662,7 @@ struct llm_build_t5_enc : public llm_graph_context {
11971
12662
  ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
11972
12663
  ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b);
11973
12664
 
11974
- cur = build_attn(inp_attn, gf,
12665
+ cur = build_attn(inp_attn,
11975
12666
  model.layers[il].wo_enc, nullptr,
11976
12667
  Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
11977
12668
  cb(cur, "kqv_out", il);
@@ -12029,7 +12720,7 @@ struct llm_build_t5_enc : public llm_graph_context {
12029
12720
  };
12030
12721
 
12031
12722
  struct llm_build_t5_dec : public llm_graph_context {
12032
- llm_build_t5_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12723
+ llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12033
12724
  const int64_t n_embd_head = hparams.n_embd_head_v;
12034
12725
  //const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
12035
12726
 
@@ -12077,7 +12768,7 @@ struct llm_build_t5_dec : public llm_graph_context {
12077
12768
  ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
12078
12769
  ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);
12079
12770
 
12080
- cur = build_attn(inp_attn_self, gf,
12771
+ cur = build_attn(inp_attn_self,
12081
12772
  model.layers[il].wo, model.layers[il].bo,
12082
12773
  Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
12083
12774
  cb(cur, "kqv_out", il);
@@ -12109,7 +12800,7 @@ struct llm_build_t5_dec : public llm_graph_context {
12109
12800
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
12110
12801
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc);
12111
12802
 
12112
- cur = build_attn(inp_attn_cross, gf,
12803
+ cur = build_attn(inp_attn_cross,
12113
12804
  model.layers[il].wo_cross, nullptr,
12114
12805
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
12115
12806
  cb(cur, "kqv_out", il);
@@ -12199,7 +12890,7 @@ struct llm_build_t5_dec : public llm_graph_context {
12199
12890
  };
12200
12891
 
12201
12892
  struct llm_build_jais : public llm_graph_context {
12202
- llm_build_jais(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12893
+ llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12203
12894
  const int64_t n_embd_head = hparams.n_embd_head_v;
12204
12895
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
12205
12896
 
@@ -12241,7 +12932,7 @@ struct llm_build_jais : public llm_graph_context {
12241
12932
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12242
12933
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12243
12934
 
12244
- cur = build_attn(inp_attn, gf,
12935
+ cur = build_attn(inp_attn,
12245
12936
  model.layers[il].wo, model.layers[il].bo,
12246
12937
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
12247
12938
  }
@@ -12294,7 +12985,7 @@ struct llm_build_jais : public llm_graph_context {
12294
12985
  };
12295
12986
 
12296
12987
  struct llm_build_chatglm : public llm_graph_context {
12297
- llm_build_chatglm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12988
+ llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12298
12989
  const int64_t n_embd_head = hparams.n_embd_head_v;
12299
12990
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
12300
12991
 
@@ -12373,7 +13064,7 @@ struct llm_build_chatglm : public llm_graph_context {
12373
13064
  cb(Kcur, "Kcur", il);
12374
13065
  cb(Vcur, "Vcur", il);
12375
13066
 
12376
- cur = build_attn(inp_attn, gf,
13067
+ cur = build_attn(inp_attn,
12377
13068
  model.layers[il].wo, NULL,
12378
13069
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12379
13070
  }
@@ -12427,7 +13118,7 @@ struct llm_build_chatglm : public llm_graph_context {
12427
13118
  };
12428
13119
 
12429
13120
  struct llm_build_glm4 : public llm_graph_context {
12430
- llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13121
+ llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12431
13122
  const int64_t n_embd_head = hparams.n_embd_head_v;
12432
13123
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
12433
13124
 
@@ -12506,7 +13197,7 @@ struct llm_build_glm4 : public llm_graph_context {
12506
13197
  cb(Kcur, "Kcur", il);
12507
13198
  cb(Vcur, "Vcur", il);
12508
13199
 
12509
- cur = build_attn(inp_attn, gf,
13200
+ cur = build_attn(inp_attn,
12510
13201
  model.layers[il].wo, NULL,
12511
13202
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12512
13203
  }
@@ -12578,7 +13269,7 @@ struct llm_build_glm4 : public llm_graph_context {
12578
13269
  };
12579
13270
 
12580
13271
  struct llm_build_nemotron : public llm_graph_context {
12581
- llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13272
+ llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12582
13273
  const int64_t n_embd_head = hparams.n_embd_head_v;
12583
13274
 
12584
13275
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12650,7 +13341,7 @@ struct llm_build_nemotron : public llm_graph_context {
12650
13341
  cb(Kcur, "Kcur", il);
12651
13342
  cb(Vcur, "Vcur", il);
12652
13343
 
12653
- cur = build_attn(inp_attn, gf,
13344
+ cur = build_attn(inp_attn,
12654
13345
  model.layers[il].wo, model.layers[il].bo,
12655
13346
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12656
13347
  }
@@ -12707,7 +13398,7 @@ struct llm_build_nemotron : public llm_graph_context {
12707
13398
  };
12708
13399
 
12709
13400
  struct llm_build_exaone : public llm_graph_context {
12710
- llm_build_exaone(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13401
+ llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12711
13402
  const int64_t n_embd_head = hparams.n_embd_head_v;
12712
13403
 
12713
13404
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12781,7 +13472,7 @@ struct llm_build_exaone : public llm_graph_context {
12781
13472
  cb(Kcur, "Kcur", il);
12782
13473
  cb(Vcur, "Vcur", il);
12783
13474
 
12784
- cur = build_attn(inp_attn, gf,
13475
+ cur = build_attn(inp_attn,
12785
13476
  model.layers[il].wo, model.layers[il].bo,
12786
13477
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12787
13478
  }
@@ -12837,93 +13528,228 @@ struct llm_build_exaone : public llm_graph_context {
12837
13528
  }
12838
13529
  };
12839
13530
 
12840
- struct llm_build_rwkv6_base : public llm_graph_context {
12841
- const llama_model & model;
13531
+ template <bool iswa>
13532
+ struct llm_build_exaone4 : public llm_graph_context {
13533
+ llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13534
+ const int64_t n_embd_head = hparams.n_embd_head_k;
12842
13535
 
12843
- llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
12844
- }
13536
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
13537
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12845
13538
 
12846
- ggml_tensor * build_rwkv6_channel_mix(
12847
- const llama_layer * layer,
12848
- ggml_tensor * cur,
12849
- ggml_tensor * x_prev,
12850
- llm_arch arch) const {
12851
- ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
12852
- switch (arch) {
12853
- case LLM_ARCH_RWKV6:
12854
- {
12855
- ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
12856
- ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
13539
+ ggml_tensor * cur;
13540
+ ggml_tensor * inpL;
12857
13541
 
12858
- ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
12859
- ggml_tensor * k = ggml_sqr(
12860
- ctx0,
12861
- ggml_relu(
12862
- ctx0,
12863
- build_lora_mm(layer->channel_mix_key, xk)
12864
- )
12865
- );
12866
- cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
12867
- } break;
12868
- default:
12869
- GGML_ABORT("fatal error");
12870
- }
13542
+ inpL = build_inp_embd(model.tok_embd);
12871
13543
 
12872
- return cur;
12873
- }
13544
+ // inp_pos - contains the positions
13545
+ ggml_tensor * inp_pos = build_inp_pos();
12874
13546
 
12875
- ggml_tensor * build_rwkv6_time_mix(
12876
- llm_graph_input_rs * inp,
12877
- ggml_cgraph * gf,
12878
- ggml_tensor * cur,
12879
- ggml_tensor * x_prev,
12880
- const llama_ubatch & ubatch,
12881
- int il) const {
12882
- const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
13547
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
13548
+ inp_attn_type * inp_attn = nullptr;
12883
13549
 
12884
- const auto n_tokens = ubatch.n_tokens;
12885
- const auto n_seqs = ubatch.n_seqs;
12886
- const auto n_seq_tokens = ubatch.n_seq_tokens;
12887
- const auto n_embd = hparams.n_embd;
12888
- const auto head_size = hparams.wkv_head_size;
12889
- const auto n_head = n_embd / head_size;
12890
- const auto n_head_kv = hparams.n_head_kv(il);
13550
+ if constexpr (iswa) {
13551
+ inp_attn = build_attn_inp_kv_unified_iswa();
13552
+ } else {
13553
+ inp_attn = build_attn_inp_kv_unified();
13554
+ }
12891
13555
 
12892
- const auto kv_head = mctx_cur->get_head();
13556
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12893
13557
 
12894
- const auto & layer = model.layers[il];
13558
+ for (int il = 0; il < n_layer; ++il) {
13559
+ ggml_tensor * inpSA = inpL;
12895
13560
 
12896
- bool is_qrwkv = layer.time_mix_first == nullptr;
13561
+ // use RoPE for SWA layers or non-SWA models
13562
+ const bool use_rope = hparams.is_swa(il) || hparams.swa_type == LLAMA_SWA_TYPE_NONE;
12897
13563
 
12898
- ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
13564
+ cur = inpL;
12899
13565
 
12900
- sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
12901
- cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
13566
+ // self-attention
13567
+ {
13568
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
12902
13569
 
12903
- ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
13570
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13571
+ cb(Qcur, "Qcur", il);
12904
13572
 
12905
- xxx = ggml_reshape_4d(
12906
- ctx0,
12907
- ggml_tanh(
12908
- ctx0,
12909
- ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
12910
- ),
12911
- layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
12912
- );
13573
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13574
+ cb(Kcur, "Kcur", il);
12913
13575
 
12914
- xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
13576
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13577
+ cb(Vcur, "Vcur", il);
12915
13578
 
12916
- xxx = ggml_mul_mat(
12917
- ctx0,
12918
- ggml_reshape_4d(
12919
- ctx0,
12920
- layer.time_mix_w2,
12921
- layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
12922
- ),
12923
- xxx
12924
- );
13579
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13580
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13581
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12925
13582
 
12926
- ggml_tensor *xw, *xk, *xv, *xr, *xg;
13583
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
13584
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
13585
+ cb(Qcur, "Qcur_normed", il);
13586
+ cb(Kcur, "Kcur_normed", il);
13587
+
13588
+ if (use_rope) {
13589
+ Qcur = ggml_rope_ext(
13590
+ ctx0, Qcur, inp_pos, rope_factors,
13591
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13592
+ ext_factor, attn_factor, beta_fast, beta_slow
13593
+ );
13594
+
13595
+ Kcur = ggml_rope_ext(
13596
+ ctx0, Kcur, inp_pos, rope_factors,
13597
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13598
+ ext_factor, attn_factor, beta_fast, beta_slow
13599
+ );
13600
+ }
13601
+
13602
+ cb(Qcur, "Qcur", il);
13603
+ cb(Kcur, "Kcur", il);
13604
+ cb(Vcur, "Vcur", il);
13605
+
13606
+ cur = build_attn(inp_attn,
13607
+ model.layers[il].wo, NULL,
13608
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13609
+ cb(cur, "attn_out", il);
13610
+ }
13611
+
13612
+ if (il == n_layer - 1 && inp_out_ids) {
13613
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13614
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13615
+ }
13616
+
13617
+ cur = build_norm(cur,
13618
+ model.layers[il].attn_post_norm, NULL,
13619
+ LLM_NORM_RMS, il);
13620
+ cb(cur, "attn_post_norm", il);
13621
+
13622
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13623
+ cb(ffn_inp, "ffn_inp", il);
13624
+
13625
+ // feed-forward network
13626
+ cur = build_ffn(ffn_inp,
13627
+ model.layers[il].ffn_up, NULL, NULL,
13628
+ model.layers[il].ffn_gate, NULL, NULL,
13629
+ model.layers[il].ffn_down, NULL, NULL,
13630
+ NULL,
13631
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13632
+ cb(cur, "ffn_out", il);
13633
+
13634
+ cur = build_norm(cur,
13635
+ model.layers[il].ffn_post_norm, NULL,
13636
+ LLM_NORM_RMS, -1);
13637
+ cb(cur, "ffn_post_norm", -1);
13638
+
13639
+ cur = ggml_add(ctx0, cur, ffn_inp);
13640
+
13641
+ cur = build_cvec(cur, il);
13642
+ cb(cur, "l_out", il);
13643
+
13644
+ // input for next layer
13645
+ inpL = cur;
13646
+ }
13647
+
13648
+ cur = inpL;
13649
+
13650
+ cur = build_norm(cur,
13651
+ model.output_norm, NULL,
13652
+ LLM_NORM_RMS, -1);
13653
+
13654
+ cb(cur, "result_norm", -1);
13655
+ res->t_embd = cur;
13656
+
13657
+ // lm_head
13658
+ cur = build_lora_mm(model.output, cur);
13659
+
13660
+ cb(cur, "result_output", -1);
13661
+ res->t_logits = cur;
13662
+
13663
+ ggml_build_forward_expand(gf, cur);
13664
+ }
13665
+ };
13666
+
13667
+ struct llm_build_rwkv6_base : public llm_graph_context {
13668
+ const llama_model & model;
13669
+
13670
+ llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
13671
+ }
13672
+
13673
+ ggml_tensor * build_rwkv6_channel_mix(
13674
+ const llama_layer * layer,
13675
+ ggml_tensor * cur,
13676
+ ggml_tensor * x_prev,
13677
+ llm_arch arch) const {
13678
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
13679
+ switch (arch) {
13680
+ case LLM_ARCH_RWKV6:
13681
+ {
13682
+ ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
13683
+ ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
13684
+
13685
+ ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
13686
+ ggml_tensor * k = ggml_sqr(
13687
+ ctx0,
13688
+ ggml_relu(
13689
+ ctx0,
13690
+ build_lora_mm(layer->channel_mix_key, xk)
13691
+ )
13692
+ );
13693
+ cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
13694
+ } break;
13695
+ default:
13696
+ GGML_ABORT("fatal error");
13697
+ }
13698
+
13699
+ return cur;
13700
+ }
13701
+
13702
+ ggml_tensor * build_rwkv6_time_mix(
13703
+ llm_graph_input_rs * inp,
13704
+ ggml_tensor * cur,
13705
+ ggml_tensor * x_prev,
13706
+ const llama_ubatch & ubatch,
13707
+ int il) const {
13708
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
13709
+
13710
+ const auto n_tokens = ubatch.n_tokens;
13711
+ const auto n_seqs = ubatch.n_seqs;
13712
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
13713
+ const auto n_embd = hparams.n_embd;
13714
+ const auto head_size = hparams.wkv_head_size;
13715
+ const auto n_head = n_embd / head_size;
13716
+ const auto n_head_kv = hparams.n_head_kv(il);
13717
+
13718
+ const auto kv_head = mctx_cur->get_head();
13719
+
13720
+ const auto & layer = model.layers[il];
13721
+
13722
+ bool is_qrwkv = layer.time_mix_first == nullptr;
13723
+
13724
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
13725
+
13726
+ sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens);
13727
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
13728
+
13729
+ ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur);
13730
+
13731
+ xxx = ggml_reshape_4d(
13732
+ ctx0,
13733
+ ggml_tanh(
13734
+ ctx0,
13735
+ ggml_mul_mat(ctx0, layer.time_mix_w1, xxx)
13736
+ ),
13737
+ layer.time_mix_w1->ne[1] / 5, 1, 5, n_tokens
13738
+ );
13739
+
13740
+ xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
13741
+
13742
+ xxx = ggml_mul_mat(
13743
+ ctx0,
13744
+ ggml_reshape_4d(
13745
+ ctx0,
13746
+ layer.time_mix_w2,
13747
+ layer.time_mix_w2->ne[0], layer.time_mix_w2->ne[1], 1, 5
13748
+ ),
13749
+ xxx
13750
+ );
13751
+
13752
+ ggml_tensor *xw, *xk, *xv, *xr, *xg;
12927
13753
  if (layer.time_mix_lerp_fused) {
12928
13754
  // fusing these weights makes some performance improvement
12929
13755
  sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens);
@@ -13001,7 +13827,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
13001
13827
  }
13002
13828
 
13003
13829
  ggml_tensor * wkv_state = build_rs(
13004
- inp, gf, mctx_cur->get_s_l(il),
13830
+ inp, mctx_cur->get_s_l(il),
13005
13831
  hparams.n_embd_s(), n_seqs);
13006
13832
 
13007
13833
  ggml_tensor * wkv_output;
@@ -13047,7 +13873,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
13047
13873
  };
13048
13874
 
13049
13875
  struct llm_build_rwkv6 : public llm_build_rwkv6_base {
13050
- llm_build_rwkv6(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
13876
+ llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
13051
13877
  GGML_ASSERT(hparams.token_shift_count == 2);
13052
13878
 
13053
13879
  ggml_tensor * cur;
@@ -13068,7 +13894,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
13068
13894
  const llama_layer * layer = &model.layers[il];
13069
13895
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
13070
13896
 
13071
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
13897
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
13072
13898
 
13073
13899
  ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
13074
13900
  ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -13083,7 +13909,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
13083
13909
  1
13084
13910
  );
13085
13911
 
13086
- cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
13912
+ cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
13087
13913
 
13088
13914
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
13089
13915
  cb(ffn_inp, "ffn_inp", il);
@@ -13148,7 +13974,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
13148
13974
 
13149
13975
  // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
13150
13976
  struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
13151
- llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
13977
+ llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
13152
13978
  GGML_ASSERT(n_embd == hparams.n_embd_r());
13153
13979
 
13154
13980
  ggml_tensor * cur;
@@ -13168,7 +13994,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
13168
13994
  const llama_layer * layer = &model.layers[il];
13169
13995
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
13170
13996
 
13171
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
13997
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
13172
13998
 
13173
13999
  ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
13174
14000
  cb(att_norm, "attn_norm", il);
@@ -13180,7 +14006,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
13180
14006
  1
13181
14007
  );
13182
14008
 
13183
- cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
14009
+ cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
13184
14010
 
13185
14011
  token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
13186
14012
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -13270,7 +14096,6 @@ struct llm_build_rwkv7_base : public llm_graph_context {
13270
14096
 
13271
14097
  ggml_tensor * build_rwkv7_time_mix(
13272
14098
  llm_graph_input_rs * inp,
13273
- ggml_cgraph * gf,
13274
14099
  ggml_tensor * cur,
13275
14100
  ggml_tensor * x_prev,
13276
14101
  ggml_tensor *& first_layer_value,
@@ -13356,7 +14181,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
13356
14181
  a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
13357
14182
 
13358
14183
  ggml_tensor * wkv_state = build_rs(
13359
- inp, gf, mctx_cur->get_s_l(il),
14184
+ inp, mctx_cur->get_s_l(il),
13360
14185
  hparams.n_embd_s(), n_seqs);
13361
14186
 
13362
14187
  ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
@@ -13403,7 +14228,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
13403
14228
  };
13404
14229
 
13405
14230
  struct llm_build_rwkv7 : public llm_build_rwkv7_base {
13406
- llm_build_rwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
14231
+ llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
13407
14232
  GGML_ASSERT(hparams.token_shift_count == 2);
13408
14233
 
13409
14234
  ggml_tensor * cur;
@@ -13425,7 +14250,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
13425
14250
  const llama_layer * layer = &model.layers[il];
13426
14251
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
13427
14252
 
13428
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
14253
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
13429
14254
 
13430
14255
  ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
13431
14256
  ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -13440,7 +14265,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
13440
14265
  1
13441
14266
  );
13442
14267
 
13443
- cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
14268
+ cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
13444
14269
 
13445
14270
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
13446
14271
  cb(ffn_inp, "ffn_inp", il);
@@ -13499,7 +14324,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
13499
14324
 
13500
14325
 
13501
14326
  struct llm_build_arwkv7 : public llm_build_rwkv7_base {
13502
- llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
14327
+ llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
13503
14328
  GGML_ASSERT(n_embd == hparams.n_embd_r());
13504
14329
 
13505
14330
  ggml_tensor * cur;
@@ -13520,7 +14345,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
13520
14345
  const llama_layer * layer = &model.layers[il];
13521
14346
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
13522
14347
 
13523
- ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
14348
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
13524
14349
 
13525
14350
  ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
13526
14351
  cb(att_norm, "attn_norm", il);
@@ -13532,7 +14357,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
13532
14357
  1
13533
14358
  );
13534
14359
 
13535
- cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
14360
+ cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
13536
14361
 
13537
14362
  token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
13538
14363
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -13586,13 +14411,10 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
13586
14411
  }
13587
14412
  };
13588
14413
 
13589
-
13590
14414
  struct llm_build_granite : public llm_graph_context {
13591
14415
  llm_build_granite(
13592
14416
  const llama_model & model,
13593
- const llm_graph_params & params,
13594
- ggml_cgraph * gf,
13595
- const bool use_rope = true)
14417
+ const llm_graph_params & params)
13596
14418
  : llm_graph_context(params) {
13597
14419
 
13598
14420
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -13607,14 +14429,12 @@ struct llm_build_granite : public llm_graph_context {
13607
14429
 
13608
14430
  // inp_pos - built only if rope enabled
13609
14431
  ggml_tensor * inp_pos = nullptr;
13610
- if (use_rope) {
14432
+ if (hparams.rope_finetuned) {
13611
14433
  inp_pos = build_inp_pos();
13612
14434
  }
13613
14435
 
13614
14436
  auto * inp_attn = build_attn_inp_kv_unified();
13615
14437
 
13616
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
13617
-
13618
14438
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13619
14439
 
13620
14440
  for (int il = 0; il < n_layer; ++il) {
@@ -13627,128 +14447,234 @@ struct llm_build_granite : public llm_graph_context {
13627
14447
  cb(cur, "attn_norm", il);
13628
14448
 
13629
14449
  // self-attention
13630
- {
13631
- // compute Q and K and (optionally) RoPE them
13632
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13633
- cb(Qcur, "Qcur", il);
13634
- if (model.layers[il].bq) {
13635
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13636
- cb(Qcur, "Qcur", il);
13637
- }
14450
+ cur = build_attention_layer(
14451
+ cur, inp_pos, inp_attn,
14452
+ model, n_embd_head, il);
13638
14453
 
13639
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13640
- cb(Kcur, "Kcur", il);
13641
- if (model.layers[il].bk) {
13642
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13643
- cb(Kcur, "Kcur", il);
13644
- }
14454
+ if (il == n_layer - 1 && inp_out_ids) {
14455
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14456
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14457
+ }
13645
14458
 
13646
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13647
- cb(Vcur, "Vcur", il);
13648
- if (model.layers[il].bv) {
13649
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
13650
- cb(Vcur, "Vcur", il);
13651
- }
14459
+ // ffn
14460
+ cur = build_layer_ffn(cur, inpSA, model, il);
13652
14461
 
13653
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13654
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13655
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14462
+ // input for next layer
14463
+ inpL = cur;
14464
+ }
13656
14465
 
13657
- if (use_rope) {
13658
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
13659
- Qcur = ggml_rope_ext(
13660
- ctx0, Qcur, inp_pos, rope_factors,
13661
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13662
- ext_factor, attn_factor, beta_fast, beta_slow
13663
- );
14466
+ cur = inpL;
13664
14467
 
13665
- Kcur = ggml_rope_ext(
13666
- ctx0, Kcur, inp_pos, rope_factors,
13667
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13668
- ext_factor, attn_factor, beta_fast, beta_slow
13669
- );
13670
- }
14468
+ cur = build_norm(cur,
14469
+ model.output_norm, NULL,
14470
+ LLM_NORM_RMS, -1);
13671
14471
 
13672
- cb(Qcur, "Qcur", il);
13673
- cb(Kcur, "Kcur", il);
13674
- cb(Vcur, "Vcur", il);
14472
+ cb(cur, "result_norm", -1);
14473
+ res->t_embd = cur;
13675
14474
 
13676
- cur = build_attn(inp_attn, gf,
13677
- model.layers[il].wo, model.layers[il].bo,
13678
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14475
+ // lm_head
14476
+ cur = build_lora_mm(model.output, cur);
14477
+
14478
+ // For Granite architectures - scale logits
14479
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
14480
+ cb(cur, "result_output", -1);
14481
+ res->t_logits = cur;
14482
+
14483
+ ggml_build_forward_expand(gf, cur);
14484
+ }
14485
+
14486
+ ggml_tensor * build_attention_layer(
14487
+ ggml_tensor * cur,
14488
+ ggml_tensor * inp_pos,
14489
+ llm_graph_input_attn_kv_unified * inp_attn,
14490
+ const llama_model & model,
14491
+ const int64_t n_embd_head,
14492
+ const int il) {
14493
+
14494
+ // compute Q and K and (optionally) RoPE them
14495
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14496
+ cb(Qcur, "Qcur", il);
14497
+ if (model.layers[il].bq) {
14498
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14499
+ cb(Qcur, "Qcur", il);
14500
+ }
14501
+
14502
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14503
+ cb(Kcur, "Kcur", il);
14504
+ if (model.layers[il].bk) {
14505
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14506
+ cb(Kcur, "Kcur", il);
14507
+ }
14508
+
14509
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14510
+ cb(Vcur, "Vcur", il);
14511
+ if (model.layers[il].bv) {
14512
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14513
+ cb(Vcur, "Vcur", il);
14514
+ }
14515
+
14516
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
14517
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14518
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14519
+
14520
+ const bool use_rope = hparams.rope_finetuned;
14521
+ if (use_rope) {
14522
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14523
+ Qcur = ggml_rope_ext(
14524
+ ctx0, Qcur, inp_pos, rope_factors,
14525
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14526
+ ext_factor, attn_factor, beta_fast, beta_slow
14527
+ );
14528
+
14529
+ Kcur = ggml_rope_ext(
14530
+ ctx0, Kcur, inp_pos, rope_factors,
14531
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14532
+ ext_factor, attn_factor, beta_fast, beta_slow
14533
+ );
14534
+ }
14535
+
14536
+ cb(Qcur, "Qcur", il);
14537
+ cb(Kcur, "Kcur", il);
14538
+ cb(Vcur, "Vcur", il);
14539
+
14540
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14541
+ cur = build_attn(inp_attn,
14542
+ model.layers[il].wo, model.layers[il].bo,
14543
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
13679
14544
  cb(cur, "attn_out", il);
13680
- }
14545
+ return cur;
14546
+ }
13681
14547
 
13682
- if (il == n_layer - 1 && inp_out_ids) {
13683
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13684
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13685
- }
14548
+ ggml_tensor * build_layer_ffn(
14549
+ ggml_tensor * cur,
14550
+ ggml_tensor * inpSA,
14551
+ const llama_model & model,
14552
+ const int il) {
13686
14553
 
13687
- // For Granite architectures - scale residual
14554
+ // For Granite architectures - scale residual
14555
+ if (hparams.f_residual_scale) {
13688
14556
  cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
13689
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13690
- cb(ffn_inp, "ffn_inp", il);
14557
+ }
14558
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14559
+ cb(ffn_inp, "ffn_inp", il);
13691
14560
 
13692
- // feed-forward network (non-MoE)
13693
- if (model.layers[il].ffn_gate_inp == nullptr) {
14561
+ // feed-forward network (non-MoE)
14562
+ if (model.layers[il].ffn_gate_inp == nullptr) {
13694
14563
 
13695
- cur = build_norm(ffn_inp,
13696
- model.layers[il].ffn_norm, NULL,
13697
- LLM_NORM_RMS, il);
13698
- cb(cur, "ffn_norm", il);
14564
+ cur = build_norm(ffn_inp,
14565
+ model.layers[il].ffn_norm, NULL,
14566
+ LLM_NORM_RMS, il);
14567
+ cb(cur, "ffn_norm", il);
13699
14568
 
13700
- cur = build_ffn(cur,
13701
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
13702
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
13703
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
13704
- NULL,
13705
- LLM_FFN_SILU, LLM_FFN_PAR, il);
13706
- cb(cur, "ffn_out", il);
14569
+ cur = build_ffn(cur,
14570
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14571
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
14572
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14573
+ NULL,
14574
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14575
+ cb(cur, "ffn_out", il);
13707
14576
 
13708
- } else {
13709
- // MoE branch
13710
- cur = build_norm(ffn_inp,
13711
- model.layers[il].ffn_norm, NULL,
13712
- LLM_NORM_RMS, il);
13713
- cb(cur, "ffn_norm", il);
14577
+ } else {
14578
+ // MoE branch
14579
+ cur = build_norm(ffn_inp,
14580
+ model.layers[il].ffn_norm, NULL,
14581
+ LLM_NORM_RMS, il);
14582
+ cb(cur, "ffn_norm", il);
13714
14583
 
13715
- ggml_tensor * moe_out = build_moe_ffn(cur,
13716
- model.layers[il].ffn_gate_inp,
13717
- model.layers[il].ffn_up_exps,
13718
- model.layers[il].ffn_gate_exps,
13719
- model.layers[il].ffn_down_exps,
13720
- nullptr,
13721
- n_expert, n_expert_used,
13722
- LLM_FFN_SILU, true,
13723
- false, 0.0,
13724
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
13725
- il);
13726
- cb(moe_out, "ffn_moe_out", il);
14584
+ ggml_tensor * moe_out = build_moe_ffn(cur,
14585
+ model.layers[il].ffn_gate_inp,
14586
+ model.layers[il].ffn_up_exps,
14587
+ model.layers[il].ffn_gate_exps,
14588
+ model.layers[il].ffn_down_exps,
14589
+ nullptr,
14590
+ n_expert, n_expert_used,
14591
+ LLM_FFN_SILU, true,
14592
+ false, 0.0,
14593
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
14594
+ il);
14595
+ cb(moe_out, "ffn_moe_out", il);
13727
14596
 
13728
- // For Granite MoE Shared
13729
- if (hparams.n_ff_shexp > 0) {
13730
- ggml_tensor * ffn_shexp = build_ffn(cur,
13731
- model.layers[il].ffn_up_shexp, NULL, NULL,
13732
- model.layers[il].ffn_gate_shexp, NULL, NULL,
13733
- model.layers[il].ffn_down_shexp, NULL, NULL,
13734
- NULL,
13735
- LLM_FFN_SILU, LLM_FFN_PAR, il);
13736
- cb(ffn_shexp, "ffn_shexp", il);
14597
+ // For Granite MoE Shared
14598
+ if (hparams.n_ff_shexp > 0) {
14599
+ ggml_tensor * ffn_shexp = build_ffn(cur,
14600
+ model.layers[il].ffn_up_shexp, NULL, NULL,
14601
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
14602
+ model.layers[il].ffn_down_shexp, NULL, NULL,
14603
+ NULL,
14604
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14605
+ cb(ffn_shexp, "ffn_shexp", il);
13737
14606
 
13738
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
13739
- cb(cur, "ffn_out", il);
13740
- } else {
13741
- cur = moe_out;
13742
- }
14607
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
14608
+ cb(cur, "ffn_out", il);
14609
+ } else {
14610
+ cur = moe_out;
13743
14611
  }
14612
+ }
13744
14613
 
13745
- // For Granite architectures - scale residual
14614
+ // For Granite architectures - scale residual
14615
+ if (hparams.f_residual_scale) {
13746
14616
  cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
13747
- cur = ggml_add(ctx0, cur, ffn_inp);
13748
- cb(cur, "ffn_out", il);
14617
+ }
14618
+ cur = ggml_add(ctx0, cur, ffn_inp);
14619
+ cb(cur, "ffn_out", il);
13749
14620
 
13750
- cur = build_cvec(cur, il);
13751
- cb(cur, "l_out", il);
14621
+ cur = build_cvec(cur, il);
14622
+ cb(cur, "l_out", il);
14623
+
14624
+ return cur;
14625
+ }
14626
+ };
14627
+
14628
+ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
14629
+ llm_build_granite_hybrid(
14630
+ const llama_model & model,
14631
+ const llm_graph_params & params) :
14632
+ llm_graph_context_mamba(params) {
14633
+
14634
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14635
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14636
+
14637
+ ggml_tensor * cur;
14638
+ ggml_tensor * inpL;
14639
+
14640
+ inpL = build_inp_embd(model.tok_embd);
14641
+
14642
+ auto * inp = build_inp_mem_hybrid();
14643
+
14644
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14645
+
14646
+ // Positional embeddings populated if rope enabled
14647
+ ggml_tensor * inp_pos = nullptr;
14648
+ if (hparams.rope_finetuned) {
14649
+ inp_pos = build_inp_pos();
14650
+ }
14651
+
14652
+ for (int il = 0; il < n_layer; ++il) {
14653
+ struct ggml_tensor * inpSA = inpL;
14654
+
14655
+ // norm
14656
+ cur = build_norm(inpL,
14657
+ model.layers[il].attn_norm, NULL,
14658
+ LLM_NORM_RMS, il);
14659
+ cb(cur, "attn_norm", il);
14660
+
14661
+ if (hparams.is_recurrent(il)) {
14662
+ // ssm layer //
14663
+ cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
14664
+ } else {
14665
+ // attention layer //
14666
+ cur = build_attention_layer(
14667
+ cur, inp_pos, inp->get_attn(), model,
14668
+ n_embd_head, il);
14669
+ }
14670
+
14671
+ if (il == n_layer - 1 && inp_out_ids) {
14672
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14673
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14674
+ }
14675
+
14676
+ // ffn
14677
+ cur = build_layer_ffn(cur, inpSA, model, il);
13752
14678
 
13753
14679
  // input for next layer
13754
14680
  inpL = cur;
@@ -13760,18 +14686,161 @@ struct llm_build_granite : public llm_graph_context {
13760
14686
  model.output_norm, NULL,
13761
14687
  LLM_NORM_RMS, -1);
13762
14688
 
13763
- cb(cur, "result_norm", -1);
13764
- res->t_embd = cur;
14689
+ cb(cur, "result_norm", -1);
14690
+ res->t_embd = cur;
14691
+
14692
+ // lm_head
14693
+ cur = build_lora_mm(model.output, cur);
14694
+
14695
+ // For Granite architectures - scale logits
14696
+ if (hparams.f_logit_scale) {
14697
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
14698
+ }
14699
+ cb(cur, "result_output", -1);
14700
+ res->t_logits = cur;
14701
+
14702
+ ggml_build_forward_expand(gf, cur);
14703
+ }
14704
+
14705
+ ggml_tensor * build_attention_layer(
14706
+ ggml_tensor * cur,
14707
+ ggml_tensor * inp_pos,
14708
+ llm_graph_input_attn_kv_unified * inp_attn,
14709
+ const llama_model & model,
14710
+ const int64_t n_embd_head,
14711
+ const int il) {
14712
+
14713
+ // compute Q and K and (optionally) RoPE them
14714
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14715
+ cb(Qcur, "Qcur", il);
14716
+ if (model.layers[il].bq) {
14717
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14718
+ cb(Qcur, "Qcur", il);
14719
+ }
14720
+
14721
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14722
+ cb(Kcur, "Kcur", il);
14723
+ if (model.layers[il].bk) {
14724
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14725
+ cb(Kcur, "Kcur", il);
14726
+ }
14727
+
14728
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14729
+ cb(Vcur, "Vcur", il);
14730
+ if (model.layers[il].bv) {
14731
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14732
+ cb(Vcur, "Vcur", il);
14733
+ }
14734
+
14735
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
14736
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14737
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14738
+
14739
+ const bool use_rope = hparams.rope_finetuned;
14740
+ if (use_rope) {
14741
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14742
+ Qcur = ggml_rope_ext(
14743
+ ctx0, Qcur, inp_pos, rope_factors,
14744
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14745
+ ext_factor, attn_factor, beta_fast, beta_slow
14746
+ );
14747
+
14748
+ Kcur = ggml_rope_ext(
14749
+ ctx0, Kcur, inp_pos, rope_factors,
14750
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14751
+ ext_factor, attn_factor, beta_fast, beta_slow
14752
+ );
14753
+ }
14754
+
14755
+ cb(Qcur, "Qcur", il);
14756
+ cb(Kcur, "Kcur", il);
14757
+ cb(Vcur, "Vcur", il);
14758
+
14759
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14760
+ cur = build_attn(inp_attn,
14761
+ model.layers[il].wo, model.layers[il].bo,
14762
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14763
+ cb(cur, "attn_out", il);
14764
+ return cur;
14765
+ }
14766
+
14767
+ ggml_tensor * build_layer_ffn(
14768
+ ggml_tensor * cur,
14769
+ ggml_tensor * inpSA,
14770
+ const llama_model & model,
14771
+ const int il) {
14772
+
14773
+ // For Granite architectures - scale residual
14774
+ if (hparams.f_residual_scale) {
14775
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14776
+ }
14777
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14778
+ cb(ffn_inp, "ffn_inp", il);
14779
+
14780
+ // feed-forward network (non-MoE)
14781
+ if (model.layers[il].ffn_gate_inp == nullptr) {
14782
+
14783
+ cur = build_norm(ffn_inp,
14784
+ model.layers[il].ffn_norm, NULL,
14785
+ LLM_NORM_RMS, il);
14786
+ cb(cur, "ffn_norm", il);
14787
+
14788
+ cur = build_ffn(cur,
14789
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14790
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
14791
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14792
+ NULL,
14793
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14794
+ cb(cur, "ffn_out", il);
14795
+
14796
+ } else {
14797
+ // MoE branch
14798
+ cur = build_norm(ffn_inp,
14799
+ model.layers[il].ffn_norm, NULL,
14800
+ LLM_NORM_RMS, il);
14801
+ cb(cur, "ffn_norm", il);
14802
+
14803
+ ggml_tensor * moe_out = build_moe_ffn(cur,
14804
+ model.layers[il].ffn_gate_inp,
14805
+ model.layers[il].ffn_up_exps,
14806
+ model.layers[il].ffn_gate_exps,
14807
+ model.layers[il].ffn_down_exps,
14808
+ nullptr,
14809
+ n_expert, n_expert_used,
14810
+ LLM_FFN_SILU, true,
14811
+ false, 0.0,
14812
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
14813
+ il);
14814
+ cb(moe_out, "ffn_moe_out", il);
14815
+
14816
+ // For Granite MoE Shared
14817
+ if (hparams.n_ff_shexp > 0) {
14818
+ ggml_tensor * ffn_shexp = build_ffn(cur,
14819
+ model.layers[il].ffn_up_shexp, NULL, NULL,
14820
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
14821
+ model.layers[il].ffn_down_shexp, NULL, NULL,
14822
+ NULL,
14823
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14824
+ cb(ffn_shexp, "ffn_shexp", il);
14825
+
14826
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
14827
+ cb(cur, "ffn_out", il);
14828
+ } else {
14829
+ cur = moe_out;
14830
+ }
14831
+ }
13765
14832
 
13766
- // lm_head
13767
- cur = build_lora_mm(model.output, cur);
14833
+ // For Granite architectures - scale residual
14834
+ if (hparams.f_residual_scale) {
14835
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14836
+ }
14837
+ cur = ggml_add(ctx0, cur, ffn_inp);
14838
+ cb(cur, "ffn_out", il);
13768
14839
 
13769
- // For Granite architectures - scale logits
13770
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
13771
- cb(cur, "result_output", -1);
13772
- res->t_logits = cur;
14840
+ cur = build_cvec(cur, il);
14841
+ cb(cur, "l_out", il);
13773
14842
 
13774
- ggml_build_forward_expand(gf, cur);
14843
+ return cur;
13775
14844
  }
13776
14845
  };
13777
14846
 
@@ -13782,7 +14851,7 @@ struct llm_build_granite : public llm_graph_context {
13782
14851
  // * removed bias
13783
14852
  // * removed MoE
13784
14853
  struct llm_build_chameleon : public llm_graph_context {
13785
- llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14854
+ llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13786
14855
  const int64_t n_embd_head = hparams.n_embd_head_v;
13787
14856
 
13788
14857
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -13873,7 +14942,7 @@ struct llm_build_chameleon : public llm_graph_context {
13873
14942
  cb(Kcur, "Kcur", il);
13874
14943
  cb(Vcur, "Vcur", il);
13875
14944
 
13876
- cur = build_attn(inp_attn, gf,
14945
+ cur = build_attn(inp_attn,
13877
14946
  model.layers[il].wo, nullptr,
13878
14947
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13879
14948
  }
@@ -13959,7 +15028,7 @@ struct llm_build_chameleon : public llm_graph_context {
13959
15028
  };
13960
15029
 
13961
15030
  struct llm_build_wavtokenizer_dec : public llm_graph_context {
13962
- llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15031
+ llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13963
15032
  ggml_tensor * cur;
13964
15033
  ggml_tensor * inpL;
13965
15034
 
@@ -14111,7 +15180,7 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
14111
15180
  };
14112
15181
 
14113
15182
  struct llm_build_plm : public llm_graph_context {
14114
- llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15183
+ llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
14115
15184
  const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
14116
15185
 
14117
15186
  const uint32_t n_embd_head_qk_rope = hparams.n_rot;
@@ -14229,7 +15298,7 @@ struct llm_build_plm : public llm_graph_context {
14229
15298
  ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
14230
15299
  cb(k_states, "k_states", il);
14231
15300
 
14232
- cur = build_attn(inp_attn, gf,
15301
+ cur = build_attn(inp_attn,
14233
15302
  model.layers[il].wo, NULL,
14234
15303
  q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
14235
15304
  }
@@ -14283,7 +15352,7 @@ struct llm_build_plm : public llm_graph_context {
14283
15352
  };
14284
15353
 
14285
15354
  struct llm_build_bailingmoe : public llm_graph_context {
14286
- llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15355
+ llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
14287
15356
  ggml_tensor * cur;
14288
15357
  ggml_tensor * inpL;
14289
15358
 
@@ -14352,7 +15421,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
14352
15421
  cb(Kcur, "Kcur", il);
14353
15422
  cb(Vcur, "Vcur", il);
14354
15423
 
14355
- cur = build_attn(inp_attn, gf,
15424
+ cur = build_attn(inp_attn,
14356
15425
  model.layers[il].wo, model.layers[il].bo,
14357
15426
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
14358
15427
  }
@@ -14427,7 +15496,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
14427
15496
  };
14428
15497
 
14429
15498
  struct llm_build_dots1 : public llm_graph_context {
14430
- llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15499
+ llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
14431
15500
  const int64_t n_embd_head = hparams.n_embd_head_v;
14432
15501
 
14433
15502
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14492,7 +15561,7 @@ struct llm_build_dots1 : public llm_graph_context {
14492
15561
  cb(Kcur, "Kcur", il);
14493
15562
  cb(Vcur, "Vcur", il);
14494
15563
 
14495
- cur = build_attn(inp_attn, gf,
15564
+ cur = build_attn(inp_attn,
14496
15565
  model.layers[il].wo, model.layers[il].bo,
14497
15566
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14498
15567
  }
@@ -14577,7 +15646,7 @@ struct llm_build_dots1 : public llm_graph_context {
14577
15646
  };
14578
15647
 
14579
15648
  struct llm_build_ernie4_5 : public llm_graph_context {
14580
- llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15649
+ llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
14581
15650
  const int64_t n_embd_head = hparams.n_embd_head_v;
14582
15651
 
14583
15652
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14647,7 +15716,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
14647
15716
  cb(Kcur, "Kcur", il);
14648
15717
  cb(Vcur, "Vcur", il);
14649
15718
 
14650
- cur = build_attn(inp_attn, gf,
15719
+ cur = build_attn(inp_attn,
14651
15720
  model.layers[il].wo, NULL,
14652
15721
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14653
15722
  }
@@ -14706,10 +15775,178 @@ struct llm_build_ernie4_5 : public llm_graph_context {
14706
15775
  }
14707
15776
  };
14708
15777
 
14709
- struct llm_build_falcon_h1 : public llm_graph_context {
14710
- const llama_model & model;
15778
+ struct llm_build_ernie4_5_moe : public llm_graph_context {
15779
+ llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
15780
+ const int64_t n_embd_head = hparams.n_embd_head_v;
15781
+
15782
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15783
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
15784
+
15785
+ ggml_tensor * cur;
15786
+ ggml_tensor * inpL;
15787
+
15788
+ inpL = build_inp_embd(model.tok_embd);
15789
+
15790
+ // inp_pos - contains the positions
15791
+ ggml_tensor * inp_pos = build_inp_pos();
15792
+
15793
+ auto * inp_attn = build_attn_inp_kv_unified();
15794
+
15795
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
15796
+
15797
+ GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
15798
+ for (int il = 0; il < n_layer; ++il) {
15799
+ ggml_tensor * inpSA = inpL;
15800
+ // norm
15801
+ {
15802
+ cur = build_norm(inpL,
15803
+ model.layers[il].attn_norm, NULL,
15804
+ LLM_NORM_RMS, il);
15805
+ cb(cur, "attn_norm", il);
15806
+ }
15807
+
15808
+ // self-attention
15809
+ {
15810
+ // compute Q and K and RoPE them
15811
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
15812
+ cb(Qcur, "Qcur", il);
15813
+ if (model.layers[il].bq) {
15814
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
15815
+ cb(Qcur, "Qcur", il);
15816
+ }
15817
+
15818
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
15819
+ cb(Kcur, "Kcur", il);
15820
+ if (model.layers[il].bk) {
15821
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
15822
+ cb(Kcur, "Kcur", il);
15823
+ }
15824
+
15825
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
15826
+ cb(Vcur, "Vcur", il);
15827
+ if (model.layers[il].bv) {
15828
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
15829
+ cb(Vcur, "Vcur", il);
15830
+ }
15831
+
15832
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
15833
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
15834
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
15835
+
15836
+ Qcur = ggml_rope_ext(
15837
+ ctx0, Qcur, inp_pos, nullptr,
15838
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15839
+ ext_factor, attn_factor, beta_fast, beta_slow
15840
+ );
15841
+
15842
+ Kcur = ggml_rope_ext(
15843
+ ctx0, Kcur, inp_pos, nullptr,
15844
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15845
+ ext_factor, attn_factor, beta_fast, beta_slow
15846
+ );
15847
+
15848
+ cb(Qcur, "Qcur", il);
15849
+ cb(Kcur, "Kcur", il);
15850
+ cb(Vcur, "Vcur", il);
15851
+
15852
+ cur = build_attn(inp_attn,
15853
+ model.layers[il].wo, NULL,
15854
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15855
+ cb(cur, "attn_out", il);
15856
+ }
15857
+
15858
+ if (il == n_layer - 1 && inp_out_ids) {
15859
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15860
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15861
+ }
15862
+
15863
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15864
+ cb(ffn_inp, "ffn_inp", il);
15865
+
15866
+ // feed-forward network
15867
+ bool is_moe_layer = static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
15868
+
15869
+ if (!is_moe_layer) {
15870
+ cur = build_norm(ffn_inp,
15871
+ model.layers[il].ffn_norm, NULL,
15872
+ LLM_NORM_RMS, il);
15873
+ cb(cur, "ffn_norm", il);
15874
+
15875
+ cur = build_ffn(cur,
15876
+ model.layers[il].ffn_up, NULL, NULL,
15877
+ model.layers[il].ffn_gate, NULL, NULL,
15878
+ model.layers[il].ffn_down, NULL, NULL,
15879
+ NULL,
15880
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15881
+ cb(cur, "ffn_out", il);
15882
+ } else {
15883
+ // MoE branch
15884
+ cur = build_norm(ffn_inp,
15885
+ model.layers[il].ffn_norm, NULL,
15886
+ LLM_NORM_RMS, il);
15887
+ cb(cur, "ffn_norm", il);
15888
+
15889
+ ggml_tensor * moe_out = build_moe_ffn(cur,
15890
+ model.layers[il].ffn_gate_inp,
15891
+ model.layers[il].ffn_up_exps,
15892
+ model.layers[il].ffn_gate_exps,
15893
+ model.layers[il].ffn_down_exps,
15894
+ model.layers[il].ffn_exp_probs_b,
15895
+ n_expert, n_expert_used,
15896
+ LLM_FFN_SILU, true,
15897
+ false, 0.0,
15898
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
15899
+ il);
15900
+ cb(moe_out, "ffn_moe_out", il);
15901
+
15902
+ // Shared expert (if present)
15903
+ if (hparams.n_ff_shexp > 0) {
15904
+ ggml_tensor * ffn_shexp = build_ffn(cur,
15905
+ model.layers[il].ffn_up_shexp, NULL, NULL,
15906
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
15907
+ model.layers[il].ffn_down_shexp, NULL, NULL,
15908
+ NULL,
15909
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15910
+ cb(ffn_shexp, "ffn_shexp", il);
15911
+
15912
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
15913
+ } else {
15914
+ cur = moe_out;
15915
+ }
15916
+ cb(cur, "ffn_out", il);
15917
+ }
15918
+
15919
+ cur = ggml_add(ctx0, cur, ffn_inp);
15920
+ cb(cur, "ffn_out", il);
15921
+
15922
+ cur = build_cvec(cur, il);
15923
+ cb(cur, "l_out", il);
15924
+
15925
+ // input for next layer
15926
+ inpL = cur;
15927
+ }
15928
+
15929
+ cur = inpL;
15930
+
15931
+ cur = build_norm(cur,
15932
+ model.output_norm, NULL,
15933
+ LLM_NORM_RMS, -1);
15934
+
15935
+ cb(cur, "result_norm", -1);
15936
+ res->t_embd = cur;
15937
+
15938
+ // lm_head
15939
+ cur = build_lora_mm(model.output, cur);
15940
+
15941
+ cb(cur, "result_output", -1);
15942
+ res->t_logits = cur;
14711
15943
 
14712
- llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
15944
+ ggml_build_forward_expand(gf, cur);
15945
+ }
15946
+ };
15947
+
15948
+ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
15949
+ llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
14713
15950
  const int64_t n_embd_head = hparams.n_embd_head_v;
14714
15951
 
14715
15952
  ggml_tensor * cur;
@@ -14765,210 +16002,389 @@ struct llm_build_falcon_h1 : public llm_graph_context {
14765
16002
  cb(Kcur, "Kcur-post-rope", il);
14766
16003
  cb(Vcur, "Vcur-post-rope", il);
14767
16004
 
14768
- ggml_tensor * attn_out = build_attn(inp, gf,
16005
+ ggml_tensor * attn_out = build_attn(inp->get_attn(),
14769
16006
  model.layers[il].wo, NULL,
14770
16007
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14771
16008
  cb(attn_out, "attn_out", il);
14772
16009
 
14773
- cur = build_norm(inpL,
14774
- model.layers[il].attn_norm, NULL,
14775
- LLM_NORM_RMS, il);
14776
- // Mamba2 layer
14777
- cb(cur, "ssm_in", il);
16010
+ cur = build_norm(inpL,
16011
+ model.layers[il].attn_norm, NULL,
16012
+ LLM_NORM_RMS, il);
16013
+ // Mamba2 layer
16014
+ cb(cur, "ssm_in", il);
16015
+
16016
+ ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
16017
+ cb(ssm_out, "ssm_out", il);
16018
+
16019
+ // // Aggregation
16020
+ cur = ggml_add(ctx0, attn_out, ssm_out);
16021
+ inpSA = ggml_add(ctx0, cur, inpSA);
16022
+ cb(cur, "layer_out", il);
16023
+
16024
+ if (il == n_layer - 1 && inp_out_ids) {
16025
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
16026
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
16027
+ }
16028
+
16029
+ ggml_tensor * ffn_inp = inpSA;
16030
+ cb(ffn_inp, "ffn_inp", il);
16031
+
16032
+ // feed-forward network
16033
+ cur = build_norm(ffn_inp,
16034
+ model.layers[il].ffn_norm, NULL,
16035
+ LLM_NORM_RMS, il);
16036
+ cb(cur, "ffn_norm", il);
16037
+
16038
+ cur = build_ffn(cur,
16039
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
16040
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
16041
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
16042
+ NULL,
16043
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
16044
+ cb(cur, "ffn_out", il);
16045
+
16046
+ cur = ggml_add(ctx0, cur, inpSA);
16047
+
16048
+ cur = build_cvec(cur, il);
16049
+ cb(cur, "l_out", il);
16050
+
16051
+ // input for next layer
16052
+ inpL = cur;
16053
+ }
16054
+
16055
+ cur = inpL;
16056
+
16057
+ cur = build_norm(cur,
16058
+ model.output_norm, NULL,
16059
+ LLM_NORM_RMS, -1);
16060
+
16061
+ cb(cur, "result_norm", -1);
16062
+ res->t_embd = cur;
16063
+
16064
+ // lm_head
16065
+ cur = build_lora_mm(model.output, cur);
16066
+
16067
+ cb(cur, "result_output", -1);
16068
+ res->t_logits = cur;
16069
+
16070
+ ggml_build_forward_expand(gf, cur);
16071
+ }
16072
+ };
16073
+
16074
+ struct llm_build_plamo2 : public llm_graph_context_mamba {
16075
+ llm_build_plamo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
16076
+ ggml_tensor * cur;
16077
+ ggml_tensor * inpL;
16078
+
16079
+ // {n_embd, n_tokens}
16080
+ inpL = build_inp_embd(model.tok_embd);
16081
+ cb(inpL, "embedding_output", -1);
16082
+
16083
+ ggml_tensor * inp_pos = build_inp_pos();
16084
+
16085
+ auto * inp_hybrid = build_inp_mem_hybrid();
16086
+
16087
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
16088
+
16089
+ for (int il = 0; il < n_layer; ++il) {
16090
+ ggml_tensor * residual = inpL;
16091
+
16092
+ // ggml_graph_add_node(gf, model.layers[il].attn_norm);
16093
+ // cb(model.layers[il].attn_norm, "attn_norm", il);
14778
16094
 
14779
- ggml_tensor * ssm_out = build_mamba2_layer(inp, gf, cur, ubatch, il);
14780
- cb(ssm_out, "ssm_out", il);
16095
+ // pre_mixer_norm
16096
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
14781
16097
 
14782
- // // Aggregation
14783
- cur = ggml_add(ctx0, attn_out, ssm_out);
14784
- inpSA = ggml_add(ctx0, cur, inpSA);
14785
- cb(cur, "layer_out", il);
16098
+ // check if this layer is Mamba or Attention
16099
+ bool is_mamba_layer = hparams.is_recurrent(il);
14786
16100
 
14787
- if (il == n_layer - 1 && inp_out_ids) {
14788
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14789
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
16101
+ if (is_mamba_layer) {
16102
+ // PLaMo-2 Mamba layer
16103
+ cur = build_plamo2_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
16104
+ } else {
16105
+ // PLaMo-2 Attention layer
16106
+ cur = build_plamo2_attn_layer(inp_hybrid->get_attn(), inp_pos, cur, model, il);
14790
16107
  }
14791
16108
 
14792
- ggml_tensor * ffn_inp = inpSA;
14793
- cb(ffn_inp, "ffn_inp", il);
16109
+ // post_mixer_norm
16110
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
16111
+ cb(cur, "attn_post_norm", il);
14794
16112
 
14795
- // feed-forward network
14796
- cur = build_norm(ffn_inp,
14797
- model.layers[il].ffn_norm, NULL,
14798
- LLM_NORM_RMS, il);
14799
- cb(cur, "ffn_norm", il);
16113
+ // residual connection
16114
+ cur = ggml_add(ctx0, cur, residual);
16115
+ cb(cur, "attn_residual", il);
16116
+ residual = cur;
16117
+
16118
+ // pre-ffn norm
16119
+ cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
16120
+ cb(cur, "ffn_pre_norm", il);
14800
16121
 
16122
+ // feed-forward network
14801
16123
  cur = build_ffn(cur,
14802
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14803
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
14804
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
16124
+ model.layers[il].ffn_up, NULL, NULL,
16125
+ NULL, NULL, NULL,
16126
+ model.layers[il].ffn_down, NULL, NULL,
14805
16127
  NULL,
14806
- LLM_FFN_SILU, LLM_FFN_PAR, il);
16128
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
14807
16129
  cb(cur, "ffn_out", il);
14808
16130
 
14809
- cur = ggml_add(ctx0, cur, inpSA);
16131
+ // post ffn norm
16132
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
16133
+ cb(cur, "ffn_post_norm", il);
14810
16134
 
14811
- cur = build_cvec(cur, il);
14812
- cb(cur, "l_out", il);
16135
+ if (il == n_layer - 1 && inp_out_ids) {
16136
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
16137
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
16138
+ }
16139
+
16140
+ // residual connection
16141
+ cur = ggml_add(ctx0, cur, residual);
16142
+ cb(cur, "ffn_residual", il);
14813
16143
 
14814
- // input for next layer
14815
16144
  inpL = cur;
14816
16145
  }
14817
16146
 
14818
16147
  cur = inpL;
14819
16148
 
14820
- cur = build_norm(cur,
14821
- model.output_norm, NULL,
14822
- LLM_NORM_RMS, -1);
14823
-
16149
+ // final norm
16150
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
14824
16151
  cb(cur, "result_norm", -1);
14825
- res->t_embd = cur;
14826
16152
 
14827
16153
  // lm_head
14828
16154
  cur = build_lora_mm(model.output, cur);
14829
-
14830
16155
  cb(cur, "result_output", -1);
16156
+
16157
+ // Explicitly mark as output tensor to ensure proper backend assignment
16158
+ ggml_set_output(cur);
16159
+
14831
16160
  res->t_logits = cur;
14832
16161
 
14833
16162
  ggml_build_forward_expand(gf, cur);
14834
16163
  }
14835
16164
 
14836
- ggml_tensor * build_mamba2_layer(
14837
- llm_graph_input_mem_hybrid * inp,
14838
- ggml_cgraph * gf,
14839
- ggml_tensor * cur,
14840
- const llama_ubatch & ubatch,
14841
- int il) const {
14842
- const auto * kv_state = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
16165
+ private:
16166
+ ggml_tensor * build_plamo2_attn_layer(
16167
+ llm_graph_input_attn_kv_unified * inp,
16168
+ ggml_tensor * inp_pos,
16169
+ ggml_tensor * cur,
16170
+ const llama_model & model,
16171
+ int il) {
14843
16172
 
14844
- const auto kv_head = kv_state->get_head();
16173
+ // self-attention
16174
+ {
16175
+ // PLaMo-2 uses combined QKV tensor
16176
+ ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
16177
+ cb(qkv, "qkv", il);
14845
16178
 
14846
- const int64_t d_conv = hparams.ssm_d_conv;
14847
- const int64_t d_inner = hparams.ssm_d_inner;
14848
- const int64_t d_state = hparams.ssm_d_state;
14849
- const int64_t n_head = hparams.ssm_dt_rank;
14850
- const int64_t head_dim = d_inner / n_head;
14851
- const int64_t n_group = hparams.ssm_n_group;
14852
- const int64_t n_seqs = ubatch.n_seqs;
16179
+ // split QKV tensor into Q, K, V
16180
+ const int64_t n_embd_head_q = hparams.n_embd_head_k;
16181
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
16182
+ const int64_t n_embd_head_v = hparams.n_embd_head_v;
16183
+ int32_t n_head_kv = hparams.n_head_kv(il);
16184
+
16185
+ const int64_t q_offset = 0;
16186
+ const int64_t k_offset = n_embd_head_q * n_head;
16187
+ const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
16188
+
16189
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
16190
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
16191
+ ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv)));
16192
+
16193
+ cb(Qcur, "Qcur", il);
16194
+ cb(Kcur, "Kcur", il);
16195
+ cb(Vcur, "Vcur", il);
16196
+
16197
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
16198
+
16199
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
16200
+ cb(Qcur, "Qcur_normed", il);
16201
+
16202
+ Qcur = ggml_rope_ext(
16203
+ ctx0, Qcur, inp_pos, nullptr,
16204
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
16205
+ ext_factor, attn_factor, beta_fast, beta_slow
16206
+ );
16207
+
16208
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
16209
+ cb(Kcur, "Kcur_normed", il);
16210
+
16211
+ Kcur = ggml_rope_ext(
16212
+ ctx0, Kcur, inp_pos, nullptr,
16213
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
16214
+ ext_factor, attn_factor, beta_fast, beta_slow
16215
+ );
16216
+
16217
+ cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
16218
+ }
16219
+
16220
+ cb(cur, "attn_out", il);
16221
+
16222
+ return cur;
16223
+ }
16224
+
16225
+ ggml_tensor * build_plamo2_mamba_layer(
16226
+ llm_graph_input_rs * inp,
16227
+ ggml_tensor * cur,
16228
+ const llama_model & model,
16229
+ const llama_ubatch & ubatch,
16230
+ int il) {
16231
+
16232
+ const auto * mctx_cur = inp->mctx;
16233
+
16234
+ const auto kv_head = mctx_cur->get_head();
16235
+
16236
+ const int64_t d_conv = hparams.ssm_d_conv;
16237
+ const int64_t d_inner = hparams.ssm_d_inner;
16238
+ const int64_t d_state = hparams.ssm_d_state;
16239
+ const int64_t n_heads = hparams.ssm_dt_rank;
16240
+ const int64_t head_dim = d_inner / n_heads;
16241
+ const int64_t n_group = hparams.ssm_n_group;
16242
+ const int64_t n_seqs = ubatch.n_seqs;
14853
16243
 
14854
16244
  const int64_t n_seq_tokens = ubatch.n_seq_tokens;
14855
16245
 
14856
16246
  GGML_ASSERT(n_seqs != 0);
14857
- GGML_ASSERT(ubatch.equal_seqs);
16247
+ GGML_ASSERT(ubatch.equal_seqs());
14858
16248
  GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
14859
16249
 
14860
- ggml_tensor * conv_states_all = kv_state->get_r_l(il);
14861
- ggml_tensor * ssm_states_all = kv_state->get_s_l(il);
16250
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
16251
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
14862
16252
 
14863
- ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
16253
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
14864
16254
  conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
14865
16255
 
14866
16256
  // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
14867
16257
  cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
14868
16258
 
14869
- // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
14870
-
14871
- // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
14872
- ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
14873
- cb(zxBCdt, "zxBCdt", il);
14874
-
14875
- // split the above in three
14876
- ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
14877
- ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
14878
- ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
14879
-
14880
- // conv
16259
+ // in_proj: {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
16260
+ ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
16261
+ cb(zx, "mamba_in_proj", il);
16262
+ // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
16263
+ zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
16264
+ zx = ggml_cont(ctx0, zx);
16265
+ zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
16266
+ cb(zx, "mamba_in_proj_out", il);
16267
+
16268
+ // split into z and x
16269
+ // => {head_dim * n_heads, n_seq_tokens, n_seqs}
16270
+ ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
16271
+ x = ggml_cont(ctx0, x);
16272
+ x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
16273
+ // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
16274
+ cb(x, "mamba_x_split", il);
16275
+
16276
+ ggml_tensor * z = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], 0);
16277
+ cb(z, "mamba_z_split", il);
16278
+
16279
+ // conv1d
14881
16280
  {
14882
- // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
14883
- ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
16281
+ // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
16282
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
16283
+ cb(conv_x, "mamba_conv1d_input", il);
14884
16284
 
14885
16285
  // copy last (d_conv - 1) columns back into the state cache
14886
- ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
16286
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs,
16287
+ conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
14887
16288
 
14888
16289
  ggml_build_forward_expand(gf,
14889
16290
  ggml_cpy(ctx0, last_conv,
14890
16291
  ggml_view_1d(ctx0, conv_states_all,
14891
- (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
14892
- kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
16292
+ (d_conv - 1)*(d_inner)*(n_seqs),
16293
+ kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
14893
16294
 
14894
16295
  // 1D convolution
14895
- // The equivalent is to make a self-overlapping view of conv_x
14896
- // over d_conv columns at each stride in the 3rd dimension,
14897
- // then element-wise multiply that with the conv1d weight,
14898
- // then sum the elements of each row,
14899
- // (the last two steps are a dot product over rows (also doable with mul_mat))
14900
- // then permute away the ne[0] dimension,
14901
- // and then you're left with the resulting x tensor.
14902
- // For simultaneous sequences, all sequences need to have the same length.
14903
- xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
14904
-
14905
- // bias
14906
- xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
16296
+ x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
16297
+ cb(x, "mamba_conv1d", il);
14907
16298
 
14908
- xBC = ggml_silu(ctx0, xBC);
16299
+ x = ggml_silu(ctx0, x);
16300
+ cb(x, "mamba_conv1d_silu", il);
14909
16301
  }
14910
16302
 
14911
- // ssm
16303
+ // SSM
14912
16304
  {
14913
- // These correspond to V K Q in SSM/attention duality
14914
- ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
14915
-
14916
- ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
14917
-
14918
- ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
16305
+ // bcdt_proj: {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
16306
+ ggml_tensor * x_bcdt = build_lora_mm(model.layers[il].ssm_x, x);
16307
+ cb(x_bcdt, "mamba_bcdt_proj", il);
16308
+
16309
+ // split into dt, B, C
16310
+ const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
16311
+ ggml_tensor * B = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], 0);
16312
+ ggml_tensor * C = ggml_view_3d(ctx0, x_bcdt, d_state, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*d_state);
16313
+ ggml_tensor * dt = ggml_view_3d(ctx0, x_bcdt, dt_dim, n_seq_tokens, n_seqs, x_bcdt->nb[1], x_bcdt->nb[2], ggml_element_size(x_bcdt)*(2*d_state));
16314
+ cb(B, "mamba_B_raw", il);
16315
+ cb(C, "mamba_C_raw", il);
16316
+ cb(dt, "mamba_dt_raw", il);
16317
+
16318
+ // Apply RMS norm to dt, B, C (PLaMo-2 specific)
16319
+ B = build_norm(B, model.layers[il].ssm_b_norm, NULL, LLM_NORM_RMS, il);
16320
+ C = build_norm(C, model.layers[il].ssm_c_norm, NULL, LLM_NORM_RMS, il);
16321
+ dt = build_norm(dt, model.layers[il].ssm_dt_norm, NULL, LLM_NORM_RMS, il);
16322
+ cb(B, "mamba_B_normed", il);
16323
+ cb(C, "mamba_C_normed", il);
16324
+ cb(dt, "mamba_dt_normed", il);
16325
+
16326
+ // dt_proj: {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
16327
+ dt = build_lora_mm(model.layers[il].ssm_dt, dt);
16328
+ dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
16329
+ cb(dt, "mamba_dt_proj", il);
14919
16330
 
14920
- // {n_head, n_seq_tokens, n_seqs}
14921
- dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
16331
+ ggml_tensor * A = ggml_reshape_2d(ctx0, model.layers[il].ssm_a, 1, n_heads);
16332
+ cb(A, "mamba_A", il);
14922
16333
 
14923
- ggml_tensor * A = model.layers[il].ssm_a;
16334
+ x = ggml_view_4d(ctx0, x, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
16335
+ B = ggml_view_4d(ctx0, B, d_state, 1, n_seq_tokens, n_seqs, d_state * B->nb[0], B->nb[1], B->nb[2], 0);
16336
+ C = ggml_view_4d(ctx0, C, d_state, 1, n_seq_tokens, n_seqs, d_state * C->nb[0], C->nb[1], C->nb[2], 0);
14924
16337
 
14925
- // use the states and the indices provided by build_rs
16338
+ // use the states and the indices provided by build_recurrent_state
14926
16339
  // (this is necessary in order to properly use the states before they are overwritten,
14927
16340
  // while avoiding to make unnecessary copies of the states)
14928
16341
  auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
14929
- ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, kv_state->get_size());
16342
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_heads, mctx_cur->get_size());
14930
16343
 
14931
- // TODO: use semistructured matrices to implement state-space duality
16344
+ // Custom operator to optimize the parallel associative scan
16345
+ // as described in the Annex D of the Mamba paper.
14932
16346
  // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
14933
16347
  return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
14934
16348
  };
14935
16349
 
14936
- ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
16350
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
16351
+ cb(y_ssm, "mamba_ssm_scan", il);
14937
16352
 
14938
16353
  // store last states
14939
16354
  ggml_build_forward_expand(gf,
14940
16355
  ggml_cpy(ctx0,
14941
- ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
14942
- ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
16356
+ ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
16357
+ ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
16358
+ kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
14943
16359
 
14944
- ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
16360
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
16361
+ cb(y, "mamba_y_view", il);
14945
16362
 
14946
- // TODO: skip computing output earlier for unused tokens
16363
+ // Add D parameter and apply gating with z
16364
+ // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
16365
+ ggml_tensor * D = ggml_reshape_2d(ctx0, model.layers[il].ssm_d, 1, n_heads);
16366
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, D));
16367
+ cb(y, "mamba_y_add_d", il);
14947
16368
 
14948
- y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
14949
16369
  y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
16370
+ cb(y, "mamba_y_swiglu_z", il);
14950
16371
 
14951
- // grouped RMS norm
14952
- if (model.layers[il].ssm_norm) {
14953
- y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
14954
- y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
14955
- }
14956
-
14957
- y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
14958
-
14959
- // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
16372
+ // out_proj: {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
16373
+ y = ggml_view_3d(ctx0, y, head_dim * n_heads, n_seq_tokens, n_seqs, y->nb[2], y->nb[3], 0);
14960
16374
  cur = build_lora_mm(model.layers[il].ssm_out, y);
16375
+ cb(cur, "mamba_out_proj", il);
14961
16376
  }
14962
16377
 
14963
16378
  // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
14964
16379
  cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
14965
16380
  cb(cur, "mamba_out", il);
16381
+
14966
16382
  return cur;
14967
16383
  }
14968
16384
  };
14969
16385
 
14970
16386
  struct llm_build_arcee : public llm_graph_context {
14971
- llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
16387
+ llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
14972
16388
  const int64_t n_embd_head = hparams.n_embd_head_v;
14973
16389
 
14974
16390
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -15044,7 +16460,7 @@ struct llm_build_arcee : public llm_graph_context {
15044
16460
  cb(Kcur, "Kcur", il);
15045
16461
  cb(Vcur, "Vcur", il);
15046
16462
 
15047
- cur = build_attn(inp_attn, gf,
16463
+ cur = build_attn(inp_attn,
15048
16464
  model.layers[il].wo, model.layers[il].bo,
15049
16465
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15050
16466
  cb(cur, "attn_out", il);
@@ -15103,7 +16519,7 @@ struct llm_build_arcee : public llm_graph_context {
15103
16519
  };
15104
16520
 
15105
16521
  struct llm_build_hunyuan_moe : public llm_graph_context {
15106
- llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
16522
+ llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
15107
16523
  const int64_t n_embd_head = hparams.n_embd_head_v;
15108
16524
 
15109
16525
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -15189,7 +16605,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
15189
16605
  LLM_NORM_RMS, il);
15190
16606
  cb(Qcur, "Qcur_norm", il);
15191
16607
 
15192
- cur = build_attn(inp_attn, gf,
16608
+ cur = build_attn(inp_attn,
15193
16609
  model.layers[il].wo, model.layers[il].bo,
15194
16610
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15195
16611
  cb(cur, "attn_out", il);
@@ -15264,7 +16680,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
15264
16680
  };
15265
16681
 
15266
16682
  struct llm_build_smollm3 : public llm_graph_context {
15267
- llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
16683
+ llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
15268
16684
  const int64_t n_embd_head = hparams.n_embd_head_v;
15269
16685
 
15270
16686
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -15341,7 +16757,7 @@ struct llm_build_smollm3 : public llm_graph_context {
15341
16757
  cb(Kcur, "Kcur", il);
15342
16758
  cb(Vcur, "Vcur", il);
15343
16759
 
15344
- cur = build_attn(inp_attn, gf,
16760
+ cur = build_attn(inp_attn,
15345
16761
  model.layers[il].wo, model.layers[il].bo,
15346
16762
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15347
16763
  cb(cur, "attn_out", il);
@@ -15400,6 +16816,183 @@ struct llm_build_smollm3 : public llm_graph_context {
15400
16816
  }
15401
16817
  };
15402
16818
 
16819
+ struct llm_build_lfm2 : public llm_graph_context {
16820
+ const llama_model & model;
16821
+
16822
+ llm_build_lfm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) {
16823
+
16824
+ ggml_tensor * cur = build_inp_embd(model.tok_embd);
16825
+ cb(cur, "model.embed_tokens", -1);
16826
+
16827
+ ggml_tensor * inp_pos = build_inp_pos();
16828
+ auto * inp_hybrid = build_inp_mem_hybrid();
16829
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
16830
+
16831
+ for (int il = 0; il < n_layer; ++il) {
16832
+ auto * prev_cur = cur;
16833
+ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
16834
+ cb(cur, "model.layers.{}.operator_norm", il);
16835
+
16836
+ cur = hparams.is_recurrent(il) ?
16837
+ build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
16838
+ build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il) ;
16839
+
16840
+ if (il == n_layer - 1 && inp_out_ids) {
16841
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
16842
+ prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
16843
+ }
16844
+
16845
+ cur = ggml_add(ctx0, prev_cur, cur);
16846
+ cur = ggml_add(ctx0, cur, build_feed_forward(cur, il));
16847
+ }
16848
+
16849
+ cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
16850
+ cb(cur, "model.embedding_norm", -1);
16851
+ res->t_embd = cur;
16852
+
16853
+ // lm_head is tied with embeddings
16854
+ cur = build_lora_mm(model.tok_embd, cur);
16855
+ cb(cur, "lm_head", -1);
16856
+
16857
+ res->t_logits = cur;
16858
+
16859
+ ggml_build_forward_expand(gf, cur);
16860
+ }
16861
+
16862
+ ggml_tensor * build_feed_forward(ggml_tensor * cur,
16863
+ int il) const {
16864
+ cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
16865
+ cb(cur, "model.layers.{}.ffn_norm", il);
16866
+
16867
+ GGML_ASSERT(!model.layers[il].ffn_up_b);
16868
+ GGML_ASSERT(!model.layers[il].ffn_gate_b);
16869
+ GGML_ASSERT(!model.layers[il].ffn_down_b);
16870
+ cur = build_ffn(cur,
16871
+ model.layers[il].ffn_up, NULL, NULL,
16872
+ model.layers[il].ffn_gate, NULL, NULL,
16873
+ model.layers[il].ffn_down, NULL, NULL,
16874
+ NULL,
16875
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
16876
+ cb(cur, "model.layers.{}.feed_forward.w2", il);
16877
+
16878
+ return cur;
16879
+ }
16880
+
16881
+ ggml_tensor * build_attn_block(ggml_tensor * cur,
16882
+ ggml_tensor * inp_pos,
16883
+ llm_graph_input_attn_kv_unified * inp_attn,
16884
+ int il) const {
16885
+ GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
16886
+ auto const n_embd_head = hparams.n_embd_head_v;
16887
+ auto const n_head_kv = hparams.n_head_kv(il);
16888
+
16889
+ auto * q = build_lora_mm(model.layers[il].wq, cur);
16890
+ cb(q, "model.layers.{}.self_attn.q_proj", il);
16891
+ auto * k = build_lora_mm(model.layers[il].wk, cur);
16892
+ cb(k, "model.layers.{}.self_attn.k_proj", il);
16893
+ auto * v = build_lora_mm(model.layers[il].wv, cur);
16894
+ cb(v, "model.layers.{}.self_attn.v_proj", il);
16895
+
16896
+ q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
16897
+ k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
16898
+ v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
16899
+
16900
+ // qk norm
16901
+ q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
16902
+ cb(q, "model.layers.{}.self_attn.q_layernorm", il);
16903
+ k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
16904
+ cb(k, "model.layers.{}.self_attn.k_layernorm", il);
16905
+
16906
+ // RoPE
16907
+ q = ggml_rope_ext(
16908
+ ctx0, q, inp_pos, nullptr,
16909
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
16910
+ ext_factor, attn_factor, beta_fast, beta_slow
16911
+ );
16912
+ k = ggml_rope_ext(
16913
+ ctx0, k, inp_pos, nullptr,
16914
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
16915
+ ext_factor, attn_factor, beta_fast, beta_slow
16916
+ );
16917
+
16918
+ cur = build_attn(inp_attn, model.layers[il].wo, NULL,
16919
+ q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16920
+
16921
+ cb(cur, "model.layers.{}.self_attn.out_proj", il);
16922
+
16923
+ return cur;
16924
+ }
16925
+
16926
+ ggml_tensor * build_shortconv_block(ggml_tensor * cur,
16927
+ llm_graph_input_rs * inp_recr,
16928
+ int il) {
16929
+ const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
16930
+ const uint32_t kv_head = mctx_cur->get_head();
16931
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
16932
+ const int64_t n_seqs = ubatch.n_seqs;
16933
+ GGML_ASSERT(n_seqs != 0);
16934
+ GGML_ASSERT(ubatch.equal_seqs());
16935
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
16936
+
16937
+ GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
16938
+ const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;
16939
+
16940
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
16941
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
16942
+
16943
+ auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
16944
+ cb(bcx, "model.layers.{}.conv.in_proj", il);
16945
+
16946
+ constexpr auto n_chunks = 3;
16947
+ GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
16948
+ auto const chunk_size = bcx->ne[0] / n_chunks;
16949
+ auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 0*chunk_size*ggml_element_size(bcx));
16950
+ auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 1*chunk_size*ggml_element_size(bcx));
16951
+ auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], 2*chunk_size*ggml_element_size(bcx));
16952
+
16953
+ auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
16954
+
16955
+ // read conv state
16956
+ auto * conv_state = mctx_cur->get_r_l(il);
16957
+ auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
16958
+ auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
16959
+
16960
+ bx = ggml_concat(ctx0, conv, bx, 0);
16961
+ GGML_ASSERT(bx->ne[0] > conv->ne[0]);
16962
+
16963
+ // last d_conv columns is a new conv state
16964
+ auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2], (bx->ne[0] - conv->ne[0])*ggml_element_size(bx));
16965
+ GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
16966
+
16967
+ // write new conv conv state
16968
+ ggml_build_forward_expand(
16969
+ gf,
16970
+ ggml_cpy(
16971
+ ctx0,
16972
+ new_conv,
16973
+ ggml_view_1d(
16974
+ ctx0,
16975
+ conv_state,
16976
+ ggml_nelements(new_conv),
16977
+ kv_head*d_conv*n_embd*ggml_element_size(new_conv)
16978
+ )
16979
+ )
16980
+ );
16981
+
16982
+ auto * conv_kernel = model.layers[il].shortconv.conv;
16983
+ auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
16984
+ cb(conv_out, "model.layers.{}.conv.conv", il);
16985
+
16986
+ auto * y = ggml_mul(ctx0, c, conv_out);
16987
+ y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
16988
+ cb(y, "model.layers.{}.conv.out_proj", il);
16989
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
16990
+ y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);
16991
+
16992
+ return y;
16993
+ }
16994
+ };
16995
+
15403
16996
  llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
15404
16997
  llama_memory_i * res;
15405
16998
 
@@ -15412,6 +17005,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
15412
17005
  case LLM_ARCH_NOMIC_BERT_MOE:
15413
17006
  case LLM_ARCH_NEO_BERT:
15414
17007
  case LLM_ARCH_WAVTOKENIZER_DEC:
17008
+ case LLM_ARCH_DREAM:
15415
17009
  {
15416
17010
  res = nullptr;
15417
17011
  } break;
@@ -15452,7 +17046,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
15452
17046
  } else {
15453
17047
  const auto padding = llama_kv_cache_unified::get_padding(cparams);
15454
17048
 
15455
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
17049
+ uint32_t n_ctx_per_stream = cparams.n_ctx;
17050
+
17051
+ if (!cparams.kv_unified) {
17052
+ n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
17053
+ n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
17054
+
17055
+ cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
17056
+ } else {
17057
+ n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
17058
+
17059
+ cparams.n_ctx = n_ctx_per_stream;
17060
+ }
15456
17061
 
15457
17062
  LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
15458
17063
 
@@ -15466,7 +17071,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
15466
17071
  !cparams.flash_attn,
15467
17072
  cparams.offload_kqv,
15468
17073
  params.swa_full,
15469
- cparams.n_ctx,
17074
+ cparams.kv_unified,
17075
+ n_ctx_per_stream,
15470
17076
  cparams.n_seq_max,
15471
17077
  cparams.n_ubatch,
15472
17078
  padding);
@@ -15480,7 +17086,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
15480
17086
  params.type_v,
15481
17087
  !cparams.flash_attn,
15482
17088
  cparams.offload_kqv,
15483
- cparams.n_ctx,
17089
+ cparams.kv_unified,
17090
+ n_ctx_per_stream,
15484
17091
  cparams.n_seq_max,
15485
17092
  padding,
15486
17093
  hparams.n_swa,
@@ -15493,223 +17100,233 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
15493
17100
  return res;
15494
17101
  }
15495
17102
 
15496
- llm_graph_result_ptr llama_model::build_graph(
15497
- const llm_graph_params & params,
15498
- ggml_cgraph * gf,
15499
- llm_graph_type type) const {
17103
+ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
15500
17104
  std::unique_ptr<llm_graph_context> llm;
15501
17105
 
15502
17106
  switch (arch) {
15503
17107
  case LLM_ARCH_LLAMA:
15504
17108
  {
15505
- llm = std::make_unique<llm_build_llama>(*this, params, gf);
17109
+ llm = std::make_unique<llm_build_llama>(*this, params);
15506
17110
  } break;
15507
17111
  case LLM_ARCH_LLAMA4:
15508
17112
  {
15509
- llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
17113
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params);
15510
17114
  } break;
15511
17115
  case LLM_ARCH_DECI:
15512
17116
  {
15513
- llm = std::make_unique<llm_build_deci>(*this, params, gf);
17117
+ llm = std::make_unique<llm_build_deci>(*this, params);
15514
17118
  } break;
15515
17119
  case LLM_ARCH_BAICHUAN:
15516
17120
  {
15517
- llm = std::make_unique<llm_build_baichuan>(*this, params, gf);
17121
+ llm = std::make_unique<llm_build_baichuan>(*this, params);
15518
17122
  } break;
15519
17123
  case LLM_ARCH_FALCON:
15520
17124
  {
15521
- llm = std::make_unique<llm_build_falcon>(*this, params, gf);
17125
+ llm = std::make_unique<llm_build_falcon>(*this, params);
15522
17126
  } break;
15523
17127
  case LLM_ARCH_GROK:
15524
17128
  {
15525
- llm = std::make_unique<llm_build_grok>(*this, params, gf);
17129
+ llm = std::make_unique<llm_build_grok>(*this, params);
15526
17130
  } break;
15527
17131
  case LLM_ARCH_STARCODER:
15528
17132
  {
15529
- llm = std::make_unique<llm_build_starcoder>(*this, params, gf);
17133
+ llm = std::make_unique<llm_build_starcoder>(*this, params);
15530
17134
  } break;
15531
17135
  case LLM_ARCH_REFACT:
15532
17136
  {
15533
- llm = std::make_unique<llm_build_refact>(*this, params, gf);
17137
+ llm = std::make_unique<llm_build_refact>(*this, params);
15534
17138
  } break;
15535
17139
  case LLM_ARCH_BERT:
15536
17140
  case LLM_ARCH_JINA_BERT_V2:
15537
17141
  case LLM_ARCH_NOMIC_BERT:
15538
17142
  case LLM_ARCH_NOMIC_BERT_MOE:
15539
17143
  {
15540
- llm = std::make_unique<llm_build_bert>(*this, params, gf);
17144
+ llm = std::make_unique<llm_build_bert>(*this, params);
15541
17145
  } break;
15542
17146
  case LLM_ARCH_NEO_BERT:
15543
17147
  {
15544
- llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
17148
+ llm = std::make_unique<llm_build_neo_bert>(*this, params);
15545
17149
  } break;
15546
17150
  case LLM_ARCH_BLOOM:
15547
17151
  {
15548
- llm = std::make_unique<llm_build_bloom>(*this, params, gf);
17152
+ llm = std::make_unique<llm_build_bloom>(*this, params);
15549
17153
  } break;
15550
17154
  case LLM_ARCH_MPT:
15551
17155
  {
15552
- llm = std::make_unique<llm_build_mpt>(*this, params, gf);
17156
+ llm = std::make_unique<llm_build_mpt>(*this, params);
15553
17157
  } break;
15554
17158
  case LLM_ARCH_STABLELM:
15555
17159
  {
15556
- llm = std::make_unique<llm_build_stablelm>(*this, params, gf);
17160
+ llm = std::make_unique<llm_build_stablelm>(*this, params);
15557
17161
  } break;
15558
17162
  case LLM_ARCH_QWEN:
15559
17163
  {
15560
- llm = std::make_unique<llm_build_qwen>(*this, params, gf);
17164
+ llm = std::make_unique<llm_build_qwen>(*this, params);
15561
17165
  } break;
15562
17166
  case LLM_ARCH_QWEN2:
15563
17167
  {
15564
- llm = std::make_unique<llm_build_qwen2>(*this, params, gf);
17168
+ llm = std::make_unique<llm_build_qwen2>(*this, params);
15565
17169
  } break;
17170
+ case LLM_ARCH_DREAM:
17171
+ {
17172
+ llm = std::make_unique<llm_build_dream>(*this, params);
17173
+ }
17174
+ break;
15566
17175
  case LLM_ARCH_QWEN2VL:
15567
17176
  {
15568
- llm = std::make_unique<llm_build_qwen2vl>(*this, params, gf);
17177
+ llm = std::make_unique<llm_build_qwen2vl>(*this, params);
15569
17178
  } break;
15570
17179
  case LLM_ARCH_QWEN2MOE:
15571
17180
  {
15572
- llm = std::make_unique<llm_build_qwen2moe>(*this, params, gf);
17181
+ llm = std::make_unique<llm_build_qwen2moe>(*this, params);
15573
17182
  } break;
15574
17183
  case LLM_ARCH_QWEN3:
15575
17184
  {
15576
- llm = std::make_unique<llm_build_qwen3>(*this, params, gf);
17185
+ llm = std::make_unique<llm_build_qwen3>(*this, params);
15577
17186
  } break;
15578
17187
  case LLM_ARCH_QWEN3MOE:
15579
17188
  {
15580
- llm = std::make_unique<llm_build_qwen3moe>(*this, params, gf);
17189
+ llm = std::make_unique<llm_build_qwen3moe>(*this, params);
15581
17190
  } break;
15582
17191
  case LLM_ARCH_PHI2:
15583
17192
  {
15584
- llm = std::make_unique<llm_build_phi2>(*this, params, gf);
17193
+ llm = std::make_unique<llm_build_phi2>(*this, params);
15585
17194
  } break;
15586
17195
  case LLM_ARCH_PHI3:
15587
17196
  case LLM_ARCH_PHIMOE:
15588
17197
  {
15589
17198
  if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
15590
- llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
17199
+ llm = std::make_unique<llm_build_phi3<true>> (*this, params);
15591
17200
  } else {
15592
- llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
17201
+ llm = std::make_unique<llm_build_phi3<false>>(*this, params);
15593
17202
  }
15594
17203
  } break;
15595
17204
  case LLM_ARCH_PLAMO:
15596
17205
  {
15597
- llm = std::make_unique<llm_build_plamo>(*this, params, gf);
17206
+ llm = std::make_unique<llm_build_plamo>(*this, params);
17207
+ } break;
17208
+ case LLM_ARCH_PLAMO2:
17209
+ {
17210
+ llm = std::make_unique<llm_build_plamo2>(*this, params);
15598
17211
  } break;
15599
17212
  case LLM_ARCH_GPT2:
15600
17213
  {
15601
- llm = std::make_unique<llm_build_gpt2>(*this, params, gf);
17214
+ llm = std::make_unique<llm_build_gpt2>(*this, params);
15602
17215
  } break;
15603
17216
  case LLM_ARCH_CODESHELL:
15604
17217
  {
15605
- llm = std::make_unique<llm_build_codeshell>(*this, params, gf);
17218
+ llm = std::make_unique<llm_build_codeshell>(*this, params);
15606
17219
  } break;
15607
17220
  case LLM_ARCH_ORION:
15608
17221
  {
15609
- llm = std::make_unique<llm_build_orion>(*this, params, gf);
17222
+ llm = std::make_unique<llm_build_orion>(*this, params);
15610
17223
  } break;
15611
17224
  case LLM_ARCH_INTERNLM2:
15612
17225
  {
15613
- llm = std::make_unique<llm_build_internlm2>(*this, params, gf);
17226
+ llm = std::make_unique<llm_build_internlm2>(*this, params);
15614
17227
  } break;
15615
17228
  case LLM_ARCH_MINICPM3:
15616
17229
  {
15617
- llm = std::make_unique<llm_build_minicpm3>(*this, params, gf);
17230
+ llm = std::make_unique<llm_build_minicpm3>(*this, params);
15618
17231
  } break;
15619
17232
  case LLM_ARCH_GEMMA:
15620
17233
  {
15621
- llm = std::make_unique<llm_build_gemma>(*this, params, gf);
17234
+ llm = std::make_unique<llm_build_gemma>(*this, params);
15622
17235
  } break;
15623
17236
  case LLM_ARCH_GEMMA2:
15624
17237
  {
15625
- llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
17238
+ llm = std::make_unique<llm_build_gemma2_iswa>(*this, params);
15626
17239
  } break;
15627
17240
  case LLM_ARCH_GEMMA3:
15628
17241
  {
15629
- llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
17242
+ llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
15630
17243
  } break;
15631
17244
  case LLM_ARCH_GEMMA3N:
15632
17245
  {
15633
- llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params, gf);
17246
+ llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
15634
17247
  } break;
15635
17248
  case LLM_ARCH_STARCODER2:
15636
17249
  {
15637
- llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
17250
+ llm = std::make_unique<llm_build_starcoder2>(*this, params);
15638
17251
  } break;
15639
17252
  case LLM_ARCH_MAMBA:
15640
17253
  case LLM_ARCH_MAMBA2:
15641
17254
  {
15642
- llm = std::make_unique<llm_build_mamba>(*this, params, gf);
17255
+ llm = std::make_unique<llm_build_mamba>(*this, params);
17256
+ } break;
17257
+ case LLM_ARCH_JAMBA:
17258
+ {
17259
+ llm = std::make_unique<llm_build_jamba>(*this, params);
15643
17260
  } break;
15644
17261
  case LLM_ARCH_XVERSE:
15645
17262
  {
15646
- llm = std::make_unique<llm_build_xverse>(*this, params, gf);
17263
+ llm = std::make_unique<llm_build_xverse>(*this, params);
15647
17264
  } break;
15648
17265
  case LLM_ARCH_COMMAND_R:
15649
17266
  {
15650
- llm = std::make_unique<llm_build_command_r>(*this, params, gf);
17267
+ llm = std::make_unique<llm_build_command_r>(*this, params);
15651
17268
  } break;
15652
17269
  case LLM_ARCH_COHERE2:
15653
17270
  {
15654
- llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
17271
+ llm = std::make_unique<llm_build_cohere2_iswa>(*this, params);
15655
17272
  } break;
15656
17273
  case LLM_ARCH_DBRX:
15657
17274
  {
15658
- llm = std::make_unique<llm_build_dbrx>(*this, params, gf);
17275
+ llm = std::make_unique<llm_build_dbrx>(*this, params);
15659
17276
  } break;
15660
17277
  case LLM_ARCH_OLMO:
15661
17278
  {
15662
- llm = std::make_unique<llm_build_olmo>(*this, params, gf);
17279
+ llm = std::make_unique<llm_build_olmo>(*this, params);
15663
17280
  } break;
15664
17281
  case LLM_ARCH_OLMO2:
15665
17282
  {
15666
- llm = std::make_unique<llm_build_olmo2>(*this, params, gf);
17283
+ llm = std::make_unique<llm_build_olmo2>(*this, params);
15667
17284
  } break;
15668
17285
  case LLM_ARCH_OLMOE:
15669
17286
  {
15670
- llm = std::make_unique<llm_build_olmoe>(*this, params, gf);
17287
+ llm = std::make_unique<llm_build_olmoe>(*this, params);
15671
17288
  } break;
15672
17289
  case LLM_ARCH_OPENELM:
15673
17290
  {
15674
- llm = std::make_unique<llm_build_openelm>(*this, params, gf);
17291
+ llm = std::make_unique<llm_build_openelm>(*this, params);
15675
17292
  } break;
15676
17293
  case LLM_ARCH_GPTNEOX:
15677
17294
  {
15678
- llm = std::make_unique<llm_build_gptneox>(*this, params, gf);
17295
+ llm = std::make_unique<llm_build_gptneox>(*this, params);
15679
17296
  } break;
15680
17297
  case LLM_ARCH_ARCTIC:
15681
17298
  {
15682
- llm = std::make_unique<llm_build_arctic>(*this, params, gf);
17299
+ llm = std::make_unique<llm_build_arctic>(*this, params);
15683
17300
  } break;
15684
17301
  case LLM_ARCH_DEEPSEEK:
15685
17302
  {
15686
- llm = std::make_unique<llm_build_deepseek>(*this, params, gf);
17303
+ llm = std::make_unique<llm_build_deepseek>(*this, params);
15687
17304
  } break;
15688
17305
  case LLM_ARCH_DEEPSEEK2:
15689
17306
  {
15690
- llm = std::make_unique<llm_build_deepseek2>(*this, params, gf);
17307
+ llm = std::make_unique<llm_build_deepseek2>(*this, params);
15691
17308
  } break;
15692
17309
  case LLM_ARCH_CHATGLM:
15693
17310
  {
15694
- llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
17311
+ llm = std::make_unique<llm_build_chatglm>(*this, params);
15695
17312
  } break;
15696
17313
  case LLM_ARCH_GLM4:
15697
17314
  {
15698
- llm = std::make_unique<llm_build_glm4>(*this, params, gf);
17315
+ llm = std::make_unique<llm_build_glm4>(*this, params);
15699
17316
  } break;
15700
17317
  case LLM_ARCH_BITNET:
15701
17318
  {
15702
- llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
17319
+ llm = std::make_unique<llm_build_bitnet>(*this, params);
15703
17320
  } break;
15704
17321
  case LLM_ARCH_T5:
15705
17322
  {
15706
- switch (type) {
17323
+ switch (params.gtype) {
15707
17324
  case LLM_GRAPH_TYPE_ENCODER:
15708
- llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
17325
+ llm = std::make_unique<llm_build_t5_enc>(*this, params);
15709
17326
  break;
15710
17327
  case LLM_GRAPH_TYPE_DEFAULT:
15711
17328
  case LLM_GRAPH_TYPE_DECODER:
15712
- llm = std::make_unique<llm_build_t5_dec>(*this, params, gf);
17329
+ llm = std::make_unique<llm_build_t5_dec>(*this, params);
15713
17330
  break;
15714
17331
  default:
15715
17332
  GGML_ABORT("invalid graph type");
@@ -15717,91 +17334,111 @@ llm_graph_result_ptr llama_model::build_graph(
15717
17334
  } break;
15718
17335
  case LLM_ARCH_T5ENCODER:
15719
17336
  {
15720
- llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
17337
+ llm = std::make_unique<llm_build_t5_enc>(*this, params);
15721
17338
  }
15722
17339
  break;
15723
17340
  case LLM_ARCH_JAIS:
15724
17341
  {
15725
- llm = std::make_unique<llm_build_jais>(*this, params, gf);
17342
+ llm = std::make_unique<llm_build_jais>(*this, params);
15726
17343
  } break;
15727
17344
  case LLM_ARCH_NEMOTRON:
15728
17345
  {
15729
- llm = std::make_unique<llm_build_nemotron>(*this, params, gf);
17346
+ llm = std::make_unique<llm_build_nemotron>(*this, params);
15730
17347
  } break;
15731
17348
  case LLM_ARCH_EXAONE:
15732
17349
  {
15733
- llm = std::make_unique<llm_build_exaone>(*this, params, gf);
17350
+ llm = std::make_unique<llm_build_exaone>(*this, params);
17351
+ } break;
17352
+ case LLM_ARCH_EXAONE4:
17353
+ {
17354
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
17355
+ llm = std::make_unique<llm_build_exaone4<true>>(*this, params);
17356
+ } else {
17357
+ llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
17358
+ }
15734
17359
  } break;
15735
17360
  case LLM_ARCH_RWKV6:
15736
17361
  {
15737
- llm = std::make_unique<llm_build_rwkv6>(*this, params, gf);
17362
+ llm = std::make_unique<llm_build_rwkv6>(*this, params);
15738
17363
  } break;
15739
17364
  case LLM_ARCH_RWKV6QWEN2:
15740
17365
  {
15741
- llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params, gf);
17366
+ llm = std::make_unique<llm_build_rwkv6qwen2>(*this, params);
15742
17367
  } break;
15743
17368
  case LLM_ARCH_RWKV7:
15744
17369
  {
15745
- llm = std::make_unique<llm_build_rwkv7>(*this, params, gf);
17370
+ llm = std::make_unique<llm_build_rwkv7>(*this, params);
15746
17371
  } break;
15747
17372
  case LLM_ARCH_ARWKV7:
15748
17373
  {
15749
- llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
17374
+ llm = std::make_unique<llm_build_arwkv7>(*this, params);
15750
17375
  } break;
15751
17376
  case LLM_ARCH_GRANITE:
15752
17377
  case LLM_ARCH_GRANITE_MOE:
15753
17378
  case LLM_ARCH_MINICPM:
15754
17379
  {
15755
- llm = std::make_unique<llm_build_granite>(*this, params, gf);
17380
+ llm = std::make_unique<llm_build_granite>(*this, params);
17381
+ } break;
17382
+ case LLM_ARCH_GRANITE_HYBRID:
17383
+ {
17384
+ llm = std::make_unique<llm_build_granite_hybrid>(*this, params);
15756
17385
  } break;
15757
17386
  case LLM_ARCH_CHAMELEON:
15758
17387
  {
15759
- llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
17388
+ llm = std::make_unique<llm_build_chameleon>(*this, params);
15760
17389
  } break;
15761
17390
  case LLM_ARCH_WAVTOKENIZER_DEC:
15762
17391
  {
15763
- llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
17392
+ llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
15764
17393
  } break;
15765
17394
  case LLM_ARCH_PLM:
15766
17395
  {
15767
- llm = std::make_unique<llm_build_plm>(*this, params, gf);
17396
+ llm = std::make_unique<llm_build_plm>(*this, params);
15768
17397
  } break;
15769
17398
  case LLM_ARCH_BAILINGMOE:
15770
17399
  {
15771
- llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
17400
+ llm = std::make_unique<llm_build_bailingmoe>(*this, params);
15772
17401
  } break;
15773
17402
  case LLM_ARCH_DOTS1:
15774
17403
  {
15775
- llm = std::make_unique<llm_build_dots1>(*this, params, gf);
17404
+ llm = std::make_unique<llm_build_dots1>(*this, params);
15776
17405
  } break;
15777
17406
  case LLM_ARCH_ARCEE:
15778
17407
  {
15779
- llm = std::make_unique<llm_build_arcee>(*this, params, gf);
17408
+ llm = std::make_unique<llm_build_arcee>(*this, params);
15780
17409
  } break;
15781
17410
  case LLM_ARCH_ERNIE4_5:
15782
17411
  {
15783
- llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf);
17412
+ llm = std::make_unique<llm_build_ernie4_5>(*this, params);
17413
+ } break;
17414
+ case LLM_ARCH_ERNIE4_5_MOE:
17415
+ {
17416
+ llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
15784
17417
  } break;
15785
17418
  case LLM_ARCH_HUNYUAN_MOE:
15786
17419
  {
15787
- llm = std::make_unique<llm_build_hunyuan_moe>(*this, params, gf);
17420
+ llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
15788
17421
  } break;
15789
17422
  case LLM_ARCH_SMOLLM3:
15790
17423
  {
15791
- llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
17424
+ llm = std::make_unique<llm_build_smollm3>(*this, params);
15792
17425
  } break;
15793
17426
  case LLM_ARCH_FALCON_H1:
15794
17427
  {
15795
- llm = std::make_unique<llm_build_falcon_h1>(*this, params, gf);
17428
+ llm = std::make_unique<llm_build_falcon_h1>(*this, params);
17429
+ } break;
17430
+ case LLM_ARCH_LFM2:
17431
+ {
17432
+ llm = std::make_unique<llm_build_lfm2>(*this, params);
15796
17433
  } break;
15797
17434
  default:
15798
17435
  GGML_ABORT("fatal error");
15799
17436
  }
15800
17437
 
15801
17438
  // add on pooling layer
15802
- llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);
17439
+ llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
15803
17440
 
15804
- return std::move(llm->res);
17441
+ return llm->res->get_gf();
15805
17442
  }
15806
17443
 
15807
17444
  //
@@ -15911,6 +17548,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
15911
17548
  case LLM_ARCH_BLOOM:
15912
17549
  case LLM_ARCH_MAMBA:
15913
17550
  case LLM_ARCH_MAMBA2:
17551
+ case LLM_ARCH_JAMBA:
15914
17552
  case LLM_ARCH_JINA_BERT_V2:
15915
17553
  case LLM_ARCH_T5:
15916
17554
  case LLM_ARCH_T5ENCODER:
@@ -15942,12 +17580,14 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
15942
17580
  case LLM_ARCH_GLM4:
15943
17581
  case LLM_ARCH_GRANITE:
15944
17582
  case LLM_ARCH_GRANITE_MOE:
17583
+ case LLM_ARCH_GRANITE_HYBRID:
15945
17584
  case LLM_ARCH_CHAMELEON:
15946
17585
  case LLM_ARCH_BAILINGMOE:
15947
17586
  case LLM_ARCH_NEO_BERT:
15948
17587
  case LLM_ARCH_SMOLLM3:
15949
17588
  case LLM_ARCH_ARCEE:
15950
17589
  case LLM_ARCH_ERNIE4_5:
17590
+ case LLM_ARCH_ERNIE4_5_MOE:
15951
17591
  return LLAMA_ROPE_TYPE_NORM;
15952
17592
 
15953
17593
  // the pairs of head values are offset by n_rot/2
@@ -15962,6 +17602,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
15962
17602
  case LLM_ARCH_BITNET:
15963
17603
  case LLM_ARCH_QWEN:
15964
17604
  case LLM_ARCH_QWEN2:
17605
+ case LLM_ARCH_DREAM:
15965
17606
  case LLM_ARCH_QWEN2MOE:
15966
17607
  case LLM_ARCH_QWEN3:
15967
17608
  case LLM_ARCH_QWEN3MOE:
@@ -15971,6 +17612,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
15971
17612
  case LLM_ARCH_PHI3:
15972
17613
  case LLM_ARCH_PHIMOE:
15973
17614
  case LLM_ARCH_PLAMO:
17615
+ case LLM_ARCH_PLAMO2:
15974
17616
  case LLM_ARCH_GEMMA:
15975
17617
  case LLM_ARCH_GEMMA2:
15976
17618
  case LLM_ARCH_GEMMA3:
@@ -15982,9 +17624,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
15982
17624
  case LLM_ARCH_ORION:
15983
17625
  case LLM_ARCH_NEMOTRON:
15984
17626
  case LLM_ARCH_EXAONE:
17627
+ case LLM_ARCH_EXAONE4:
15985
17628
  case LLM_ARCH_MINICPM3:
15986
17629
  case LLM_ARCH_DOTS1:
15987
17630
  case LLM_ARCH_HUNYUAN_MOE:
17631
+ case LLM_ARCH_LFM2:
15988
17632
  return LLAMA_ROPE_TYPE_NEOX;
15989
17633
 
15990
17634
  case LLM_ARCH_QWEN2VL: