@fugood/llama.node 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +12 -12
  3. package/src/llama.cpp/CMakeLists.txt +0 -1
  4. package/src/llama.cpp/common/arg.cpp +17 -0
  5. package/src/llama.cpp/common/chat.cpp +37 -20
  6. package/src/llama.cpp/common/chat.h +2 -0
  7. package/src/llama.cpp/common/common.h +4 -0
  8. package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
  9. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  10. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  11. package/src/llama.cpp/ggml/include/ggml.h +181 -10
  12. package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +38 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1297 -211
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +7 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
  20. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -9
  21. package/src/llama.cpp/include/llama.h +1 -0
  22. package/src/llama.cpp/src/llama-arch.cpp +108 -2
  23. package/src/llama.cpp/src/llama-arch.h +7 -0
  24. package/src/llama.cpp/src/llama-batch.cpp +27 -1
  25. package/src/llama.cpp/src/llama-batch.h +8 -1
  26. package/src/llama.cpp/src/llama-chat.cpp +15 -0
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-graph.cpp +95 -81
  29. package/src/llama.cpp/src/llama-graph.h +43 -16
  30. package/src/llama.cpp/src/llama-hparams.cpp +2 -1
  31. package/src/llama.cpp/src/llama-hparams.h +1 -0
  32. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  33. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  34. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  35. package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  36. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  37. package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  38. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  39. package/src/llama.cpp/src/llama-memory-recurrent.cpp +34 -16
  40. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  41. package/src/llama.cpp/src/llama-memory.h +3 -0
  42. package/src/llama.cpp/src/llama-model.cpp +1374 -210
  43. package/src/llama.cpp/src/llama-model.h +3 -0
  44. package/src/llama.cpp/src/llama-vocab.cpp +8 -1
  45. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
47
47
  case LLM_TYPE_475M: return "475M";
48
48
  case LLM_TYPE_770M: return "770M";
49
49
  case LLM_TYPE_780M: return "780M";
50
+ case LLM_TYPE_0_3B: return "0.3B";
50
51
  case LLM_TYPE_0_5B: return "0.5B";
51
52
  case LLM_TYPE_0_6B: return "0.6B";
52
53
  case LLM_TYPE_1B: return "1B";
@@ -101,6 +102,7 @@ const char * llm_type_name(llm_type type) {
101
102
  case LLM_TYPE_57B_A14B: return "57B.A14B";
102
103
  case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
103
104
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
105
+ case LLM_TYPE_A13B: return "A13B";
104
106
  case LLM_TYPE_30B_A3B: return "30B.A3B";
105
107
  case LLM_TYPE_235B_A22B: return "235B.A22B";
106
108
  case LLM_TYPE_E2B: return "E2B";
@@ -207,23 +209,27 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
207
209
  } break;
208
210
  case GGML_OP_SSM_CONV:
209
211
  {
210
- // FIXME
211
- ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
212
+ const int64_t n_seq_tokens = 512;
213
+ const int64_t n_seqs = 3;
214
+ ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
212
215
  op_tensor = ggml_ssm_conv(ctx, conv_x, w);
213
216
  } break;
214
217
  case GGML_OP_SSM_SCAN:
215
218
  {
216
- // FIXME
217
- const int64_t d_state = w->ne[0];
218
- const int64_t d_inner = w->ne[1];
219
+ // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
220
+ const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
221
+ const int64_t n_head = w->ne[1];
222
+ const int64_t head_dim = hparams.ssm_d_inner / n_head;
223
+ const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
219
224
  const int64_t n_seq_tokens = 512;
220
- const int64_t n_seqs = 1;
221
- ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
222
- ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
223
- ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
224
- ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
225
- ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
226
- op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
225
+ const int64_t n_seqs = 3;
226
+ ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
227
+ ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
228
+ ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
229
+ ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
230
+ ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
231
+ ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
232
+ op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
227
233
  } break;
228
234
  case GGML_OP_RWKV_WKV6:
229
235
  {
@@ -1080,6 +1086,38 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1080
1086
  default: type = LLM_TYPE_UNKNOWN;
1081
1087
  }
1082
1088
  } break;
1089
+ case LLM_ARCH_MAMBA2:
1090
+ {
1091
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1092
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1093
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1094
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1095
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1096
+
1097
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1098
+
1099
+ switch (hparams.n_layer) {
1100
+ case 24:
1101
+ switch (hparams.n_embd) {
1102
+ case 768: type = LLM_TYPE_SMALL; break;
1103
+ default: type = LLM_TYPE_UNKNOWN;
1104
+ } break;
1105
+ case 48:
1106
+ switch (hparams.n_embd) {
1107
+ case 1024: type = LLM_TYPE_MEDIUM; break;
1108
+ case 1536: type = LLM_TYPE_LARGE; break;
1109
+ case 2048: type = LLM_TYPE_XL; break;
1110
+ default: type = LLM_TYPE_UNKNOWN;
1111
+ } break;
1112
+ case 64:
1113
+ switch (hparams.n_embd) {
1114
+ case 2560: type = LLM_TYPE_3B; break;
1115
+ case 4096: type = LLM_TYPE_7B; break;
1116
+ default: type = LLM_TYPE_UNKNOWN;
1117
+ } break;
1118
+ default: type = LLM_TYPE_UNKNOWN;
1119
+ }
1120
+ } break;
1083
1121
  case LLM_ARCH_XVERSE:
1084
1122
  {
1085
1123
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1504,6 +1542,66 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1504
1542
  default: type = LLM_TYPE_UNKNOWN;
1505
1543
  }
1506
1544
  } break;
1545
+ case LLM_ARCH_ERNIE4_5:
1546
+ {
1547
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1548
+ switch (hparams.n_layer) {
1549
+ case 18: type = LLM_TYPE_0_3B; break;
1550
+ default: type = LLM_TYPE_UNKNOWN;
1551
+ }
1552
+ } break;
1553
+ case LLM_ARCH_FALCON_H1:
1554
+ {
1555
+ // Common parameters
1556
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1557
+
1558
+ // SSM parameters
1559
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1560
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1561
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1562
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1563
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1564
+
1565
+ std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
1566
+
1567
+ switch (hparams.n_layer) {
1568
+ case 36:
1569
+ type = LLM_TYPE_0_5B; break;
1570
+ case 24:
1571
+ type = LLM_TYPE_1_5B; break;
1572
+ case 66:
1573
+ type = LLM_TYPE_1B; break;
1574
+ case 32:
1575
+ type = LLM_TYPE_3B; break;
1576
+ case 44:
1577
+ type = LLM_TYPE_7B; break;
1578
+ case 72:
1579
+ type = LLM_TYPE_34B; break;
1580
+ default:
1581
+ type = LLM_TYPE_UNKNOWN;
1582
+ }
1583
+ } break;
1584
+ case LLM_ARCH_HUNYUAN_MOE:
1585
+ {
1586
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1587
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1588
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
1589
+
1590
+ switch (hparams.n_layer) {
1591
+ case 32: type = LLM_TYPE_A13B; break;
1592
+ default: type = LLM_TYPE_UNKNOWN;
1593
+ }
1594
+ } break;
1595
+ case LLM_ARCH_SMOLLM3:
1596
+ {
1597
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1598
+ hparams.n_no_rope_layer_step = 4;
1599
+
1600
+ switch (hparams.n_layer) {
1601
+ case 36: type = LLM_TYPE_3B; break;
1602
+ default: type = LLM_TYPE_UNKNOWN;
1603
+ }
1604
+ } break;
1507
1605
  default: throw std::runtime_error("unsupported model architecture");
1508
1606
  }
1509
1607
 
@@ -3111,6 +3209,54 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3111
3209
  layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
3112
3210
  layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
3113
3211
 
3212
+ // out_proj
3213
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3214
+ }
3215
+ } break;
3216
+ case LLM_ARCH_MAMBA2:
3217
+ {
3218
+ const int64_t d_conv = hparams.ssm_d_conv;
3219
+ const int64_t d_inner = hparams.ssm_d_inner;
3220
+ const int64_t d_state = hparams.ssm_d_state;
3221
+ const int64_t n_head = hparams.ssm_dt_rank;
3222
+ const int64_t n_group = hparams.ssm_n_group;
3223
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
3224
+
3225
+ // only an expansion factor of 2 is supported for now
3226
+ GGML_ASSERT(2 * n_embd == d_inner);
3227
+
3228
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3229
+
3230
+ // output
3231
+ {
3232
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3233
+
3234
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
3235
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
3236
+ if (output == NULL) {
3237
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
3238
+ }
3239
+ }
3240
+
3241
+ for (int i = 0; i < n_layer; ++i) {
3242
+ auto & layer = layers[i];
3243
+
3244
+ // norm
3245
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3246
+
3247
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
3248
+
3249
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
3250
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
3251
+
3252
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
3253
+
3254
+ // no "weight" suffix for these
3255
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
3256
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
3257
+
3258
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
3259
+
3114
3260
  // out_proj
3115
3261
  layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3116
3262
  }
@@ -4344,6 +4490,183 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4344
4490
 
4345
4491
  layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4346
4492
 
4493
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4494
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4495
+ }
4496
+ } break;
4497
+ case LLM_ARCH_ERNIE4_5:
4498
+ {
4499
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4500
+
4501
+ // output
4502
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4503
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4504
+ // if output is NULL, init from the input tok embed
4505
+ if (output == NULL) {
4506
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4507
+ }
4508
+
4509
+ for (int i = 0; i < n_layer; ++i) {
4510
+ auto & layer = layers[i];
4511
+
4512
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4513
+
4514
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4515
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4516
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4517
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4518
+
4519
+ // optional bias tensors
4520
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4521
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4522
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4523
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4524
+
4525
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4526
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4527
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4528
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4529
+ }
4530
+ } break;
4531
+ case LLM_ARCH_FALCON_H1:
4532
+ {
4533
+ // Common
4534
+ const int64_t hidden_size = hparams.n_embd; // hidden_size
4535
+
4536
+ // mamba2 Mixer SSM params
4537
+ const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
4538
+ const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
4539
+ const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
4540
+ const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
4541
+ const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
4542
+ const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
4543
+ const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
4544
+
4545
+ // attn params
4546
+ const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
4547
+ const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
4548
+
4549
+ // ffn params
4550
+ const int64_t ffn_intermediate_size = hparams.n_ff(0);
4551
+
4552
+ // embeddings
4553
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
4554
+
4555
+ // output
4556
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
4557
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
4558
+
4559
+ // if output is NULL, init from the input tok embed
4560
+ if (output == NULL) {
4561
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
4562
+ }
4563
+
4564
+ for (int i = 0; i < n_layer; ++i) {
4565
+ auto & layer = layers[i];
4566
+
4567
+ /*SSM LAYERS*/
4568
+ // ssm in
4569
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
4570
+ // ssm 1d conv
4571
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
4572
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
4573
+ // ssm_dt
4574
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
4575
+ // no "weight" suffix for these
4576
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
4577
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
4578
+ // ssm_norm
4579
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
4580
+ // out_proj
4581
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
4582
+
4583
+ /*ATTENTION LAYERS*/
4584
+ // attention layers (with optional bias)
4585
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
4586
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
4587
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
4588
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
4589
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
4590
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
4591
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
4592
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
4593
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
4594
+
4595
+
4596
+ // feed forward (w/ optional biases)
4597
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
4598
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4599
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
4600
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
4601
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
4602
+
4603
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
4604
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
4605
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
4606
+ }
4607
+ } break;
4608
+ case LLM_ARCH_HUNYUAN_MOE:
4609
+ {
4610
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4611
+
4612
+ // output
4613
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4614
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4615
+ // if output is NULL, init from the input tok embed
4616
+ if (output == NULL) {
4617
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4618
+ }
4619
+
4620
+ for (int i = 0; i < n_layer; ++i) {
4621
+ auto & layer = layers[i];
4622
+
4623
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4624
+
4625
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4626
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4627
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4628
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4629
+
4630
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4631
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4632
+
4633
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4634
+
4635
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4636
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4637
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
4638
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4639
+
4640
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4641
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4642
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
4643
+ }
4644
+ } break;
4645
+ case LLM_ARCH_SMOLLM3:
4646
+ {
4647
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4648
+
4649
+ // output
4650
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4651
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4652
+
4653
+ // if output is NULL, init from the input tok embed
4654
+ if (output == NULL) {
4655
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4656
+ }
4657
+
4658
+ for (int i = 0; i < n_layer; ++i) {
4659
+ auto & layer = layers[i];
4660
+
4661
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4662
+
4663
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4664
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4665
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4666
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4667
+
4668
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4669
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4347
4670
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4348
4671
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4349
4672
  }
@@ -4587,10 +4910,14 @@ void llama_model::print_info() const {
4587
4910
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4588
4911
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
4589
4912
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
4913
+ }
4914
+
4915
+ if (arch == LLM_ARCH_MAMBA || arch == LLM_ARCH_MAMBA2) {
4590
4916
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
4591
4917
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
4592
4918
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
4593
4919
  LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
4920
+ LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
4594
4921
  LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
4595
4922
 
4596
4923
  if (!classifier_labels.empty()) {
@@ -5539,12 +5866,10 @@ struct llm_build_falcon : public llm_graph_context {
5539
5866
  cur = build_lora_mm(model.layers[il].wqkv, cur);
5540
5867
  cb(cur, "wqkv", il);
5541
5868
 
5542
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5543
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5869
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
5870
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
5544
5871
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5545
5872
 
5546
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5547
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5548
5873
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5549
5874
 
5550
5875
  // using mode = 2 for neox mode
@@ -5821,12 +6146,10 @@ struct llm_build_dbrx : public llm_graph_context {
5821
6146
  cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
5822
6147
  cb(cur, "wqkv_clamped", il);
5823
6148
 
5824
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5825
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6149
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6150
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
5826
6151
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5827
6152
 
5828
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5829
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5830
6153
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5831
6154
 
5832
6155
  Qcur = ggml_rope_ext(
@@ -6337,12 +6660,10 @@ struct llm_build_neo_bert : public llm_graph_context {
6337
6660
  cur = build_lora_mm(model.layers[il].wqkv, cur);
6338
6661
  cb(cur, "wqkv", il);
6339
6662
 
6340
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6341
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6663
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6664
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6342
6665
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6343
6666
 
6344
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6345
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6346
6667
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6347
6668
 
6348
6669
  // RoPE
@@ -6572,8 +6893,8 @@ struct llm_build_mpt : public llm_graph_context {
6572
6893
  cb(cur, "wqkv_clamped", il);
6573
6894
  }
6574
6895
 
6575
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6576
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6896
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
6897
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
6577
6898
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6578
6899
 
6579
6900
  cb(Qcur, "Qcur", il);
@@ -6593,6 +6914,12 @@ struct llm_build_mpt : public llm_graph_context {
6593
6914
  model.layers[il].attn_k_norm_b,
6594
6915
  LLM_NORM, il);
6595
6916
  cb(Kcur, "Kcur", il);
6917
+ } else {
6918
+ Qcur = ggml_cont(ctx0, Qcur);
6919
+ cb(Qcur, "Qcur", il);
6920
+
6921
+ Kcur = ggml_cont(ctx0, Kcur);
6922
+ cb(Kcur, "Kcur", il);
6596
6923
  }
6597
6924
 
6598
6925
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -6847,12 +7174,10 @@ struct llm_build_qwen : public llm_graph_context {
6847
7174
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6848
7175
  cb(cur, "bqkv", il);
6849
7176
 
6850
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6851
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7177
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7178
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6852
7179
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
6853
7180
 
6854
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6855
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6856
7181
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6857
7182
 
6858
7183
  // using mode = 2 for neox mode
@@ -7617,21 +7942,21 @@ struct llm_build_phi2 : public llm_graph_context {
7617
7942
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7618
7943
  cb(cur, "bqkv", il);
7619
7944
 
7620
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7621
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7945
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7946
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7622
7947
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7623
7948
  } else {
7624
7949
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7625
7950
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7626
7951
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
7952
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7953
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7627
7954
  }
7628
7955
 
7629
7956
  cb(Qcur, "Qcur", il);
7630
7957
  cb(Kcur, "Kcur", il);
7631
7958
  cb(Vcur, "Vcur", il);
7632
7959
 
7633
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7634
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7635
7960
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7636
7961
 
7637
7962
  Qcur = ggml_rope_ext(
@@ -7755,21 +8080,21 @@ struct llm_build_phi3 : public llm_graph_context {
7755
8080
  cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
7756
8081
  cb(cur, "wqkv", il);
7757
8082
 
7758
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
7759
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
8083
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
8084
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
7760
8085
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
7761
8086
  } else {
7762
8087
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7763
8088
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7764
8089
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
8090
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8091
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7765
8092
  }
7766
8093
 
7767
8094
  cb(Qcur, "Qcur", il);
7768
8095
  cb(Kcur, "Kcur", il);
7769
8096
  cb(Vcur, "Vcur", il);
7770
8097
 
7771
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7772
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7773
8098
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7774
8099
 
7775
8100
  Qcur = ggml_rope_ext(
@@ -8125,12 +8450,10 @@ struct llm_build_codeshell : public llm_graph_context {
8125
8450
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8126
8451
  cb(cur, "bqkv", il);
8127
8452
 
8128
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
8129
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
8453
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8454
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8130
8455
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
8131
8456
 
8132
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8133
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8134
8457
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8135
8458
 
8136
8459
  Qcur = ggml_rope_ext(
@@ -8546,8 +8869,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8546
8869
  ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
8547
8870
  cb(k_pe, "k_pe", il);
8548
8871
 
8549
- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
8550
- kv_compressed = ggml_cont(ctx0, kv_compressed);
8551
8872
  kv_compressed = build_norm(kv_compressed,
8552
8873
  model.layers[il].attn_kv_a_norm, NULL,
8553
8874
  LLM_NORM_RMS, il);
@@ -8574,12 +8895,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8574
8895
  v_states = ggml_cont(ctx0, v_states);
8575
8896
  cb(v_states, "v_states", il);
8576
8897
 
8577
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
8578
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
8579
- 0);
8580
- cb(v_states, "v_states", il);
8581
-
8582
- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
8583
8898
  q_pe = ggml_rope_ext(
8584
8899
  ctx0, q_pe, inp_pos, rope_factors,
8585
8900
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -8588,7 +8903,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8588
8903
  cb(q_pe, "q_pe", il);
8589
8904
 
8590
8905
  // shared RoPE key
8591
- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
8592
8906
  k_pe = ggml_rope_ext(
8593
8907
  ctx0, k_pe, inp_pos, rope_factors,
8594
8908
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9622,9 +9936,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
9622
9936
  };
9623
9937
 
9624
9938
  struct llm_build_mamba : public llm_graph_context {
9625
- const llama_model & model;
9626
-
9627
- llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
9939
+ llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9628
9940
  ggml_tensor * cur;
9629
9941
  ggml_tensor * inpL;
9630
9942
 
@@ -9642,7 +9954,11 @@ struct llm_build_mamba : public llm_graph_context {
9642
9954
  LLM_NORM_RMS, il);
9643
9955
  cb(cur, "attn_norm", il);
9644
9956
 
9645
- cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
9957
+ if (model.arch == LLM_ARCH_MAMBA2) {
9958
+ cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il);
9959
+ } else {
9960
+ cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il);
9961
+ }
9646
9962
 
9647
9963
  if (il == n_layer - 1 && inp_out_ids) {
9648
9964
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
@@ -9676,11 +9992,11 @@ struct llm_build_mamba : public llm_graph_context {
9676
9992
  ggml_build_forward_expand(gf, cur);
9677
9993
  }
9678
9994
 
9679
- // TODO: split
9680
9995
  ggml_tensor * build_mamba_layer(
9681
9996
  llm_graph_input_rs * inp,
9682
9997
  ggml_cgraph * gf,
9683
9998
  ggml_tensor * cur,
9999
+ const llama_model & model,
9684
10000
  const llama_ubatch & ubatch,
9685
10001
  int il) const {
9686
10002
  const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
@@ -9691,6 +10007,8 @@ struct llm_build_mamba : public llm_graph_context {
9691
10007
  const int64_t d_inner = hparams.ssm_d_inner;
9692
10008
  const int64_t d_state = hparams.ssm_d_state;
9693
10009
  const int64_t dt_rank = hparams.ssm_dt_rank;
10010
+ const int64_t n_head = d_inner;
10011
+ const int64_t head_dim = 1;
9694
10012
  const int64_t n_seqs = ubatch.n_seqs;
9695
10013
  // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
9696
10014
  const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
@@ -9706,15 +10024,8 @@ struct llm_build_mamba : public llm_graph_context {
9706
10024
  ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
9707
10025
  ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
9708
10026
 
9709
- // (ab)using the KV cache to store the states
9710
- ggml_tensor * conv = build_rs(
9711
- inp, gf, conv_states_all,
9712
- hparams.n_embd_r(), n_seqs);
10027
+ ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
9713
10028
  conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
9714
- ggml_tensor * ssm = build_rs(
9715
- inp, gf, ssm_states_all,
9716
- hparams.n_embd_s(), n_seqs);
9717
- ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
9718
10029
 
9719
10030
  // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
9720
10031
  cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
@@ -9763,8 +10074,8 @@ struct llm_build_mamba : public llm_graph_context {
9763
10074
  ggml_tensor * x_db = build_lora_mm(model.layers[il].ssm_x, x);
9764
10075
  // split
9765
10076
  ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
9766
- ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
9767
- ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
10077
+ ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
10078
+ ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
9768
10079
 
9769
10080
  // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
9770
10081
  if (ssm_dt_b_c_rms) {
@@ -9777,32 +10088,174 @@ struct llm_build_mamba : public llm_graph_context {
9777
10088
  dt = build_lora_mm(model.layers[il].ssm_dt, dt);
9778
10089
  dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
9779
10090
 
9780
- // Custom operator to optimize the parallel associative scan
9781
- // as described in the Annex D of the Mamba paper.
9782
- // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
9783
- ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
10091
+ cur = x;
10092
+ x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
9784
10093
 
9785
- // store last states
9786
- ggml_build_forward_expand(gf,
10094
+ ggml_tensor * A = model.layers[il].ssm_a;
10095
+
10096
+ // use the states and the indices provided by build_recurrent_state
10097
+ // (this is necessary in order to properly use the states before they are overwritten,
10098
+ // while avoiding to make unnecessary copies of the states)
10099
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
10100
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
10101
+
10102
+ // Custom operator to optimize the parallel associative scan
10103
+ // as described in the Annex D of the Mamba paper.
10104
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
10105
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
10106
+ };
10107
+
10108
+ ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
10109
+
10110
+ // store last states
10111
+ ggml_build_forward_expand(gf,
10112
+ ggml_cpy(ctx0,
10113
+ ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
10114
+ ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
10115
+
10116
+ ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
10117
+
10118
+ // TODO: skip computing output earlier for unused tokens
10119
+
10120
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, model.layers[il].ssm_d));
10121
+ y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
10122
+
10123
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
10124
+ cur = build_lora_mm(model.layers[il].ssm_out, y);
10125
+ }
10126
+
10127
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
10128
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
10129
+ // cb(cur, "mamba_out", il);
10130
+
10131
+ return cur;
10132
+ }
10133
+
10134
+ ggml_tensor * build_mamba2_layer(
10135
+ llm_graph_input_rs * inp,
10136
+ ggml_cgraph * gf,
10137
+ ggml_tensor * cur,
10138
+ const llama_model & model,
10139
+ const llama_ubatch & ubatch,
10140
+ int il) const {
10141
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
10142
+
10143
+ const auto kv_head = mctx_cur->get_head();
10144
+
10145
+ const int64_t d_conv = hparams.ssm_d_conv;
10146
+ const int64_t d_inner = hparams.ssm_d_inner;
10147
+ const int64_t d_state = hparams.ssm_d_state;
10148
+ const int64_t n_head = hparams.ssm_dt_rank;
10149
+ const int64_t head_dim = d_inner / n_head;
10150
+ const int64_t n_group = hparams.ssm_n_group;
10151
+ const int64_t n_seqs = ubatch.n_seqs;
10152
+
10153
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
10154
+
10155
+ GGML_ASSERT(n_seqs != 0);
10156
+ GGML_ASSERT(ubatch.equal_seqs);
10157
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
10158
+
10159
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
10160
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
10161
+
10162
+ ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
10163
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
10164
+
10165
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
10166
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
10167
+
10168
+ // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
10169
+
10170
+ // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
10171
+ ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
10172
+
10173
+ // split the above in three
10174
+ ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
10175
+ ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
10176
+ ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
10177
+
10178
+ // conv
10179
+ {
10180
+ // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
10181
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
10182
+
10183
+ // copy last (d_conv - 1) columns back into the state cache
10184
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
10185
+
10186
+ ggml_build_forward_expand(gf,
10187
+ ggml_cpy(ctx0, last_conv,
10188
+ ggml_view_1d(ctx0, conv_states_all,
10189
+ (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
10190
+ kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
10191
+
10192
+ // 1D convolution
10193
+ // The equivalent is to make a self-overlapping view of conv_x
10194
+ // over d_conv columns at each stride in the 3rd dimension,
10195
+ // then element-wise multiply that with the conv1d weight,
10196
+ // then sum the elements of each row,
10197
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
10198
+ // then permute away the ne[0] dimension,
10199
+ // and then you're left with the resulting x tensor.
10200
+ // For simultaneous sequences, all sequences need to have the same length.
10201
+ xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
10202
+
10203
+ // bias
10204
+ xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
10205
+
10206
+ xBC = ggml_silu(ctx0, xBC);
10207
+ }
10208
+
10209
+ // ssm
10210
+ {
10211
+ // These correspond to V K Q in SSM/attention duality
10212
+ ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
10213
+ ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
10214
+ ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
10215
+
10216
+ // {n_head, n_seq_tokens, n_seqs}
10217
+ dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
10218
+
10219
+ ggml_tensor * A = model.layers[il].ssm_a;
10220
+
10221
+ // use the states and the indices provided by build_recurrent_state
10222
+ // (this is necessary in order to properly use the states before they are overwritten,
10223
+ // while avoiding to make unnecessary copies of the states)
10224
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
10225
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
10226
+
10227
+ // TODO: use semistructured matrices to implement state-space duality
10228
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
10229
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
10230
+ };
10231
+
10232
+ ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
10233
+
10234
+ // store last states
10235
+ ggml_build_forward_expand(gf,
9787
10236
  ggml_cpy(ctx0,
9788
- ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
10237
+ ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
9789
10238
  ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
9790
10239
 
9791
- ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
10240
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
9792
10241
 
9793
10242
  // TODO: skip computing output earlier for unused tokens
9794
10243
 
9795
- // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
9796
10244
  y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
9797
10245
  y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
9798
10246
 
10247
+ // grouped RMS norm
10248
+ y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
10249
+ y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
10250
+ y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
10251
+
9799
10252
  // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
9800
10253
  cur = build_lora_mm(model.layers[il].ssm_out, y);
9801
10254
  }
9802
10255
 
9803
10256
  // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
9804
10257
  cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
9805
- //cb(cur, "mamba_out", il);
10258
+ cb(cur, "mamba_out", il);
9806
10259
 
9807
10260
  return cur;
9808
10261
  }
@@ -10514,10 +10967,10 @@ struct llm_build_openelm : public llm_graph_context {
10514
10967
 
10515
10968
  cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
10516
10969
 
10517
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
10970
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
10518
10971
  cb(Qcur, "Qcur", il);
10519
10972
 
10520
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
10973
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
10521
10974
  cb(Kcur, "Kcur", il);
10522
10975
 
10523
10976
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
@@ -10639,12 +11092,10 @@ struct llm_build_gptneox : public llm_graph_context {
10639
11092
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10640
11093
  cb(cur, "bqkv", il);
10641
11094
 
10642
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
10643
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
11095
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
11096
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
10644
11097
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10645
11098
 
10646
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
10647
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
10648
11099
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
10649
11100
 
10650
11101
  Qcur = ggml_rope_ext(
@@ -11889,6 +12340,8 @@ struct llm_build_chatglm : public llm_graph_context {
11889
12340
  if (model.layers[il].bv) {
11890
12341
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
11891
12342
  }
12343
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12344
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11892
12345
  } else {
11893
12346
  cur = build_lora_mm(model.layers[il].wqkv, cur);
11894
12347
  cb(cur, "wqkv", il);
@@ -11896,13 +12349,11 @@ struct llm_build_chatglm : public llm_graph_context {
11896
12349
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
11897
12350
  cb(cur, "bqkv", il);
11898
12351
  }
11899
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
11900
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
12352
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12353
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
11901
12354
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
11902
12355
  }
11903
12356
 
11904
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11905
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11906
12357
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11907
12358
 
11908
12359
  //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
@@ -12023,6 +12474,8 @@ struct llm_build_glm4 : public llm_graph_context {
12023
12474
  if (model.layers[il].bv) {
12024
12475
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12025
12476
  }
12477
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12478
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12026
12479
  } else {
12027
12480
  cur = build_lora_mm(model.layers[il].wqkv, cur);
12028
12481
  cb(cur, "wqkv", il);
@@ -12030,13 +12483,11 @@ struct llm_build_glm4 : public llm_graph_context {
12030
12483
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
12031
12484
  cb(cur, "bqkv", il);
12032
12485
  }
12033
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
12034
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
12486
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12487
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
12035
12488
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
12036
12489
  }
12037
12490
 
12038
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12039
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12040
12491
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12041
12492
 
12042
12493
  Qcur = ggml_rope_ext(
@@ -14125,8 +14576,8 @@ struct llm_build_dots1 : public llm_graph_context {
14125
14576
  }
14126
14577
  };
14127
14578
 
14128
- struct llm_build_arcee : public llm_graph_context {
14129
- llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14579
+ struct llm_build_ernie4_5 : public llm_graph_context {
14580
+ llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14130
14581
  const int64_t n_embd_head = hparams.n_embd_head_v;
14131
14582
 
14132
14583
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14142,25 +14593,19 @@ struct llm_build_arcee : public llm_graph_context {
14142
14593
 
14143
14594
  auto * inp_attn = build_attn_inp_kv_unified();
14144
14595
 
14145
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14146
-
14147
- ggml_tensor * inp_out_ids = build_inp_out_ids();
14148
-
14149
14596
  for (int il = 0; il < n_layer; ++il) {
14150
14597
  ggml_tensor * inpSA = inpL;
14151
14598
 
14152
14599
  // norm
14153
- cur = build_norm(inpL,
14154
- model.layers[il].attn_norm, NULL,
14155
- LLM_NORM_RMS, il);
14156
- cb(cur, "attn_norm", il);
14600
+ {
14601
+ cur = build_norm(inpL,
14602
+ model.layers[il].attn_norm, NULL,
14603
+ LLM_NORM_RMS, il);
14604
+ cb(cur, "attn_norm", il);
14605
+ }
14157
14606
 
14158
14607
  // self-attention
14159
14608
  {
14160
- // rope freq factors for llama3; may return nullptr for llama2 and other models
14161
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14162
-
14163
- // compute Q and K and RoPE them
14164
14609
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14165
14610
  cb(Qcur, "Qcur", il);
14166
14611
  if (model.layers[il].bq) {
@@ -14187,13 +14632,13 @@ struct llm_build_arcee : public llm_graph_context {
14187
14632
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14188
14633
 
14189
14634
  Qcur = ggml_rope_ext(
14190
- ctx0, Qcur, inp_pos, rope_factors,
14635
+ ctx0, Qcur, inp_pos, nullptr,
14191
14636
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14192
14637
  ext_factor, attn_factor, beta_fast, beta_slow
14193
14638
  );
14194
14639
 
14195
14640
  Kcur = ggml_rope_ext(
14196
- ctx0, Kcur, inp_pos, rope_factors,
14641
+ ctx0, Kcur, inp_pos, nullptr,
14197
14642
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14198
14643
  ext_factor, attn_factor, beta_fast, beta_slow
14199
14644
  );
@@ -14203,12 +14648,13 @@ struct llm_build_arcee : public llm_graph_context {
14203
14648
  cb(Vcur, "Vcur", il);
14204
14649
 
14205
14650
  cur = build_attn(inp_attn, gf,
14206
- model.layers[il].wo, model.layers[il].bo,
14207
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14208
- cb(cur, "attn_out", il);
14651
+ model.layers[il].wo, NULL,
14652
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14209
14653
  }
14210
14654
 
14211
- if (il == n_layer - 1 && inp_out_ids) {
14655
+ if (il == n_layer - 1) {
14656
+ // skip computing output for unused tokens
14657
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14212
14658
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14213
14659
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14214
14660
  }
@@ -14217,22 +14663,22 @@ struct llm_build_arcee : public llm_graph_context {
14217
14663
  cb(ffn_inp, "ffn_inp", il);
14218
14664
 
14219
14665
  // feed-forward network
14220
- // ARCEE uses relu^2 instead of silu
14221
- cur = build_norm(ffn_inp,
14222
- model.layers[il].ffn_norm, NULL,
14223
- LLM_NORM_RMS, il);
14224
- cb(cur, "ffn_norm", il);
14666
+ {
14667
+ cur = build_norm(ffn_inp,
14668
+ model.layers[il].ffn_norm, NULL,
14669
+ LLM_NORM_RMS, il);
14670
+ cb(cur, "ffn_norm", il);
14225
14671
 
14226
- cur = build_ffn(cur,
14227
- model.layers[il].ffn_up, NULL, NULL,
14228
- NULL, NULL, NULL,
14229
- model.layers[il].ffn_down, NULL, NULL,
14230
- NULL,
14231
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
14232
- cb(cur, "ffn_out", il);
14672
+ cur = build_ffn(cur,
14673
+ model.layers[il].ffn_up, NULL, NULL,
14674
+ model.layers[il].ffn_gate, NULL, NULL,
14675
+ model.layers[il].ffn_down, NULL, NULL,
14676
+ NULL,
14677
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14678
+ cb(cur, "ffn_out", il);
14679
+ }
14233
14680
 
14234
14681
  cur = ggml_add(ctx0, cur, ffn_inp);
14235
- cb(cur, "ffn_out", il);
14236
14682
 
14237
14683
  cur = build_cvec(cur, il);
14238
14684
  cb(cur, "l_out", il);
@@ -14260,104 +14706,800 @@ struct llm_build_arcee : public llm_graph_context {
14260
14706
  }
14261
14707
  };
14262
14708
 
14263
- llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
14264
- llama_memory_i * res;
14709
+ struct llm_build_falcon_h1 : public llm_graph_context {
14710
+ const llama_model & model;
14265
14711
 
14266
- switch (arch) {
14267
- // Models that need specific instantiation should be handled in the
14268
- // switch statement
14269
- case LLM_ARCH_BERT:
14270
- case LLM_ARCH_JINA_BERT_V2:
14271
- case LLM_ARCH_NOMIC_BERT:
14272
- case LLM_ARCH_NOMIC_BERT_MOE:
14273
- case LLM_ARCH_NEO_BERT:
14274
- case LLM_ARCH_WAVTOKENIZER_DEC:
14275
- {
14276
- res = nullptr;
14277
- } break;
14278
- // Models that need standard caching should rely on recurrent/hybrid
14279
- // checks
14280
- default:
14281
- {
14282
- if (llm_arch_is_recurrent(arch)) {
14283
- res = new llama_memory_recurrent(
14284
- *this,
14285
- nullptr,
14286
- GGML_TYPE_F32,
14287
- GGML_TYPE_F32,
14288
- cparams.offload_kqv,
14289
- std::max((uint32_t) 1, cparams.n_seq_max),
14290
- cparams.n_seq_max);
14291
- } else if (llm_arch_is_hybrid(arch)) {
14292
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
14712
+ llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
14713
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14293
14714
 
14294
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
14715
+ ggml_tensor * cur;
14716
+ ggml_tensor * inpL;
14295
14717
 
14296
- res = new llama_memory_hybrid(
14297
- /* model */ *this,
14298
- /* attn_type_k */ params.type_k,
14299
- /* attn_type_v */ params.type_v,
14300
- /* attn_v_trans */ !cparams.flash_attn,
14301
- /* attn_kv_size */ cparams.n_ctx,
14302
- /* attn_n_pad */ padding,
14303
- /* attn_n_swa */ hparams.n_swa,
14304
- /* attn_swa_type */ hparams.swa_type,
14305
- /* recurrent_type_k */ GGML_TYPE_F32,
14306
- /* recurrent_type_v */ GGML_TYPE_F32,
14307
- /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
14308
- /* n_seq_max */ cparams.n_seq_max,
14309
- /* offload */ cparams.offload_kqv);
14310
- } else {
14311
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
14718
+ inpL = build_inp_embd(model.tok_embd);
14312
14719
 
14313
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
14720
+ // inp_pos - contains the positions
14721
+ ggml_tensor * inp_pos = build_inp_pos();
14314
14722
 
14315
- LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
14723
+ // Build the inputs in the recurrent & kv cache
14724
+ auto * inp = build_inp_mem_hybrid();
14316
14725
 
14317
- if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
14318
- GGML_ASSERT(hparams.is_swa_any());
14726
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14319
14727
 
14320
- res = new llama_kv_cache_unified_iswa(
14321
- *this,
14322
- params.type_k,
14323
- params.type_v,
14324
- !cparams.flash_attn,
14325
- cparams.offload_kqv,
14326
- params.swa_full,
14327
- cparams.n_ctx,
14328
- cparams.n_seq_max,
14329
- cparams.n_ubatch,
14330
- padding);
14331
- } else {
14332
- GGML_ASSERT(!hparams.is_swa_any());
14728
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14333
14729
 
14334
- res = new llama_kv_cache_unified(
14335
- *this,
14336
- nullptr,
14337
- params.type_k,
14338
- params.type_v,
14339
- !cparams.flash_attn,
14340
- cparams.offload_kqv,
14341
- cparams.n_ctx,
14342
- cparams.n_seq_max,
14343
- padding,
14344
- hparams.n_swa,
14345
- hparams.swa_type);
14346
- }
14347
- }
14348
- }
14349
- }
14730
+ for (int il = 0; il < n_layer; ++il) {
14731
+ ggml_tensor * inpSA = inpL;
14350
14732
 
14351
- return res;
14352
- }
14733
+ cur = build_norm(inpL,
14734
+ model.layers[il].attn_norm, NULL,
14735
+ LLM_NORM_RMS, il);
14736
+ cb(cur, "attn_norm", il);
14353
14737
 
14354
- llm_graph_result_ptr llama_model::build_graph(
14355
- const llm_graph_params & params,
14356
- ggml_cgraph * gf,
14357
- llm_graph_type type) const {
14358
- std::unique_ptr<llm_graph_context> llm;
14738
+ // self-attention
14739
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14740
+ cb(Qcur, "Qcur", il);
14359
14741
 
14360
- switch (arch) {
14742
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14743
+ cb(Kcur, "Kcur", il);
14744
+
14745
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14746
+ cb(Vcur, "Vcur", il);
14747
+
14748
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14749
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14750
+
14751
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14752
+
14753
+ Qcur = ggml_rope_ext(
14754
+ ctx0, Qcur, inp_pos, nullptr,
14755
+ n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
14756
+ ext_factor, attn_factor, beta_fast, beta_slow);
14757
+
14758
+ Kcur = ggml_rope_ext(
14759
+ ctx0, Kcur, inp_pos, nullptr,
14760
+ n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
14761
+ ext_factor, attn_factor, beta_fast, beta_slow
14762
+ );
14763
+
14764
+ cb(Qcur, "Qcur-post-rope", il);
14765
+ cb(Kcur, "Kcur-post-rope", il);
14766
+ cb(Vcur, "Vcur-post-rope", il);
14767
+
14768
+ ggml_tensor * attn_out = build_attn(inp, gf,
14769
+ model.layers[il].wo, NULL,
14770
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14771
+ cb(attn_out, "attn_out", il);
14772
+
14773
+ cur = build_norm(inpL,
14774
+ model.layers[il].attn_norm, NULL,
14775
+ LLM_NORM_RMS, il);
14776
+ // Mamba2 layer
14777
+ cb(cur, "ssm_in", il);
14778
+
14779
+ ggml_tensor * ssm_out = build_mamba2_layer(inp, gf, cur, ubatch, il);
14780
+ cb(ssm_out, "ssm_out", il);
14781
+
14782
+ // // Aggregation
14783
+ cur = ggml_add(ctx0, attn_out, ssm_out);
14784
+ inpSA = ggml_add(ctx0, cur, inpSA);
14785
+ cb(cur, "layer_out", il);
14786
+
14787
+ if (il == n_layer - 1 && inp_out_ids) {
14788
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14789
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14790
+ }
14791
+
14792
+ ggml_tensor * ffn_inp = inpSA;
14793
+ cb(ffn_inp, "ffn_inp", il);
14794
+
14795
+ // feed-forward network
14796
+ cur = build_norm(ffn_inp,
14797
+ model.layers[il].ffn_norm, NULL,
14798
+ LLM_NORM_RMS, il);
14799
+ cb(cur, "ffn_norm", il);
14800
+
14801
+ cur = build_ffn(cur,
14802
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14803
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
14804
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14805
+ NULL,
14806
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14807
+ cb(cur, "ffn_out", il);
14808
+
14809
+ cur = ggml_add(ctx0, cur, inpSA);
14810
+
14811
+ cur = build_cvec(cur, il);
14812
+ cb(cur, "l_out", il);
14813
+
14814
+ // input for next layer
14815
+ inpL = cur;
14816
+ }
14817
+
14818
+ cur = inpL;
14819
+
14820
+ cur = build_norm(cur,
14821
+ model.output_norm, NULL,
14822
+ LLM_NORM_RMS, -1);
14823
+
14824
+ cb(cur, "result_norm", -1);
14825
+ res->t_embd = cur;
14826
+
14827
+ // lm_head
14828
+ cur = build_lora_mm(model.output, cur);
14829
+
14830
+ cb(cur, "result_output", -1);
14831
+ res->t_logits = cur;
14832
+
14833
+ ggml_build_forward_expand(gf, cur);
14834
+ }
14835
+
14836
+ ggml_tensor * build_mamba2_layer(
14837
+ llm_graph_input_mem_hybrid * inp,
14838
+ ggml_cgraph * gf,
14839
+ ggml_tensor * cur,
14840
+ const llama_ubatch & ubatch,
14841
+ int il) const {
14842
+ const auto * kv_state = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
14843
+
14844
+ const auto kv_head = kv_state->get_head();
14845
+
14846
+ const int64_t d_conv = hparams.ssm_d_conv;
14847
+ const int64_t d_inner = hparams.ssm_d_inner;
14848
+ const int64_t d_state = hparams.ssm_d_state;
14849
+ const int64_t n_head = hparams.ssm_dt_rank;
14850
+ const int64_t head_dim = d_inner / n_head;
14851
+ const int64_t n_group = hparams.ssm_n_group;
14852
+ const int64_t n_seqs = ubatch.n_seqs;
14853
+
14854
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
14855
+
14856
+ GGML_ASSERT(n_seqs != 0);
14857
+ GGML_ASSERT(ubatch.equal_seqs);
14858
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
14859
+
14860
+ ggml_tensor * conv_states_all = kv_state->get_r_l(il);
14861
+ ggml_tensor * ssm_states_all = kv_state->get_s_l(il);
14862
+
14863
+ ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
14864
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
14865
+
14866
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
14867
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
14868
+
14869
+ // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
14870
+
14871
+ // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
14872
+ ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
14873
+ cb(zxBCdt, "zxBCdt", il);
14874
+
14875
+ // split the above in three
14876
+ ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
14877
+ ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
14878
+ ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
14879
+
14880
+ // conv
14881
+ {
14882
+ // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
14883
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
14884
+
14885
+ // copy last (d_conv - 1) columns back into the state cache
14886
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
14887
+
14888
+ ggml_build_forward_expand(gf,
14889
+ ggml_cpy(ctx0, last_conv,
14890
+ ggml_view_1d(ctx0, conv_states_all,
14891
+ (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
14892
+ kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
14893
+
14894
+ // 1D convolution
14895
+ // The equivalent is to make a self-overlapping view of conv_x
14896
+ // over d_conv columns at each stride in the 3rd dimension,
14897
+ // then element-wise multiply that with the conv1d weight,
14898
+ // then sum the elements of each row,
14899
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
14900
+ // then permute away the ne[0] dimension,
14901
+ // and then you're left with the resulting x tensor.
14902
+ // For simultaneous sequences, all sequences need to have the same length.
14903
+ xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
14904
+
14905
+ // bias
14906
+ xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
14907
+
14908
+ xBC = ggml_silu(ctx0, xBC);
14909
+ }
14910
+
14911
+ // ssm
14912
+ {
14913
+ // These correspond to V K Q in SSM/attention duality
14914
+ ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
14915
+
14916
+ ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
14917
+
14918
+ ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
14919
+
14920
+ // {n_head, n_seq_tokens, n_seqs}
14921
+ dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
14922
+
14923
+ ggml_tensor * A = model.layers[il].ssm_a;
14924
+
14925
+ // use the states and the indices provided by build_rs
14926
+ // (this is necessary in order to properly use the states before they are overwritten,
14927
+ // while avoiding to make unnecessary copies of the states)
14928
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
14929
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, kv_state->get_size());
14930
+
14931
+ // TODO: use semistructured matrices to implement state-space duality
14932
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
14933
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
14934
+ };
14935
+
14936
+ ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
14937
+
14938
+ // store last states
14939
+ ggml_build_forward_expand(gf,
14940
+ ggml_cpy(ctx0,
14941
+ ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
14942
+ ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
14943
+
14944
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
14945
+
14946
+ // TODO: skip computing output earlier for unused tokens
14947
+
14948
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
14949
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
14950
+
14951
+ // grouped RMS norm
14952
+ if (model.layers[il].ssm_norm) {
14953
+ y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
14954
+ y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
14955
+ }
14956
+
14957
+ y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
14958
+
14959
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
14960
+ cur = build_lora_mm(model.layers[il].ssm_out, y);
14961
+ }
14962
+
14963
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
14964
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
14965
+ cb(cur, "mamba_out", il);
14966
+ return cur;
14967
+ }
14968
+ };
14969
+
14970
+ struct llm_build_arcee : public llm_graph_context {
14971
+ llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14972
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14973
+
14974
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14975
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
14976
+
14977
+ ggml_tensor * cur;
14978
+ ggml_tensor * inpL;
14979
+
14980
+ inpL = build_inp_embd(model.tok_embd);
14981
+
14982
+ // inp_pos - contains the positions
14983
+ ggml_tensor * inp_pos = build_inp_pos();
14984
+
14985
+ auto * inp_attn = build_attn_inp_kv_unified();
14986
+
14987
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14988
+
14989
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14990
+
14991
+ for (int il = 0; il < n_layer; ++il) {
14992
+ ggml_tensor * inpSA = inpL;
14993
+
14994
+ // norm
14995
+ cur = build_norm(inpL,
14996
+ model.layers[il].attn_norm, NULL,
14997
+ LLM_NORM_RMS, il);
14998
+ cb(cur, "attn_norm", il);
14999
+
15000
+ // self-attention
15001
+ {
15002
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
15003
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
15004
+
15005
+ // compute Q and K and RoPE them
15006
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
15007
+ cb(Qcur, "Qcur", il);
15008
+ if (model.layers[il].bq) {
15009
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
15010
+ cb(Qcur, "Qcur", il);
15011
+ }
15012
+
15013
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
15014
+ cb(Kcur, "Kcur", il);
15015
+ if (model.layers[il].bk) {
15016
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
15017
+ cb(Kcur, "Kcur", il);
15018
+ }
15019
+
15020
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
15021
+ cb(Vcur, "Vcur", il);
15022
+ if (model.layers[il].bv) {
15023
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
15024
+ cb(Vcur, "Vcur", il);
15025
+ }
15026
+
15027
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
15028
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
15029
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
15030
+
15031
+ Qcur = ggml_rope_ext(
15032
+ ctx0, Qcur, inp_pos, rope_factors,
15033
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15034
+ ext_factor, attn_factor, beta_fast, beta_slow
15035
+ );
15036
+
15037
+ Kcur = ggml_rope_ext(
15038
+ ctx0, Kcur, inp_pos, rope_factors,
15039
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15040
+ ext_factor, attn_factor, beta_fast, beta_slow
15041
+ );
15042
+
15043
+ cb(Qcur, "Qcur", il);
15044
+ cb(Kcur, "Kcur", il);
15045
+ cb(Vcur, "Vcur", il);
15046
+
15047
+ cur = build_attn(inp_attn, gf,
15048
+ model.layers[il].wo, model.layers[il].bo,
15049
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15050
+ cb(cur, "attn_out", il);
15051
+ }
15052
+
15053
+ if (il == n_layer - 1 && inp_out_ids) {
15054
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15055
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15056
+ }
15057
+
15058
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15059
+ cb(ffn_inp, "ffn_inp", il);
15060
+
15061
+ // feed-forward network
15062
+ // ARCEE uses relu^2 instead of silu
15063
+ cur = build_norm(ffn_inp,
15064
+ model.layers[il].ffn_norm, NULL,
15065
+ LLM_NORM_RMS, il);
15066
+ cb(cur, "ffn_norm", il);
15067
+
15068
+ cur = build_ffn(cur,
15069
+ model.layers[il].ffn_up, NULL, NULL,
15070
+ NULL, NULL, NULL,
15071
+ model.layers[il].ffn_down, NULL, NULL,
15072
+ NULL,
15073
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
15074
+ cb(cur, "ffn_out", il);
15075
+
15076
+ cur = ggml_add(ctx0, cur, ffn_inp);
15077
+ cb(cur, "ffn_out", il);
15078
+
15079
+ cur = build_cvec(cur, il);
15080
+ cb(cur, "l_out", il);
15081
+
15082
+ // input for next layer
15083
+ inpL = cur;
15084
+ }
15085
+
15086
+ cur = inpL;
15087
+
15088
+ cur = build_norm(cur,
15089
+ model.output_norm, NULL,
15090
+ LLM_NORM_RMS, -1);
15091
+
15092
+ cb(cur, "result_norm", -1);
15093
+ res->t_embd = cur;
15094
+
15095
+ // lm_head
15096
+ cur = build_lora_mm(model.output, cur);
15097
+
15098
+ cb(cur, "result_output", -1);
15099
+ res->t_logits = cur;
15100
+
15101
+ ggml_build_forward_expand(gf, cur);
15102
+ }
15103
+ };
15104
+
15105
+ struct llm_build_hunyuan_moe : public llm_graph_context {
15106
+ llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15107
+ const int64_t n_embd_head = hparams.n_embd_head_v;
15108
+
15109
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15110
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
15111
+
15112
+ ggml_tensor * cur;
15113
+ ggml_tensor * inpL;
15114
+
15115
+ inpL = build_inp_embd(model.tok_embd);
15116
+
15117
+ // inp_pos - contains the positions
15118
+ ggml_tensor * inp_pos = build_inp_pos();
15119
+
15120
+ auto * inp_attn = build_attn_inp_kv_unified();
15121
+
15122
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
15123
+
15124
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
15125
+
15126
+ for (int il = 0; il < n_layer; ++il) {
15127
+ ggml_tensor * inpSA = inpL;
15128
+
15129
+ // norm
15130
+ cur = build_norm(inpL,
15131
+ model.layers[il].attn_norm, NULL,
15132
+ LLM_NORM_RMS, il);
15133
+ cb(cur, "attn_norm", il);
15134
+
15135
+ // self-attention
15136
+ {
15137
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
15138
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
15139
+
15140
+ // compute Q and K and RoPE them
15141
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
15142
+ cb(Qcur, "Qcur", il);
15143
+ if (model.layers[il].bq) {
15144
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
15145
+ cb(Qcur, "Qcur", il);
15146
+ }
15147
+
15148
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
15149
+ cb(Kcur, "Kcur", il);
15150
+ if (model.layers[il].bk) {
15151
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
15152
+ cb(Kcur, "Kcur", il);
15153
+ }
15154
+
15155
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
15156
+ cb(Vcur, "Vcur", il);
15157
+ if (model.layers[il].bv) {
15158
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
15159
+ cb(Vcur, "Vcur", il);
15160
+ }
15161
+
15162
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
15163
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
15164
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
15165
+
15166
+ Qcur = ggml_rope_ext(
15167
+ ctx0, Qcur, inp_pos, rope_factors,
15168
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15169
+ ext_factor, attn_factor, beta_fast, beta_slow
15170
+ );
15171
+
15172
+ cb(Qcur, "Qcur", il);
15173
+ cb(Kcur, "Kcur", il);
15174
+ cb(Vcur, "Vcur", il);
15175
+
15176
+ Kcur = ggml_rope_ext(
15177
+ ctx0, Kcur, inp_pos, rope_factors,
15178
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15179
+ ext_factor, attn_factor, beta_fast, beta_slow
15180
+ );
15181
+
15182
+ Kcur = build_norm(Kcur,
15183
+ model.layers[il].attn_k_norm, nullptr,
15184
+ LLM_NORM_RMS, il);
15185
+ cb(Kcur, "Kcur_norm", il);
15186
+
15187
+ Qcur = build_norm(Qcur,
15188
+ model.layers[il].attn_q_norm, nullptr,
15189
+ LLM_NORM_RMS, il);
15190
+ cb(Qcur, "Qcur_norm", il);
15191
+
15192
+ cur = build_attn(inp_attn, gf,
15193
+ model.layers[il].wo, model.layers[il].bo,
15194
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15195
+ cb(cur, "attn_out", il);
15196
+ }
15197
+
15198
+ if (il == n_layer - 1 && inp_out_ids) {
15199
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15200
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15201
+ }
15202
+
15203
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15204
+ cb(ffn_inp, "ffn_inp", il);
15205
+
15206
+ cur = build_norm(ffn_inp,
15207
+ model.layers[il].ffn_norm, NULL,
15208
+ LLM_NORM_RMS, il);
15209
+ cb(cur, "ffn_norm", il);
15210
+
15211
+ // feed-forward network (non-MoE)
15212
+ ggml_tensor * cur_mlp = build_ffn(cur,
15213
+ model.layers[il].ffn_up_shexp, NULL, NULL,
15214
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
15215
+ model.layers[il].ffn_down_shexp, NULL, NULL,
15216
+ NULL,
15217
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15218
+ cb(cur_mlp, "ffn_mlp", il);
15219
+
15220
+ // MoE branch
15221
+ ggml_tensor * cur_moe = build_moe_ffn(cur,
15222
+ model.layers[il].ffn_gate_inp,
15223
+ model.layers[il].ffn_up_exps,
15224
+ model.layers[il].ffn_gate_exps,
15225
+ model.layers[il].ffn_down_exps,
15226
+ nullptr,
15227
+ n_expert, n_expert_used,
15228
+ LLM_FFN_SILU,
15229
+ true, // norm_topk_prob
15230
+ false,
15231
+ 0.0,
15232
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
15233
+ il);
15234
+ cb(cur_moe, "ffn_moe_out", il);
15235
+
15236
+ ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
15237
+ cb(ffn_out, "ffn_out", il);
15238
+
15239
+ cur = ggml_add(ctx0, ffn_out, ffn_inp);
15240
+
15241
+ cur = build_cvec(cur, il);
15242
+ cb(cur, "l_out", il);
15243
+
15244
+ // input for next layer
15245
+ inpL = cur;
15246
+ }
15247
+
15248
+ cur = inpL;
15249
+
15250
+ cur = build_norm(cur,
15251
+ model.output_norm, NULL,
15252
+ LLM_NORM_RMS, -1);
15253
+
15254
+ cb(cur, "result_norm", -1);
15255
+ res->t_embd = cur;
15256
+
15257
+ // lm_head
15258
+ cur = build_lora_mm(model.output, cur);
15259
+ cb(cur, "result_output", -1);
15260
+ res->t_logits = cur;
15261
+
15262
+ ggml_build_forward_expand(gf, cur);
15263
+ }
15264
+ };
15265
+
15266
+ struct llm_build_smollm3 : public llm_graph_context {
15267
+ llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15268
+ const int64_t n_embd_head = hparams.n_embd_head_v;
15269
+
15270
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15271
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
15272
+
15273
+ ggml_tensor * cur;
15274
+ ggml_tensor * inpL;
15275
+
15276
+ inpL = build_inp_embd(model.tok_embd);
15277
+
15278
+ // inp_pos - contains the positions
15279
+ ggml_tensor * inp_pos = build_inp_pos();
15280
+
15281
+ auto * inp_attn = build_attn_inp_kv_unified();
15282
+
15283
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15284
+
15285
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
15286
+
15287
+ for (int il = 0; il < n_layer; ++il) {
15288
+ ggml_tensor * inpSA = inpL;
15289
+
15290
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
15291
+
15292
+ // norm
15293
+ cur = build_norm(inpL,
15294
+ model.layers[il].attn_norm, NULL,
15295
+ LLM_NORM_RMS, il);
15296
+ cb(cur, "attn_norm", il);
15297
+
15298
+ // self-attention
15299
+ {
15300
+ // compute Q and K and RoPE them
15301
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
15302
+ cb(Qcur, "Qcur", il);
15303
+ if (model.layers[il].bq) {
15304
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
15305
+ cb(Qcur, "Qcur", il);
15306
+ }
15307
+
15308
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
15309
+ cb(Kcur, "Kcur", il);
15310
+ if (model.layers[il].bk) {
15311
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
15312
+ cb(Kcur, "Kcur", il);
15313
+ }
15314
+
15315
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
15316
+ cb(Vcur, "Vcur", il);
15317
+ if (model.layers[il].bv) {
15318
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
15319
+ cb(Vcur, "Vcur", il);
15320
+ }
15321
+
15322
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
15323
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
15324
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
15325
+
15326
+ if (use_rope) {
15327
+ Qcur = ggml_rope_ext(
15328
+ ctx0, Qcur, inp_pos, nullptr,
15329
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15330
+ ext_factor, attn_factor, beta_fast, beta_slow
15331
+ );
15332
+
15333
+ Kcur = ggml_rope_ext(
15334
+ ctx0, Kcur, inp_pos, nullptr,
15335
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15336
+ ext_factor, attn_factor, beta_fast, beta_slow
15337
+ );
15338
+ }
15339
+
15340
+ cb(Qcur, "Qcur", il);
15341
+ cb(Kcur, "Kcur", il);
15342
+ cb(Vcur, "Vcur", il);
15343
+
15344
+ cur = build_attn(inp_attn, gf,
15345
+ model.layers[il].wo, model.layers[il].bo,
15346
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15347
+ cb(cur, "attn_out", il);
15348
+ }
15349
+
15350
+ if (il == n_layer - 1 && inp_out_ids) {
15351
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15352
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15353
+ }
15354
+
15355
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15356
+ cb(ffn_inp, "ffn_inp", il);
15357
+
15358
+ // feed-forward network
15359
+ {
15360
+ cur = build_norm(ffn_inp,
15361
+ model.layers[il].ffn_norm, NULL,
15362
+ LLM_NORM_RMS, il);
15363
+ cb(cur, "ffn_norm", il);
15364
+
15365
+ cur = build_ffn(cur,
15366
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
15367
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
15368
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
15369
+ NULL,
15370
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15371
+ cb(cur, "ffn_out", il);
15372
+ }
15373
+
15374
+ cur = ggml_add(ctx0, cur, ffn_inp);
15375
+ cb(cur, "ffn_out", il);
15376
+
15377
+ cur = build_cvec(cur, il);
15378
+ cb(cur, "l_out", il);
15379
+
15380
+ // input for next layer
15381
+ inpL = cur;
15382
+ }
15383
+
15384
+ cur = inpL;
15385
+
15386
+ cur = build_norm(cur,
15387
+ model.output_norm, NULL,
15388
+ LLM_NORM_RMS, -1);
15389
+
15390
+ cb(cur, "result_norm", -1);
15391
+ res->t_embd = cur;
15392
+
15393
+ // lm_head
15394
+ cur = build_lora_mm(model.output, cur);
15395
+
15396
+ cb(cur, "result_output", -1);
15397
+ res->t_logits = cur;
15398
+
15399
+ ggml_build_forward_expand(gf, cur);
15400
+ }
15401
+ };
15402
+
15403
+ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
15404
+ llama_memory_i * res;
15405
+
15406
+ switch (arch) {
15407
+ // Models that need specific instantiation should be handled in the
15408
+ // switch statement
15409
+ case LLM_ARCH_BERT:
15410
+ case LLM_ARCH_JINA_BERT_V2:
15411
+ case LLM_ARCH_NOMIC_BERT:
15412
+ case LLM_ARCH_NOMIC_BERT_MOE:
15413
+ case LLM_ARCH_NEO_BERT:
15414
+ case LLM_ARCH_WAVTOKENIZER_DEC:
15415
+ {
15416
+ res = nullptr;
15417
+ } break;
15418
+ // Models that need standard caching should rely on recurrent/hybrid
15419
+ // checks
15420
+ default:
15421
+ {
15422
+ if (llm_arch_is_recurrent(arch)) {
15423
+ res = new llama_memory_recurrent(
15424
+ *this,
15425
+ nullptr,
15426
+ GGML_TYPE_F32,
15427
+ GGML_TYPE_F32,
15428
+ cparams.offload_kqv,
15429
+ std::max((uint32_t) 1, cparams.n_seq_max),
15430
+ cparams.n_seq_max);
15431
+ } else if (llm_arch_is_hybrid(arch)) {
15432
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
15433
+
15434
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
15435
+
15436
+ res = new llama_memory_hybrid(
15437
+ /* model */ *this,
15438
+ /* attn_type_k */ params.type_k,
15439
+ /* attn_type_v */ params.type_v,
15440
+ /* attn_v_trans */ !cparams.flash_attn,
15441
+ /* attn_kv_size */ cparams.n_ctx,
15442
+ /* attn_n_pad */ padding,
15443
+ /* attn_n_swa */ hparams.n_swa,
15444
+ /* attn_swa_type */ hparams.swa_type,
15445
+ /* recurrent_type_k */ GGML_TYPE_F32,
15446
+ /* recurrent_type_v */ GGML_TYPE_F32,
15447
+ /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
15448
+ /* n_seq_max */ cparams.n_seq_max,
15449
+ /* offload */ cparams.offload_kqv,
15450
+ /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
15451
+ /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
15452
+ } else {
15453
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
15454
+
15455
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
15456
+
15457
+ LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
15458
+
15459
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
15460
+ GGML_ASSERT(hparams.is_swa_any());
15461
+
15462
+ res = new llama_kv_cache_unified_iswa(
15463
+ *this,
15464
+ params.type_k,
15465
+ params.type_v,
15466
+ !cparams.flash_attn,
15467
+ cparams.offload_kqv,
15468
+ params.swa_full,
15469
+ cparams.n_ctx,
15470
+ cparams.n_seq_max,
15471
+ cparams.n_ubatch,
15472
+ padding);
15473
+ } else {
15474
+ GGML_ASSERT(!hparams.is_swa_any());
15475
+
15476
+ res = new llama_kv_cache_unified(
15477
+ *this,
15478
+ nullptr,
15479
+ params.type_k,
15480
+ params.type_v,
15481
+ !cparams.flash_attn,
15482
+ cparams.offload_kqv,
15483
+ cparams.n_ctx,
15484
+ cparams.n_seq_max,
15485
+ padding,
15486
+ hparams.n_swa,
15487
+ hparams.swa_type);
15488
+ }
15489
+ }
15490
+ }
15491
+ }
15492
+
15493
+ return res;
15494
+ }
15495
+
15496
+ llm_graph_result_ptr llama_model::build_graph(
15497
+ const llm_graph_params & params,
15498
+ ggml_cgraph * gf,
15499
+ llm_graph_type type) const {
15500
+ std::unique_ptr<llm_graph_context> llm;
15501
+
15502
+ switch (arch) {
14361
15503
  case LLM_ARCH_LLAMA:
14362
15504
  {
14363
15505
  llm = std::make_unique<llm_build_llama>(*this, params, gf);
@@ -14495,6 +15637,7 @@ llm_graph_result_ptr llama_model::build_graph(
14495
15637
  llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
14496
15638
  } break;
14497
15639
  case LLM_ARCH_MAMBA:
15640
+ case LLM_ARCH_MAMBA2:
14498
15641
  {
14499
15642
  llm = std::make_unique<llm_build_mamba>(*this, params, gf);
14500
15643
  } break;
@@ -14635,6 +15778,22 @@ llm_graph_result_ptr llama_model::build_graph(
14635
15778
  {
14636
15779
  llm = std::make_unique<llm_build_arcee>(*this, params, gf);
14637
15780
  } break;
15781
+ case LLM_ARCH_ERNIE4_5:
15782
+ {
15783
+ llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf);
15784
+ } break;
15785
+ case LLM_ARCH_HUNYUAN_MOE:
15786
+ {
15787
+ llm = std::make_unique<llm_build_hunyuan_moe>(*this, params, gf);
15788
+ } break;
15789
+ case LLM_ARCH_SMOLLM3:
15790
+ {
15791
+ llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
15792
+ } break;
15793
+ case LLM_ARCH_FALCON_H1:
15794
+ {
15795
+ llm = std::make_unique<llm_build_falcon_h1>(*this, params, gf);
15796
+ } break;
14638
15797
  default:
14639
15798
  GGML_ABORT("fatal error");
14640
15799
  }
@@ -14751,6 +15910,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14751
15910
  case LLM_ARCH_REFACT:
14752
15911
  case LLM_ARCH_BLOOM:
14753
15912
  case LLM_ARCH_MAMBA:
15913
+ case LLM_ARCH_MAMBA2:
14754
15914
  case LLM_ARCH_JINA_BERT_V2:
14755
15915
  case LLM_ARCH_T5:
14756
15916
  case LLM_ARCH_T5ENCODER:
@@ -14785,11 +15945,14 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14785
15945
  case LLM_ARCH_CHAMELEON:
14786
15946
  case LLM_ARCH_BAILINGMOE:
14787
15947
  case LLM_ARCH_NEO_BERT:
15948
+ case LLM_ARCH_SMOLLM3:
14788
15949
  case LLM_ARCH_ARCEE:
15950
+ case LLM_ARCH_ERNIE4_5:
14789
15951
  return LLAMA_ROPE_TYPE_NORM;
14790
15952
 
14791
15953
  // the pairs of head values are offset by n_rot/2
14792
15954
  case LLM_ARCH_FALCON:
15955
+ case LLM_ARCH_FALCON_H1:
14793
15956
  case LLM_ARCH_GROK:
14794
15957
  case LLM_ARCH_DBRX:
14795
15958
  case LLM_ARCH_BERT:
@@ -14821,6 +15984,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14821
15984
  case LLM_ARCH_EXAONE:
14822
15985
  case LLM_ARCH_MINICPM3:
14823
15986
  case LLM_ARCH_DOTS1:
15987
+ case LLM_ARCH_HUNYUAN_MOE:
14824
15988
  return LLAMA_ROPE_TYPE_NEOX;
14825
15989
 
14826
15990
  case LLM_ARCH_QWEN2VL: