@fugood/llama.node 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/CMakeLists.txt +0 -1
  3. package/src/llama.cpp/common/arg.cpp +7 -0
  4. package/src/llama.cpp/common/common.h +1 -0
  5. package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
  6. package/src/llama.cpp/ggml/include/ggml.h +91 -10
  7. package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  8. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  9. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
  10. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +726 -155
  11. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +9 -9
  14. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -9
  15. package/src/llama.cpp/include/llama.h +1 -0
  16. package/src/llama.cpp/src/llama-arch.cpp +90 -2
  17. package/src/llama.cpp/src/llama-arch.h +6 -0
  18. package/src/llama.cpp/src/llama-batch.cpp +27 -1
  19. package/src/llama.cpp/src/llama-batch.h +8 -1
  20. package/src/llama.cpp/src/llama-chat.cpp +15 -0
  21. package/src/llama.cpp/src/llama-chat.h +1 -0
  22. package/src/llama.cpp/src/llama-graph.cpp +64 -50
  23. package/src/llama.cpp/src/llama-graph.h +41 -16
  24. package/src/llama.cpp/src/llama-hparams.cpp +2 -1
  25. package/src/llama.cpp/src/llama-hparams.h +1 -0
  26. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  27. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  28. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  29. package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  30. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  31. package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  32. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  33. package/src/llama.cpp/src/llama-memory-recurrent.cpp +15 -2
  34. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  35. package/src/llama.cpp/src/llama-memory.h +3 -0
  36. package/src/llama.cpp/src/llama-model.cpp +1234 -248
  37. package/src/llama.cpp/src/llama-model.h +2 -0
  38. package/src/llama.cpp/src/llama-vocab.cpp +8 -1
  39. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
@@ -102,6 +102,7 @@ const char * llm_type_name(llm_type type) {
102
102
  case LLM_TYPE_57B_A14B: return "57B.A14B";
103
103
  case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
104
104
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
105
+ case LLM_TYPE_A13B: return "A13B";
105
106
  case LLM_TYPE_30B_A3B: return "30B.A3B";
106
107
  case LLM_TYPE_235B_A22B: return "235B.A22B";
107
108
  case LLM_TYPE_E2B: return "E2B";
@@ -208,23 +209,27 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
208
209
  } break;
209
210
  case GGML_OP_SSM_CONV:
210
211
  {
211
- // FIXME
212
- ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
212
+ const int64_t n_seq_tokens = 512;
213
+ const int64_t n_seqs = 3;
214
+ ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
213
215
  op_tensor = ggml_ssm_conv(ctx, conv_x, w);
214
216
  } break;
215
217
  case GGML_OP_SSM_SCAN:
216
218
  {
217
- // FIXME
218
- const int64_t d_state = w->ne[0];
219
- const int64_t d_inner = w->ne[1];
219
+ // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
220
+ const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
221
+ const int64_t n_head = w->ne[1];
222
+ const int64_t head_dim = hparams.ssm_d_inner / n_head;
223
+ const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
220
224
  const int64_t n_seq_tokens = 512;
221
- const int64_t n_seqs = 1;
222
- ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
223
- ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
224
- ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
225
- ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
226
- ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
227
- op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
225
+ const int64_t n_seqs = 3;
226
+ ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
227
+ ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
228
+ ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
229
+ ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
230
+ ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
231
+ ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
232
+ op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
228
233
  } break;
229
234
  case GGML_OP_RWKV_WKV6:
230
235
  {
@@ -1081,6 +1086,38 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1081
1086
  default: type = LLM_TYPE_UNKNOWN;
1082
1087
  }
1083
1088
  } break;
1089
+ case LLM_ARCH_MAMBA2:
1090
+ {
1091
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1092
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1093
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1094
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1095
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1096
+
1097
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1098
+
1099
+ switch (hparams.n_layer) {
1100
+ case 24:
1101
+ switch (hparams.n_embd) {
1102
+ case 768: type = LLM_TYPE_SMALL; break;
1103
+ default: type = LLM_TYPE_UNKNOWN;
1104
+ } break;
1105
+ case 48:
1106
+ switch (hparams.n_embd) {
1107
+ case 1024: type = LLM_TYPE_MEDIUM; break;
1108
+ case 1536: type = LLM_TYPE_LARGE; break;
1109
+ case 2048: type = LLM_TYPE_XL; break;
1110
+ default: type = LLM_TYPE_UNKNOWN;
1111
+ } break;
1112
+ case 64:
1113
+ switch (hparams.n_embd) {
1114
+ case 2560: type = LLM_TYPE_3B; break;
1115
+ case 4096: type = LLM_TYPE_7B; break;
1116
+ default: type = LLM_TYPE_UNKNOWN;
1117
+ } break;
1118
+ default: type = LLM_TYPE_UNKNOWN;
1119
+ }
1120
+ } break;
1084
1121
  case LLM_ARCH_XVERSE:
1085
1122
  {
1086
1123
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1513,6 +1550,58 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1513
1550
  default: type = LLM_TYPE_UNKNOWN;
1514
1551
  }
1515
1552
  } break;
1553
+ case LLM_ARCH_FALCON_H1:
1554
+ {
1555
+ // Common parameters
1556
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1557
+
1558
+ // SSM parameters
1559
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1560
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1561
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1562
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1563
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1564
+
1565
+ std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
1566
+
1567
+ switch (hparams.n_layer) {
1568
+ case 36:
1569
+ type = LLM_TYPE_0_5B; break;
1570
+ case 24:
1571
+ type = LLM_TYPE_1_5B; break;
1572
+ case 66:
1573
+ type = LLM_TYPE_1B; break;
1574
+ case 32:
1575
+ type = LLM_TYPE_3B; break;
1576
+ case 44:
1577
+ type = LLM_TYPE_7B; break;
1578
+ case 72:
1579
+ type = LLM_TYPE_34B; break;
1580
+ default:
1581
+ type = LLM_TYPE_UNKNOWN;
1582
+ }
1583
+ } break;
1584
+ case LLM_ARCH_HUNYUAN_MOE:
1585
+ {
1586
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1587
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1588
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
1589
+
1590
+ switch (hparams.n_layer) {
1591
+ case 32: type = LLM_TYPE_A13B; break;
1592
+ default: type = LLM_TYPE_UNKNOWN;
1593
+ }
1594
+ } break;
1595
+ case LLM_ARCH_SMOLLM3:
1596
+ {
1597
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1598
+ hparams.n_no_rope_layer_step = 4;
1599
+
1600
+ switch (hparams.n_layer) {
1601
+ case 36: type = LLM_TYPE_3B; break;
1602
+ default: type = LLM_TYPE_UNKNOWN;
1603
+ }
1604
+ } break;
1516
1605
  default: throw std::runtime_error("unsupported model architecture");
1517
1606
  }
1518
1607
 
@@ -3120,6 +3209,54 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3120
3209
  layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
3121
3210
  layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
3122
3211
 
3212
+ // out_proj
3213
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3214
+ }
3215
+ } break;
3216
+ case LLM_ARCH_MAMBA2:
3217
+ {
3218
+ const int64_t d_conv = hparams.ssm_d_conv;
3219
+ const int64_t d_inner = hparams.ssm_d_inner;
3220
+ const int64_t d_state = hparams.ssm_d_state;
3221
+ const int64_t n_head = hparams.ssm_dt_rank;
3222
+ const int64_t n_group = hparams.ssm_n_group;
3223
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
3224
+
3225
+ // only an expansion factor of 2 is supported for now
3226
+ GGML_ASSERT(2 * n_embd == d_inner);
3227
+
3228
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3229
+
3230
+ // output
3231
+ {
3232
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3233
+
3234
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
3235
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
3236
+ if (output == NULL) {
3237
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
3238
+ }
3239
+ }
3240
+
3241
+ for (int i = 0; i < n_layer; ++i) {
3242
+ auto & layer = layers[i];
3243
+
3244
+ // norm
3245
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3246
+
3247
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
3248
+
3249
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
3250
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
3251
+
3252
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
3253
+
3254
+ // no "weight" suffix for these
3255
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
3256
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
3257
+
3258
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
3259
+
3123
3260
  // out_proj
3124
3261
  layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3125
3262
  }
@@ -4385,6 +4522,149 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4385
4522
  layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4386
4523
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4387
4524
 
4525
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4526
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4527
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4528
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4529
+ }
4530
+ } break;
4531
+ case LLM_ARCH_FALCON_H1:
4532
+ {
4533
+ // Common
4534
+ const int64_t hidden_size = hparams.n_embd; // hidden_size
4535
+
4536
+ // mamba2 Mixer SSM params
4537
+ const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
4538
+ const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
4539
+ const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
4540
+ const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
4541
+ const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
4542
+ const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
4543
+ const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
4544
+
4545
+ // attn params
4546
+ const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
4547
+ const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
4548
+
4549
+ // ffn params
4550
+ const int64_t ffn_intermediate_size = hparams.n_ff(0);
4551
+
4552
+ // embeddings
4553
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
4554
+
4555
+ // output
4556
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
4557
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
4558
+
4559
+ // if output is NULL, init from the input tok embed
4560
+ if (output == NULL) {
4561
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
4562
+ }
4563
+
4564
+ for (int i = 0; i < n_layer; ++i) {
4565
+ auto & layer = layers[i];
4566
+
4567
+ /*SSM LAYERS*/
4568
+ // ssm in
4569
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
4570
+ // ssm 1d conv
4571
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
4572
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
4573
+ // ssm_dt
4574
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
4575
+ // no "weight" suffix for these
4576
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
4577
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
4578
+ // ssm_norm
4579
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
4580
+ // out_proj
4581
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
4582
+
4583
+ /*ATTENTION LAYERS*/
4584
+ // attention layers (with optional bias)
4585
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
4586
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
4587
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
4588
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
4589
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
4590
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
4591
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
4592
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
4593
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
4594
+
4595
+
4596
+ // feed forward (w/ optional biases)
4597
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
4598
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4599
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
4600
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
4601
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
4602
+
4603
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
4604
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
4605
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
4606
+ }
4607
+ } break;
4608
+ case LLM_ARCH_HUNYUAN_MOE:
4609
+ {
4610
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4611
+
4612
+ // output
4613
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4614
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4615
+ // if output is NULL, init from the input tok embed
4616
+ if (output == NULL) {
4617
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4618
+ }
4619
+
4620
+ for (int i = 0; i < n_layer; ++i) {
4621
+ auto & layer = layers[i];
4622
+
4623
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4624
+
4625
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4626
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4627
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4628
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4629
+
4630
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4631
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4632
+
4633
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4634
+
4635
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4636
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4637
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
4638
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4639
+
4640
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4641
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4642
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
4643
+ }
4644
+ } break;
4645
+ case LLM_ARCH_SMOLLM3:
4646
+ {
4647
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4648
+
4649
+ // output
4650
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4651
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4652
+
4653
+ // if output is NULL, init from the input tok embed
4654
+ if (output == NULL) {
4655
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4656
+ }
4657
+
4658
+ for (int i = 0; i < n_layer; ++i) {
4659
+ auto & layer = layers[i];
4660
+
4661
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4662
+
4663
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4664
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4665
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4666
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4667
+
4388
4668
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4389
4669
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4390
4670
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
@@ -4630,10 +4910,14 @@ void llama_model::print_info() const {
4630
4910
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4631
4911
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
4632
4912
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
4913
+ }
4914
+
4915
+ if (arch == LLM_ARCH_MAMBA || arch == LLM_ARCH_MAMBA2) {
4633
4916
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
4634
4917
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
4635
4918
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
4636
4919
  LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
4920
+ LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
4637
4921
  LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
4638
4922
 
4639
4923
  if (!classifier_labels.empty()) {
@@ -5582,12 +5866,10 @@ struct llm_build_falcon : public llm_graph_context {
5582
5866
  cur = build_lora_mm(model.layers[il].wqkv, cur);
5583
5867
  cb(cur, "wqkv", il);
5584
5868
 
5585
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5586
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5869
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
5870
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
5587
5871
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5588
5872
 
5589
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5590
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5591
5873
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5592
5874
 
5593
5875
  // using mode = 2 for neox mode
@@ -5864,12 +6146,10 @@ struct llm_build_dbrx : public llm_graph_context {
5864
6146
  cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
5865
6147
  cb(cur, "wqkv_clamped", il);
5866
6148
 
5867
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5868
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6149
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6150
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
5869
6151
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5870
6152
 
5871
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5872
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5873
6153
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5874
6154
 
5875
6155
  Qcur = ggml_rope_ext(
@@ -6380,12 +6660,10 @@ struct llm_build_neo_bert : public llm_graph_context {
6380
6660
  cur = build_lora_mm(model.layers[il].wqkv, cur);
6381
6661
  cb(cur, "wqkv", il);
6382
6662
 
6383
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6384
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6663
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6664
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6385
6665
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6386
6666
 
6387
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6388
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6389
6667
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6390
6668
 
6391
6669
  // RoPE
@@ -6615,8 +6893,8 @@ struct llm_build_mpt : public llm_graph_context {
6615
6893
  cb(cur, "wqkv_clamped", il);
6616
6894
  }
6617
6895
 
6618
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6619
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6896
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
6897
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
6620
6898
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6621
6899
 
6622
6900
  cb(Qcur, "Qcur", il);
@@ -6636,6 +6914,12 @@ struct llm_build_mpt : public llm_graph_context {
6636
6914
  model.layers[il].attn_k_norm_b,
6637
6915
  LLM_NORM, il);
6638
6916
  cb(Kcur, "Kcur", il);
6917
+ } else {
6918
+ Qcur = ggml_cont(ctx0, Qcur);
6919
+ cb(Qcur, "Qcur", il);
6920
+
6921
+ Kcur = ggml_cont(ctx0, Kcur);
6922
+ cb(Kcur, "Kcur", il);
6639
6923
  }
6640
6924
 
6641
6925
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -6890,12 +7174,10 @@ struct llm_build_qwen : public llm_graph_context {
6890
7174
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6891
7175
  cb(cur, "bqkv", il);
6892
7176
 
6893
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6894
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7177
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7178
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6895
7179
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
6896
7180
 
6897
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6898
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6899
7181
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6900
7182
 
6901
7183
  // using mode = 2 for neox mode
@@ -7660,21 +7942,21 @@ struct llm_build_phi2 : public llm_graph_context {
7660
7942
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7661
7943
  cb(cur, "bqkv", il);
7662
7944
 
7663
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7664
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7945
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7946
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7665
7947
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7666
7948
  } else {
7667
7949
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7668
7950
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7669
7951
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
7952
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7953
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7670
7954
  }
7671
7955
 
7672
7956
  cb(Qcur, "Qcur", il);
7673
7957
  cb(Kcur, "Kcur", il);
7674
7958
  cb(Vcur, "Vcur", il);
7675
7959
 
7676
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7677
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7678
7960
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7679
7961
 
7680
7962
  Qcur = ggml_rope_ext(
@@ -7798,21 +8080,21 @@ struct llm_build_phi3 : public llm_graph_context {
7798
8080
  cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
7799
8081
  cb(cur, "wqkv", il);
7800
8082
 
7801
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
7802
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
8083
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
8084
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
7803
8085
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
7804
8086
  } else {
7805
8087
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7806
8088
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7807
8089
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
8090
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8091
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7808
8092
  }
7809
8093
 
7810
8094
  cb(Qcur, "Qcur", il);
7811
8095
  cb(Kcur, "Kcur", il);
7812
8096
  cb(Vcur, "Vcur", il);
7813
8097
 
7814
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7815
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7816
8098
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7817
8099
 
7818
8100
  Qcur = ggml_rope_ext(
@@ -8168,12 +8450,10 @@ struct llm_build_codeshell : public llm_graph_context {
8168
8450
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8169
8451
  cb(cur, "bqkv", il);
8170
8452
 
8171
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
8172
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
8453
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8454
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8173
8455
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
8174
8456
 
8175
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8176
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8177
8457
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8178
8458
 
8179
8459
  Qcur = ggml_rope_ext(
@@ -8589,8 +8869,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8589
8869
  ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
8590
8870
  cb(k_pe, "k_pe", il);
8591
8871
 
8592
- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
8593
- kv_compressed = ggml_cont(ctx0, kv_compressed);
8594
8872
  kv_compressed = build_norm(kv_compressed,
8595
8873
  model.layers[il].attn_kv_a_norm, NULL,
8596
8874
  LLM_NORM_RMS, il);
@@ -8617,12 +8895,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8617
8895
  v_states = ggml_cont(ctx0, v_states);
8618
8896
  cb(v_states, "v_states", il);
8619
8897
 
8620
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
8621
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
8622
- 0);
8623
- cb(v_states, "v_states", il);
8624
-
8625
- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
8626
8898
  q_pe = ggml_rope_ext(
8627
8899
  ctx0, q_pe, inp_pos, rope_factors,
8628
8900
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -8631,7 +8903,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8631
8903
  cb(q_pe, "q_pe", il);
8632
8904
 
8633
8905
  // shared RoPE key
8634
- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
8635
8906
  k_pe = ggml_rope_ext(
8636
8907
  ctx0, k_pe, inp_pos, rope_factors,
8637
8908
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9665,9 +9936,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
9665
9936
  };
9666
9937
 
9667
9938
  struct llm_build_mamba : public llm_graph_context {
9668
- const llama_model & model;
9669
-
9670
- llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
9939
+ llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9671
9940
  ggml_tensor * cur;
9672
9941
  ggml_tensor * inpL;
9673
9942
 
@@ -9685,7 +9954,11 @@ struct llm_build_mamba : public llm_graph_context {
9685
9954
  LLM_NORM_RMS, il);
9686
9955
  cb(cur, "attn_norm", il);
9687
9956
 
9688
- cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
9957
+ if (model.arch == LLM_ARCH_MAMBA2) {
9958
+ cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il);
9959
+ } else {
9960
+ cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il);
9961
+ }
9689
9962
 
9690
9963
  if (il == n_layer - 1 && inp_out_ids) {
9691
9964
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
@@ -9719,11 +9992,11 @@ struct llm_build_mamba : public llm_graph_context {
9719
9992
  ggml_build_forward_expand(gf, cur);
9720
9993
  }
9721
9994
 
9722
- // TODO: split
9723
9995
  ggml_tensor * build_mamba_layer(
9724
9996
  llm_graph_input_rs * inp,
9725
9997
  ggml_cgraph * gf,
9726
9998
  ggml_tensor * cur,
9999
+ const llama_model & model,
9727
10000
  const llama_ubatch & ubatch,
9728
10001
  int il) const {
9729
10002
  const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
@@ -9734,6 +10007,8 @@ struct llm_build_mamba : public llm_graph_context {
9734
10007
  const int64_t d_inner = hparams.ssm_d_inner;
9735
10008
  const int64_t d_state = hparams.ssm_d_state;
9736
10009
  const int64_t dt_rank = hparams.ssm_dt_rank;
10010
+ const int64_t n_head = d_inner;
10011
+ const int64_t head_dim = 1;
9737
10012
  const int64_t n_seqs = ubatch.n_seqs;
9738
10013
  // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
9739
10014
  const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
@@ -9749,15 +10024,8 @@ struct llm_build_mamba : public llm_graph_context {
9749
10024
  ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
9750
10025
  ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
9751
10026
 
9752
- // (ab)using the KV cache to store the states
9753
- ggml_tensor * conv = build_rs(
9754
- inp, gf, conv_states_all,
9755
- hparams.n_embd_r(), n_seqs);
10027
+ ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
9756
10028
  conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
9757
- ggml_tensor * ssm = build_rs(
9758
- inp, gf, ssm_states_all,
9759
- hparams.n_embd_s(), n_seqs);
9760
- ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
9761
10029
 
9762
10030
  // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
9763
10031
  cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
@@ -9806,8 +10074,8 @@ struct llm_build_mamba : public llm_graph_context {
9806
10074
  ggml_tensor * x_db = build_lora_mm(model.layers[il].ssm_x, x);
9807
10075
  // split
9808
10076
  ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
9809
- ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
9810
- ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
10077
+ ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
10078
+ ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
9811
10079
 
9812
10080
  // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
9813
10081
  if (ssm_dt_b_c_rms) {
@@ -9820,23 +10088,36 @@ struct llm_build_mamba : public llm_graph_context {
9820
10088
  dt = build_lora_mm(model.layers[il].ssm_dt, dt);
9821
10089
  dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
9822
10090
 
9823
- // Custom operator to optimize the parallel associative scan
9824
- // as described in the Annex D of the Mamba paper.
9825
- // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
9826
- ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
10091
+ cur = x;
10092
+ x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
10093
+
10094
+ ggml_tensor * A = model.layers[il].ssm_a;
10095
+
10096
+ // use the states and the indices provided by build_recurrent_state
10097
+ // (this is necessary in order to properly use the states before they are overwritten,
10098
+ // while avoiding to make unnecessary copies of the states)
10099
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
10100
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
10101
+
10102
+ // Custom operator to optimize the parallel associative scan
10103
+ // as described in the Annex D of the Mamba paper.
10104
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
10105
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
10106
+ };
10107
+
10108
+ ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
9827
10109
 
9828
10110
  // store last states
9829
10111
  ggml_build_forward_expand(gf,
9830
10112
  ggml_cpy(ctx0,
9831
- ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
10113
+ ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
9832
10114
  ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
9833
10115
 
9834
- ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
10116
+ ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
9835
10117
 
9836
10118
  // TODO: skip computing output earlier for unused tokens
9837
10119
 
9838
- // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
9839
- y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
10120
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, model.layers[il].ssm_d));
9840
10121
  y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
9841
10122
 
9842
10123
  // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
@@ -9845,40 +10126,169 @@ struct llm_build_mamba : public llm_graph_context {
9845
10126
 
9846
10127
  // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
9847
10128
  cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
9848
- //cb(cur, "mamba_out", il);
10129
+ // cb(cur, "mamba_out", il);
9849
10130
 
9850
10131
  return cur;
9851
10132
  }
9852
- };
9853
10133
 
9854
- struct llm_build_command_r : public llm_graph_context {
9855
- llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9856
- const int64_t n_embd_head = hparams.n_embd_head_v;
10134
+ ggml_tensor * build_mamba2_layer(
10135
+ llm_graph_input_rs * inp,
10136
+ ggml_cgraph * gf,
10137
+ ggml_tensor * cur,
10138
+ const llama_model & model,
10139
+ const llama_ubatch & ubatch,
10140
+ int il) const {
10141
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
9857
10142
 
9858
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10143
+ const auto kv_head = mctx_cur->get_head();
9859
10144
 
9860
- const float f_logit_scale = hparams.f_logit_scale;
10145
+ const int64_t d_conv = hparams.ssm_d_conv;
10146
+ const int64_t d_inner = hparams.ssm_d_inner;
10147
+ const int64_t d_state = hparams.ssm_d_state;
10148
+ const int64_t n_head = hparams.ssm_dt_rank;
10149
+ const int64_t head_dim = d_inner / n_head;
10150
+ const int64_t n_group = hparams.ssm_n_group;
10151
+ const int64_t n_seqs = ubatch.n_seqs;
9861
10152
 
9862
- ggml_tensor * cur;
9863
- ggml_tensor * inpL;
10153
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
9864
10154
 
9865
- inpL = build_inp_embd(model.tok_embd);
10155
+ GGML_ASSERT(n_seqs != 0);
10156
+ GGML_ASSERT(ubatch.equal_seqs);
10157
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
9866
10158
 
9867
- // inp_pos - contains the positions
9868
- ggml_tensor * inp_pos = build_inp_pos();
10159
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
10160
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
9869
10161
 
9870
- auto * inp_attn = build_attn_inp_kv_unified();
10162
+ ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
10163
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
9871
10164
 
9872
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10165
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
10166
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
9873
10167
 
9874
- for (int il = 0; il < n_layer; ++il) {
9875
- // norm
9876
- cur = build_norm(inpL,
9877
- model.layers[il].attn_norm, NULL,
9878
- LLM_NORM, il);
9879
- cb(cur, "attn_norm", il);
10168
+ // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
9880
10169
 
9881
- ggml_tensor * ffn_inp = cur;
10170
+ // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
10171
+ ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
10172
+
10173
+ // split the above in three
10174
+ ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
10175
+ ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
10176
+ ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
10177
+
10178
+ // conv
10179
+ {
10180
+ // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
10181
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
10182
+
10183
+ // copy last (d_conv - 1) columns back into the state cache
10184
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
10185
+
10186
+ ggml_build_forward_expand(gf,
10187
+ ggml_cpy(ctx0, last_conv,
10188
+ ggml_view_1d(ctx0, conv_states_all,
10189
+ (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
10190
+ kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
10191
+
10192
+ // 1D convolution
10193
+ // The equivalent is to make a self-overlapping view of conv_x
10194
+ // over d_conv columns at each stride in the 3rd dimension,
10195
+ // then element-wise multiply that with the conv1d weight,
10196
+ // then sum the elements of each row,
10197
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
10198
+ // then permute away the ne[0] dimension,
10199
+ // and then you're left with the resulting x tensor.
10200
+ // For simultaneous sequences, all sequences need to have the same length.
10201
+ xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
10202
+
10203
+ // bias
10204
+ xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
10205
+
10206
+ xBC = ggml_silu(ctx0, xBC);
10207
+ }
10208
+
10209
+ // ssm
10210
+ {
10211
+ // These correspond to V K Q in SSM/attention duality
10212
+ ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
10213
+ ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
10214
+ ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
10215
+
10216
+ // {n_head, n_seq_tokens, n_seqs}
10217
+ dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
10218
+
10219
+ ggml_tensor * A = model.layers[il].ssm_a;
10220
+
10221
+ // use the states and the indices provided by build_recurrent_state
10222
+ // (this is necessary in order to properly use the states before they are overwritten,
10223
+ // while avoiding to make unnecessary copies of the states)
10224
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
10225
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
10226
+
10227
+ // TODO: use semistructured matrices to implement state-space duality
10228
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
10229
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
10230
+ };
10231
+
10232
+ ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
10233
+
10234
+ // store last states
10235
+ ggml_build_forward_expand(gf,
10236
+ ggml_cpy(ctx0,
10237
+ ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
10238
+ ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
10239
+
10240
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
10241
+
10242
+ // TODO: skip computing output earlier for unused tokens
10243
+
10244
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
10245
+ y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
10246
+
10247
+ // grouped RMS norm
10248
+ y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
10249
+ y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
10250
+ y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
10251
+
10252
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
10253
+ cur = build_lora_mm(model.layers[il].ssm_out, y);
10254
+ }
10255
+
10256
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
10257
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
10258
+ cb(cur, "mamba_out", il);
10259
+
10260
+ return cur;
10261
+ }
10262
+ };
10263
+
10264
+ struct llm_build_command_r : public llm_graph_context {
10265
+ llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
10266
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10267
+
10268
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10269
+
10270
+ const float f_logit_scale = hparams.f_logit_scale;
10271
+
10272
+ ggml_tensor * cur;
10273
+ ggml_tensor * inpL;
10274
+
10275
+ inpL = build_inp_embd(model.tok_embd);
10276
+
10277
+ // inp_pos - contains the positions
10278
+ ggml_tensor * inp_pos = build_inp_pos();
10279
+
10280
+ auto * inp_attn = build_attn_inp_kv_unified();
10281
+
10282
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10283
+
10284
+ for (int il = 0; il < n_layer; ++il) {
10285
+ // norm
10286
+ cur = build_norm(inpL,
10287
+ model.layers[il].attn_norm, NULL,
10288
+ LLM_NORM, il);
10289
+ cb(cur, "attn_norm", il);
10290
+
10291
+ ggml_tensor * ffn_inp = cur;
9882
10292
 
9883
10293
  // self-attention
9884
10294
  {
@@ -10557,10 +10967,10 @@ struct llm_build_openelm : public llm_graph_context {
10557
10967
 
10558
10968
  cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
10559
10969
 
10560
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
10970
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
10561
10971
  cb(Qcur, "Qcur", il);
10562
10972
 
10563
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
10973
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
10564
10974
  cb(Kcur, "Kcur", il);
10565
10975
 
10566
10976
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
@@ -10682,12 +11092,10 @@ struct llm_build_gptneox : public llm_graph_context {
10682
11092
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10683
11093
  cb(cur, "bqkv", il);
10684
11094
 
10685
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
10686
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
11095
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
11096
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
10687
11097
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10688
11098
 
10689
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
10690
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
10691
11099
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
10692
11100
 
10693
11101
  Qcur = ggml_rope_ext(
@@ -11932,6 +12340,8 @@ struct llm_build_chatglm : public llm_graph_context {
11932
12340
  if (model.layers[il].bv) {
11933
12341
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
11934
12342
  }
12343
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12344
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11935
12345
  } else {
11936
12346
  cur = build_lora_mm(model.layers[il].wqkv, cur);
11937
12347
  cb(cur, "wqkv", il);
@@ -11939,13 +12349,11 @@ struct llm_build_chatglm : public llm_graph_context {
11939
12349
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
11940
12350
  cb(cur, "bqkv", il);
11941
12351
  }
11942
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
11943
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
12352
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12353
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
11944
12354
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
11945
12355
  }
11946
12356
 
11947
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11948
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11949
12357
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11950
12358
 
11951
12359
  //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
@@ -12066,6 +12474,8 @@ struct llm_build_glm4 : public llm_graph_context {
12066
12474
  if (model.layers[il].bv) {
12067
12475
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12068
12476
  }
12477
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12478
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12069
12479
  } else {
12070
12480
  cur = build_lora_mm(model.layers[il].wqkv, cur);
12071
12481
  cb(cur, "wqkv", il);
@@ -12073,13 +12483,11 @@ struct llm_build_glm4 : public llm_graph_context {
12073
12483
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
12074
12484
  cb(cur, "bqkv", il);
12075
12485
  }
12076
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
12077
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
12486
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12487
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
12078
12488
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
12079
12489
  }
12080
12490
 
12081
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12082
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12083
12491
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12084
12492
 
12085
12493
  Qcur = ggml_rope_ext(
@@ -14298,12 +14706,11 @@ struct llm_build_ernie4_5 : public llm_graph_context {
14298
14706
  }
14299
14707
  };
14300
14708
 
14301
- struct llm_build_arcee : public llm_graph_context {
14302
- llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14303
- const int64_t n_embd_head = hparams.n_embd_head_v;
14709
+ struct llm_build_falcon_h1 : public llm_graph_context {
14710
+ const llama_model & model;
14304
14711
 
14305
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14306
- GGML_ASSERT(n_embd_head == hparams.n_rot);
14712
+ llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
14713
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14307
14714
 
14308
14715
  ggml_tensor * cur;
14309
14716
  ggml_tensor * inpL;
@@ -14313,7 +14720,8 @@ struct llm_build_arcee : public llm_graph_context {
14313
14720
  // inp_pos - contains the positions
14314
14721
  ggml_tensor * inp_pos = build_inp_pos();
14315
14722
 
14316
- auto * inp_attn = build_attn_inp_kv_unified();
14723
+ // Build the inputs in the recurrent & kv cache
14724
+ auto * inp = build_inp_mem_hybrid();
14317
14725
 
14318
14726
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14319
14727
 
@@ -14322,90 +14730,83 @@ struct llm_build_arcee : public llm_graph_context {
14322
14730
  for (int il = 0; il < n_layer; ++il) {
14323
14731
  ggml_tensor * inpSA = inpL;
14324
14732
 
14325
- // norm
14326
14733
  cur = build_norm(inpL,
14327
14734
  model.layers[il].attn_norm, NULL,
14328
14735
  LLM_NORM_RMS, il);
14329
14736
  cb(cur, "attn_norm", il);
14330
14737
 
14331
14738
  // self-attention
14332
- {
14333
- // rope freq factors for llama3; may return nullptr for llama2 and other models
14334
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14739
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14740
+ cb(Qcur, "Qcur", il);
14335
14741
 
14336
- // compute Q and K and RoPE them
14337
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14338
- cb(Qcur, "Qcur", il);
14339
- if (model.layers[il].bq) {
14340
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14341
- cb(Qcur, "Qcur", il);
14342
- }
14742
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14743
+ cb(Kcur, "Kcur", il);
14343
14744
 
14344
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14345
- cb(Kcur, "Kcur", il);
14346
- if (model.layers[il].bk) {
14347
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14348
- cb(Kcur, "Kcur", il);
14349
- }
14745
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14746
+ cb(Vcur, "Vcur", il);
14350
14747
 
14351
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14352
- cb(Vcur, "Vcur", il);
14353
- if (model.layers[il].bv) {
14354
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14355
- cb(Vcur, "Vcur", il);
14356
- }
14748
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14749
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14357
14750
 
14358
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14359
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14360
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14751
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14361
14752
 
14362
- Qcur = ggml_rope_ext(
14363
- ctx0, Qcur, inp_pos, rope_factors,
14364
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14365
- ext_factor, attn_factor, beta_fast, beta_slow
14366
- );
14753
+ Qcur = ggml_rope_ext(
14754
+ ctx0, Qcur, inp_pos, nullptr,
14755
+ n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
14756
+ ext_factor, attn_factor, beta_fast, beta_slow);
14367
14757
 
14368
- Kcur = ggml_rope_ext(
14369
- ctx0, Kcur, inp_pos, rope_factors,
14370
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14371
- ext_factor, attn_factor, beta_fast, beta_slow
14372
- );
14758
+ Kcur = ggml_rope_ext(
14759
+ ctx0, Kcur, inp_pos, nullptr,
14760
+ n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
14761
+ ext_factor, attn_factor, beta_fast, beta_slow
14762
+ );
14373
14763
 
14374
- cb(Qcur, "Qcur", il);
14375
- cb(Kcur, "Kcur", il);
14376
- cb(Vcur, "Vcur", il);
14764
+ cb(Qcur, "Qcur-post-rope", il);
14765
+ cb(Kcur, "Kcur-post-rope", il);
14766
+ cb(Vcur, "Vcur-post-rope", il);
14377
14767
 
14378
- cur = build_attn(inp_attn, gf,
14379
- model.layers[il].wo, model.layers[il].bo,
14380
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14381
- cb(cur, "attn_out", il);
14382
- }
14768
+ ggml_tensor * attn_out = build_attn(inp, gf,
14769
+ model.layers[il].wo, NULL,
14770
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14771
+ cb(attn_out, "attn_out", il);
14772
+
14773
+ cur = build_norm(inpL,
14774
+ model.layers[il].attn_norm, NULL,
14775
+ LLM_NORM_RMS, il);
14776
+ // Mamba2 layer
14777
+ cb(cur, "ssm_in", il);
14778
+
14779
+ ggml_tensor * ssm_out = build_mamba2_layer(inp, gf, cur, ubatch, il);
14780
+ cb(ssm_out, "ssm_out", il);
14781
+
14782
+ // // Aggregation
14783
+ cur = ggml_add(ctx0, attn_out, ssm_out);
14784
+ inpSA = ggml_add(ctx0, cur, inpSA);
14785
+ cb(cur, "layer_out", il);
14383
14786
 
14384
14787
  if (il == n_layer - 1 && inp_out_ids) {
14385
14788
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14386
14789
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14387
14790
  }
14388
14791
 
14389
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14792
+ ggml_tensor * ffn_inp = inpSA;
14390
14793
  cb(ffn_inp, "ffn_inp", il);
14391
14794
 
14392
14795
  // feed-forward network
14393
- // ARCEE uses relu^2 instead of silu
14394
14796
  cur = build_norm(ffn_inp,
14395
14797
  model.layers[il].ffn_norm, NULL,
14396
14798
  LLM_NORM_RMS, il);
14397
14799
  cb(cur, "ffn_norm", il);
14398
14800
 
14399
14801
  cur = build_ffn(cur,
14400
- model.layers[il].ffn_up, NULL, NULL,
14401
- NULL, NULL, NULL,
14402
- model.layers[il].ffn_down, NULL, NULL,
14802
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14803
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
14804
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14403
14805
  NULL,
14404
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
14806
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14405
14807
  cb(cur, "ffn_out", il);
14406
14808
 
14407
- cur = ggml_add(ctx0, cur, ffn_inp);
14408
- cb(cur, "ffn_out", il);
14809
+ cur = ggml_add(ctx0, cur, inpSA);
14409
14810
 
14410
14811
  cur = build_cvec(cur, il);
14411
14812
  cb(cur, "l_out", il);
@@ -14431,97 +14832,665 @@ struct llm_build_arcee : public llm_graph_context {
14431
14832
 
14432
14833
  ggml_build_forward_expand(gf, cur);
14433
14834
  }
14434
- };
14435
14835
 
14436
- llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
14437
- llama_memory_i * res;
14836
+ ggml_tensor * build_mamba2_layer(
14837
+ llm_graph_input_mem_hybrid * inp,
14838
+ ggml_cgraph * gf,
14839
+ ggml_tensor * cur,
14840
+ const llama_ubatch & ubatch,
14841
+ int il) const {
14842
+ const auto * kv_state = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
14438
14843
 
14439
- switch (arch) {
14440
- // Models that need specific instantiation should be handled in the
14441
- // switch statement
14442
- case LLM_ARCH_BERT:
14443
- case LLM_ARCH_JINA_BERT_V2:
14444
- case LLM_ARCH_NOMIC_BERT:
14445
- case LLM_ARCH_NOMIC_BERT_MOE:
14446
- case LLM_ARCH_NEO_BERT:
14447
- case LLM_ARCH_WAVTOKENIZER_DEC:
14448
- {
14449
- res = nullptr;
14450
- } break;
14451
- // Models that need standard caching should rely on recurrent/hybrid
14452
- // checks
14453
- default:
14454
- {
14455
- if (llm_arch_is_recurrent(arch)) {
14456
- res = new llama_memory_recurrent(
14457
- *this,
14458
- nullptr,
14459
- GGML_TYPE_F32,
14460
- GGML_TYPE_F32,
14461
- cparams.offload_kqv,
14462
- std::max((uint32_t) 1, cparams.n_seq_max),
14463
- cparams.n_seq_max);
14464
- } else if (llm_arch_is_hybrid(arch)) {
14465
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
14844
+ const auto kv_head = kv_state->get_head();
14466
14845
 
14467
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
14846
+ const int64_t d_conv = hparams.ssm_d_conv;
14847
+ const int64_t d_inner = hparams.ssm_d_inner;
14848
+ const int64_t d_state = hparams.ssm_d_state;
14849
+ const int64_t n_head = hparams.ssm_dt_rank;
14850
+ const int64_t head_dim = d_inner / n_head;
14851
+ const int64_t n_group = hparams.ssm_n_group;
14852
+ const int64_t n_seqs = ubatch.n_seqs;
14468
14853
 
14469
- res = new llama_memory_hybrid(
14470
- /* model */ *this,
14471
- /* attn_type_k */ params.type_k,
14472
- /* attn_type_v */ params.type_v,
14473
- /* attn_v_trans */ !cparams.flash_attn,
14474
- /* attn_kv_size */ cparams.n_ctx,
14475
- /* attn_n_pad */ padding,
14476
- /* attn_n_swa */ hparams.n_swa,
14477
- /* attn_swa_type */ hparams.swa_type,
14478
- /* recurrent_type_k */ GGML_TYPE_F32,
14479
- /* recurrent_type_v */ GGML_TYPE_F32,
14480
- /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
14481
- /* n_seq_max */ cparams.n_seq_max,
14482
- /* offload */ cparams.offload_kqv);
14483
- } else {
14484
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
14854
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
14485
14855
 
14486
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
14856
+ GGML_ASSERT(n_seqs != 0);
14857
+ GGML_ASSERT(ubatch.equal_seqs);
14858
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
14487
14859
 
14488
- LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
14860
+ ggml_tensor * conv_states_all = kv_state->get_r_l(il);
14861
+ ggml_tensor * ssm_states_all = kv_state->get_s_l(il);
14489
14862
 
14490
- if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
14491
- GGML_ASSERT(hparams.is_swa_any());
14863
+ ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
14864
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
14492
14865
 
14493
- res = new llama_kv_cache_unified_iswa(
14494
- *this,
14495
- params.type_k,
14496
- params.type_v,
14497
- !cparams.flash_attn,
14498
- cparams.offload_kqv,
14499
- params.swa_full,
14500
- cparams.n_ctx,
14501
- cparams.n_seq_max,
14502
- cparams.n_ubatch,
14503
- padding);
14504
- } else {
14505
- GGML_ASSERT(!hparams.is_swa_any());
14866
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
14867
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
14506
14868
 
14507
- res = new llama_kv_cache_unified(
14508
- *this,
14509
- nullptr,
14510
- params.type_k,
14511
- params.type_v,
14512
- !cparams.flash_attn,
14513
- cparams.offload_kqv,
14514
- cparams.n_ctx,
14515
- cparams.n_seq_max,
14516
- padding,
14517
- hparams.n_swa,
14518
- hparams.swa_type);
14519
- }
14520
- }
14521
- }
14522
- }
14869
+ // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
14523
14870
 
14524
- return res;
14871
+ // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
14872
+ ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
14873
+ cb(zxBCdt, "zxBCdt", il);
14874
+
14875
+ // split the above in three
14876
+ ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
14877
+ ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
14878
+ ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
14879
+
14880
+ // conv
14881
+ {
14882
+ // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
14883
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
14884
+
14885
+ // copy last (d_conv - 1) columns back into the state cache
14886
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
14887
+
14888
+ ggml_build_forward_expand(gf,
14889
+ ggml_cpy(ctx0, last_conv,
14890
+ ggml_view_1d(ctx0, conv_states_all,
14891
+ (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
14892
+ kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
14893
+
14894
+ // 1D convolution
14895
+ // The equivalent is to make a self-overlapping view of conv_x
14896
+ // over d_conv columns at each stride in the 3rd dimension,
14897
+ // then element-wise multiply that with the conv1d weight,
14898
+ // then sum the elements of each row,
14899
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
14900
+ // then permute away the ne[0] dimension,
14901
+ // and then you're left with the resulting x tensor.
14902
+ // For simultaneous sequences, all sequences need to have the same length.
14903
+ xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
14904
+
14905
+ // bias
14906
+ xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
14907
+
14908
+ xBC = ggml_silu(ctx0, xBC);
14909
+ }
14910
+
14911
+ // ssm
14912
+ {
14913
+ // These correspond to V K Q in SSM/attention duality
14914
+ ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
14915
+
14916
+ ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
14917
+
14918
+ ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
14919
+
14920
+ // {n_head, n_seq_tokens, n_seqs}
14921
+ dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
14922
+
14923
+ ggml_tensor * A = model.layers[il].ssm_a;
14924
+
14925
+ // use the states and the indices provided by build_rs
14926
+ // (this is necessary in order to properly use the states before they are overwritten,
14927
+ // while avoiding to make unnecessary copies of the states)
14928
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
14929
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, kv_state->get_size());
14930
+
14931
+ // TODO: use semistructured matrices to implement state-space duality
14932
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
14933
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
14934
+ };
14935
+
14936
+ ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
14937
+
14938
+ // store last states
14939
+ ggml_build_forward_expand(gf,
14940
+ ggml_cpy(ctx0,
14941
+ ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
14942
+ ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
14943
+
14944
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
14945
+
14946
+ // TODO: skip computing output earlier for unused tokens
14947
+
14948
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
14949
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
14950
+
14951
+ // grouped RMS norm
14952
+ if (model.layers[il].ssm_norm) {
14953
+ y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
14954
+ y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
14955
+ }
14956
+
14957
+ y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
14958
+
14959
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
14960
+ cur = build_lora_mm(model.layers[il].ssm_out, y);
14961
+ }
14962
+
14963
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
14964
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
14965
+ cb(cur, "mamba_out", il);
14966
+ return cur;
14967
+ }
14968
+ };
14969
+
14970
+ struct llm_build_arcee : public llm_graph_context {
14971
+ llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14972
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14973
+
14974
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14975
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
14976
+
14977
+ ggml_tensor * cur;
14978
+ ggml_tensor * inpL;
14979
+
14980
+ inpL = build_inp_embd(model.tok_embd);
14981
+
14982
+ // inp_pos - contains the positions
14983
+ ggml_tensor * inp_pos = build_inp_pos();
14984
+
14985
+ auto * inp_attn = build_attn_inp_kv_unified();
14986
+
14987
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14988
+
14989
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14990
+
14991
+ for (int il = 0; il < n_layer; ++il) {
14992
+ ggml_tensor * inpSA = inpL;
14993
+
14994
+ // norm
14995
+ cur = build_norm(inpL,
14996
+ model.layers[il].attn_norm, NULL,
14997
+ LLM_NORM_RMS, il);
14998
+ cb(cur, "attn_norm", il);
14999
+
15000
+ // self-attention
15001
+ {
15002
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
15003
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
15004
+
15005
+ // compute Q and K and RoPE them
15006
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
15007
+ cb(Qcur, "Qcur", il);
15008
+ if (model.layers[il].bq) {
15009
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
15010
+ cb(Qcur, "Qcur", il);
15011
+ }
15012
+
15013
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
15014
+ cb(Kcur, "Kcur", il);
15015
+ if (model.layers[il].bk) {
15016
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
15017
+ cb(Kcur, "Kcur", il);
15018
+ }
15019
+
15020
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
15021
+ cb(Vcur, "Vcur", il);
15022
+ if (model.layers[il].bv) {
15023
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
15024
+ cb(Vcur, "Vcur", il);
15025
+ }
15026
+
15027
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
15028
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
15029
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
15030
+
15031
+ Qcur = ggml_rope_ext(
15032
+ ctx0, Qcur, inp_pos, rope_factors,
15033
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15034
+ ext_factor, attn_factor, beta_fast, beta_slow
15035
+ );
15036
+
15037
+ Kcur = ggml_rope_ext(
15038
+ ctx0, Kcur, inp_pos, rope_factors,
15039
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15040
+ ext_factor, attn_factor, beta_fast, beta_slow
15041
+ );
15042
+
15043
+ cb(Qcur, "Qcur", il);
15044
+ cb(Kcur, "Kcur", il);
15045
+ cb(Vcur, "Vcur", il);
15046
+
15047
+ cur = build_attn(inp_attn, gf,
15048
+ model.layers[il].wo, model.layers[il].bo,
15049
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15050
+ cb(cur, "attn_out", il);
15051
+ }
15052
+
15053
+ if (il == n_layer - 1 && inp_out_ids) {
15054
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15055
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15056
+ }
15057
+
15058
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15059
+ cb(ffn_inp, "ffn_inp", il);
15060
+
15061
+ // feed-forward network
15062
+ // ARCEE uses relu^2 instead of silu
15063
+ cur = build_norm(ffn_inp,
15064
+ model.layers[il].ffn_norm, NULL,
15065
+ LLM_NORM_RMS, il);
15066
+ cb(cur, "ffn_norm", il);
15067
+
15068
+ cur = build_ffn(cur,
15069
+ model.layers[il].ffn_up, NULL, NULL,
15070
+ NULL, NULL, NULL,
15071
+ model.layers[il].ffn_down, NULL, NULL,
15072
+ NULL,
15073
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
15074
+ cb(cur, "ffn_out", il);
15075
+
15076
+ cur = ggml_add(ctx0, cur, ffn_inp);
15077
+ cb(cur, "ffn_out", il);
15078
+
15079
+ cur = build_cvec(cur, il);
15080
+ cb(cur, "l_out", il);
15081
+
15082
+ // input for next layer
15083
+ inpL = cur;
15084
+ }
15085
+
15086
+ cur = inpL;
15087
+
15088
+ cur = build_norm(cur,
15089
+ model.output_norm, NULL,
15090
+ LLM_NORM_RMS, -1);
15091
+
15092
+ cb(cur, "result_norm", -1);
15093
+ res->t_embd = cur;
15094
+
15095
+ // lm_head
15096
+ cur = build_lora_mm(model.output, cur);
15097
+
15098
+ cb(cur, "result_output", -1);
15099
+ res->t_logits = cur;
15100
+
15101
+ ggml_build_forward_expand(gf, cur);
15102
+ }
15103
+ };
15104
+
15105
+ struct llm_build_hunyuan_moe : public llm_graph_context {
15106
+ llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15107
+ const int64_t n_embd_head = hparams.n_embd_head_v;
15108
+
15109
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15110
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
15111
+
15112
+ ggml_tensor * cur;
15113
+ ggml_tensor * inpL;
15114
+
15115
+ inpL = build_inp_embd(model.tok_embd);
15116
+
15117
+ // inp_pos - contains the positions
15118
+ ggml_tensor * inp_pos = build_inp_pos();
15119
+
15120
+ auto * inp_attn = build_attn_inp_kv_unified();
15121
+
15122
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
15123
+
15124
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
15125
+
15126
+ for (int il = 0; il < n_layer; ++il) {
15127
+ ggml_tensor * inpSA = inpL;
15128
+
15129
+ // norm
15130
+ cur = build_norm(inpL,
15131
+ model.layers[il].attn_norm, NULL,
15132
+ LLM_NORM_RMS, il);
15133
+ cb(cur, "attn_norm", il);
15134
+
15135
+ // self-attention
15136
+ {
15137
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
15138
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
15139
+
15140
+ // compute Q and K and RoPE them
15141
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
15142
+ cb(Qcur, "Qcur", il);
15143
+ if (model.layers[il].bq) {
15144
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
15145
+ cb(Qcur, "Qcur", il);
15146
+ }
15147
+
15148
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
15149
+ cb(Kcur, "Kcur", il);
15150
+ if (model.layers[il].bk) {
15151
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
15152
+ cb(Kcur, "Kcur", il);
15153
+ }
15154
+
15155
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
15156
+ cb(Vcur, "Vcur", il);
15157
+ if (model.layers[il].bv) {
15158
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
15159
+ cb(Vcur, "Vcur", il);
15160
+ }
15161
+
15162
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
15163
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
15164
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
15165
+
15166
+ Qcur = ggml_rope_ext(
15167
+ ctx0, Qcur, inp_pos, rope_factors,
15168
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15169
+ ext_factor, attn_factor, beta_fast, beta_slow
15170
+ );
15171
+
15172
+ cb(Qcur, "Qcur", il);
15173
+ cb(Kcur, "Kcur", il);
15174
+ cb(Vcur, "Vcur", il);
15175
+
15176
+ Kcur = ggml_rope_ext(
15177
+ ctx0, Kcur, inp_pos, rope_factors,
15178
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15179
+ ext_factor, attn_factor, beta_fast, beta_slow
15180
+ );
15181
+
15182
+ Kcur = build_norm(Kcur,
15183
+ model.layers[il].attn_k_norm, nullptr,
15184
+ LLM_NORM_RMS, il);
15185
+ cb(Kcur, "Kcur_norm", il);
15186
+
15187
+ Qcur = build_norm(Qcur,
15188
+ model.layers[il].attn_q_norm, nullptr,
15189
+ LLM_NORM_RMS, il);
15190
+ cb(Qcur, "Qcur_norm", il);
15191
+
15192
+ cur = build_attn(inp_attn, gf,
15193
+ model.layers[il].wo, model.layers[il].bo,
15194
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15195
+ cb(cur, "attn_out", il);
15196
+ }
15197
+
15198
+ if (il == n_layer - 1 && inp_out_ids) {
15199
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15200
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15201
+ }
15202
+
15203
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15204
+ cb(ffn_inp, "ffn_inp", il);
15205
+
15206
+ cur = build_norm(ffn_inp,
15207
+ model.layers[il].ffn_norm, NULL,
15208
+ LLM_NORM_RMS, il);
15209
+ cb(cur, "ffn_norm", il);
15210
+
15211
+ // feed-forward network (non-MoE)
15212
+ ggml_tensor * cur_mlp = build_ffn(cur,
15213
+ model.layers[il].ffn_up_shexp, NULL, NULL,
15214
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
15215
+ model.layers[il].ffn_down_shexp, NULL, NULL,
15216
+ NULL,
15217
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15218
+ cb(cur_mlp, "ffn_mlp", il);
15219
+
15220
+ // MoE branch
15221
+ ggml_tensor * cur_moe = build_moe_ffn(cur,
15222
+ model.layers[il].ffn_gate_inp,
15223
+ model.layers[il].ffn_up_exps,
15224
+ model.layers[il].ffn_gate_exps,
15225
+ model.layers[il].ffn_down_exps,
15226
+ nullptr,
15227
+ n_expert, n_expert_used,
15228
+ LLM_FFN_SILU,
15229
+ true, // norm_topk_prob
15230
+ false,
15231
+ 0.0,
15232
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
15233
+ il);
15234
+ cb(cur_moe, "ffn_moe_out", il);
15235
+
15236
+ ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
15237
+ cb(ffn_out, "ffn_out", il);
15238
+
15239
+ cur = ggml_add(ctx0, ffn_out, ffn_inp);
15240
+
15241
+ cur = build_cvec(cur, il);
15242
+ cb(cur, "l_out", il);
15243
+
15244
+ // input for next layer
15245
+ inpL = cur;
15246
+ }
15247
+
15248
+ cur = inpL;
15249
+
15250
+ cur = build_norm(cur,
15251
+ model.output_norm, NULL,
15252
+ LLM_NORM_RMS, -1);
15253
+
15254
+ cb(cur, "result_norm", -1);
15255
+ res->t_embd = cur;
15256
+
15257
+ // lm_head
15258
+ cur = build_lora_mm(model.output, cur);
15259
+ cb(cur, "result_output", -1);
15260
+ res->t_logits = cur;
15261
+
15262
+ ggml_build_forward_expand(gf, cur);
15263
+ }
15264
+ };
15265
+
15266
+ struct llm_build_smollm3 : public llm_graph_context {
15267
+ llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15268
+ const int64_t n_embd_head = hparams.n_embd_head_v;
15269
+
15270
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15271
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
15272
+
15273
+ ggml_tensor * cur;
15274
+ ggml_tensor * inpL;
15275
+
15276
+ inpL = build_inp_embd(model.tok_embd);
15277
+
15278
+ // inp_pos - contains the positions
15279
+ ggml_tensor * inp_pos = build_inp_pos();
15280
+
15281
+ auto * inp_attn = build_attn_inp_kv_unified();
15282
+
15283
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15284
+
15285
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
15286
+
15287
+ for (int il = 0; il < n_layer; ++il) {
15288
+ ggml_tensor * inpSA = inpL;
15289
+
15290
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
15291
+
15292
+ // norm
15293
+ cur = build_norm(inpL,
15294
+ model.layers[il].attn_norm, NULL,
15295
+ LLM_NORM_RMS, il);
15296
+ cb(cur, "attn_norm", il);
15297
+
15298
+ // self-attention
15299
+ {
15300
+ // compute Q and K and RoPE them
15301
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
15302
+ cb(Qcur, "Qcur", il);
15303
+ if (model.layers[il].bq) {
15304
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
15305
+ cb(Qcur, "Qcur", il);
15306
+ }
15307
+
15308
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
15309
+ cb(Kcur, "Kcur", il);
15310
+ if (model.layers[il].bk) {
15311
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
15312
+ cb(Kcur, "Kcur", il);
15313
+ }
15314
+
15315
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
15316
+ cb(Vcur, "Vcur", il);
15317
+ if (model.layers[il].bv) {
15318
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
15319
+ cb(Vcur, "Vcur", il);
15320
+ }
15321
+
15322
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
15323
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
15324
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
15325
+
15326
+ if (use_rope) {
15327
+ Qcur = ggml_rope_ext(
15328
+ ctx0, Qcur, inp_pos, nullptr,
15329
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15330
+ ext_factor, attn_factor, beta_fast, beta_slow
15331
+ );
15332
+
15333
+ Kcur = ggml_rope_ext(
15334
+ ctx0, Kcur, inp_pos, nullptr,
15335
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15336
+ ext_factor, attn_factor, beta_fast, beta_slow
15337
+ );
15338
+ }
15339
+
15340
+ cb(Qcur, "Qcur", il);
15341
+ cb(Kcur, "Kcur", il);
15342
+ cb(Vcur, "Vcur", il);
15343
+
15344
+ cur = build_attn(inp_attn, gf,
15345
+ model.layers[il].wo, model.layers[il].bo,
15346
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15347
+ cb(cur, "attn_out", il);
15348
+ }
15349
+
15350
+ if (il == n_layer - 1 && inp_out_ids) {
15351
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15352
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15353
+ }
15354
+
15355
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15356
+ cb(ffn_inp, "ffn_inp", il);
15357
+
15358
+ // feed-forward network
15359
+ {
15360
+ cur = build_norm(ffn_inp,
15361
+ model.layers[il].ffn_norm, NULL,
15362
+ LLM_NORM_RMS, il);
15363
+ cb(cur, "ffn_norm", il);
15364
+
15365
+ cur = build_ffn(cur,
15366
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
15367
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
15368
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
15369
+ NULL,
15370
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15371
+ cb(cur, "ffn_out", il);
15372
+ }
15373
+
15374
+ cur = ggml_add(ctx0, cur, ffn_inp);
15375
+ cb(cur, "ffn_out", il);
15376
+
15377
+ cur = build_cvec(cur, il);
15378
+ cb(cur, "l_out", il);
15379
+
15380
+ // input for next layer
15381
+ inpL = cur;
15382
+ }
15383
+
15384
+ cur = inpL;
15385
+
15386
+ cur = build_norm(cur,
15387
+ model.output_norm, NULL,
15388
+ LLM_NORM_RMS, -1);
15389
+
15390
+ cb(cur, "result_norm", -1);
15391
+ res->t_embd = cur;
15392
+
15393
+ // lm_head
15394
+ cur = build_lora_mm(model.output, cur);
15395
+
15396
+ cb(cur, "result_output", -1);
15397
+ res->t_logits = cur;
15398
+
15399
+ ggml_build_forward_expand(gf, cur);
15400
+ }
15401
+ };
15402
+
15403
+ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
15404
+ llama_memory_i * res;
15405
+
15406
+ switch (arch) {
15407
+ // Models that need specific instantiation should be handled in the
15408
+ // switch statement
15409
+ case LLM_ARCH_BERT:
15410
+ case LLM_ARCH_JINA_BERT_V2:
15411
+ case LLM_ARCH_NOMIC_BERT:
15412
+ case LLM_ARCH_NOMIC_BERT_MOE:
15413
+ case LLM_ARCH_NEO_BERT:
15414
+ case LLM_ARCH_WAVTOKENIZER_DEC:
15415
+ {
15416
+ res = nullptr;
15417
+ } break;
15418
+ // Models that need standard caching should rely on recurrent/hybrid
15419
+ // checks
15420
+ default:
15421
+ {
15422
+ if (llm_arch_is_recurrent(arch)) {
15423
+ res = new llama_memory_recurrent(
15424
+ *this,
15425
+ nullptr,
15426
+ GGML_TYPE_F32,
15427
+ GGML_TYPE_F32,
15428
+ cparams.offload_kqv,
15429
+ std::max((uint32_t) 1, cparams.n_seq_max),
15430
+ cparams.n_seq_max);
15431
+ } else if (llm_arch_is_hybrid(arch)) {
15432
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
15433
+
15434
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
15435
+
15436
+ res = new llama_memory_hybrid(
15437
+ /* model */ *this,
15438
+ /* attn_type_k */ params.type_k,
15439
+ /* attn_type_v */ params.type_v,
15440
+ /* attn_v_trans */ !cparams.flash_attn,
15441
+ /* attn_kv_size */ cparams.n_ctx,
15442
+ /* attn_n_pad */ padding,
15443
+ /* attn_n_swa */ hparams.n_swa,
15444
+ /* attn_swa_type */ hparams.swa_type,
15445
+ /* recurrent_type_k */ GGML_TYPE_F32,
15446
+ /* recurrent_type_v */ GGML_TYPE_F32,
15447
+ /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
15448
+ /* n_seq_max */ cparams.n_seq_max,
15449
+ /* offload */ cparams.offload_kqv,
15450
+ /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
15451
+ /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
15452
+ } else {
15453
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
15454
+
15455
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
15456
+
15457
+ LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
15458
+
15459
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
15460
+ GGML_ASSERT(hparams.is_swa_any());
15461
+
15462
+ res = new llama_kv_cache_unified_iswa(
15463
+ *this,
15464
+ params.type_k,
15465
+ params.type_v,
15466
+ !cparams.flash_attn,
15467
+ cparams.offload_kqv,
15468
+ params.swa_full,
15469
+ cparams.n_ctx,
15470
+ cparams.n_seq_max,
15471
+ cparams.n_ubatch,
15472
+ padding);
15473
+ } else {
15474
+ GGML_ASSERT(!hparams.is_swa_any());
15475
+
15476
+ res = new llama_kv_cache_unified(
15477
+ *this,
15478
+ nullptr,
15479
+ params.type_k,
15480
+ params.type_v,
15481
+ !cparams.flash_attn,
15482
+ cparams.offload_kqv,
15483
+ cparams.n_ctx,
15484
+ cparams.n_seq_max,
15485
+ padding,
15486
+ hparams.n_swa,
15487
+ hparams.swa_type);
15488
+ }
15489
+ }
15490
+ }
15491
+ }
15492
+
15493
+ return res;
14525
15494
  }
14526
15495
 
14527
15496
  llm_graph_result_ptr llama_model::build_graph(
@@ -14668,6 +15637,7 @@ llm_graph_result_ptr llama_model::build_graph(
14668
15637
  llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
14669
15638
  } break;
14670
15639
  case LLM_ARCH_MAMBA:
15640
+ case LLM_ARCH_MAMBA2:
14671
15641
  {
14672
15642
  llm = std::make_unique<llm_build_mamba>(*this, params, gf);
14673
15643
  } break;
@@ -14812,6 +15782,18 @@ llm_graph_result_ptr llama_model::build_graph(
14812
15782
  {
14813
15783
  llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf);
14814
15784
  } break;
15785
+ case LLM_ARCH_HUNYUAN_MOE:
15786
+ {
15787
+ llm = std::make_unique<llm_build_hunyuan_moe>(*this, params, gf);
15788
+ } break;
15789
+ case LLM_ARCH_SMOLLM3:
15790
+ {
15791
+ llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
15792
+ } break;
15793
+ case LLM_ARCH_FALCON_H1:
15794
+ {
15795
+ llm = std::make_unique<llm_build_falcon_h1>(*this, params, gf);
15796
+ } break;
14815
15797
  default:
14816
15798
  GGML_ABORT("fatal error");
14817
15799
  }
@@ -14928,6 +15910,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14928
15910
  case LLM_ARCH_REFACT:
14929
15911
  case LLM_ARCH_BLOOM:
14930
15912
  case LLM_ARCH_MAMBA:
15913
+ case LLM_ARCH_MAMBA2:
14931
15914
  case LLM_ARCH_JINA_BERT_V2:
14932
15915
  case LLM_ARCH_T5:
14933
15916
  case LLM_ARCH_T5ENCODER:
@@ -14962,12 +15945,14 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14962
15945
  case LLM_ARCH_CHAMELEON:
14963
15946
  case LLM_ARCH_BAILINGMOE:
14964
15947
  case LLM_ARCH_NEO_BERT:
15948
+ case LLM_ARCH_SMOLLM3:
14965
15949
  case LLM_ARCH_ARCEE:
14966
15950
  case LLM_ARCH_ERNIE4_5:
14967
15951
  return LLAMA_ROPE_TYPE_NORM;
14968
15952
 
14969
15953
  // the pairs of head values are offset by n_rot/2
14970
15954
  case LLM_ARCH_FALCON:
15955
+ case LLM_ARCH_FALCON_H1:
14971
15956
  case LLM_ARCH_GROK:
14972
15957
  case LLM_ARCH_DBRX:
14973
15958
  case LLM_ARCH_BERT:
@@ -14999,6 +15984,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14999
15984
  case LLM_ARCH_EXAONE:
15000
15985
  case LLM_ARCH_MINICPM3:
15001
15986
  case LLM_ARCH_DOTS1:
15987
+ case LLM_ARCH_HUNYUAN_MOE:
15002
15988
  return LLAMA_ROPE_TYPE_NEOX;
15003
15989
 
15004
15990
  case LLM_ARCH_QWEN2VL: