@fugood/llama.node 1.1.8 → 1.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/lib/binding.ts +9 -0
  2. package/lib/index.js +9 -2
  3. package/lib/index.ts +57 -30
  4. package/lib/version.js +2 -2
  5. package/lib/version.ts +2 -2
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +15 -5
  8. package/src/LlamaCompletionWorker.cpp +12 -3
  9. package/src/LlamaCompletionWorker.h +3 -1
  10. package/src/LlamaContext.cpp +14 -1
  11. package/src/llama.cpp/common/arg.cpp +6 -4
  12. package/src/llama.cpp/common/chat.cpp +34 -3
  13. package/src/llama.cpp/common/common.cpp +0 -15
  14. package/src/llama.cpp/common/common.h +1 -2
  15. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  16. package/src/llama.cpp/ggml/include/ggml.h +25 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  23. package/src/llama.cpp/include/llama.h +1 -110
  24. package/src/llama.cpp/src/CMakeLists.txt +2 -2
  25. package/src/llama.cpp/src/llama-arch.cpp +19 -0
  26. package/src/llama.cpp/src/llama-arch.h +1 -0
  27. package/src/llama.cpp/src/llama-chat.cpp +13 -2
  28. package/src/llama.cpp/src/llama-chat.h +1 -0
  29. package/src/llama.cpp/src/llama-context.cpp +5 -192
  30. package/src/llama.cpp/src/llama-context.h +2 -7
  31. package/src/llama.cpp/src/llama-cparams.h +0 -1
  32. package/src/llama.cpp/src/llama-graph.cpp +35 -57
  33. package/src/llama.cpp/src/llama-graph.h +36 -46
  34. package/src/llama.cpp/src/llama-hparams.cpp +25 -0
  35. package/src/llama.cpp/src/llama-hparams.h +6 -0
  36. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +69 -52
  37. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +28 -26
  38. package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +123 -474
  39. package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +34 -59
  40. package/src/llama.cpp/src/llama-kv-cells.h +21 -21
  41. package/src/llama.cpp/src/llama-memory-hybrid.cpp +34 -33
  42. package/src/llama.cpp/src/llama-memory-hybrid.h +24 -28
  43. package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
  44. package/src/llama.cpp/src/llama-memory-recurrent.h +8 -12
  45. package/src/llama.cpp/src/llama-memory.h +11 -8
  46. package/src/llama.cpp/src/llama-model.cpp +396 -187
  47. package/src/llama.cpp/src/llama-model.h +1 -0
@@ -6,8 +6,8 @@
6
6
  #include "llama-cparams.h"
7
7
  #include "llama-model-loader.h"
8
8
 
9
- #include "llama-kv-cache-unified.h"
10
- #include "llama-kv-cache-unified-iswa.h"
9
+ #include "llama-kv-cache.h"
10
+ #include "llama-kv-cache-iswa.h"
11
11
  #include "llama-memory-hybrid.h"
12
12
  #include "llama-memory-recurrent.h"
13
13
 
@@ -83,6 +83,7 @@ const char * llm_type_name(llm_type type) {
83
83
  case LLM_TYPE_32B: return "32B";
84
84
  case LLM_TYPE_34B: return "34B";
85
85
  case LLM_TYPE_35B: return "35B";
86
+ case LLM_TYPE_36B: return "36B";
86
87
  case LLM_TYPE_40B: return "40B";
87
88
  case LLM_TYPE_65B: return "65B";
88
89
  case LLM_TYPE_70B: return "70B";
@@ -1114,6 +1115,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1114
1115
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1115
1116
  hparams.set_swa_pattern(5);
1116
1117
 
1118
+ hparams.n_layer_kv_from_start = 20;
1117
1119
  hparams.rope_freq_base_train_swa = 10000.0f;
1118
1120
  hparams.rope_freq_scale_train_swa = 1.0f;
1119
1121
  hparams.f_attention_scale = 1.0f;
@@ -1288,6 +1290,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1288
1290
  default: type = LLM_TYPE_UNKNOWN;
1289
1291
  }
1290
1292
  } break;
1293
+ case LLM_ARCH_SEED_OSS:
1294
+ {
1295
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1296
+ switch (hparams.n_layer) {
1297
+ case 64: type = LLM_TYPE_36B; break;
1298
+ default: type = LLM_TYPE_UNKNOWN;
1299
+ }
1300
+ } break;
1291
1301
  case LLM_ARCH_OLMOE:
1292
1302
  {
1293
1303
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1465,12 +1475,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1465
1475
  // Expert gating function (GLM-4.5 uses sigmoid)
1466
1476
  ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1467
1477
  if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1468
- hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1478
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1469
1479
  }
1470
1480
 
1471
1481
  // NextN/MTP parameters
1472
1482
  ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1473
1483
 
1484
+ // TODO: when MTP is implemented, this should probably be updated if needed
1485
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1486
+
1474
1487
  switch (hparams.n_layer) {
1475
1488
  case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1476
1489
  case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
@@ -3967,6 +3980,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3967
3980
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3968
3981
  }
3969
3982
  } break;
3983
+ case LLM_ARCH_SEED_OSS:
3984
+ {
3985
+ const uint32_t head_dim = hparams.n_embd_head_k;
3986
+ const int64_t n_qo_dim = n_head * head_dim;
3987
+ const int64_t n_kv_dim = n_head_kv * head_dim;
3988
+
3989
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3990
+
3991
+ // output
3992
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3993
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3994
+ // if output is NULL, init from the input tok embed
3995
+ if (output == NULL) {
3996
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3997
+ }
3998
+
3999
+ for (int i = 0; i < n_layer; ++i) {
4000
+ auto & layer = layers[i];
4001
+
4002
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
4003
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
4004
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
4005
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
4006
+
4007
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
4008
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4009
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4010
+
4011
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4012
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4013
+
4014
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4015
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4016
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4017
+ }
4018
+ } break;
4019
+
3970
4020
  case LLM_ARCH_OLMOE:
3971
4021
  {
3972
4022
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5474,8 +5524,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5474
5524
  } break;
5475
5525
  case LLM_ARCH_LFM2:
5476
5526
  {
5477
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5527
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5478
5528
  tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5529
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5530
+
5531
+ if (output == NULL) {
5532
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5533
+ }
5479
5534
 
5480
5535
  for (int i = 0; i < n_layer; ++i) {
5481
5536
  auto & layer = layers[i];
@@ -5986,7 +6041,7 @@ struct llm_build_llama : public llm_graph_context {
5986
6041
  // inp_pos - contains the positions
5987
6042
  ggml_tensor * inp_pos = build_inp_pos();
5988
6043
 
5989
- auto * inp_attn = build_attn_inp_kv_unified();
6044
+ auto * inp_attn = build_attn_inp_kv();
5990
6045
 
5991
6046
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
5992
6047
 
@@ -6050,7 +6105,7 @@ struct llm_build_llama : public llm_graph_context {
6050
6105
 
6051
6106
  cur = build_attn(inp_attn,
6052
6107
  model.layers[il].wo, model.layers[il].bo,
6053
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6108
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
6054
6109
  cb(cur, "attn_out", il);
6055
6110
  }
6056
6111
 
@@ -6146,7 +6201,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
6146
6201
  ggml_tensor * inp_attn_scale = nullptr;
6147
6202
  inp_attn_scale = build_inp_attn_scale();
6148
6203
 
6149
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
6204
+ auto * inp_attn = build_attn_inp_kv_iswa();
6150
6205
 
6151
6206
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
6152
6207
 
@@ -6224,7 +6279,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
6224
6279
 
6225
6280
  cur = build_attn(inp_attn,
6226
6281
  model.layers[il].wo, model.layers[il].bo,
6227
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6282
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
6228
6283
  cb(cur, "attn_out", il);
6229
6284
  }
6230
6285
 
@@ -6325,7 +6380,7 @@ struct llm_build_deci : public llm_graph_context {
6325
6380
  // inp_pos - contains the positions
6326
6381
  ggml_tensor * inp_pos = build_inp_pos();
6327
6382
 
6328
- auto * inp_attn = build_attn_inp_kv_unified();
6383
+ auto * inp_attn = build_attn_inp_kv();
6329
6384
 
6330
6385
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
6331
6386
 
@@ -6401,7 +6456,7 @@ struct llm_build_deci : public llm_graph_context {
6401
6456
 
6402
6457
  cur = build_attn(inp_attn,
6403
6458
  model.layers[il].wo, model.layers[il].bo,
6404
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6459
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
6405
6460
  }
6406
6461
 
6407
6462
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6481,7 +6536,7 @@ struct llm_build_baichuan : public llm_graph_context {
6481
6536
  // inp_pos - contains the positions
6482
6537
  ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
6483
6538
 
6484
- auto * inp_attn = build_attn_inp_kv_unified();
6539
+ auto * inp_attn = build_attn_inp_kv();
6485
6540
 
6486
6541
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6487
6542
 
@@ -6533,7 +6588,7 @@ struct llm_build_baichuan : public llm_graph_context {
6533
6588
 
6534
6589
  cur = build_attn(inp_attn,
6535
6590
  model.layers[il].wo, NULL,
6536
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6591
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6537
6592
  }
6538
6593
 
6539
6594
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6603,7 +6658,7 @@ struct llm_build_xverse : public llm_graph_context {
6603
6658
  // inp_pos - contains the positions
6604
6659
  ggml_tensor * inp_pos = build_inp_pos();
6605
6660
 
6606
- auto * inp_attn = build_attn_inp_kv_unified();
6661
+ auto * inp_attn = build_attn_inp_kv();
6607
6662
 
6608
6663
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6609
6664
 
@@ -6648,7 +6703,7 @@ struct llm_build_xverse : public llm_graph_context {
6648
6703
 
6649
6704
  cur = build_attn(inp_attn,
6650
6705
  model.layers[il].wo, NULL,
6651
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6706
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6652
6707
  }
6653
6708
 
6654
6709
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6717,7 +6772,7 @@ struct llm_build_falcon : public llm_graph_context {
6717
6772
  // inp_pos - contains the positions
6718
6773
  ggml_tensor * inp_pos = build_inp_pos();
6719
6774
 
6720
- auto * inp_attn = build_attn_inp_kv_unified();
6775
+ auto * inp_attn = build_attn_inp_kv();
6721
6776
 
6722
6777
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6723
6778
 
@@ -6771,7 +6826,7 @@ struct llm_build_falcon : public llm_graph_context {
6771
6826
 
6772
6827
  cur = build_attn(inp_attn,
6773
6828
  model.layers[il].wo, NULL,
6774
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6829
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6775
6830
  }
6776
6831
 
6777
6832
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6841,7 +6896,7 @@ struct llm_build_grok : public llm_graph_context {
6841
6896
  // inp_pos - contains the positions
6842
6897
  ggml_tensor * inp_pos = build_inp_pos();
6843
6898
 
6844
- auto * inp_attn = build_attn_inp_kv_unified();
6899
+ auto * inp_attn = build_attn_inp_kv();
6845
6900
 
6846
6901
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6847
6902
 
@@ -6901,7 +6956,7 @@ struct llm_build_grok : public llm_graph_context {
6901
6956
 
6902
6957
  cur = build_attn(inp_attn,
6903
6958
  model.layers[il].wo, model.layers[il].bo,
6904
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
6959
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
6905
6960
  }
6906
6961
 
6907
6962
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7001,7 +7056,7 @@ struct llm_build_dbrx : public llm_graph_context {
7001
7056
  // inp_pos - contains the positions
7002
7057
  ggml_tensor * inp_pos = build_inp_pos();
7003
7058
 
7004
- auto * inp_attn = build_attn_inp_kv_unified();
7059
+ auto * inp_attn = build_attn_inp_kv();
7005
7060
 
7006
7061
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7007
7062
 
@@ -7050,7 +7105,7 @@ struct llm_build_dbrx : public llm_graph_context {
7050
7105
 
7051
7106
  cur = build_attn(inp_attn,
7052
7107
  model.layers[il].wo, NULL,
7053
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7108
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7054
7109
  }
7055
7110
 
7056
7111
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7125,7 +7180,7 @@ struct llm_build_starcoder : public llm_graph_context {
7125
7180
  // inp_pos - contains the positions
7126
7181
  ggml_tensor * inp_pos = build_inp_pos();
7127
7182
 
7128
- auto * inp_attn = build_attn_inp_kv_unified();
7183
+ auto * inp_attn = build_attn_inp_kv();
7129
7184
 
7130
7185
  ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7131
7186
  cb(pos, "pos_embd", -1);
@@ -7164,7 +7219,7 @@ struct llm_build_starcoder : public llm_graph_context {
7164
7219
 
7165
7220
  cur = build_attn(inp_attn,
7166
7221
  model.layers[il].wo, model.layers[il].bo,
7167
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7222
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7168
7223
  }
7169
7224
 
7170
7225
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7230,7 +7285,7 @@ struct llm_build_refact : public llm_graph_context {
7230
7285
 
7231
7286
  inpL = build_inp_embd(model.tok_embd);
7232
7287
 
7233
- auto * inp_attn = build_attn_inp_kv_unified();
7288
+ auto * inp_attn = build_attn_inp_kv();
7234
7289
 
7235
7290
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7236
7291
 
@@ -7263,7 +7318,7 @@ struct llm_build_refact : public llm_graph_context {
7263
7318
 
7264
7319
  cur = build_attn(inp_attn,
7265
7320
  model.layers[il].wo, NULL,
7266
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7321
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7267
7322
  }
7268
7323
 
7269
7324
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7426,7 +7481,7 @@ struct llm_build_bert : public llm_graph_context {
7426
7481
 
7427
7482
  cur = build_attn(inp_attn,
7428
7483
  model.layers[il].wo, model.layers[il].bo,
7429
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7484
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7430
7485
  cb(cur, "kqv_out", il);
7431
7486
  }
7432
7487
 
@@ -7571,7 +7626,7 @@ struct llm_build_neo_bert : public llm_graph_context {
7571
7626
 
7572
7627
  cur = build_attn(inp_attn,
7573
7628
  model.layers[il].wo, nullptr,
7574
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7629
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7575
7630
  cb(cur, "kqv_out", il);
7576
7631
  }
7577
7632
 
@@ -7632,7 +7687,7 @@ struct llm_build_bloom : public llm_graph_context {
7632
7687
 
7633
7688
  inpL = build_inp_embd(model.tok_embd);
7634
7689
 
7635
- auto * inp_attn = build_attn_inp_kv_unified();
7690
+ auto * inp_attn = build_attn_inp_kv();
7636
7691
 
7637
7692
  inpL = build_norm(inpL,
7638
7693
  model.tok_norm,
@@ -7671,7 +7726,7 @@ struct llm_build_bloom : public llm_graph_context {
7671
7726
 
7672
7727
  cur = build_attn(inp_attn,
7673
7728
  model.layers[il].wo, model.layers[il].bo,
7674
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7729
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7675
7730
  }
7676
7731
 
7677
7732
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7739,7 +7794,7 @@ struct llm_build_mpt : public llm_graph_context {
7739
7794
 
7740
7795
  inpL = build_inp_embd(model.tok_embd);
7741
7796
 
7742
- auto * inp_attn = build_attn_inp_kv_unified();
7797
+ auto * inp_attn = build_attn_inp_kv();
7743
7798
 
7744
7799
  if (model.pos_embd) {
7745
7800
  // inp_pos - contains the positions
@@ -7819,7 +7874,7 @@ struct llm_build_mpt : public llm_graph_context {
7819
7874
 
7820
7875
  cur = build_attn(inp_attn,
7821
7876
  model.layers[il].wo, model.layers[il].bo,
7822
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7877
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7823
7878
  }
7824
7879
 
7825
7880
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7889,7 +7944,7 @@ struct llm_build_stablelm : public llm_graph_context {
7889
7944
  // inp_pos - contains the positions
7890
7945
  ggml_tensor * inp_pos = build_inp_pos();
7891
7946
 
7892
- auto * inp_attn = build_attn_inp_kv_unified();
7947
+ auto * inp_attn = build_attn_inp_kv();
7893
7948
 
7894
7949
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7895
7950
 
@@ -7965,7 +8020,7 @@ struct llm_build_stablelm : public llm_graph_context {
7965
8020
 
7966
8021
  cur = build_attn(inp_attn,
7967
8022
  model.layers[il].wo, NULL,
7968
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8023
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7969
8024
  }
7970
8025
 
7971
8026
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8041,7 +8096,7 @@ struct llm_build_qwen : public llm_graph_context {
8041
8096
  // inp_pos - contains the positions
8042
8097
  ggml_tensor * inp_pos = build_inp_pos();
8043
8098
 
8044
- auto * inp_attn = build_attn_inp_kv_unified();
8099
+ auto * inp_attn = build_attn_inp_kv();
8045
8100
 
8046
8101
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8047
8102
 
@@ -8086,7 +8141,7 @@ struct llm_build_qwen : public llm_graph_context {
8086
8141
 
8087
8142
  cur = build_attn(inp_attn,
8088
8143
  model.layers[il].wo, NULL,
8089
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8144
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8090
8145
  }
8091
8146
 
8092
8147
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8156,7 +8211,7 @@ struct llm_build_qwen2 : public llm_graph_context {
8156
8211
  // inp_pos - contains the positions
8157
8212
  ggml_tensor * inp_pos = build_inp_pos();
8158
8213
 
8159
- auto * inp_attn = build_attn_inp_kv_unified();
8214
+ auto * inp_attn = build_attn_inp_kv();
8160
8215
 
8161
8216
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8162
8217
 
@@ -8206,7 +8261,7 @@ struct llm_build_qwen2 : public llm_graph_context {
8206
8261
 
8207
8262
  cur = build_attn(inp_attn,
8208
8263
  model.layers[il].wo, model.layers[il].bo,
8209
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8264
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8210
8265
  }
8211
8266
 
8212
8267
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8320,8 +8375,9 @@ struct llm_build_dream : public llm_graph_context {
8320
8375
  cb(Kcur, "Kcur", il);
8321
8376
  cb(Vcur, "Vcur", il);
8322
8377
 
8323
- cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr,
8324
- nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
8378
+ cur = build_attn(inp_attn,
8379
+ model.layers[il].wo, model.layers[il].bo,
8380
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
8325
8381
  }
8326
8382
 
8327
8383
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8420,8 +8476,9 @@ struct llm_build_llada : public llm_graph_context {
8420
8476
  cb(Kcur, "Kcur", il);
8421
8477
  cb(Vcur, "Vcur", il);
8422
8478
 
8423
- cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
8424
- 1.0f / sqrtf(float(n_embd_head)), il);
8479
+ cur = build_attn(inp_attn,
8480
+ model.layers[il].wo, NULL,
8481
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
8425
8482
  }
8426
8483
 
8427
8484
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8481,7 +8538,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
8481
8538
  // inp_pos - contains the positions
8482
8539
  ggml_tensor * inp_pos = build_inp_pos();
8483
8540
 
8484
- auto * inp_attn = build_attn_inp_kv_unified();
8541
+ auto * inp_attn = build_attn_inp_kv();
8485
8542
 
8486
8543
  int sections[4];
8487
8544
  std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -8534,7 +8591,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
8534
8591
 
8535
8592
  cur = build_attn(inp_attn,
8536
8593
  model.layers[il].wo, model.layers[il].bo,
8537
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8594
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8538
8595
  }
8539
8596
 
8540
8597
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8602,7 +8659,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
8602
8659
  // inp_pos - contains the positions
8603
8660
  ggml_tensor * inp_pos = build_inp_pos();
8604
8661
 
8605
- auto * inp_attn = build_attn_inp_kv_unified();
8662
+ auto * inp_attn = build_attn_inp_kv();
8606
8663
 
8607
8664
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8608
8665
 
@@ -8661,7 +8718,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
8661
8718
 
8662
8719
  cur = build_attn(inp_attn,
8663
8720
  model.layers[il].wo, model.layers[il].bo,
8664
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8721
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8665
8722
  }
8666
8723
 
8667
8724
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8761,7 +8818,7 @@ struct llm_build_qwen3 : public llm_graph_context {
8761
8818
  // inp_pos - contains the positions
8762
8819
  ggml_tensor * inp_pos = build_inp_pos();
8763
8820
 
8764
- auto * inp_attn = build_attn_inp_kv_unified();
8821
+ auto * inp_attn = build_attn_inp_kv();
8765
8822
 
8766
8823
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8767
8824
 
@@ -8814,7 +8871,7 @@ struct llm_build_qwen3 : public llm_graph_context {
8814
8871
 
8815
8872
  cur = build_attn(inp_attn,
8816
8873
  model.layers[il].wo, model.layers[il].bo,
8817
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8874
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8818
8875
  }
8819
8876
 
8820
8877
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8882,7 +8939,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
8882
8939
  // inp_pos - contains the positions
8883
8940
  ggml_tensor * inp_pos = build_inp_pos();
8884
8941
 
8885
- auto * inp_attn = build_attn_inp_kv_unified();
8942
+ auto * inp_attn = build_attn_inp_kv();
8886
8943
 
8887
8944
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8888
8945
 
@@ -8935,7 +8992,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
8935
8992
 
8936
8993
  cur = build_attn(inp_attn,
8937
8994
  model.layers[il].wo, model.layers[il].bo,
8938
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8995
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8939
8996
  }
8940
8997
 
8941
8998
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9012,7 +9069,7 @@ struct llm_build_phi2 : public llm_graph_context {
9012
9069
  // inp_pos - contains the positions
9013
9070
  ggml_tensor * inp_pos = build_inp_pos();
9014
9071
 
9015
- auto * inp_attn = build_attn_inp_kv_unified();
9072
+ auto * inp_attn = build_attn_inp_kv();
9016
9073
 
9017
9074
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9018
9075
 
@@ -9075,7 +9132,7 @@ struct llm_build_phi2 : public llm_graph_context {
9075
9132
 
9076
9133
  cur = build_attn(inp_attn,
9077
9134
  model.layers[il].wo, model.layers[il].bo,
9078
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9135
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
9079
9136
  }
9080
9137
 
9081
9138
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9141,13 +9198,13 @@ struct llm_build_phi3 : public llm_graph_context {
9141
9198
  // inp_pos - contains the positions
9142
9199
  ggml_tensor * inp_pos = build_inp_pos();
9143
9200
 
9144
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
9201
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
9145
9202
  inp_attn_type * inp_attn = nullptr;
9146
9203
 
9147
9204
  if constexpr (iswa) {
9148
- inp_attn = build_attn_inp_kv_unified_iswa();
9205
+ inp_attn = build_attn_inp_kv_iswa();
9149
9206
  } else {
9150
- inp_attn = build_attn_inp_kv_unified();
9207
+ inp_attn = build_attn_inp_kv();
9151
9208
  }
9152
9209
 
9153
9210
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -9212,7 +9269,7 @@ struct llm_build_phi3 : public llm_graph_context {
9212
9269
 
9213
9270
  cur = build_attn(inp_attn,
9214
9271
  model.layers[il].wo, model.layers[il].bo,
9215
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9272
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
9216
9273
  }
9217
9274
 
9218
9275
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9299,7 +9356,7 @@ struct llm_build_plamo : public llm_graph_context {
9299
9356
  // inp_pos - contains the positions
9300
9357
  ggml_tensor * inp_pos = build_inp_pos();
9301
9358
 
9302
- auto * inp_attn = build_attn_inp_kv_unified();
9359
+ auto * inp_attn = build_attn_inp_kv();
9303
9360
 
9304
9361
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9305
9362
 
@@ -9346,7 +9403,7 @@ struct llm_build_plamo : public llm_graph_context {
9346
9403
 
9347
9404
  cur = build_attn(inp_attn,
9348
9405
  model.layers[il].wo, NULL,
9349
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9406
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9350
9407
  }
9351
9408
 
9352
9409
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9415,7 +9472,7 @@ struct llm_build_gpt2 : public llm_graph_context {
9415
9472
  // inp_pos - contains the positions
9416
9473
  ggml_tensor * inp_pos = build_inp_pos();
9417
9474
 
9418
- auto * inp_attn = build_attn_inp_kv_unified();
9475
+ auto * inp_attn = build_attn_inp_kv();
9419
9476
 
9420
9477
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
9421
9478
  cb(pos, "pos_embd", -1);
@@ -9454,7 +9511,7 @@ struct llm_build_gpt2 : public llm_graph_context {
9454
9511
 
9455
9512
  cur = build_attn(inp_attn,
9456
9513
  model.layers[il].wo, model.layers[il].bo,
9457
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9514
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9458
9515
  }
9459
9516
 
9460
9517
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9525,7 +9582,7 @@ struct llm_build_codeshell : public llm_graph_context {
9525
9582
  // inp_pos - contains the positions
9526
9583
  ggml_tensor * inp_pos = build_inp_pos();
9527
9584
 
9528
- auto * inp_attn = build_attn_inp_kv_unified();
9585
+ auto * inp_attn = build_attn_inp_kv();
9529
9586
 
9530
9587
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9531
9588
 
@@ -9568,7 +9625,7 @@ struct llm_build_codeshell : public llm_graph_context {
9568
9625
 
9569
9626
  cur = build_attn(inp_attn,
9570
9627
  model.layers[il].wo, model.layers[il].bo,
9571
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9628
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9572
9629
  }
9573
9630
 
9574
9631
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9638,7 +9695,7 @@ struct llm_build_orion : public llm_graph_context {
9638
9695
  // inp_pos - contains the positions
9639
9696
  ggml_tensor * inp_pos = build_inp_pos();
9640
9697
 
9641
- auto * inp_attn = build_attn_inp_kv_unified();
9698
+ auto * inp_attn = build_attn_inp_kv();
9642
9699
 
9643
9700
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9644
9701
 
@@ -9697,7 +9754,7 @@ struct llm_build_orion : public llm_graph_context {
9697
9754
 
9698
9755
  cur = build_attn(inp_attn,
9699
9756
  model.layers[il].wo, NULL,
9700
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9757
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9701
9758
  }
9702
9759
 
9703
9760
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9765,7 +9822,7 @@ struct llm_build_internlm2 : public llm_graph_context {
9765
9822
  // inp_pos - contains the positions
9766
9823
  ggml_tensor * inp_pos = build_inp_pos();
9767
9824
 
9768
- auto * inp_attn = build_attn_inp_kv_unified();
9825
+ auto * inp_attn = build_attn_inp_kv();
9769
9826
 
9770
9827
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9771
9828
 
@@ -9824,7 +9881,7 @@ struct llm_build_internlm2 : public llm_graph_context {
9824
9881
 
9825
9882
  cur = build_attn(inp_attn,
9826
9883
  model.layers[il].wo, model.layers[il].bo,
9827
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9884
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9828
9885
  }
9829
9886
 
9830
9887
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9901,7 +9958,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
9901
9958
  // inp_pos - contains the positions
9902
9959
  ggml_tensor * inp_pos = build_inp_pos();
9903
9960
 
9904
- auto * inp_attn = build_attn_inp_kv_unified();
9961
+ auto * inp_attn = build_attn_inp_kv();
9905
9962
 
9906
9963
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9907
9964
 
@@ -10012,7 +10069,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
10012
10069
 
10013
10070
  cur = build_attn(inp_attn,
10014
10071
  model.layers[il].wo, NULL,
10015
- q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
10072
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
10016
10073
  }
10017
10074
 
10018
10075
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10096,7 +10153,7 @@ struct llm_build_gemma : public llm_graph_context {
10096
10153
  // inp_pos - contains the positions
10097
10154
  ggml_tensor * inp_pos = build_inp_pos();
10098
10155
 
10099
- auto * inp_attn = build_attn_inp_kv_unified();
10156
+ auto * inp_attn = build_attn_inp_kv();
10100
10157
 
10101
10158
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10102
10159
 
@@ -10142,7 +10199,7 @@ struct llm_build_gemma : public llm_graph_context {
10142
10199
 
10143
10200
  cur = build_attn(inp_attn,
10144
10201
  model.layers[il].wo, NULL,
10145
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10202
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
10146
10203
  }
10147
10204
 
10148
10205
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10212,7 +10269,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
10212
10269
  // inp_pos - contains the positions
10213
10270
  ggml_tensor * inp_pos = build_inp_pos();
10214
10271
 
10215
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10272
+ auto * inp_attn = build_attn_inp_kv_iswa();
10216
10273
 
10217
10274
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10218
10275
 
@@ -10257,7 +10314,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
10257
10314
 
10258
10315
  cur = build_attn(inp_attn,
10259
10316
  model.layers[il].wo, NULL,
10260
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10317
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
10261
10318
  }
10262
10319
 
10263
10320
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10346,7 +10403,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
10346
10403
  ggml_tensor * inp_pos = build_inp_pos();
10347
10404
 
10348
10405
  // TODO: is causal == true correct? might need some changes
10349
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10406
+ auto * inp_attn = build_attn_inp_kv_iswa();
10350
10407
 
10351
10408
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10352
10409
 
@@ -10399,7 +10456,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
10399
10456
 
10400
10457
  cur = build_attn(inp_attn,
10401
10458
  model.layers[il].wo, NULL,
10402
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10459
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
10403
10460
  }
10404
10461
 
10405
10462
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10471,7 +10528,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10471
10528
  const int64_t n_embd_altup;
10472
10529
  const int64_t n_altup;
10473
10530
  const int i_altup_act;
10474
- const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
10475
10531
  const int n_layer_sparsity = 10; // number of layers using activation sparsity
10476
10532
  const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
10477
10533
 
@@ -10497,7 +10553,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10497
10553
  ggml_tensor * inp_pos = build_inp_pos();
10498
10554
 
10499
10555
  // TODO: is causal == true correct? might need some changes
10500
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10556
+ auto * inp_attn = build_attn_inp_kv_iswa();
10501
10557
 
10502
10558
  // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
10503
10559
  ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
@@ -10521,8 +10577,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10521
10577
 
10522
10578
  for (int il = 0; il < n_layer; ++il) {
10523
10579
  // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
10524
- const bool has_kv = (il < n_layer_kv);
10525
-
10526
10580
  const float freq_base_l = model.get_rope_freq_base (cparams, il);
10527
10581
  const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
10528
10582
 
@@ -10542,7 +10596,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10542
10596
  ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
10543
10597
 
10544
10598
  // self-attention
10545
- if (has_kv) {
10599
+ if (hparams.has_kv(il)) {
10546
10600
  // compute Q and K and RoPE them
10547
10601
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
10548
10602
  cb(Qcur, "Qcur", il);
@@ -10580,9 +10634,9 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10580
10634
 
10581
10635
  cur = build_attn(inp_attn,
10582
10636
  model.layers[il].wo, NULL,
10583
- Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
10637
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10584
10638
  } else {
10585
- // no KV layers
10639
+ // reuse KV cache of earlier layers
10586
10640
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
10587
10641
  cb(Qcur, "Qcur", il);
10588
10642
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -10598,7 +10652,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10598
10652
 
10599
10653
  cur = build_attn(inp_attn,
10600
10654
  model.layers[il].wo, NULL,
10601
- Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10655
+ Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10602
10656
  }
10603
10657
 
10604
10658
  cur = build_norm(cur,
@@ -10904,7 +10958,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
10904
10958
  // inp_pos - contains the positions
10905
10959
  ggml_tensor * inp_pos = build_inp_pos();
10906
10960
 
10907
- auto * inp_attn = build_attn_inp_kv_unified();
10961
+ auto * inp_attn = build_attn_inp_kv();
10908
10962
 
10909
10963
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10910
10964
 
@@ -10963,7 +11017,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
10963
11017
 
10964
11018
  cur = build_attn(inp_attn,
10965
11019
  model.layers[il].wo, model.layers[il].bo,
10966
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11020
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10967
11021
  }
10968
11022
 
10969
11023
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11390,7 +11444,9 @@ struct llm_build_jamba : public llm_graph_context_mamba {
11390
11444
  cb(Vcur, "Vcur", il);
11391
11445
 
11392
11446
  // No RoPE :)
11393
- cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
11447
+ cur = build_attn(inp_hybrid->get_attn(),
11448
+ model.layers[il].wo, NULL,
11449
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
11394
11450
  }
11395
11451
 
11396
11452
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11473,7 +11529,7 @@ struct llm_build_command_r : public llm_graph_context {
11473
11529
  // inp_pos - contains the positions
11474
11530
  ggml_tensor * inp_pos = build_inp_pos();
11475
11531
 
11476
- auto * inp_attn = build_attn_inp_kv_unified();
11532
+ auto * inp_attn = build_attn_inp_kv();
11477
11533
 
11478
11534
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11479
11535
 
@@ -11548,7 +11604,7 @@ struct llm_build_command_r : public llm_graph_context {
11548
11604
 
11549
11605
  cur = build_attn(inp_attn,
11550
11606
  model.layers[il].wo, model.layers[il].bo,
11551
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11607
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11552
11608
  }
11553
11609
 
11554
11610
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11620,7 +11676,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
11620
11676
  // inp_pos - contains the positions
11621
11677
  ggml_tensor * inp_pos = build_inp_pos();
11622
11678
 
11623
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
11679
+ auto * inp_attn = build_attn_inp_kv_iswa();
11624
11680
 
11625
11681
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11626
11682
 
@@ -11683,7 +11739,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
11683
11739
 
11684
11740
  cur = build_attn(inp_attn,
11685
11741
  model.layers[il].wo, model.layers[il].bo,
11686
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11742
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11687
11743
  }
11688
11744
 
11689
11745
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11755,7 +11811,7 @@ struct llm_build_olmo : public llm_graph_context {
11755
11811
  // inp_pos - contains the positions
11756
11812
  ggml_tensor * inp_pos = build_inp_pos();
11757
11813
 
11758
- auto * inp_attn = build_attn_inp_kv_unified();
11814
+ auto * inp_attn = build_attn_inp_kv();
11759
11815
 
11760
11816
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11761
11817
 
@@ -11814,7 +11870,7 @@ struct llm_build_olmo : public llm_graph_context {
11814
11870
 
11815
11871
  cur = build_attn(inp_attn,
11816
11872
  model.layers[il].wo, nullptr,
11817
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11873
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11818
11874
  }
11819
11875
 
11820
11876
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11883,7 +11939,7 @@ struct llm_build_olmo2 : public llm_graph_context {
11883
11939
  // inp_pos - contains the positions
11884
11940
  ggml_tensor * inp_pos = build_inp_pos();
11885
11941
 
11886
- auto * inp_attn = build_attn_inp_kv_unified();
11942
+ auto * inp_attn = build_attn_inp_kv();
11887
11943
 
11888
11944
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11889
11945
 
@@ -11934,7 +11990,7 @@ struct llm_build_olmo2 : public llm_graph_context {
11934
11990
 
11935
11991
  cur = build_attn(inp_attn,
11936
11992
  model.layers[il].wo, NULL,
11937
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11993
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11938
11994
  }
11939
11995
 
11940
11996
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12012,7 +12068,7 @@ struct llm_build_olmoe : public llm_graph_context {
12012
12068
  // inp_pos - contains the positions
12013
12069
  ggml_tensor * inp_pos = build_inp_pos();
12014
12070
 
12015
- auto * inp_attn = build_attn_inp_kv_unified();
12071
+ auto * inp_attn = build_attn_inp_kv();
12016
12072
 
12017
12073
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12018
12074
 
@@ -12067,7 +12123,7 @@ struct llm_build_olmoe : public llm_graph_context {
12067
12123
 
12068
12124
  cur = build_attn(inp_attn,
12069
12125
  model.layers[il].wo, NULL,
12070
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12126
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12071
12127
  }
12072
12128
 
12073
12129
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12138,7 +12194,7 @@ struct llm_build_openelm : public llm_graph_context {
12138
12194
  // inp_pos - contains the positions
12139
12195
  ggml_tensor * inp_pos = build_inp_pos();
12140
12196
 
12141
- auto * inp_attn = build_attn_inp_kv_unified();
12197
+ auto * inp_attn = build_attn_inp_kv();
12142
12198
 
12143
12199
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12144
12200
 
@@ -12200,7 +12256,7 @@ struct llm_build_openelm : public llm_graph_context {
12200
12256
 
12201
12257
  cur = build_attn(inp_attn,
12202
12258
  model.layers[il].wo, NULL,
12203
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12259
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12204
12260
  }
12205
12261
 
12206
12262
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12269,7 +12325,7 @@ struct llm_build_gptneox : public llm_graph_context {
12269
12325
  // inp_pos - contains the positions
12270
12326
  ggml_tensor * inp_pos = build_inp_pos();
12271
12327
 
12272
- auto * inp_attn = build_attn_inp_kv_unified();
12328
+ auto * inp_attn = build_attn_inp_kv();
12273
12329
 
12274
12330
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12275
12331
 
@@ -12312,7 +12368,7 @@ struct llm_build_gptneox : public llm_graph_context {
12312
12368
 
12313
12369
  cur = build_attn(inp_attn,
12314
12370
  model.layers[il].wo, model.layers[il].bo,
12315
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12371
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12316
12372
  }
12317
12373
 
12318
12374
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12415,7 +12471,7 @@ struct llm_build_arctic : public llm_graph_context {
12415
12471
  // inp_pos - contains the positions
12416
12472
  ggml_tensor * inp_pos = build_inp_pos();
12417
12473
 
12418
- auto * inp_attn = build_attn_inp_kv_unified();
12474
+ auto * inp_attn = build_attn_inp_kv();
12419
12475
 
12420
12476
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12421
12477
 
@@ -12462,7 +12518,7 @@ struct llm_build_arctic : public llm_graph_context {
12462
12518
 
12463
12519
  cur = build_attn(inp_attn,
12464
12520
  model.layers[il].wo, NULL,
12465
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12521
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12466
12522
  }
12467
12523
 
12468
12524
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12553,7 +12609,7 @@ struct llm_build_deepseek : public llm_graph_context {
12553
12609
  // inp_pos - contains the positions
12554
12610
  ggml_tensor * inp_pos = build_inp_pos();
12555
12611
 
12556
- auto * inp_attn = build_attn_inp_kv_unified();
12612
+ auto * inp_attn = build_attn_inp_kv();
12557
12613
 
12558
12614
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12559
12615
 
@@ -12617,7 +12673,7 @@ struct llm_build_deepseek : public llm_graph_context {
12617
12673
 
12618
12674
  cur = build_attn(inp_attn,
12619
12675
  model.layers[il].wo, model.layers[il].bo,
12620
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12676
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
12621
12677
  }
12622
12678
 
12623
12679
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12730,7 +12786,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12730
12786
  // inp_pos - contains the positions
12731
12787
  ggml_tensor * inp_pos = build_inp_pos();
12732
12788
 
12733
- auto * inp_attn = build_attn_inp_kv_unified();
12789
+ auto * inp_attn = build_attn_inp_kv();
12734
12790
 
12735
12791
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12736
12792
 
@@ -12845,7 +12901,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12845
12901
  // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
12846
12902
  cur = build_attn(inp_attn,
12847
12903
  model.layers[il].wo, NULL,
12848
- Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
12904
+ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
12849
12905
  } else {
12850
12906
  ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
12851
12907
  cb(kv, "kv", il);
@@ -12879,7 +12935,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12879
12935
  // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
12880
12936
  cur = build_attn(inp_attn,
12881
12937
  model.layers[il].wo, NULL,
12882
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12938
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
12883
12939
  }
12884
12940
  }
12885
12941
 
@@ -12977,7 +13033,7 @@ struct llm_build_bitnet : public llm_graph_context {
12977
13033
  // inp_pos - contains the positions
12978
13034
  ggml_tensor * inp_pos = build_inp_pos();
12979
13035
 
12980
- auto * inp_attn = build_attn_inp_kv_unified();
13036
+ auto * inp_attn = build_attn_inp_kv();
12981
13037
 
12982
13038
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12983
13039
 
@@ -13046,7 +13102,7 @@ struct llm_build_bitnet : public llm_graph_context {
13046
13102
 
13047
13103
  cur = build_attn(inp_attn,
13048
13104
  NULL, NULL,
13049
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13105
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13050
13106
 
13051
13107
  cur = build_norm(cur,
13052
13108
  model.layers[il].attn_sub_norm, NULL,
@@ -13169,7 +13225,7 @@ struct llm_build_t5_enc : public llm_graph_context {
13169
13225
 
13170
13226
  cur = build_attn(inp_attn,
13171
13227
  model.layers[il].wo_enc, nullptr,
13172
- Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
13228
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
13173
13229
  cb(cur, "kqv_out", il);
13174
13230
  }
13175
13231
 
@@ -13241,7 +13297,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13241
13297
 
13242
13298
  const int64_t n_outputs_enc = embd_enc->ne[1];
13243
13299
 
13244
- auto * inp_attn_self = build_attn_inp_kv_unified();
13300
+ auto * inp_attn_self = build_attn_inp_kv();
13245
13301
  auto * inp_attn_cross = build_attn_inp_cross();
13246
13302
 
13247
13303
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -13275,7 +13331,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13275
13331
 
13276
13332
  cur = build_attn(inp_attn_self,
13277
13333
  model.layers[il].wo, model.layers[il].bo,
13278
- Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
13334
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
13279
13335
  cb(cur, "kqv_out", il);
13280
13336
  }
13281
13337
 
@@ -13307,7 +13363,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13307
13363
 
13308
13364
  cur = build_attn(inp_attn_cross,
13309
13365
  model.layers[il].wo_cross, nullptr,
13310
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
13366
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
13311
13367
  cb(cur, "kqv_out", il);
13312
13368
 
13313
13369
  //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
@@ -13406,7 +13462,7 @@ struct llm_build_jais : public llm_graph_context {
13406
13462
 
13407
13463
  inpL = build_inp_embd(model.tok_embd);
13408
13464
 
13409
- auto * inp_attn = build_attn_inp_kv_unified();
13465
+ auto * inp_attn = build_attn_inp_kv();
13410
13466
 
13411
13467
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13412
13468
 
@@ -13439,7 +13495,7 @@ struct llm_build_jais : public llm_graph_context {
13439
13495
 
13440
13496
  cur = build_attn(inp_attn,
13441
13497
  model.layers[il].wo, model.layers[il].bo,
13442
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
13498
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
13443
13499
  }
13444
13500
 
13445
13501
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13504,7 +13560,7 @@ struct llm_build_chatglm : public llm_graph_context {
13504
13560
  // inp_pos - contains the positions
13505
13561
  ggml_tensor * inp_pos = build_inp_pos();
13506
13562
 
13507
- auto * inp_attn = build_attn_inp_kv_unified();
13563
+ auto * inp_attn = build_attn_inp_kv();
13508
13564
 
13509
13565
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13510
13566
 
@@ -13571,7 +13627,7 @@ struct llm_build_chatglm : public llm_graph_context {
13571
13627
 
13572
13628
  cur = build_attn(inp_attn,
13573
13629
  model.layers[il].wo, NULL,
13574
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13630
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13575
13631
  }
13576
13632
 
13577
13633
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13637,7 +13693,7 @@ struct llm_build_glm4 : public llm_graph_context {
13637
13693
  // inp_pos - contains the positions
13638
13694
  ggml_tensor * inp_pos = build_inp_pos();
13639
13695
 
13640
- auto * inp_attn = build_attn_inp_kv_unified();
13696
+ auto * inp_attn = build_attn_inp_kv();
13641
13697
 
13642
13698
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13643
13699
 
@@ -13704,7 +13760,7 @@ struct llm_build_glm4 : public llm_graph_context {
13704
13760
 
13705
13761
  cur = build_attn(inp_attn,
13706
13762
  model.layers[il].wo, NULL,
13707
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13763
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13708
13764
  }
13709
13765
 
13710
13766
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13787,7 +13843,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
13787
13843
  // inp_pos - contains the positions
13788
13844
  ggml_tensor * inp_pos = build_inp_pos();
13789
13845
 
13790
- auto * inp_attn = build_attn_inp_kv_unified();
13846
+ auto * inp_attn = build_attn_inp_kv();
13791
13847
 
13792
13848
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13793
13849
 
@@ -13853,7 +13909,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
13853
13909
 
13854
13910
  cur = build_attn(inp_attn,
13855
13911
  model.layers[il].wo, NULL,
13856
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13912
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13857
13913
  }
13858
13914
 
13859
13915
  if (il == n_transformer_layers - 1 && inp_out_ids) {
@@ -13947,7 +14003,7 @@ struct llm_build_nemotron : public llm_graph_context {
13947
14003
  // inp_pos - contains the positions
13948
14004
  ggml_tensor * inp_pos = build_inp_pos();
13949
14005
 
13950
- auto * inp_attn = build_attn_inp_kv_unified();
14006
+ auto * inp_attn = build_attn_inp_kv();
13951
14007
 
13952
14008
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13953
14009
 
@@ -14007,7 +14063,7 @@ struct llm_build_nemotron : public llm_graph_context {
14007
14063
 
14008
14064
  cur = build_attn(inp_attn,
14009
14065
  model.layers[il].wo, model.layers[il].bo,
14010
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14066
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14011
14067
  }
14012
14068
 
14013
14069
  if (il == n_layer - 1 && inp_out_ids) {
@@ -14076,7 +14132,7 @@ struct llm_build_exaone : public llm_graph_context {
14076
14132
  // inp_pos - contains the positions
14077
14133
  ggml_tensor * inp_pos = build_inp_pos();
14078
14134
 
14079
- auto * inp_attn = build_attn_inp_kv_unified();
14135
+ auto * inp_attn = build_attn_inp_kv();
14080
14136
 
14081
14137
  ggml_tensor * inp_out_ids = build_inp_out_ids();
14082
14138
 
@@ -14138,7 +14194,7 @@ struct llm_build_exaone : public llm_graph_context {
14138
14194
 
14139
14195
  cur = build_attn(inp_attn,
14140
14196
  model.layers[il].wo, model.layers[il].bo,
14141
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14197
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14142
14198
  }
14143
14199
 
14144
14200
  if (il == n_layer - 1 && inp_out_ids) {
@@ -14208,13 +14264,13 @@ struct llm_build_exaone4 : public llm_graph_context {
14208
14264
  // inp_pos - contains the positions
14209
14265
  ggml_tensor * inp_pos = build_inp_pos();
14210
14266
 
14211
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
14267
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
14212
14268
  inp_attn_type * inp_attn = nullptr;
14213
14269
 
14214
14270
  if constexpr (iswa) {
14215
- inp_attn = build_attn_inp_kv_unified_iswa();
14271
+ inp_attn = build_attn_inp_kv_iswa();
14216
14272
  } else {
14217
- inp_attn = build_attn_inp_kv_unified();
14273
+ inp_attn = build_attn_inp_kv();
14218
14274
  }
14219
14275
 
14220
14276
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -14269,7 +14325,7 @@ struct llm_build_exaone4 : public llm_graph_context {
14269
14325
 
14270
14326
  cur = build_attn(inp_attn,
14271
14327
  model.layers[il].wo, NULL,
14272
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14328
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14273
14329
  cb(cur, "attn_out", il);
14274
14330
  }
14275
14331
 
@@ -15097,7 +15153,7 @@ struct llm_build_granite : public llm_graph_context {
15097
15153
  inp_pos = build_inp_pos();
15098
15154
  }
15099
15155
 
15100
- auto * inp_attn = build_attn_inp_kv_unified();
15156
+ auto * inp_attn = build_attn_inp_kv();
15101
15157
 
15102
15158
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15103
15159
 
@@ -15148,12 +15204,12 @@ struct llm_build_granite : public llm_graph_context {
15148
15204
  }
15149
15205
 
15150
15206
  ggml_tensor * build_attention_layer(
15151
- ggml_tensor * cur,
15152
- ggml_tensor * inp_pos,
15153
- llm_graph_input_attn_kv_unified * inp_attn,
15154
- const llama_model & model,
15155
- const int64_t n_embd_head,
15156
- const int il) {
15207
+ ggml_tensor * cur,
15208
+ ggml_tensor * inp_pos,
15209
+ llm_graph_input_attn_kv * inp_attn,
15210
+ const llama_model & model,
15211
+ const int64_t n_embd_head,
15212
+ const int il) {
15157
15213
 
15158
15214
  // compute Q and K and (optionally) RoPE them
15159
15215
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -15204,7 +15260,7 @@ struct llm_build_granite : public llm_graph_context {
15204
15260
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15205
15261
  cur = build_attn(inp_attn,
15206
15262
  model.layers[il].wo, model.layers[il].bo,
15207
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15263
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
15208
15264
  cb(cur, "attn_out", il);
15209
15265
  return cur;
15210
15266
  }
@@ -15367,12 +15423,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
15367
15423
  }
15368
15424
 
15369
15425
  ggml_tensor * build_attention_layer(
15370
- ggml_tensor * cur,
15371
- ggml_tensor * inp_pos,
15372
- llm_graph_input_attn_kv_unified * inp_attn,
15373
- const llama_model & model,
15374
- const int64_t n_embd_head,
15375
- const int il) {
15426
+ ggml_tensor * cur,
15427
+ ggml_tensor * inp_pos,
15428
+ llm_graph_input_attn_kv * inp_attn,
15429
+ const llama_model & model,
15430
+ const int64_t n_embd_head,
15431
+ const int il) {
15376
15432
 
15377
15433
  // compute Q and K and (optionally) RoPE them
15378
15434
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -15423,7 +15479,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
15423
15479
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15424
15480
  cur = build_attn(inp_attn,
15425
15481
  model.layers[il].wo, model.layers[il].bo,
15426
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15482
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
15427
15483
  cb(cur, "attn_out", il);
15428
15484
  return cur;
15429
15485
  }
@@ -15529,7 +15585,7 @@ struct llm_build_chameleon : public llm_graph_context {
15529
15585
  // inp_pos - contains the positions
15530
15586
  ggml_tensor * inp_pos = build_inp_pos();
15531
15587
 
15532
- auto * inp_attn = build_attn_inp_kv_unified();
15588
+ auto * inp_attn = build_attn_inp_kv();
15533
15589
 
15534
15590
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15535
15591
 
@@ -15608,7 +15664,7 @@ struct llm_build_chameleon : public llm_graph_context {
15608
15664
 
15609
15665
  cur = build_attn(inp_attn,
15610
15666
  model.layers[il].wo, nullptr,
15611
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15667
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15612
15668
  }
15613
15669
 
15614
15670
  if (il == n_layer - 1 && inp_out_ids) {
@@ -15860,7 +15916,7 @@ struct llm_build_plm : public llm_graph_context {
15860
15916
  // inp_pos - contains the positions
15861
15917
  ggml_tensor * inp_pos = build_inp_pos();
15862
15918
 
15863
- auto * inp_attn = build_attn_inp_kv_unified();
15919
+ auto * inp_attn = build_attn_inp_kv();
15864
15920
 
15865
15921
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15866
15922
 
@@ -15964,7 +16020,7 @@ struct llm_build_plm : public llm_graph_context {
15964
16020
 
15965
16021
  cur = build_attn(inp_attn,
15966
16022
  model.layers[il].wo, NULL,
15967
- q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
16023
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
15968
16024
  }
15969
16025
 
15970
16026
  if (il == n_layer - 1 && inp_out_ids) {
@@ -16025,7 +16081,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
16025
16081
  // inp_pos - contains the positions
16026
16082
  ggml_tensor * inp_pos = build_inp_pos();
16027
16083
 
16028
- auto * inp_attn = build_attn_inp_kv_unified();
16084
+ auto * inp_attn = build_attn_inp_kv();
16029
16085
 
16030
16086
  ggml_tensor * inp_out_ids = build_inp_out_ids();
16031
16087
 
@@ -16087,7 +16143,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
16087
16143
 
16088
16144
  cur = build_attn(inp_attn,
16089
16145
  model.layers[il].wo, model.layers[il].bo,
16090
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
16146
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
16091
16147
  }
16092
16148
 
16093
16149
  if (il == n_layer - 1 && inp_out_ids) {
@@ -16174,7 +16230,7 @@ struct llm_build_dots1 : public llm_graph_context {
16174
16230
  // inp_pos - contains the positions
16175
16231
  ggml_tensor * inp_pos = build_inp_pos();
16176
16232
 
16177
- auto * inp_attn = build_attn_inp_kv_unified();
16233
+ auto * inp_attn = build_attn_inp_kv();
16178
16234
 
16179
16235
  ggml_tensor * inp_out_ids = build_inp_out_ids();
16180
16236
 
@@ -16227,7 +16283,7 @@ struct llm_build_dots1 : public llm_graph_context {
16227
16283
 
16228
16284
  cur = build_attn(inp_attn,
16229
16285
  model.layers[il].wo, model.layers[il].bo,
16230
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16286
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16231
16287
  }
16232
16288
 
16233
16289
  if (il == n_layer - 1 && inp_out_ids) {
@@ -16324,7 +16380,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
16324
16380
  // inp_pos - contains the positions
16325
16381
  ggml_tensor * inp_pos = build_inp_pos();
16326
16382
 
16327
- auto * inp_attn = build_attn_inp_kv_unified();
16383
+ auto * inp_attn = build_attn_inp_kv();
16328
16384
 
16329
16385
  for (int il = 0; il < n_layer; ++il) {
16330
16386
  ggml_tensor * inpSA = inpL;
@@ -16382,7 +16438,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
16382
16438
 
16383
16439
  cur = build_attn(inp_attn,
16384
16440
  model.layers[il].wo, NULL,
16385
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16441
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16386
16442
  }
16387
16443
 
16388
16444
  if (il == n_layer - 1) {
@@ -16454,7 +16510,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
16454
16510
  // inp_pos - contains the positions
16455
16511
  ggml_tensor * inp_pos = build_inp_pos();
16456
16512
 
16457
- auto * inp_attn = build_attn_inp_kv_unified();
16513
+ auto * inp_attn = build_attn_inp_kv();
16458
16514
 
16459
16515
  ggml_tensor * inp_out_ids = build_inp_out_ids();
16460
16516
 
@@ -16515,7 +16571,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
16515
16571
 
16516
16572
  cur = build_attn(inp_attn,
16517
16573
  model.layers[il].wo, NULL,
16518
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16574
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16519
16575
  cb(cur, "attn_out", il);
16520
16576
  }
16521
16577
 
@@ -16668,7 +16724,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
16668
16724
 
16669
16725
  ggml_tensor * attn_out = build_attn(inp->get_attn(),
16670
16726
  model.layers[il].wo, NULL,
16671
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
16727
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
16672
16728
  cb(attn_out, "attn_out", il);
16673
16729
 
16674
16730
  cur = build_norm(inpL,
@@ -16828,7 +16884,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
16828
16884
 
16829
16885
  private:
16830
16886
  ggml_tensor * build_plamo2_attn_layer(
16831
- llm_graph_input_attn_kv_unified * inp,
16887
+ llm_graph_input_attn_kv * inp,
16832
16888
  ggml_tensor * inp_pos,
16833
16889
  ggml_tensor * cur,
16834
16890
  const llama_model & model,
@@ -16878,7 +16934,9 @@ private:
16878
16934
  ext_factor, attn_factor, beta_fast, beta_slow
16879
16935
  );
16880
16936
 
16881
- cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
16937
+ cur = build_attn(inp,
16938
+ model.layers[il].wo, NULL,
16939
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
16882
16940
  }
16883
16941
 
16884
16942
  cb(cur, "attn_out", il);
@@ -17061,7 +17119,7 @@ struct llm_build_arcee : public llm_graph_context {
17061
17119
  // inp_pos - contains the positions
17062
17120
  ggml_tensor * inp_pos = build_inp_pos();
17063
17121
 
17064
- auto * inp_attn = build_attn_inp_kv_unified();
17122
+ auto * inp_attn = build_attn_inp_kv();
17065
17123
 
17066
17124
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
17067
17125
 
@@ -17125,7 +17183,7 @@ struct llm_build_arcee : public llm_graph_context {
17125
17183
 
17126
17184
  cur = build_attn(inp_attn,
17127
17185
  model.layers[il].wo, model.layers[il].bo,
17128
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17186
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17129
17187
  cb(cur, "attn_out", il);
17130
17188
  }
17131
17189
 
@@ -17196,7 +17254,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
17196
17254
  // inp_pos - contains the positions
17197
17255
  ggml_tensor * inp_pos = build_inp_pos();
17198
17256
 
17199
- auto * inp_attn = build_attn_inp_kv_unified();
17257
+ auto * inp_attn = build_attn_inp_kv();
17200
17258
 
17201
17259
  const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
17202
17260
 
@@ -17270,7 +17328,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
17270
17328
 
17271
17329
  cur = build_attn(inp_attn,
17272
17330
  model.layers[il].wo, model.layers[il].bo,
17273
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17331
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17274
17332
  cb(cur, "attn_out", il);
17275
17333
  }
17276
17334
 
@@ -17357,7 +17415,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
17357
17415
  // inp_pos - contains the positions
17358
17416
  ggml_tensor * inp_pos = build_inp_pos();
17359
17417
 
17360
- auto * inp_attn = build_attn_inp_kv_unified();
17418
+ auto * inp_attn = build_attn_inp_kv();
17361
17419
 
17362
17420
  const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
17363
17421
 
@@ -17430,7 +17488,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
17430
17488
 
17431
17489
  cur = build_attn(inp_attn,
17432
17490
  model.layers[il].wo, model.layers[il].bo,
17433
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17491
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17434
17492
  cb(cur, "attn_out", il);
17435
17493
  }
17436
17494
 
@@ -17495,7 +17553,7 @@ struct llm_build_smollm3 : public llm_graph_context {
17495
17553
  // inp_pos - contains the positions
17496
17554
  ggml_tensor * inp_pos = build_inp_pos();
17497
17555
 
17498
- auto * inp_attn = build_attn_inp_kv_unified();
17556
+ auto * inp_attn = build_attn_inp_kv();
17499
17557
 
17500
17558
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
17501
17559
 
@@ -17560,7 +17618,7 @@ struct llm_build_smollm3 : public llm_graph_context {
17560
17618
 
17561
17619
  cur = build_attn(inp_attn,
17562
17620
  model.layers[il].wo, model.layers[il].bo,
17563
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17621
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17564
17622
  cb(cur, "attn_out", il);
17565
17623
  }
17566
17624
 
@@ -17627,7 +17685,7 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
17627
17685
  // inp_pos - contains the positions
17628
17686
  ggml_tensor * inp_pos = build_inp_pos();
17629
17687
 
17630
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
17688
+ auto * inp_attn = build_attn_inp_kv_iswa();
17631
17689
 
17632
17690
  for (int il = 0; il < n_layer; ++il) {
17633
17691
  ggml_tensor * inpSA = inpL;
@@ -17682,9 +17740,9 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
17682
17740
  cb(Kcur, "Kcur", il);
17683
17741
  cb(Vcur, "Vcur", il);
17684
17742
 
17685
- cur = build_attn_with_sinks(inp_attn,
17743
+ cur = build_attn(inp_attn,
17686
17744
  model.layers[il].wo, model.layers[il].bo,
17687
- Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il);
17745
+ Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
17688
17746
 
17689
17747
  cb(cur, "attn_out", il);
17690
17748
  }
@@ -17781,8 +17839,7 @@ struct llm_build_lfm2 : public llm_graph_context {
17781
17839
  cb(cur, "model.embedding_norm", -1);
17782
17840
  res->t_embd = cur;
17783
17841
 
17784
- // lm_head is tied with embeddings
17785
- cur = build_lora_mm(model.tok_embd, cur);
17842
+ cur = build_lora_mm(model.output, cur);
17786
17843
  cb(cur, "lm_head", -1);
17787
17844
 
17788
17845
  res->t_logits = cur;
@@ -17809,10 +17866,10 @@ struct llm_build_lfm2 : public llm_graph_context {
17809
17866
  return cur;
17810
17867
  }
17811
17868
 
17812
- ggml_tensor * build_attn_block(ggml_tensor * cur,
17813
- ggml_tensor * inp_pos,
17814
- llm_graph_input_attn_kv_unified * inp_attn,
17815
- int il) const {
17869
+ ggml_tensor * build_attn_block(ggml_tensor * cur,
17870
+ ggml_tensor * inp_pos,
17871
+ llm_graph_input_attn_kv * inp_attn,
17872
+ int il) const {
17816
17873
  GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
17817
17874
  auto const n_embd_head = hparams.n_embd_head_v;
17818
17875
  auto const n_head_kv = hparams.n_head_kv(il);
@@ -17847,7 +17904,7 @@ struct llm_build_lfm2 : public llm_graph_context {
17847
17904
  );
17848
17905
 
17849
17906
  cur = build_attn(inp_attn, model.layers[il].wo, NULL,
17850
- q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
17907
+ q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
17851
17908
 
17852
17909
  cb(cur, "model.layers.{}.self_attn.out_proj", il);
17853
17910
 
@@ -17924,6 +17981,137 @@ struct llm_build_lfm2 : public llm_graph_context {
17924
17981
  }
17925
17982
  };
17926
17983
 
17984
+ struct llm_build_seed_oss : public llm_graph_context {
17985
+ llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17986
+ const int64_t n_embd_head = hparams.n_embd_head_v;
17987
+
17988
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
17989
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
17990
+
17991
+ ggml_tensor * cur;
17992
+ ggml_tensor * inpL;
17993
+
17994
+ inpL = build_inp_embd(model.tok_embd);
17995
+
17996
+ // inp_pos - contains the positions
17997
+ ggml_tensor * inp_pos = build_inp_pos();
17998
+
17999
+ auto * inp_attn = build_attn_inp_kv();
18000
+
18001
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
18002
+
18003
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
18004
+
18005
+ for (int il = 0; il < n_layer; ++il) {
18006
+ ggml_tensor * inpSA = inpL;
18007
+
18008
+ // norm
18009
+ cur = build_norm(inpL,
18010
+ model.layers[il].attn_norm, NULL,
18011
+ LLM_NORM_RMS, il);
18012
+ cb(cur, "attn_norm", il);
18013
+
18014
+ // self-attention
18015
+ {
18016
+ // compute Q and K and RoPE them
18017
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
18018
+ cb(Qcur, "Qcur", il);
18019
+ if (model.layers[il].bq) {
18020
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
18021
+ cb(Qcur, "Qcur", il);
18022
+ }
18023
+
18024
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
18025
+ cb(Kcur, "Kcur", il);
18026
+ if (model.layers[il].bk) {
18027
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
18028
+ cb(Kcur, "Kcur", il);
18029
+ }
18030
+
18031
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
18032
+ cb(Vcur, "Vcur", il);
18033
+ if (model.layers[il].bv) {
18034
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
18035
+ cb(Vcur, "Vcur", il);
18036
+ }
18037
+
18038
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
18039
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
18040
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
18041
+
18042
+ Qcur = ggml_rope_ext(
18043
+ ctx0, Qcur, inp_pos, nullptr,
18044
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
18045
+ ext_factor, attn_factor, beta_fast, beta_slow
18046
+ );
18047
+
18048
+ Kcur = ggml_rope_ext(
18049
+ ctx0, Kcur, inp_pos, nullptr,
18050
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
18051
+ ext_factor, attn_factor, beta_fast, beta_slow
18052
+ );
18053
+
18054
+ cb(Qcur, "Qcur", il);
18055
+ cb(Kcur, "Kcur", il);
18056
+ cb(Vcur, "Vcur", il);
18057
+
18058
+ cur = build_attn(inp_attn,
18059
+ model.layers[il].wo, model.layers[il].bo,
18060
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
18061
+ cb(cur, "attn_out", il);
18062
+ }
18063
+
18064
+ if (il == n_layer - 1 && inp_out_ids) {
18065
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
18066
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
18067
+ }
18068
+
18069
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
18070
+ cb(ffn_inp, "ffn_inp", il);
18071
+
18072
+ // feed-forward network
18073
+ cur = build_norm(ffn_inp,
18074
+ model.layers[il].attn_post_norm, NULL,
18075
+ LLM_NORM_RMS, il);
18076
+ cb(cur, "attn_post_norm", il);
18077
+
18078
+ cur = build_ffn(cur,
18079
+ model.layers[il].ffn_up, NULL, NULL,
18080
+ model.layers[il].ffn_gate, NULL, NULL,
18081
+ model.layers[il].ffn_down, NULL, NULL,
18082
+ NULL,
18083
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
18084
+ cb(cur, "ffn_out", il);
18085
+
18086
+ cur = ggml_add(ctx0, cur, ffn_inp);
18087
+ cb(cur, "ffn_out", il);
18088
+
18089
+ cur = build_cvec(cur, il);
18090
+ cb(cur, "l_out", il);
18091
+
18092
+ // input for next layer
18093
+ inpL = cur;
18094
+ }
18095
+
18096
+ cur = inpL;
18097
+
18098
+ cur = build_norm(cur,
18099
+ model.output_norm, NULL,
18100
+ LLM_NORM_RMS, -1);
18101
+
18102
+ cb(cur, "result_norm", -1);
18103
+ res->t_embd = cur;
18104
+
18105
+ // lm_head
18106
+ cur = build_lora_mm(model.output, cur);
18107
+
18108
+ cb(cur, "result_output", -1);
18109
+ res->t_logits = cur;
18110
+
18111
+ ggml_build_forward_expand(gf, cur);
18112
+ }
18113
+ };
18114
+
17927
18115
  template <bool iswa>
17928
18116
  struct llm_build_smallthinker : public llm_graph_context{
17929
18117
  llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
@@ -17940,13 +18128,13 @@ struct llm_build_smallthinker : public llm_graph_context{
17940
18128
  // inp_pos - contains the positions
17941
18129
  ggml_tensor * inp_pos = build_inp_pos();
17942
18130
 
17943
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
18131
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
17944
18132
  inp_attn_type * inp_attn = nullptr;
17945
18133
 
17946
18134
  if constexpr (iswa) {
17947
- inp_attn = build_attn_inp_kv_unified_iswa();
18135
+ inp_attn = build_attn_inp_kv_iswa();
17948
18136
  } else {
17949
- inp_attn = build_attn_inp_kv_unified();
18137
+ inp_attn = build_attn_inp_kv();
17950
18138
  }
17951
18139
 
17952
18140
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -17991,7 +18179,7 @@ struct llm_build_smallthinker : public llm_graph_context{
17991
18179
 
17992
18180
  cur = build_attn(inp_attn,
17993
18181
  model.layers[il].wo, model.layers[il].bo,
17994
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
18182
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
17995
18183
  }
17996
18184
 
17997
18185
  if (il == n_layer - 1 && inp_out_ids) {
@@ -18069,14 +18257,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18069
18257
  if (llm_arch_is_recurrent(arch)) {
18070
18258
  res = new llama_memory_recurrent(
18071
18259
  *this,
18072
- nullptr,
18073
18260
  GGML_TYPE_F32,
18074
18261
  GGML_TYPE_F32,
18075
18262
  cparams.offload_kqv,
18076
18263
  std::max((uint32_t) 1, cparams.n_seq_max),
18077
- cparams.n_seq_max);
18264
+ cparams.n_seq_max,
18265
+ nullptr);
18078
18266
  } else if (llm_arch_is_hybrid(arch)) {
18079
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
18267
+ const auto padding = llama_kv_cache::get_padding(cparams);
18080
18268
 
18081
18269
  cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
18082
18270
 
@@ -18098,7 +18286,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18098
18286
  /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
18099
18287
  /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
18100
18288
  } else {
18101
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
18289
+ const auto padding = llama_kv_cache::get_padding(cparams);
18102
18290
 
18103
18291
  uint32_t n_ctx_per_stream = cparams.n_ctx;
18104
18292
 
@@ -18115,10 +18303,22 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18115
18303
 
18116
18304
  LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
18117
18305
 
18306
+ llama_memory_i::layer_reuse_cb reuse = nullptr;
18307
+
18308
+ if (arch == LLM_ARCH_GEMMA3N) {
18309
+ reuse = [&](int32_t il) {
18310
+ if (il >= (int32_t) hparams.n_layer_kv_from_start) {
18311
+ return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
18312
+ }
18313
+
18314
+ return -1;
18315
+ };
18316
+ }
18317
+
18118
18318
  if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
18119
18319
  GGML_ASSERT(hparams.is_swa_any());
18120
18320
 
18121
- res = new llama_kv_cache_unified_iswa(
18321
+ res = new llama_kv_cache_iswa(
18122
18322
  *this,
18123
18323
  params.type_k,
18124
18324
  params.type_v,
@@ -18129,13 +18329,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18129
18329
  n_ctx_per_stream,
18130
18330
  cparams.n_seq_max,
18131
18331
  cparams.n_ubatch,
18132
- padding);
18332
+ padding,
18333
+ nullptr,
18334
+ reuse);
18133
18335
  } else {
18134
18336
  GGML_ASSERT(!hparams.is_swa_any());
18135
18337
 
18136
- res = new llama_kv_cache_unified(
18338
+ res = new llama_kv_cache(
18137
18339
  *this,
18138
- nullptr,
18139
18340
  params.type_k,
18140
18341
  params.type_v,
18141
18342
  !cparams.flash_attn,
@@ -18145,7 +18346,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18145
18346
  cparams.n_seq_max,
18146
18347
  padding,
18147
18348
  hparams.n_swa,
18148
- hparams.swa_type);
18349
+ hparams.swa_type,
18350
+ nullptr,
18351
+ nullptr);
18149
18352
  }
18150
18353
  }
18151
18354
  }
@@ -18462,6 +18665,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18462
18665
  {
18463
18666
  llm = std::make_unique<llm_build_bailingmoe>(*this, params);
18464
18667
  } break;
18668
+ case LLM_ARCH_SEED_OSS:
18669
+ {
18670
+ llm = std::make_unique<llm_build_seed_oss>(*this, params);
18671
+ } break;
18465
18672
  case LLM_ARCH_DOTS1:
18466
18673
  {
18467
18674
  llm = std::make_unique<llm_build_dots1>(*this, params);
@@ -18520,6 +18727,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18520
18727
  return llm->res->get_gf();
18521
18728
  }
18522
18729
 
18730
+
18523
18731
  //
18524
18732
  // interface implementation
18525
18733
  //
@@ -18714,6 +18922,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18714
18922
  case LLM_ARCH_LFM2:
18715
18923
  case LLM_ARCH_SMALLTHINKER:
18716
18924
  case LLM_ARCH_GLM4_MOE:
18925
+ case LLM_ARCH_SEED_OSS:
18717
18926
  return LLAMA_ROPE_TYPE_NEOX;
18718
18927
 
18719
18928
  case LLM_ARCH_QWEN2VL: