@fugood/llama.node 1.1.8 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +9 -2
  3. package/lib/index.ts +57 -30
  4. package/lib/version.js +2 -2
  5. package/lib/version.ts +2 -2
  6. package/package.json +14 -14
  7. package/src/LlamaContext.cpp +11 -0
  8. package/src/llama.cpp/common/arg.cpp +6 -4
  9. package/src/llama.cpp/common/chat.cpp +33 -1
  10. package/src/llama.cpp/common/common.cpp +0 -15
  11. package/src/llama.cpp/common/common.h +1 -2
  12. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  13. package/src/llama.cpp/ggml/include/ggml.h +25 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  20. package/src/llama.cpp/include/llama.h +1 -110
  21. package/src/llama.cpp/src/CMakeLists.txt +2 -2
  22. package/src/llama.cpp/src/llama-arch.cpp +19 -0
  23. package/src/llama.cpp/src/llama-arch.h +1 -0
  24. package/src/llama.cpp/src/llama-chat.cpp +13 -2
  25. package/src/llama.cpp/src/llama-chat.h +1 -0
  26. package/src/llama.cpp/src/llama-context.cpp +5 -192
  27. package/src/llama.cpp/src/llama-context.h +2 -7
  28. package/src/llama.cpp/src/llama-cparams.h +0 -1
  29. package/src/llama.cpp/src/llama-graph.cpp +35 -57
  30. package/src/llama.cpp/src/llama-graph.h +36 -46
  31. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
  32. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
  33. package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
  34. package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
  35. package/src/llama.cpp/src/llama-kv-cells.h +21 -21
  36. package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
  37. package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
  38. package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
  39. package/src/llama.cpp/src/llama-memory.h +3 -8
  40. package/src/llama.cpp/src/llama-model.cpp +369 -176
  41. package/src/llama.cpp/src/llama-model.h +1 -0
@@ -6,8 +6,8 @@
6
6
  #include "llama-cparams.h"
7
7
  #include "llama-model-loader.h"
8
8
 
9
- #include "llama-kv-cache-unified.h"
10
- #include "llama-kv-cache-unified-iswa.h"
9
+ #include "llama-kv-cache.h"
10
+ #include "llama-kv-cache-iswa.h"
11
11
  #include "llama-memory-hybrid.h"
12
12
  #include "llama-memory-recurrent.h"
13
13
 
@@ -83,6 +83,7 @@ const char * llm_type_name(llm_type type) {
83
83
  case LLM_TYPE_32B: return "32B";
84
84
  case LLM_TYPE_34B: return "34B";
85
85
  case LLM_TYPE_35B: return "35B";
86
+ case LLM_TYPE_36B: return "36B";
86
87
  case LLM_TYPE_40B: return "40B";
87
88
  case LLM_TYPE_65B: return "65B";
88
89
  case LLM_TYPE_70B: return "70B";
@@ -1288,6 +1289,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1288
1289
  default: type = LLM_TYPE_UNKNOWN;
1289
1290
  }
1290
1291
  } break;
1292
+ case LLM_ARCH_SEED_OSS:
1293
+ {
1294
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1295
+ switch (hparams.n_layer) {
1296
+ case 64: type = LLM_TYPE_36B; break;
1297
+ default: type = LLM_TYPE_UNKNOWN;
1298
+ }
1299
+ } break;
1291
1300
  case LLM_ARCH_OLMOE:
1292
1301
  {
1293
1302
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -3967,6 +3976,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3967
3976
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3968
3977
  }
3969
3978
  } break;
3979
+ case LLM_ARCH_SEED_OSS:
3980
+ {
3981
+ const uint32_t head_dim = hparams.n_embd_head_k;
3982
+ const int64_t n_qo_dim = n_head * head_dim;
3983
+ const int64_t n_kv_dim = n_head_kv * head_dim;
3984
+
3985
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3986
+
3987
+ // output
3988
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3989
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3990
+ // if output is NULL, init from the input tok embed
3991
+ if (output == NULL) {
3992
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3993
+ }
3994
+
3995
+ for (int i = 0; i < n_layer; ++i) {
3996
+ auto & layer = layers[i];
3997
+
3998
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
3999
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
4000
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
4001
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
4002
+
4003
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
4004
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4005
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4006
+
4007
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4008
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4009
+
4010
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4011
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4012
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4013
+ }
4014
+ } break;
4015
+
3970
4016
  case LLM_ARCH_OLMOE:
3971
4017
  {
3972
4018
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5474,8 +5520,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5474
5520
  } break;
5475
5521
  case LLM_ARCH_LFM2:
5476
5522
  {
5477
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5523
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5478
5524
  tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5525
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5526
+
5527
+ if (output == NULL) {
5528
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5529
+ }
5479
5530
 
5480
5531
  for (int i = 0; i < n_layer; ++i) {
5481
5532
  auto & layer = layers[i];
@@ -5986,7 +6037,7 @@ struct llm_build_llama : public llm_graph_context {
5986
6037
  // inp_pos - contains the positions
5987
6038
  ggml_tensor * inp_pos = build_inp_pos();
5988
6039
 
5989
- auto * inp_attn = build_attn_inp_kv_unified();
6040
+ auto * inp_attn = build_attn_inp_kv();
5990
6041
 
5991
6042
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
5992
6043
 
@@ -6050,7 +6101,7 @@ struct llm_build_llama : public llm_graph_context {
6050
6101
 
6051
6102
  cur = build_attn(inp_attn,
6052
6103
  model.layers[il].wo, model.layers[il].bo,
6053
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6104
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
6054
6105
  cb(cur, "attn_out", il);
6055
6106
  }
6056
6107
 
@@ -6146,7 +6197,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
6146
6197
  ggml_tensor * inp_attn_scale = nullptr;
6147
6198
  inp_attn_scale = build_inp_attn_scale();
6148
6199
 
6149
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
6200
+ auto * inp_attn = build_attn_inp_kv_iswa();
6150
6201
 
6151
6202
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
6152
6203
 
@@ -6224,7 +6275,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
6224
6275
 
6225
6276
  cur = build_attn(inp_attn,
6226
6277
  model.layers[il].wo, model.layers[il].bo,
6227
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6278
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
6228
6279
  cb(cur, "attn_out", il);
6229
6280
  }
6230
6281
 
@@ -6325,7 +6376,7 @@ struct llm_build_deci : public llm_graph_context {
6325
6376
  // inp_pos - contains the positions
6326
6377
  ggml_tensor * inp_pos = build_inp_pos();
6327
6378
 
6328
- auto * inp_attn = build_attn_inp_kv_unified();
6379
+ auto * inp_attn = build_attn_inp_kv();
6329
6380
 
6330
6381
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
6331
6382
 
@@ -6401,7 +6452,7 @@ struct llm_build_deci : public llm_graph_context {
6401
6452
 
6402
6453
  cur = build_attn(inp_attn,
6403
6454
  model.layers[il].wo, model.layers[il].bo,
6404
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6455
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
6405
6456
  }
6406
6457
 
6407
6458
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6481,7 +6532,7 @@ struct llm_build_baichuan : public llm_graph_context {
6481
6532
  // inp_pos - contains the positions
6482
6533
  ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
6483
6534
 
6484
- auto * inp_attn = build_attn_inp_kv_unified();
6535
+ auto * inp_attn = build_attn_inp_kv();
6485
6536
 
6486
6537
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6487
6538
 
@@ -6533,7 +6584,7 @@ struct llm_build_baichuan : public llm_graph_context {
6533
6584
 
6534
6585
  cur = build_attn(inp_attn,
6535
6586
  model.layers[il].wo, NULL,
6536
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6587
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6537
6588
  }
6538
6589
 
6539
6590
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6603,7 +6654,7 @@ struct llm_build_xverse : public llm_graph_context {
6603
6654
  // inp_pos - contains the positions
6604
6655
  ggml_tensor * inp_pos = build_inp_pos();
6605
6656
 
6606
- auto * inp_attn = build_attn_inp_kv_unified();
6657
+ auto * inp_attn = build_attn_inp_kv();
6607
6658
 
6608
6659
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6609
6660
 
@@ -6648,7 +6699,7 @@ struct llm_build_xverse : public llm_graph_context {
6648
6699
 
6649
6700
  cur = build_attn(inp_attn,
6650
6701
  model.layers[il].wo, NULL,
6651
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6702
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6652
6703
  }
6653
6704
 
6654
6705
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6717,7 +6768,7 @@ struct llm_build_falcon : public llm_graph_context {
6717
6768
  // inp_pos - contains the positions
6718
6769
  ggml_tensor * inp_pos = build_inp_pos();
6719
6770
 
6720
- auto * inp_attn = build_attn_inp_kv_unified();
6771
+ auto * inp_attn = build_attn_inp_kv();
6721
6772
 
6722
6773
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6723
6774
 
@@ -6771,7 +6822,7 @@ struct llm_build_falcon : public llm_graph_context {
6771
6822
 
6772
6823
  cur = build_attn(inp_attn,
6773
6824
  model.layers[il].wo, NULL,
6774
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6825
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6775
6826
  }
6776
6827
 
6777
6828
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6841,7 +6892,7 @@ struct llm_build_grok : public llm_graph_context {
6841
6892
  // inp_pos - contains the positions
6842
6893
  ggml_tensor * inp_pos = build_inp_pos();
6843
6894
 
6844
- auto * inp_attn = build_attn_inp_kv_unified();
6895
+ auto * inp_attn = build_attn_inp_kv();
6845
6896
 
6846
6897
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6847
6898
 
@@ -6901,7 +6952,7 @@ struct llm_build_grok : public llm_graph_context {
6901
6952
 
6902
6953
  cur = build_attn(inp_attn,
6903
6954
  model.layers[il].wo, model.layers[il].bo,
6904
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
6955
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
6905
6956
  }
6906
6957
 
6907
6958
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7001,7 +7052,7 @@ struct llm_build_dbrx : public llm_graph_context {
7001
7052
  // inp_pos - contains the positions
7002
7053
  ggml_tensor * inp_pos = build_inp_pos();
7003
7054
 
7004
- auto * inp_attn = build_attn_inp_kv_unified();
7055
+ auto * inp_attn = build_attn_inp_kv();
7005
7056
 
7006
7057
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7007
7058
 
@@ -7050,7 +7101,7 @@ struct llm_build_dbrx : public llm_graph_context {
7050
7101
 
7051
7102
  cur = build_attn(inp_attn,
7052
7103
  model.layers[il].wo, NULL,
7053
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7104
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7054
7105
  }
7055
7106
 
7056
7107
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7125,7 +7176,7 @@ struct llm_build_starcoder : public llm_graph_context {
7125
7176
  // inp_pos - contains the positions
7126
7177
  ggml_tensor * inp_pos = build_inp_pos();
7127
7178
 
7128
- auto * inp_attn = build_attn_inp_kv_unified();
7179
+ auto * inp_attn = build_attn_inp_kv();
7129
7180
 
7130
7181
  ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7131
7182
  cb(pos, "pos_embd", -1);
@@ -7164,7 +7215,7 @@ struct llm_build_starcoder : public llm_graph_context {
7164
7215
 
7165
7216
  cur = build_attn(inp_attn,
7166
7217
  model.layers[il].wo, model.layers[il].bo,
7167
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7218
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7168
7219
  }
7169
7220
 
7170
7221
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7230,7 +7281,7 @@ struct llm_build_refact : public llm_graph_context {
7230
7281
 
7231
7282
  inpL = build_inp_embd(model.tok_embd);
7232
7283
 
7233
- auto * inp_attn = build_attn_inp_kv_unified();
7284
+ auto * inp_attn = build_attn_inp_kv();
7234
7285
 
7235
7286
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7236
7287
 
@@ -7263,7 +7314,7 @@ struct llm_build_refact : public llm_graph_context {
7263
7314
 
7264
7315
  cur = build_attn(inp_attn,
7265
7316
  model.layers[il].wo, NULL,
7266
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7317
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7267
7318
  }
7268
7319
 
7269
7320
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7426,7 +7477,7 @@ struct llm_build_bert : public llm_graph_context {
7426
7477
 
7427
7478
  cur = build_attn(inp_attn,
7428
7479
  model.layers[il].wo, model.layers[il].bo,
7429
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7480
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7430
7481
  cb(cur, "kqv_out", il);
7431
7482
  }
7432
7483
 
@@ -7571,7 +7622,7 @@ struct llm_build_neo_bert : public llm_graph_context {
7571
7622
 
7572
7623
  cur = build_attn(inp_attn,
7573
7624
  model.layers[il].wo, nullptr,
7574
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7625
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7575
7626
  cb(cur, "kqv_out", il);
7576
7627
  }
7577
7628
 
@@ -7632,7 +7683,7 @@ struct llm_build_bloom : public llm_graph_context {
7632
7683
 
7633
7684
  inpL = build_inp_embd(model.tok_embd);
7634
7685
 
7635
- auto * inp_attn = build_attn_inp_kv_unified();
7686
+ auto * inp_attn = build_attn_inp_kv();
7636
7687
 
7637
7688
  inpL = build_norm(inpL,
7638
7689
  model.tok_norm,
@@ -7671,7 +7722,7 @@ struct llm_build_bloom : public llm_graph_context {
7671
7722
 
7672
7723
  cur = build_attn(inp_attn,
7673
7724
  model.layers[il].wo, model.layers[il].bo,
7674
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7725
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7675
7726
  }
7676
7727
 
7677
7728
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7739,7 +7790,7 @@ struct llm_build_mpt : public llm_graph_context {
7739
7790
 
7740
7791
  inpL = build_inp_embd(model.tok_embd);
7741
7792
 
7742
- auto * inp_attn = build_attn_inp_kv_unified();
7793
+ auto * inp_attn = build_attn_inp_kv();
7743
7794
 
7744
7795
  if (model.pos_embd) {
7745
7796
  // inp_pos - contains the positions
@@ -7819,7 +7870,7 @@ struct llm_build_mpt : public llm_graph_context {
7819
7870
 
7820
7871
  cur = build_attn(inp_attn,
7821
7872
  model.layers[il].wo, model.layers[il].bo,
7822
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7873
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7823
7874
  }
7824
7875
 
7825
7876
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7889,7 +7940,7 @@ struct llm_build_stablelm : public llm_graph_context {
7889
7940
  // inp_pos - contains the positions
7890
7941
  ggml_tensor * inp_pos = build_inp_pos();
7891
7942
 
7892
- auto * inp_attn = build_attn_inp_kv_unified();
7943
+ auto * inp_attn = build_attn_inp_kv();
7893
7944
 
7894
7945
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7895
7946
 
@@ -7965,7 +8016,7 @@ struct llm_build_stablelm : public llm_graph_context {
7965
8016
 
7966
8017
  cur = build_attn(inp_attn,
7967
8018
  model.layers[il].wo, NULL,
7968
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8019
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7969
8020
  }
7970
8021
 
7971
8022
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8041,7 +8092,7 @@ struct llm_build_qwen : public llm_graph_context {
8041
8092
  // inp_pos - contains the positions
8042
8093
  ggml_tensor * inp_pos = build_inp_pos();
8043
8094
 
8044
- auto * inp_attn = build_attn_inp_kv_unified();
8095
+ auto * inp_attn = build_attn_inp_kv();
8045
8096
 
8046
8097
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8047
8098
 
@@ -8086,7 +8137,7 @@ struct llm_build_qwen : public llm_graph_context {
8086
8137
 
8087
8138
  cur = build_attn(inp_attn,
8088
8139
  model.layers[il].wo, NULL,
8089
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8140
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8090
8141
  }
8091
8142
 
8092
8143
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8156,7 +8207,7 @@ struct llm_build_qwen2 : public llm_graph_context {
8156
8207
  // inp_pos - contains the positions
8157
8208
  ggml_tensor * inp_pos = build_inp_pos();
8158
8209
 
8159
- auto * inp_attn = build_attn_inp_kv_unified();
8210
+ auto * inp_attn = build_attn_inp_kv();
8160
8211
 
8161
8212
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8162
8213
 
@@ -8206,7 +8257,7 @@ struct llm_build_qwen2 : public llm_graph_context {
8206
8257
 
8207
8258
  cur = build_attn(inp_attn,
8208
8259
  model.layers[il].wo, model.layers[il].bo,
8209
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8260
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8210
8261
  }
8211
8262
 
8212
8263
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8320,8 +8371,9 @@ struct llm_build_dream : public llm_graph_context {
8320
8371
  cb(Kcur, "Kcur", il);
8321
8372
  cb(Vcur, "Vcur", il);
8322
8373
 
8323
- cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr,
8324
- nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
8374
+ cur = build_attn(inp_attn,
8375
+ model.layers[il].wo, model.layers[il].bo,
8376
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
8325
8377
  }
8326
8378
 
8327
8379
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8420,8 +8472,9 @@ struct llm_build_llada : public llm_graph_context {
8420
8472
  cb(Kcur, "Kcur", il);
8421
8473
  cb(Vcur, "Vcur", il);
8422
8474
 
8423
- cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
8424
- 1.0f / sqrtf(float(n_embd_head)), il);
8475
+ cur = build_attn(inp_attn,
8476
+ model.layers[il].wo, NULL,
8477
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
8425
8478
  }
8426
8479
 
8427
8480
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8481,7 +8534,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
8481
8534
  // inp_pos - contains the positions
8482
8535
  ggml_tensor * inp_pos = build_inp_pos();
8483
8536
 
8484
- auto * inp_attn = build_attn_inp_kv_unified();
8537
+ auto * inp_attn = build_attn_inp_kv();
8485
8538
 
8486
8539
  int sections[4];
8487
8540
  std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -8534,7 +8587,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
8534
8587
 
8535
8588
  cur = build_attn(inp_attn,
8536
8589
  model.layers[il].wo, model.layers[il].bo,
8537
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8590
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8538
8591
  }
8539
8592
 
8540
8593
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8602,7 +8655,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
8602
8655
  // inp_pos - contains the positions
8603
8656
  ggml_tensor * inp_pos = build_inp_pos();
8604
8657
 
8605
- auto * inp_attn = build_attn_inp_kv_unified();
8658
+ auto * inp_attn = build_attn_inp_kv();
8606
8659
 
8607
8660
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8608
8661
 
@@ -8661,7 +8714,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
8661
8714
 
8662
8715
  cur = build_attn(inp_attn,
8663
8716
  model.layers[il].wo, model.layers[il].bo,
8664
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8717
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8665
8718
  }
8666
8719
 
8667
8720
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8761,7 +8814,7 @@ struct llm_build_qwen3 : public llm_graph_context {
8761
8814
  // inp_pos - contains the positions
8762
8815
  ggml_tensor * inp_pos = build_inp_pos();
8763
8816
 
8764
- auto * inp_attn = build_attn_inp_kv_unified();
8817
+ auto * inp_attn = build_attn_inp_kv();
8765
8818
 
8766
8819
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8767
8820
 
@@ -8814,7 +8867,7 @@ struct llm_build_qwen3 : public llm_graph_context {
8814
8867
 
8815
8868
  cur = build_attn(inp_attn,
8816
8869
  model.layers[il].wo, model.layers[il].bo,
8817
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8870
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8818
8871
  }
8819
8872
 
8820
8873
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8882,7 +8935,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
8882
8935
  // inp_pos - contains the positions
8883
8936
  ggml_tensor * inp_pos = build_inp_pos();
8884
8937
 
8885
- auto * inp_attn = build_attn_inp_kv_unified();
8938
+ auto * inp_attn = build_attn_inp_kv();
8886
8939
 
8887
8940
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8888
8941
 
@@ -8935,7 +8988,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
8935
8988
 
8936
8989
  cur = build_attn(inp_attn,
8937
8990
  model.layers[il].wo, model.layers[il].bo,
8938
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8991
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8939
8992
  }
8940
8993
 
8941
8994
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9012,7 +9065,7 @@ struct llm_build_phi2 : public llm_graph_context {
9012
9065
  // inp_pos - contains the positions
9013
9066
  ggml_tensor * inp_pos = build_inp_pos();
9014
9067
 
9015
- auto * inp_attn = build_attn_inp_kv_unified();
9068
+ auto * inp_attn = build_attn_inp_kv();
9016
9069
 
9017
9070
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9018
9071
 
@@ -9075,7 +9128,7 @@ struct llm_build_phi2 : public llm_graph_context {
9075
9128
 
9076
9129
  cur = build_attn(inp_attn,
9077
9130
  model.layers[il].wo, model.layers[il].bo,
9078
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9131
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
9079
9132
  }
9080
9133
 
9081
9134
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9141,13 +9194,13 @@ struct llm_build_phi3 : public llm_graph_context {
9141
9194
  // inp_pos - contains the positions
9142
9195
  ggml_tensor * inp_pos = build_inp_pos();
9143
9196
 
9144
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
9197
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
9145
9198
  inp_attn_type * inp_attn = nullptr;
9146
9199
 
9147
9200
  if constexpr (iswa) {
9148
- inp_attn = build_attn_inp_kv_unified_iswa();
9201
+ inp_attn = build_attn_inp_kv_iswa();
9149
9202
  } else {
9150
- inp_attn = build_attn_inp_kv_unified();
9203
+ inp_attn = build_attn_inp_kv();
9151
9204
  }
9152
9205
 
9153
9206
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -9212,7 +9265,7 @@ struct llm_build_phi3 : public llm_graph_context {
9212
9265
 
9213
9266
  cur = build_attn(inp_attn,
9214
9267
  model.layers[il].wo, model.layers[il].bo,
9215
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9268
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
9216
9269
  }
9217
9270
 
9218
9271
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9299,7 +9352,7 @@ struct llm_build_plamo : public llm_graph_context {
9299
9352
  // inp_pos - contains the positions
9300
9353
  ggml_tensor * inp_pos = build_inp_pos();
9301
9354
 
9302
- auto * inp_attn = build_attn_inp_kv_unified();
9355
+ auto * inp_attn = build_attn_inp_kv();
9303
9356
 
9304
9357
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9305
9358
 
@@ -9346,7 +9399,7 @@ struct llm_build_plamo : public llm_graph_context {
9346
9399
 
9347
9400
  cur = build_attn(inp_attn,
9348
9401
  model.layers[il].wo, NULL,
9349
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9402
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9350
9403
  }
9351
9404
 
9352
9405
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9415,7 +9468,7 @@ struct llm_build_gpt2 : public llm_graph_context {
9415
9468
  // inp_pos - contains the positions
9416
9469
  ggml_tensor * inp_pos = build_inp_pos();
9417
9470
 
9418
- auto * inp_attn = build_attn_inp_kv_unified();
9471
+ auto * inp_attn = build_attn_inp_kv();
9419
9472
 
9420
9473
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
9421
9474
  cb(pos, "pos_embd", -1);
@@ -9454,7 +9507,7 @@ struct llm_build_gpt2 : public llm_graph_context {
9454
9507
 
9455
9508
  cur = build_attn(inp_attn,
9456
9509
  model.layers[il].wo, model.layers[il].bo,
9457
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9510
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9458
9511
  }
9459
9512
 
9460
9513
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9525,7 +9578,7 @@ struct llm_build_codeshell : public llm_graph_context {
9525
9578
  // inp_pos - contains the positions
9526
9579
  ggml_tensor * inp_pos = build_inp_pos();
9527
9580
 
9528
- auto * inp_attn = build_attn_inp_kv_unified();
9581
+ auto * inp_attn = build_attn_inp_kv();
9529
9582
 
9530
9583
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9531
9584
 
@@ -9568,7 +9621,7 @@ struct llm_build_codeshell : public llm_graph_context {
9568
9621
 
9569
9622
  cur = build_attn(inp_attn,
9570
9623
  model.layers[il].wo, model.layers[il].bo,
9571
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9624
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9572
9625
  }
9573
9626
 
9574
9627
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9638,7 +9691,7 @@ struct llm_build_orion : public llm_graph_context {
9638
9691
  // inp_pos - contains the positions
9639
9692
  ggml_tensor * inp_pos = build_inp_pos();
9640
9693
 
9641
- auto * inp_attn = build_attn_inp_kv_unified();
9694
+ auto * inp_attn = build_attn_inp_kv();
9642
9695
 
9643
9696
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9644
9697
 
@@ -9697,7 +9750,7 @@ struct llm_build_orion : public llm_graph_context {
9697
9750
 
9698
9751
  cur = build_attn(inp_attn,
9699
9752
  model.layers[il].wo, NULL,
9700
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9753
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9701
9754
  }
9702
9755
 
9703
9756
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9765,7 +9818,7 @@ struct llm_build_internlm2 : public llm_graph_context {
9765
9818
  // inp_pos - contains the positions
9766
9819
  ggml_tensor * inp_pos = build_inp_pos();
9767
9820
 
9768
- auto * inp_attn = build_attn_inp_kv_unified();
9821
+ auto * inp_attn = build_attn_inp_kv();
9769
9822
 
9770
9823
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9771
9824
 
@@ -9824,7 +9877,7 @@ struct llm_build_internlm2 : public llm_graph_context {
9824
9877
 
9825
9878
  cur = build_attn(inp_attn,
9826
9879
  model.layers[il].wo, model.layers[il].bo,
9827
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9880
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9828
9881
  }
9829
9882
 
9830
9883
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9901,7 +9954,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
9901
9954
  // inp_pos - contains the positions
9902
9955
  ggml_tensor * inp_pos = build_inp_pos();
9903
9956
 
9904
- auto * inp_attn = build_attn_inp_kv_unified();
9957
+ auto * inp_attn = build_attn_inp_kv();
9905
9958
 
9906
9959
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9907
9960
 
@@ -10012,7 +10065,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
10012
10065
 
10013
10066
  cur = build_attn(inp_attn,
10014
10067
  model.layers[il].wo, NULL,
10015
- q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
10068
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
10016
10069
  }
10017
10070
 
10018
10071
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10096,7 +10149,7 @@ struct llm_build_gemma : public llm_graph_context {
10096
10149
  // inp_pos - contains the positions
10097
10150
  ggml_tensor * inp_pos = build_inp_pos();
10098
10151
 
10099
- auto * inp_attn = build_attn_inp_kv_unified();
10152
+ auto * inp_attn = build_attn_inp_kv();
10100
10153
 
10101
10154
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10102
10155
 
@@ -10142,7 +10195,7 @@ struct llm_build_gemma : public llm_graph_context {
10142
10195
 
10143
10196
  cur = build_attn(inp_attn,
10144
10197
  model.layers[il].wo, NULL,
10145
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10198
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
10146
10199
  }
10147
10200
 
10148
10201
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10212,7 +10265,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
10212
10265
  // inp_pos - contains the positions
10213
10266
  ggml_tensor * inp_pos = build_inp_pos();
10214
10267
 
10215
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10268
+ auto * inp_attn = build_attn_inp_kv_iswa();
10216
10269
 
10217
10270
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10218
10271
 
@@ -10257,7 +10310,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
10257
10310
 
10258
10311
  cur = build_attn(inp_attn,
10259
10312
  model.layers[il].wo, NULL,
10260
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10313
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
10261
10314
  }
10262
10315
 
10263
10316
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10346,7 +10399,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
10346
10399
  ggml_tensor * inp_pos = build_inp_pos();
10347
10400
 
10348
10401
  // TODO: is causal == true correct? might need some changes
10349
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10402
+ auto * inp_attn = build_attn_inp_kv_iswa();
10350
10403
 
10351
10404
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10352
10405
 
@@ -10399,7 +10452,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
10399
10452
 
10400
10453
  cur = build_attn(inp_attn,
10401
10454
  model.layers[il].wo, NULL,
10402
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10455
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
10403
10456
  }
10404
10457
 
10405
10458
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10497,7 +10550,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10497
10550
  ggml_tensor * inp_pos = build_inp_pos();
10498
10551
 
10499
10552
  // TODO: is causal == true correct? might need some changes
10500
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10553
+ auto * inp_attn = build_attn_inp_kv_iswa();
10501
10554
 
10502
10555
  // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
10503
10556
  ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
@@ -10580,7 +10633,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10580
10633
 
10581
10634
  cur = build_attn(inp_attn,
10582
10635
  model.layers[il].wo, NULL,
10583
- Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
10636
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10584
10637
  } else {
10585
10638
  // no KV layers
10586
10639
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -10598,7 +10651,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10598
10651
 
10599
10652
  cur = build_attn(inp_attn,
10600
10653
  model.layers[il].wo, NULL,
10601
- Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10654
+ Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10602
10655
  }
10603
10656
 
10604
10657
  cur = build_norm(cur,
@@ -10904,7 +10957,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
10904
10957
  // inp_pos - contains the positions
10905
10958
  ggml_tensor * inp_pos = build_inp_pos();
10906
10959
 
10907
- auto * inp_attn = build_attn_inp_kv_unified();
10960
+ auto * inp_attn = build_attn_inp_kv();
10908
10961
 
10909
10962
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10910
10963
 
@@ -10963,7 +11016,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
10963
11016
 
10964
11017
  cur = build_attn(inp_attn,
10965
11018
  model.layers[il].wo, model.layers[il].bo,
10966
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11019
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10967
11020
  }
10968
11021
 
10969
11022
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11390,7 +11443,9 @@ struct llm_build_jamba : public llm_graph_context_mamba {
11390
11443
  cb(Vcur, "Vcur", il);
11391
11444
 
11392
11445
  // No RoPE :)
11393
- cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
11446
+ cur = build_attn(inp_hybrid->get_attn(),
11447
+ model.layers[il].wo, NULL,
11448
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
11394
11449
  }
11395
11450
 
11396
11451
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11473,7 +11528,7 @@ struct llm_build_command_r : public llm_graph_context {
11473
11528
  // inp_pos - contains the positions
11474
11529
  ggml_tensor * inp_pos = build_inp_pos();
11475
11530
 
11476
- auto * inp_attn = build_attn_inp_kv_unified();
11531
+ auto * inp_attn = build_attn_inp_kv();
11477
11532
 
11478
11533
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11479
11534
 
@@ -11548,7 +11603,7 @@ struct llm_build_command_r : public llm_graph_context {
11548
11603
 
11549
11604
  cur = build_attn(inp_attn,
11550
11605
  model.layers[il].wo, model.layers[il].bo,
11551
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11606
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11552
11607
  }
11553
11608
 
11554
11609
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11620,7 +11675,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
11620
11675
  // inp_pos - contains the positions
11621
11676
  ggml_tensor * inp_pos = build_inp_pos();
11622
11677
 
11623
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
11678
+ auto * inp_attn = build_attn_inp_kv_iswa();
11624
11679
 
11625
11680
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11626
11681
 
@@ -11683,7 +11738,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
11683
11738
 
11684
11739
  cur = build_attn(inp_attn,
11685
11740
  model.layers[il].wo, model.layers[il].bo,
11686
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11741
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11687
11742
  }
11688
11743
 
11689
11744
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11755,7 +11810,7 @@ struct llm_build_olmo : public llm_graph_context {
11755
11810
  // inp_pos - contains the positions
11756
11811
  ggml_tensor * inp_pos = build_inp_pos();
11757
11812
 
11758
- auto * inp_attn = build_attn_inp_kv_unified();
11813
+ auto * inp_attn = build_attn_inp_kv();
11759
11814
 
11760
11815
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11761
11816
 
@@ -11814,7 +11869,7 @@ struct llm_build_olmo : public llm_graph_context {
11814
11869
 
11815
11870
  cur = build_attn(inp_attn,
11816
11871
  model.layers[il].wo, nullptr,
11817
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11872
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11818
11873
  }
11819
11874
 
11820
11875
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11883,7 +11938,7 @@ struct llm_build_olmo2 : public llm_graph_context {
11883
11938
  // inp_pos - contains the positions
11884
11939
  ggml_tensor * inp_pos = build_inp_pos();
11885
11940
 
11886
- auto * inp_attn = build_attn_inp_kv_unified();
11941
+ auto * inp_attn = build_attn_inp_kv();
11887
11942
 
11888
11943
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11889
11944
 
@@ -11934,7 +11989,7 @@ struct llm_build_olmo2 : public llm_graph_context {
11934
11989
 
11935
11990
  cur = build_attn(inp_attn,
11936
11991
  model.layers[il].wo, NULL,
11937
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11992
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11938
11993
  }
11939
11994
 
11940
11995
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12012,7 +12067,7 @@ struct llm_build_olmoe : public llm_graph_context {
12012
12067
  // inp_pos - contains the positions
12013
12068
  ggml_tensor * inp_pos = build_inp_pos();
12014
12069
 
12015
- auto * inp_attn = build_attn_inp_kv_unified();
12070
+ auto * inp_attn = build_attn_inp_kv();
12016
12071
 
12017
12072
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12018
12073
 
@@ -12067,7 +12122,7 @@ struct llm_build_olmoe : public llm_graph_context {
12067
12122
 
12068
12123
  cur = build_attn(inp_attn,
12069
12124
  model.layers[il].wo, NULL,
12070
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12125
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12071
12126
  }
12072
12127
 
12073
12128
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12138,7 +12193,7 @@ struct llm_build_openelm : public llm_graph_context {
12138
12193
  // inp_pos - contains the positions
12139
12194
  ggml_tensor * inp_pos = build_inp_pos();
12140
12195
 
12141
- auto * inp_attn = build_attn_inp_kv_unified();
12196
+ auto * inp_attn = build_attn_inp_kv();
12142
12197
 
12143
12198
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12144
12199
 
@@ -12200,7 +12255,7 @@ struct llm_build_openelm : public llm_graph_context {
12200
12255
 
12201
12256
  cur = build_attn(inp_attn,
12202
12257
  model.layers[il].wo, NULL,
12203
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12258
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12204
12259
  }
12205
12260
 
12206
12261
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12269,7 +12324,7 @@ struct llm_build_gptneox : public llm_graph_context {
12269
12324
  // inp_pos - contains the positions
12270
12325
  ggml_tensor * inp_pos = build_inp_pos();
12271
12326
 
12272
- auto * inp_attn = build_attn_inp_kv_unified();
12327
+ auto * inp_attn = build_attn_inp_kv();
12273
12328
 
12274
12329
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12275
12330
 
@@ -12312,7 +12367,7 @@ struct llm_build_gptneox : public llm_graph_context {
12312
12367
 
12313
12368
  cur = build_attn(inp_attn,
12314
12369
  model.layers[il].wo, model.layers[il].bo,
12315
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12370
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12316
12371
  }
12317
12372
 
12318
12373
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12415,7 +12470,7 @@ struct llm_build_arctic : public llm_graph_context {
12415
12470
  // inp_pos - contains the positions
12416
12471
  ggml_tensor * inp_pos = build_inp_pos();
12417
12472
 
12418
- auto * inp_attn = build_attn_inp_kv_unified();
12473
+ auto * inp_attn = build_attn_inp_kv();
12419
12474
 
12420
12475
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12421
12476
 
@@ -12462,7 +12517,7 @@ struct llm_build_arctic : public llm_graph_context {
12462
12517
 
12463
12518
  cur = build_attn(inp_attn,
12464
12519
  model.layers[il].wo, NULL,
12465
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12520
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12466
12521
  }
12467
12522
 
12468
12523
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12553,7 +12608,7 @@ struct llm_build_deepseek : public llm_graph_context {
12553
12608
  // inp_pos - contains the positions
12554
12609
  ggml_tensor * inp_pos = build_inp_pos();
12555
12610
 
12556
- auto * inp_attn = build_attn_inp_kv_unified();
12611
+ auto * inp_attn = build_attn_inp_kv();
12557
12612
 
12558
12613
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12559
12614
 
@@ -12617,7 +12672,7 @@ struct llm_build_deepseek : public llm_graph_context {
12617
12672
 
12618
12673
  cur = build_attn(inp_attn,
12619
12674
  model.layers[il].wo, model.layers[il].bo,
12620
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12675
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
12621
12676
  }
12622
12677
 
12623
12678
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12730,7 +12785,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12730
12785
  // inp_pos - contains the positions
12731
12786
  ggml_tensor * inp_pos = build_inp_pos();
12732
12787
 
12733
- auto * inp_attn = build_attn_inp_kv_unified();
12788
+ auto * inp_attn = build_attn_inp_kv();
12734
12789
 
12735
12790
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12736
12791
 
@@ -12845,7 +12900,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12845
12900
  // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
12846
12901
  cur = build_attn(inp_attn,
12847
12902
  model.layers[il].wo, NULL,
12848
- Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
12903
+ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
12849
12904
  } else {
12850
12905
  ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
12851
12906
  cb(kv, "kv", il);
@@ -12879,7 +12934,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12879
12934
  // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
12880
12935
  cur = build_attn(inp_attn,
12881
12936
  model.layers[il].wo, NULL,
12882
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12937
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
12883
12938
  }
12884
12939
  }
12885
12940
 
@@ -12977,7 +13032,7 @@ struct llm_build_bitnet : public llm_graph_context {
12977
13032
  // inp_pos - contains the positions
12978
13033
  ggml_tensor * inp_pos = build_inp_pos();
12979
13034
 
12980
- auto * inp_attn = build_attn_inp_kv_unified();
13035
+ auto * inp_attn = build_attn_inp_kv();
12981
13036
 
12982
13037
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12983
13038
 
@@ -13046,7 +13101,7 @@ struct llm_build_bitnet : public llm_graph_context {
13046
13101
 
13047
13102
  cur = build_attn(inp_attn,
13048
13103
  NULL, NULL,
13049
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13104
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13050
13105
 
13051
13106
  cur = build_norm(cur,
13052
13107
  model.layers[il].attn_sub_norm, NULL,
@@ -13169,7 +13224,7 @@ struct llm_build_t5_enc : public llm_graph_context {
13169
13224
 
13170
13225
  cur = build_attn(inp_attn,
13171
13226
  model.layers[il].wo_enc, nullptr,
13172
- Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
13227
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
13173
13228
  cb(cur, "kqv_out", il);
13174
13229
  }
13175
13230
 
@@ -13241,7 +13296,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13241
13296
 
13242
13297
  const int64_t n_outputs_enc = embd_enc->ne[1];
13243
13298
 
13244
- auto * inp_attn_self = build_attn_inp_kv_unified();
13299
+ auto * inp_attn_self = build_attn_inp_kv();
13245
13300
  auto * inp_attn_cross = build_attn_inp_cross();
13246
13301
 
13247
13302
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -13275,7 +13330,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13275
13330
 
13276
13331
  cur = build_attn(inp_attn_self,
13277
13332
  model.layers[il].wo, model.layers[il].bo,
13278
- Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
13333
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
13279
13334
  cb(cur, "kqv_out", il);
13280
13335
  }
13281
13336
 
@@ -13307,7 +13362,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13307
13362
 
13308
13363
  cur = build_attn(inp_attn_cross,
13309
13364
  model.layers[il].wo_cross, nullptr,
13310
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
13365
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
13311
13366
  cb(cur, "kqv_out", il);
13312
13367
 
13313
13368
  //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
@@ -13406,7 +13461,7 @@ struct llm_build_jais : public llm_graph_context {
13406
13461
 
13407
13462
  inpL = build_inp_embd(model.tok_embd);
13408
13463
 
13409
- auto * inp_attn = build_attn_inp_kv_unified();
13464
+ auto * inp_attn = build_attn_inp_kv();
13410
13465
 
13411
13466
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13412
13467
 
@@ -13439,7 +13494,7 @@ struct llm_build_jais : public llm_graph_context {
13439
13494
 
13440
13495
  cur = build_attn(inp_attn,
13441
13496
  model.layers[il].wo, model.layers[il].bo,
13442
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
13497
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
13443
13498
  }
13444
13499
 
13445
13500
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13504,7 +13559,7 @@ struct llm_build_chatglm : public llm_graph_context {
13504
13559
  // inp_pos - contains the positions
13505
13560
  ggml_tensor * inp_pos = build_inp_pos();
13506
13561
 
13507
- auto * inp_attn = build_attn_inp_kv_unified();
13562
+ auto * inp_attn = build_attn_inp_kv();
13508
13563
 
13509
13564
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13510
13565
 
@@ -13571,7 +13626,7 @@ struct llm_build_chatglm : public llm_graph_context {
13571
13626
 
13572
13627
  cur = build_attn(inp_attn,
13573
13628
  model.layers[il].wo, NULL,
13574
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13629
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13575
13630
  }
13576
13631
 
13577
13632
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13637,7 +13692,7 @@ struct llm_build_glm4 : public llm_graph_context {
13637
13692
  // inp_pos - contains the positions
13638
13693
  ggml_tensor * inp_pos = build_inp_pos();
13639
13694
 
13640
- auto * inp_attn = build_attn_inp_kv_unified();
13695
+ auto * inp_attn = build_attn_inp_kv();
13641
13696
 
13642
13697
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13643
13698
 
@@ -13704,7 +13759,7 @@ struct llm_build_glm4 : public llm_graph_context {
13704
13759
 
13705
13760
  cur = build_attn(inp_attn,
13706
13761
  model.layers[il].wo, NULL,
13707
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13762
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13708
13763
  }
13709
13764
 
13710
13765
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13787,7 +13842,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
13787
13842
  // inp_pos - contains the positions
13788
13843
  ggml_tensor * inp_pos = build_inp_pos();
13789
13844
 
13790
- auto * inp_attn = build_attn_inp_kv_unified();
13845
+ auto * inp_attn = build_attn_inp_kv();
13791
13846
 
13792
13847
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13793
13848
 
@@ -13853,7 +13908,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
13853
13908
 
13854
13909
  cur = build_attn(inp_attn,
13855
13910
  model.layers[il].wo, NULL,
13856
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13911
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13857
13912
  }
13858
13913
 
13859
13914
  if (il == n_transformer_layers - 1 && inp_out_ids) {
@@ -13947,7 +14002,7 @@ struct llm_build_nemotron : public llm_graph_context {
13947
14002
  // inp_pos - contains the positions
13948
14003
  ggml_tensor * inp_pos = build_inp_pos();
13949
14004
 
13950
- auto * inp_attn = build_attn_inp_kv_unified();
14005
+ auto * inp_attn = build_attn_inp_kv();
13951
14006
 
13952
14007
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13953
14008
 
@@ -14007,7 +14062,7 @@ struct llm_build_nemotron : public llm_graph_context {
14007
14062
 
14008
14063
  cur = build_attn(inp_attn,
14009
14064
  model.layers[il].wo, model.layers[il].bo,
14010
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14065
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14011
14066
  }
14012
14067
 
14013
14068
  if (il == n_layer - 1 && inp_out_ids) {
@@ -14076,7 +14131,7 @@ struct llm_build_exaone : public llm_graph_context {
14076
14131
  // inp_pos - contains the positions
14077
14132
  ggml_tensor * inp_pos = build_inp_pos();
14078
14133
 
14079
- auto * inp_attn = build_attn_inp_kv_unified();
14134
+ auto * inp_attn = build_attn_inp_kv();
14080
14135
 
14081
14136
  ggml_tensor * inp_out_ids = build_inp_out_ids();
14082
14137
 
@@ -14138,7 +14193,7 @@ struct llm_build_exaone : public llm_graph_context {
14138
14193
 
14139
14194
  cur = build_attn(inp_attn,
14140
14195
  model.layers[il].wo, model.layers[il].bo,
14141
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14196
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14142
14197
  }
14143
14198
 
14144
14199
  if (il == n_layer - 1 && inp_out_ids) {
@@ -14208,13 +14263,13 @@ struct llm_build_exaone4 : public llm_graph_context {
14208
14263
  // inp_pos - contains the positions
14209
14264
  ggml_tensor * inp_pos = build_inp_pos();
14210
14265
 
14211
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
14266
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
14212
14267
  inp_attn_type * inp_attn = nullptr;
14213
14268
 
14214
14269
  if constexpr (iswa) {
14215
- inp_attn = build_attn_inp_kv_unified_iswa();
14270
+ inp_attn = build_attn_inp_kv_iswa();
14216
14271
  } else {
14217
- inp_attn = build_attn_inp_kv_unified();
14272
+ inp_attn = build_attn_inp_kv();
14218
14273
  }
14219
14274
 
14220
14275
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -14269,7 +14324,7 @@ struct llm_build_exaone4 : public llm_graph_context {
14269
14324
 
14270
14325
  cur = build_attn(inp_attn,
14271
14326
  model.layers[il].wo, NULL,
14272
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14327
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14273
14328
  cb(cur, "attn_out", il);
14274
14329
  }
14275
14330
 
@@ -15097,7 +15152,7 @@ struct llm_build_granite : public llm_graph_context {
15097
15152
  inp_pos = build_inp_pos();
15098
15153
  }
15099
15154
 
15100
- auto * inp_attn = build_attn_inp_kv_unified();
15155
+ auto * inp_attn = build_attn_inp_kv();
15101
15156
 
15102
15157
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15103
15158
 
@@ -15148,12 +15203,12 @@ struct llm_build_granite : public llm_graph_context {
15148
15203
  }
15149
15204
 
15150
15205
  ggml_tensor * build_attention_layer(
15151
- ggml_tensor * cur,
15152
- ggml_tensor * inp_pos,
15153
- llm_graph_input_attn_kv_unified * inp_attn,
15154
- const llama_model & model,
15155
- const int64_t n_embd_head,
15156
- const int il) {
15206
+ ggml_tensor * cur,
15207
+ ggml_tensor * inp_pos,
15208
+ llm_graph_input_attn_kv * inp_attn,
15209
+ const llama_model & model,
15210
+ const int64_t n_embd_head,
15211
+ const int il) {
15157
15212
 
15158
15213
  // compute Q and K and (optionally) RoPE them
15159
15214
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -15204,7 +15259,7 @@ struct llm_build_granite : public llm_graph_context {
15204
15259
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15205
15260
  cur = build_attn(inp_attn,
15206
15261
  model.layers[il].wo, model.layers[il].bo,
15207
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15262
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
15208
15263
  cb(cur, "attn_out", il);
15209
15264
  return cur;
15210
15265
  }
@@ -15367,12 +15422,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
15367
15422
  }
15368
15423
 
15369
15424
  ggml_tensor * build_attention_layer(
15370
- ggml_tensor * cur,
15371
- ggml_tensor * inp_pos,
15372
- llm_graph_input_attn_kv_unified * inp_attn,
15373
- const llama_model & model,
15374
- const int64_t n_embd_head,
15375
- const int il) {
15425
+ ggml_tensor * cur,
15426
+ ggml_tensor * inp_pos,
15427
+ llm_graph_input_attn_kv * inp_attn,
15428
+ const llama_model & model,
15429
+ const int64_t n_embd_head,
15430
+ const int il) {
15376
15431
 
15377
15432
  // compute Q and K and (optionally) RoPE them
15378
15433
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -15423,7 +15478,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
15423
15478
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15424
15479
  cur = build_attn(inp_attn,
15425
15480
  model.layers[il].wo, model.layers[il].bo,
15426
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15481
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
15427
15482
  cb(cur, "attn_out", il);
15428
15483
  return cur;
15429
15484
  }
@@ -15529,7 +15584,7 @@ struct llm_build_chameleon : public llm_graph_context {
15529
15584
  // inp_pos - contains the positions
15530
15585
  ggml_tensor * inp_pos = build_inp_pos();
15531
15586
 
15532
- auto * inp_attn = build_attn_inp_kv_unified();
15587
+ auto * inp_attn = build_attn_inp_kv();
15533
15588
 
15534
15589
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15535
15590
 
@@ -15608,7 +15663,7 @@ struct llm_build_chameleon : public llm_graph_context {
15608
15663
 
15609
15664
  cur = build_attn(inp_attn,
15610
15665
  model.layers[il].wo, nullptr,
15611
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15666
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15612
15667
  }
15613
15668
 
15614
15669
  if (il == n_layer - 1 && inp_out_ids) {
@@ -15860,7 +15915,7 @@ struct llm_build_plm : public llm_graph_context {
15860
15915
  // inp_pos - contains the positions
15861
15916
  ggml_tensor * inp_pos = build_inp_pos();
15862
15917
 
15863
- auto * inp_attn = build_attn_inp_kv_unified();
15918
+ auto * inp_attn = build_attn_inp_kv();
15864
15919
 
15865
15920
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15866
15921
 
@@ -15964,7 +16019,7 @@ struct llm_build_plm : public llm_graph_context {
15964
16019
 
15965
16020
  cur = build_attn(inp_attn,
15966
16021
  model.layers[il].wo, NULL,
15967
- q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
16022
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
15968
16023
  }
15969
16024
 
15970
16025
  if (il == n_layer - 1 && inp_out_ids) {
@@ -16025,7 +16080,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
16025
16080
  // inp_pos - contains the positions
16026
16081
  ggml_tensor * inp_pos = build_inp_pos();
16027
16082
 
16028
- auto * inp_attn = build_attn_inp_kv_unified();
16083
+ auto * inp_attn = build_attn_inp_kv();
16029
16084
 
16030
16085
  ggml_tensor * inp_out_ids = build_inp_out_ids();
16031
16086
 
@@ -16087,7 +16142,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
16087
16142
 
16088
16143
  cur = build_attn(inp_attn,
16089
16144
  model.layers[il].wo, model.layers[il].bo,
16090
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
16145
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
16091
16146
  }
16092
16147
 
16093
16148
  if (il == n_layer - 1 && inp_out_ids) {
@@ -16174,7 +16229,7 @@ struct llm_build_dots1 : public llm_graph_context {
16174
16229
  // inp_pos - contains the positions
16175
16230
  ggml_tensor * inp_pos = build_inp_pos();
16176
16231
 
16177
- auto * inp_attn = build_attn_inp_kv_unified();
16232
+ auto * inp_attn = build_attn_inp_kv();
16178
16233
 
16179
16234
  ggml_tensor * inp_out_ids = build_inp_out_ids();
16180
16235
 
@@ -16227,7 +16282,7 @@ struct llm_build_dots1 : public llm_graph_context {
16227
16282
 
16228
16283
  cur = build_attn(inp_attn,
16229
16284
  model.layers[il].wo, model.layers[il].bo,
16230
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16285
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16231
16286
  }
16232
16287
 
16233
16288
  if (il == n_layer - 1 && inp_out_ids) {
@@ -16324,7 +16379,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
16324
16379
  // inp_pos - contains the positions
16325
16380
  ggml_tensor * inp_pos = build_inp_pos();
16326
16381
 
16327
- auto * inp_attn = build_attn_inp_kv_unified();
16382
+ auto * inp_attn = build_attn_inp_kv();
16328
16383
 
16329
16384
  for (int il = 0; il < n_layer; ++il) {
16330
16385
  ggml_tensor * inpSA = inpL;
@@ -16382,7 +16437,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
16382
16437
 
16383
16438
  cur = build_attn(inp_attn,
16384
16439
  model.layers[il].wo, NULL,
16385
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16440
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16386
16441
  }
16387
16442
 
16388
16443
  if (il == n_layer - 1) {
@@ -16454,7 +16509,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
16454
16509
  // inp_pos - contains the positions
16455
16510
  ggml_tensor * inp_pos = build_inp_pos();
16456
16511
 
16457
- auto * inp_attn = build_attn_inp_kv_unified();
16512
+ auto * inp_attn = build_attn_inp_kv();
16458
16513
 
16459
16514
  ggml_tensor * inp_out_ids = build_inp_out_ids();
16460
16515
 
@@ -16515,7 +16570,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
16515
16570
 
16516
16571
  cur = build_attn(inp_attn,
16517
16572
  model.layers[il].wo, NULL,
16518
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16573
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16519
16574
  cb(cur, "attn_out", il);
16520
16575
  }
16521
16576
 
@@ -16668,7 +16723,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
16668
16723
 
16669
16724
  ggml_tensor * attn_out = build_attn(inp->get_attn(),
16670
16725
  model.layers[il].wo, NULL,
16671
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
16726
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
16672
16727
  cb(attn_out, "attn_out", il);
16673
16728
 
16674
16729
  cur = build_norm(inpL,
@@ -16828,7 +16883,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
16828
16883
 
16829
16884
  private:
16830
16885
  ggml_tensor * build_plamo2_attn_layer(
16831
- llm_graph_input_attn_kv_unified * inp,
16886
+ llm_graph_input_attn_kv * inp,
16832
16887
  ggml_tensor * inp_pos,
16833
16888
  ggml_tensor * cur,
16834
16889
  const llama_model & model,
@@ -16878,7 +16933,9 @@ private:
16878
16933
  ext_factor, attn_factor, beta_fast, beta_slow
16879
16934
  );
16880
16935
 
16881
- cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
16936
+ cur = build_attn(inp,
16937
+ model.layers[il].wo, NULL,
16938
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
16882
16939
  }
16883
16940
 
16884
16941
  cb(cur, "attn_out", il);
@@ -17061,7 +17118,7 @@ struct llm_build_arcee : public llm_graph_context {
17061
17118
  // inp_pos - contains the positions
17062
17119
  ggml_tensor * inp_pos = build_inp_pos();
17063
17120
 
17064
- auto * inp_attn = build_attn_inp_kv_unified();
17121
+ auto * inp_attn = build_attn_inp_kv();
17065
17122
 
17066
17123
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
17067
17124
 
@@ -17125,7 +17182,7 @@ struct llm_build_arcee : public llm_graph_context {
17125
17182
 
17126
17183
  cur = build_attn(inp_attn,
17127
17184
  model.layers[il].wo, model.layers[il].bo,
17128
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17185
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17129
17186
  cb(cur, "attn_out", il);
17130
17187
  }
17131
17188
 
@@ -17196,7 +17253,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
17196
17253
  // inp_pos - contains the positions
17197
17254
  ggml_tensor * inp_pos = build_inp_pos();
17198
17255
 
17199
- auto * inp_attn = build_attn_inp_kv_unified();
17256
+ auto * inp_attn = build_attn_inp_kv();
17200
17257
 
17201
17258
  const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
17202
17259
 
@@ -17270,7 +17327,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
17270
17327
 
17271
17328
  cur = build_attn(inp_attn,
17272
17329
  model.layers[il].wo, model.layers[il].bo,
17273
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17330
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17274
17331
  cb(cur, "attn_out", il);
17275
17332
  }
17276
17333
 
@@ -17357,7 +17414,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
17357
17414
  // inp_pos - contains the positions
17358
17415
  ggml_tensor * inp_pos = build_inp_pos();
17359
17416
 
17360
- auto * inp_attn = build_attn_inp_kv_unified();
17417
+ auto * inp_attn = build_attn_inp_kv();
17361
17418
 
17362
17419
  const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
17363
17420
 
@@ -17430,7 +17487,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
17430
17487
 
17431
17488
  cur = build_attn(inp_attn,
17432
17489
  model.layers[il].wo, model.layers[il].bo,
17433
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17490
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17434
17491
  cb(cur, "attn_out", il);
17435
17492
  }
17436
17493
 
@@ -17495,7 +17552,7 @@ struct llm_build_smollm3 : public llm_graph_context {
17495
17552
  // inp_pos - contains the positions
17496
17553
  ggml_tensor * inp_pos = build_inp_pos();
17497
17554
 
17498
- auto * inp_attn = build_attn_inp_kv_unified();
17555
+ auto * inp_attn = build_attn_inp_kv();
17499
17556
 
17500
17557
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
17501
17558
 
@@ -17560,7 +17617,7 @@ struct llm_build_smollm3 : public llm_graph_context {
17560
17617
 
17561
17618
  cur = build_attn(inp_attn,
17562
17619
  model.layers[il].wo, model.layers[il].bo,
17563
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17620
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17564
17621
  cb(cur, "attn_out", il);
17565
17622
  }
17566
17623
 
@@ -17627,7 +17684,7 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
17627
17684
  // inp_pos - contains the positions
17628
17685
  ggml_tensor * inp_pos = build_inp_pos();
17629
17686
 
17630
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
17687
+ auto * inp_attn = build_attn_inp_kv_iswa();
17631
17688
 
17632
17689
  for (int il = 0; il < n_layer; ++il) {
17633
17690
  ggml_tensor * inpSA = inpL;
@@ -17682,9 +17739,9 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
17682
17739
  cb(Kcur, "Kcur", il);
17683
17740
  cb(Vcur, "Vcur", il);
17684
17741
 
17685
- cur = build_attn_with_sinks(inp_attn,
17742
+ cur = build_attn(inp_attn,
17686
17743
  model.layers[il].wo, model.layers[il].bo,
17687
- Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il);
17744
+ Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
17688
17745
 
17689
17746
  cb(cur, "attn_out", il);
17690
17747
  }
@@ -17781,8 +17838,7 @@ struct llm_build_lfm2 : public llm_graph_context {
17781
17838
  cb(cur, "model.embedding_norm", -1);
17782
17839
  res->t_embd = cur;
17783
17840
 
17784
- // lm_head is tied with embeddings
17785
- cur = build_lora_mm(model.tok_embd, cur);
17841
+ cur = build_lora_mm(model.output, cur);
17786
17842
  cb(cur, "lm_head", -1);
17787
17843
 
17788
17844
  res->t_logits = cur;
@@ -17809,10 +17865,10 @@ struct llm_build_lfm2 : public llm_graph_context {
17809
17865
  return cur;
17810
17866
  }
17811
17867
 
17812
- ggml_tensor * build_attn_block(ggml_tensor * cur,
17813
- ggml_tensor * inp_pos,
17814
- llm_graph_input_attn_kv_unified * inp_attn,
17815
- int il) const {
17868
+ ggml_tensor * build_attn_block(ggml_tensor * cur,
17869
+ ggml_tensor * inp_pos,
17870
+ llm_graph_input_attn_kv * inp_attn,
17871
+ int il) const {
17816
17872
  GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
17817
17873
  auto const n_embd_head = hparams.n_embd_head_v;
17818
17874
  auto const n_head_kv = hparams.n_head_kv(il);
@@ -17847,7 +17903,7 @@ struct llm_build_lfm2 : public llm_graph_context {
17847
17903
  );
17848
17904
 
17849
17905
  cur = build_attn(inp_attn, model.layers[il].wo, NULL,
17850
- q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
17906
+ q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
17851
17907
 
17852
17908
  cb(cur, "model.layers.{}.self_attn.out_proj", il);
17853
17909
 
@@ -17924,6 +17980,137 @@ struct llm_build_lfm2 : public llm_graph_context {
17924
17980
  }
17925
17981
  };
17926
17982
 
17983
+ struct llm_build_seed_oss : public llm_graph_context {
17984
+ llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17985
+ const int64_t n_embd_head = hparams.n_embd_head_v;
17986
+
17987
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
17988
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
17989
+
17990
+ ggml_tensor * cur;
17991
+ ggml_tensor * inpL;
17992
+
17993
+ inpL = build_inp_embd(model.tok_embd);
17994
+
17995
+ // inp_pos - contains the positions
17996
+ ggml_tensor * inp_pos = build_inp_pos();
17997
+
17998
+ auto * inp_attn = build_attn_inp_kv();
17999
+
18000
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
18001
+
18002
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
18003
+
18004
+ for (int il = 0; il < n_layer; ++il) {
18005
+ ggml_tensor * inpSA = inpL;
18006
+
18007
+ // norm
18008
+ cur = build_norm(inpL,
18009
+ model.layers[il].attn_norm, NULL,
18010
+ LLM_NORM_RMS, il);
18011
+ cb(cur, "attn_norm", il);
18012
+
18013
+ // self-attention
18014
+ {
18015
+ // compute Q and K and RoPE them
18016
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
18017
+ cb(Qcur, "Qcur", il);
18018
+ if (model.layers[il].bq) {
18019
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
18020
+ cb(Qcur, "Qcur", il);
18021
+ }
18022
+
18023
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
18024
+ cb(Kcur, "Kcur", il);
18025
+ if (model.layers[il].bk) {
18026
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
18027
+ cb(Kcur, "Kcur", il);
18028
+ }
18029
+
18030
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
18031
+ cb(Vcur, "Vcur", il);
18032
+ if (model.layers[il].bv) {
18033
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
18034
+ cb(Vcur, "Vcur", il);
18035
+ }
18036
+
18037
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
18038
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
18039
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
18040
+
18041
+ Qcur = ggml_rope_ext(
18042
+ ctx0, Qcur, inp_pos, nullptr,
18043
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
18044
+ ext_factor, attn_factor, beta_fast, beta_slow
18045
+ );
18046
+
18047
+ Kcur = ggml_rope_ext(
18048
+ ctx0, Kcur, inp_pos, nullptr,
18049
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
18050
+ ext_factor, attn_factor, beta_fast, beta_slow
18051
+ );
18052
+
18053
+ cb(Qcur, "Qcur", il);
18054
+ cb(Kcur, "Kcur", il);
18055
+ cb(Vcur, "Vcur", il);
18056
+
18057
+ cur = build_attn(inp_attn,
18058
+ model.layers[il].wo, model.layers[il].bo,
18059
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
18060
+ cb(cur, "attn_out", il);
18061
+ }
18062
+
18063
+ if (il == n_layer - 1 && inp_out_ids) {
18064
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
18065
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
18066
+ }
18067
+
18068
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
18069
+ cb(ffn_inp, "ffn_inp", il);
18070
+
18071
+ // feed-forward network
18072
+ cur = build_norm(ffn_inp,
18073
+ model.layers[il].attn_post_norm, NULL,
18074
+ LLM_NORM_RMS, il);
18075
+ cb(cur, "attn_post_norm", il);
18076
+
18077
+ cur = build_ffn(cur,
18078
+ model.layers[il].ffn_up, NULL, NULL,
18079
+ model.layers[il].ffn_gate, NULL, NULL,
18080
+ model.layers[il].ffn_down, NULL, NULL,
18081
+ NULL,
18082
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
18083
+ cb(cur, "ffn_out", il);
18084
+
18085
+ cur = ggml_add(ctx0, cur, ffn_inp);
18086
+ cb(cur, "ffn_out", il);
18087
+
18088
+ cur = build_cvec(cur, il);
18089
+ cb(cur, "l_out", il);
18090
+
18091
+ // input for next layer
18092
+ inpL = cur;
18093
+ }
18094
+
18095
+ cur = inpL;
18096
+
18097
+ cur = build_norm(cur,
18098
+ model.output_norm, NULL,
18099
+ LLM_NORM_RMS, -1);
18100
+
18101
+ cb(cur, "result_norm", -1);
18102
+ res->t_embd = cur;
18103
+
18104
+ // lm_head
18105
+ cur = build_lora_mm(model.output, cur);
18106
+
18107
+ cb(cur, "result_output", -1);
18108
+ res->t_logits = cur;
18109
+
18110
+ ggml_build_forward_expand(gf, cur);
18111
+ }
18112
+ };
18113
+
17927
18114
  template <bool iswa>
17928
18115
  struct llm_build_smallthinker : public llm_graph_context{
17929
18116
  llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
@@ -17940,13 +18127,13 @@ struct llm_build_smallthinker : public llm_graph_context{
17940
18127
  // inp_pos - contains the positions
17941
18128
  ggml_tensor * inp_pos = build_inp_pos();
17942
18129
 
17943
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
18130
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
17944
18131
  inp_attn_type * inp_attn = nullptr;
17945
18132
 
17946
18133
  if constexpr (iswa) {
17947
- inp_attn = build_attn_inp_kv_unified_iswa();
18134
+ inp_attn = build_attn_inp_kv_iswa();
17948
18135
  } else {
17949
- inp_attn = build_attn_inp_kv_unified();
18136
+ inp_attn = build_attn_inp_kv();
17950
18137
  }
17951
18138
 
17952
18139
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -17991,7 +18178,7 @@ struct llm_build_smallthinker : public llm_graph_context{
17991
18178
 
17992
18179
  cur = build_attn(inp_attn,
17993
18180
  model.layers[il].wo, model.layers[il].bo,
17994
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
18181
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
17995
18182
  }
17996
18183
 
17997
18184
  if (il == n_layer - 1 && inp_out_ids) {
@@ -18076,7 +18263,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18076
18263
  std::max((uint32_t) 1, cparams.n_seq_max),
18077
18264
  cparams.n_seq_max);
18078
18265
  } else if (llm_arch_is_hybrid(arch)) {
18079
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
18266
+ const auto padding = llama_kv_cache::get_padding(cparams);
18080
18267
 
18081
18268
  cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
18082
18269
 
@@ -18098,7 +18285,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18098
18285
  /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
18099
18286
  /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
18100
18287
  } else {
18101
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
18288
+ const auto padding = llama_kv_cache::get_padding(cparams);
18102
18289
 
18103
18290
  uint32_t n_ctx_per_stream = cparams.n_ctx;
18104
18291
 
@@ -18118,7 +18305,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18118
18305
  if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
18119
18306
  GGML_ASSERT(hparams.is_swa_any());
18120
18307
 
18121
- res = new llama_kv_cache_unified_iswa(
18308
+ res = new llama_kv_cache_iswa(
18122
18309
  *this,
18123
18310
  params.type_k,
18124
18311
  params.type_v,
@@ -18133,7 +18320,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18133
18320
  } else {
18134
18321
  GGML_ASSERT(!hparams.is_swa_any());
18135
18322
 
18136
- res = new llama_kv_cache_unified(
18323
+ res = new llama_kv_cache(
18137
18324
  *this,
18138
18325
  nullptr,
18139
18326
  params.type_k,
@@ -18462,6 +18649,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18462
18649
  {
18463
18650
  llm = std::make_unique<llm_build_bailingmoe>(*this, params);
18464
18651
  } break;
18652
+ case LLM_ARCH_SEED_OSS:
18653
+ {
18654
+ llm = std::make_unique<llm_build_seed_oss>(*this, params);
18655
+ } break;
18465
18656
  case LLM_ARCH_DOTS1:
18466
18657
  {
18467
18658
  llm = std::make_unique<llm_build_dots1>(*this, params);
@@ -18520,6 +18711,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18520
18711
  return llm->res->get_gf();
18521
18712
  }
18522
18713
 
18714
+
18523
18715
  //
18524
18716
  // interface implementation
18525
18717
  //
@@ -18714,6 +18906,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18714
18906
  case LLM_ARCH_LFM2:
18715
18907
  case LLM_ARCH_SMALLTHINKER:
18716
18908
  case LLM_ARCH_GLM4_MOE:
18909
+ case LLM_ARCH_SEED_OSS:
18717
18910
  return LLAMA_ROPE_TYPE_NEOX;
18718
18911
 
18719
18912
  case LLM_ARCH_QWEN2VL: