@fugood/llama.node 1.1.7 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +9 -2
  3. package/lib/index.ts +57 -30
  4. package/lib/version.js +2 -2
  5. package/lib/version.ts +2 -2
  6. package/package.json +14 -14
  7. package/src/LlamaContext.cpp +20 -0
  8. package/src/common.hpp +8 -1
  9. package/src/llama.cpp/common/arg.cpp +13 -4
  10. package/src/llama.cpp/common/chat.cpp +33 -2
  11. package/src/llama.cpp/common/common.cpp +0 -15
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  14. package/src/llama.cpp/ggml/include/ggml.h +25 -0
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -3
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  22. package/src/llama.cpp/include/llama.h +1 -110
  23. package/src/llama.cpp/src/CMakeLists.txt +2 -2
  24. package/src/llama.cpp/src/llama-arch.cpp +19 -0
  25. package/src/llama.cpp/src/llama-arch.h +1 -0
  26. package/src/llama.cpp/src/llama-chat.cpp +13 -2
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-context.cpp +5 -197
  29. package/src/llama.cpp/src/llama-context.h +2 -7
  30. package/src/llama.cpp/src/llama-cparams.h +0 -1
  31. package/src/llama.cpp/src/llama-graph.cpp +35 -57
  32. package/src/llama.cpp/src/llama-graph.h +36 -46
  33. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
  34. package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
  35. package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
  36. package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
  37. package/src/llama.cpp/src/llama-kv-cells.h +21 -21
  38. package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
  39. package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
  40. package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
  41. package/src/llama.cpp/src/llama-memory.h +3 -8
  42. package/src/llama.cpp/src/llama-model.cpp +449 -246
  43. package/src/llama.cpp/src/llama-model.h +2 -0
@@ -6,8 +6,8 @@
6
6
  #include "llama-cparams.h"
7
7
  #include "llama-model-loader.h"
8
8
 
9
- #include "llama-kv-cache-unified.h"
10
- #include "llama-kv-cache-unified-iswa.h"
9
+ #include "llama-kv-cache.h"
10
+ #include "llama-kv-cache-iswa.h"
11
11
  #include "llama-memory-hybrid.h"
12
12
  #include "llama-memory-recurrent.h"
13
13
 
@@ -83,9 +83,11 @@ const char * llm_type_name(llm_type type) {
83
83
  case LLM_TYPE_32B: return "32B";
84
84
  case LLM_TYPE_34B: return "34B";
85
85
  case LLM_TYPE_35B: return "35B";
86
+ case LLM_TYPE_36B: return "36B";
86
87
  case LLM_TYPE_40B: return "40B";
87
88
  case LLM_TYPE_65B: return "65B";
88
89
  case LLM_TYPE_70B: return "70B";
90
+ case LLM_TYPE_120B: return "120B";
89
91
  case LLM_TYPE_142B: return "142B";
90
92
  case LLM_TYPE_236B: return "236B";
91
93
  case LLM_TYPE_290B: return "290B";
@@ -1287,6 +1289,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1287
1289
  default: type = LLM_TYPE_UNKNOWN;
1288
1290
  }
1289
1291
  } break;
1292
+ case LLM_ARCH_SEED_OSS:
1293
+ {
1294
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1295
+ switch (hparams.n_layer) {
1296
+ case 64: type = LLM_TYPE_36B; break;
1297
+ default: type = LLM_TYPE_UNKNOWN;
1298
+ }
1299
+ } break;
1290
1300
  case LLM_ARCH_OLMOE:
1291
1301
  {
1292
1302
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1834,7 +1844,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1834
1844
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1835
1845
  hparams.set_swa_pattern(2);
1836
1846
 
1837
- // TODO: switch (hparams.n_layer)
1847
+ switch (hparams.n_layer) {
1848
+ case 24: type = LLM_TYPE_20B; break;
1849
+ case 36: type = LLM_TYPE_120B; break;
1850
+ default: type = LLM_TYPE_UNKNOWN;
1851
+ }
1838
1852
  } break;
1839
1853
  case LLM_ARCH_LFM2:
1840
1854
  {
@@ -3962,6 +3976,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3962
3976
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3963
3977
  }
3964
3978
  } break;
3979
+ case LLM_ARCH_SEED_OSS:
3980
+ {
3981
+ const uint32_t head_dim = hparams.n_embd_head_k;
3982
+ const int64_t n_qo_dim = n_head * head_dim;
3983
+ const int64_t n_kv_dim = n_head_kv * head_dim;
3984
+
3985
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3986
+
3987
+ // output
3988
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3989
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3990
+ // if output is NULL, init from the input tok embed
3991
+ if (output == NULL) {
3992
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3993
+ }
3994
+
3995
+ for (int i = 0; i < n_layer; ++i) {
3996
+ auto & layer = layers[i];
3997
+
3998
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
3999
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
4000
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
4001
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
4002
+
4003
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
4004
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4005
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4006
+
4007
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4008
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4009
+
4010
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4011
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4012
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4013
+ }
4014
+ } break;
4015
+
3965
4016
  case LLM_ARCH_OLMOE:
3966
4017
  {
3967
4018
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5469,8 +5520,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5469
5520
  } break;
5470
5521
  case LLM_ARCH_LFM2:
5471
5522
  {
5472
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5523
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5473
5524
  tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5525
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5526
+
5527
+ if (output == NULL) {
5528
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5529
+ }
5474
5530
 
5475
5531
  for (int i = 0; i < n_layer; ++i) {
5476
5532
  auto & layer = layers[i];
@@ -5981,7 +6037,7 @@ struct llm_build_llama : public llm_graph_context {
5981
6037
  // inp_pos - contains the positions
5982
6038
  ggml_tensor * inp_pos = build_inp_pos();
5983
6039
 
5984
- auto * inp_attn = build_attn_inp_kv_unified();
6040
+ auto * inp_attn = build_attn_inp_kv();
5985
6041
 
5986
6042
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
5987
6043
 
@@ -6045,7 +6101,7 @@ struct llm_build_llama : public llm_graph_context {
6045
6101
 
6046
6102
  cur = build_attn(inp_attn,
6047
6103
  model.layers[il].wo, model.layers[il].bo,
6048
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6104
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
6049
6105
  cb(cur, "attn_out", il);
6050
6106
  }
6051
6107
 
@@ -6141,7 +6197,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
6141
6197
  ggml_tensor * inp_attn_scale = nullptr;
6142
6198
  inp_attn_scale = build_inp_attn_scale();
6143
6199
 
6144
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
6200
+ auto * inp_attn = build_attn_inp_kv_iswa();
6145
6201
 
6146
6202
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
6147
6203
 
@@ -6219,7 +6275,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
6219
6275
 
6220
6276
  cur = build_attn(inp_attn,
6221
6277
  model.layers[il].wo, model.layers[il].bo,
6222
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6278
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
6223
6279
  cb(cur, "attn_out", il);
6224
6280
  }
6225
6281
 
@@ -6320,7 +6376,7 @@ struct llm_build_deci : public llm_graph_context {
6320
6376
  // inp_pos - contains the positions
6321
6377
  ggml_tensor * inp_pos = build_inp_pos();
6322
6378
 
6323
- auto * inp_attn = build_attn_inp_kv_unified();
6379
+ auto * inp_attn = build_attn_inp_kv();
6324
6380
 
6325
6381
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
6326
6382
 
@@ -6396,7 +6452,7 @@ struct llm_build_deci : public llm_graph_context {
6396
6452
 
6397
6453
  cur = build_attn(inp_attn,
6398
6454
  model.layers[il].wo, model.layers[il].bo,
6399
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6455
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
6400
6456
  }
6401
6457
 
6402
6458
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6476,7 +6532,7 @@ struct llm_build_baichuan : public llm_graph_context {
6476
6532
  // inp_pos - contains the positions
6477
6533
  ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
6478
6534
 
6479
- auto * inp_attn = build_attn_inp_kv_unified();
6535
+ auto * inp_attn = build_attn_inp_kv();
6480
6536
 
6481
6537
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6482
6538
 
@@ -6528,7 +6584,7 @@ struct llm_build_baichuan : public llm_graph_context {
6528
6584
 
6529
6585
  cur = build_attn(inp_attn,
6530
6586
  model.layers[il].wo, NULL,
6531
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6587
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6532
6588
  }
6533
6589
 
6534
6590
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6598,7 +6654,7 @@ struct llm_build_xverse : public llm_graph_context {
6598
6654
  // inp_pos - contains the positions
6599
6655
  ggml_tensor * inp_pos = build_inp_pos();
6600
6656
 
6601
- auto * inp_attn = build_attn_inp_kv_unified();
6657
+ auto * inp_attn = build_attn_inp_kv();
6602
6658
 
6603
6659
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6604
6660
 
@@ -6643,7 +6699,7 @@ struct llm_build_xverse : public llm_graph_context {
6643
6699
 
6644
6700
  cur = build_attn(inp_attn,
6645
6701
  model.layers[il].wo, NULL,
6646
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6702
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6647
6703
  }
6648
6704
 
6649
6705
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6712,7 +6768,7 @@ struct llm_build_falcon : public llm_graph_context {
6712
6768
  // inp_pos - contains the positions
6713
6769
  ggml_tensor * inp_pos = build_inp_pos();
6714
6770
 
6715
- auto * inp_attn = build_attn_inp_kv_unified();
6771
+ auto * inp_attn = build_attn_inp_kv();
6716
6772
 
6717
6773
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6718
6774
 
@@ -6743,9 +6799,9 @@ struct llm_build_falcon : public llm_graph_context {
6743
6799
 
6744
6800
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6745
6801
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6746
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6802
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6747
6803
 
6748
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6804
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6749
6805
 
6750
6806
  // using mode = 2 for neox mode
6751
6807
  Qcur = ggml_rope_ext(
@@ -6766,7 +6822,7 @@ struct llm_build_falcon : public llm_graph_context {
6766
6822
 
6767
6823
  cur = build_attn(inp_attn,
6768
6824
  model.layers[il].wo, NULL,
6769
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6825
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6770
6826
  }
6771
6827
 
6772
6828
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6836,7 +6892,7 @@ struct llm_build_grok : public llm_graph_context {
6836
6892
  // inp_pos - contains the positions
6837
6893
  ggml_tensor * inp_pos = build_inp_pos();
6838
6894
 
6839
- auto * inp_attn = build_attn_inp_kv_unified();
6895
+ auto * inp_attn = build_attn_inp_kv();
6840
6896
 
6841
6897
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6842
6898
 
@@ -6896,7 +6952,7 @@ struct llm_build_grok : public llm_graph_context {
6896
6952
 
6897
6953
  cur = build_attn(inp_attn,
6898
6954
  model.layers[il].wo, model.layers[il].bo,
6899
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
6955
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
6900
6956
  }
6901
6957
 
6902
6958
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6996,7 +7052,7 @@ struct llm_build_dbrx : public llm_graph_context {
6996
7052
  // inp_pos - contains the positions
6997
7053
  ggml_tensor * inp_pos = build_inp_pos();
6998
7054
 
6999
- auto * inp_attn = build_attn_inp_kv_unified();
7055
+ auto * inp_attn = build_attn_inp_kv();
7000
7056
 
7001
7057
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7002
7058
 
@@ -7023,9 +7079,9 @@ struct llm_build_dbrx : public llm_graph_context {
7023
7079
 
7024
7080
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7025
7081
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7026
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7082
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7027
7083
 
7028
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7084
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7029
7085
 
7030
7086
  Qcur = ggml_rope_ext(
7031
7087
  ctx0, Qcur, inp_pos, nullptr,
@@ -7045,7 +7101,7 @@ struct llm_build_dbrx : public llm_graph_context {
7045
7101
 
7046
7102
  cur = build_attn(inp_attn,
7047
7103
  model.layers[il].wo, NULL,
7048
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7104
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7049
7105
  }
7050
7106
 
7051
7107
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7120,7 +7176,7 @@ struct llm_build_starcoder : public llm_graph_context {
7120
7176
  // inp_pos - contains the positions
7121
7177
  ggml_tensor * inp_pos = build_inp_pos();
7122
7178
 
7123
- auto * inp_attn = build_attn_inp_kv_unified();
7179
+ auto * inp_attn = build_attn_inp_kv();
7124
7180
 
7125
7181
  ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7126
7182
  cb(pos, "pos_embd", -1);
@@ -7145,13 +7201,13 @@ struct llm_build_starcoder : public llm_graph_context {
7145
7201
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7146
7202
  cb(cur, "bqkv", il);
7147
7203
 
7148
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7149
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7150
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7204
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7205
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7206
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7151
7207
 
7152
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7153
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7154
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7208
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7209
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7210
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7155
7211
 
7156
7212
  cb(Qcur, "Qcur", il);
7157
7213
  cb(Kcur, "Kcur", il);
@@ -7159,7 +7215,7 @@ struct llm_build_starcoder : public llm_graph_context {
7159
7215
 
7160
7216
  cur = build_attn(inp_attn,
7161
7217
  model.layers[il].wo, model.layers[il].bo,
7162
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7218
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7163
7219
  }
7164
7220
 
7165
7221
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7225,7 +7281,7 @@ struct llm_build_refact : public llm_graph_context {
7225
7281
 
7226
7282
  inpL = build_inp_embd(model.tok_embd);
7227
7283
 
7228
- auto * inp_attn = build_attn_inp_kv_unified();
7284
+ auto * inp_attn = build_attn_inp_kv();
7229
7285
 
7230
7286
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7231
7287
 
@@ -7258,7 +7314,7 @@ struct llm_build_refact : public llm_graph_context {
7258
7314
 
7259
7315
  cur = build_attn(inp_attn,
7260
7316
  model.layers[il].wo, NULL,
7261
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7317
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7262
7318
  }
7263
7319
 
7264
7320
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7367,13 +7423,15 @@ struct llm_build_bert : public llm_graph_context {
7367
7423
  cb(cur, "bqkv", il);
7368
7424
  }
7369
7425
 
7370
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7371
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7372
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7426
+ Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7427
+ Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7428
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7429
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7373
7430
  } else {
7374
7431
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
7375
7432
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
7376
7433
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
7434
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7377
7435
  }
7378
7436
 
7379
7437
  if (model.layers[il].attn_q_norm) {
@@ -7381,6 +7439,10 @@ struct llm_build_bert : public llm_graph_context {
7381
7439
  model.layers[il].attn_q_norm,
7382
7440
  model.layers[il].attn_q_norm_b,
7383
7441
  LLM_NORM, il);
7442
+
7443
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7444
+ } else {
7445
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7384
7446
  }
7385
7447
 
7386
7448
  if (model.layers[il].attn_k_norm) {
@@ -7388,11 +7450,11 @@ struct llm_build_bert : public llm_graph_context {
7388
7450
  model.layers[il].attn_k_norm,
7389
7451
  model.layers[il].attn_k_norm_b,
7390
7452
  LLM_NORM, il);
7391
- }
7392
7453
 
7393
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7394
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7395
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7454
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7455
+ } else {
7456
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7457
+ }
7396
7458
 
7397
7459
  // RoPE
7398
7460
  if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
@@ -7415,7 +7477,7 @@ struct llm_build_bert : public llm_graph_context {
7415
7477
 
7416
7478
  cur = build_attn(inp_attn,
7417
7479
  model.layers[il].wo, model.layers[il].bo,
7418
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7480
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7419
7481
  cb(cur, "kqv_out", il);
7420
7482
  }
7421
7483
 
@@ -7537,9 +7599,9 @@ struct llm_build_neo_bert : public llm_graph_context {
7537
7599
 
7538
7600
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7539
7601
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7540
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7602
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7541
7603
 
7542
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7604
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7543
7605
 
7544
7606
  // RoPE
7545
7607
  Qcur = ggml_rope_ext(
@@ -7560,7 +7622,7 @@ struct llm_build_neo_bert : public llm_graph_context {
7560
7622
 
7561
7623
  cur = build_attn(inp_attn,
7562
7624
  model.layers[il].wo, nullptr,
7563
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7625
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7564
7626
  cb(cur, "kqv_out", il);
7565
7627
  }
7566
7628
 
@@ -7621,7 +7683,7 @@ struct llm_build_bloom : public llm_graph_context {
7621
7683
 
7622
7684
  inpL = build_inp_embd(model.tok_embd);
7623
7685
 
7624
- auto * inp_attn = build_attn_inp_kv_unified();
7686
+ auto * inp_attn = build_attn_inp_kv();
7625
7687
 
7626
7688
  inpL = build_norm(inpL,
7627
7689
  model.tok_norm,
@@ -7646,13 +7708,13 @@ struct llm_build_bloom : public llm_graph_context {
7646
7708
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7647
7709
  cb(cur, "bqkv", il);
7648
7710
 
7649
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7650
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7651
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7711
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7712
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7713
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7652
7714
 
7653
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7654
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7655
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7715
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7716
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7717
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7656
7718
 
7657
7719
  cb(Qcur, "Qcur", il);
7658
7720
  cb(Kcur, "Kcur", il);
@@ -7660,7 +7722,7 @@ struct llm_build_bloom : public llm_graph_context {
7660
7722
 
7661
7723
  cur = build_attn(inp_attn,
7662
7724
  model.layers[il].wo, model.layers[il].bo,
7663
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7725
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7664
7726
  }
7665
7727
 
7666
7728
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7728,7 +7790,7 @@ struct llm_build_mpt : public llm_graph_context {
7728
7790
 
7729
7791
  inpL = build_inp_embd(model.tok_embd);
7730
7792
 
7731
- auto * inp_attn = build_attn_inp_kv_unified();
7793
+ auto * inp_attn = build_attn_inp_kv();
7732
7794
 
7733
7795
  if (model.pos_embd) {
7734
7796
  // inp_pos - contains the positions
@@ -7770,7 +7832,7 @@ struct llm_build_mpt : public llm_graph_context {
7770
7832
 
7771
7833
  ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7772
7834
  ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7773
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7835
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7774
7836
 
7775
7837
  cb(Qcur, "Qcur", il);
7776
7838
  cb(Kcur, "Kcur", il);
@@ -7789,17 +7851,18 @@ struct llm_build_mpt : public llm_graph_context {
7789
7851
  model.layers[il].attn_k_norm_b,
7790
7852
  LLM_NORM, il);
7791
7853
  cb(Kcur, "Kcur", il);
7854
+
7855
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7856
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7792
7857
  } else {
7793
- Qcur = ggml_cont(ctx0, Qcur);
7858
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7794
7859
  cb(Qcur, "Qcur", il);
7795
7860
 
7796
- Kcur = ggml_cont(ctx0, Kcur);
7861
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7797
7862
  cb(Kcur, "Kcur", il);
7798
7863
  }
7799
7864
 
7800
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7801
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7802
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7865
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7803
7866
 
7804
7867
  cb(Qcur, "Qcur", il);
7805
7868
  cb(Kcur, "Kcur", il);
@@ -7807,7 +7870,7 @@ struct llm_build_mpt : public llm_graph_context {
7807
7870
 
7808
7871
  cur = build_attn(inp_attn,
7809
7872
  model.layers[il].wo, model.layers[il].bo,
7810
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7873
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7811
7874
  }
7812
7875
 
7813
7876
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7877,7 +7940,7 @@ struct llm_build_stablelm : public llm_graph_context {
7877
7940
  // inp_pos - contains the positions
7878
7941
  ggml_tensor * inp_pos = build_inp_pos();
7879
7942
 
7880
- auto * inp_attn = build_attn_inp_kv_unified();
7943
+ auto * inp_attn = build_attn_inp_kv();
7881
7944
 
7882
7945
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7883
7946
 
@@ -7953,7 +8016,7 @@ struct llm_build_stablelm : public llm_graph_context {
7953
8016
 
7954
8017
  cur = build_attn(inp_attn,
7955
8018
  model.layers[il].wo, NULL,
7956
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8019
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7957
8020
  }
7958
8021
 
7959
8022
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8029,7 +8092,7 @@ struct llm_build_qwen : public llm_graph_context {
8029
8092
  // inp_pos - contains the positions
8030
8093
  ggml_tensor * inp_pos = build_inp_pos();
8031
8094
 
8032
- auto * inp_attn = build_attn_inp_kv_unified();
8095
+ auto * inp_attn = build_attn_inp_kv();
8033
8096
 
8034
8097
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8035
8098
 
@@ -8051,9 +8114,9 @@ struct llm_build_qwen : public llm_graph_context {
8051
8114
 
8052
8115
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8053
8116
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8054
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
8117
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
8055
8118
 
8056
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8119
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8057
8120
 
8058
8121
  // using mode = 2 for neox mode
8059
8122
  Qcur = ggml_rope_ext(
@@ -8074,7 +8137,7 @@ struct llm_build_qwen : public llm_graph_context {
8074
8137
 
8075
8138
  cur = build_attn(inp_attn,
8076
8139
  model.layers[il].wo, NULL,
8077
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8140
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8078
8141
  }
8079
8142
 
8080
8143
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8144,7 +8207,7 @@ struct llm_build_qwen2 : public llm_graph_context {
8144
8207
  // inp_pos - contains the positions
8145
8208
  ggml_tensor * inp_pos = build_inp_pos();
8146
8209
 
8147
- auto * inp_attn = build_attn_inp_kv_unified();
8210
+ auto * inp_attn = build_attn_inp_kv();
8148
8211
 
8149
8212
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8150
8213
 
@@ -8194,7 +8257,7 @@ struct llm_build_qwen2 : public llm_graph_context {
8194
8257
 
8195
8258
  cur = build_attn(inp_attn,
8196
8259
  model.layers[il].wo, model.layers[il].bo,
8197
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8260
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8198
8261
  }
8199
8262
 
8200
8263
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8308,8 +8371,9 @@ struct llm_build_dream : public llm_graph_context {
8308
8371
  cb(Kcur, "Kcur", il);
8309
8372
  cb(Vcur, "Vcur", il);
8310
8373
 
8311
- cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr,
8312
- nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
8374
+ cur = build_attn(inp_attn,
8375
+ model.layers[il].wo, model.layers[il].bo,
8376
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
8313
8377
  }
8314
8378
 
8315
8379
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8408,8 +8472,9 @@ struct llm_build_llada : public llm_graph_context {
8408
8472
  cb(Kcur, "Kcur", il);
8409
8473
  cb(Vcur, "Vcur", il);
8410
8474
 
8411
- cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr,
8412
- 1.0f / sqrtf(float(n_embd_head)), il);
8475
+ cur = build_attn(inp_attn,
8476
+ model.layers[il].wo, NULL,
8477
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
8413
8478
  }
8414
8479
 
8415
8480
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8469,7 +8534,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
8469
8534
  // inp_pos - contains the positions
8470
8535
  ggml_tensor * inp_pos = build_inp_pos();
8471
8536
 
8472
- auto * inp_attn = build_attn_inp_kv_unified();
8537
+ auto * inp_attn = build_attn_inp_kv();
8473
8538
 
8474
8539
  int sections[4];
8475
8540
  std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -8522,7 +8587,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
8522
8587
 
8523
8588
  cur = build_attn(inp_attn,
8524
8589
  model.layers[il].wo, model.layers[il].bo,
8525
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8590
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8526
8591
  }
8527
8592
 
8528
8593
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8590,7 +8655,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
8590
8655
  // inp_pos - contains the positions
8591
8656
  ggml_tensor * inp_pos = build_inp_pos();
8592
8657
 
8593
- auto * inp_attn = build_attn_inp_kv_unified();
8658
+ auto * inp_attn = build_attn_inp_kv();
8594
8659
 
8595
8660
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8596
8661
 
@@ -8649,7 +8714,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
8649
8714
 
8650
8715
  cur = build_attn(inp_attn,
8651
8716
  model.layers[il].wo, model.layers[il].bo,
8652
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8717
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8653
8718
  }
8654
8719
 
8655
8720
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8749,7 +8814,7 @@ struct llm_build_qwen3 : public llm_graph_context {
8749
8814
  // inp_pos - contains the positions
8750
8815
  ggml_tensor * inp_pos = build_inp_pos();
8751
8816
 
8752
- auto * inp_attn = build_attn_inp_kv_unified();
8817
+ auto * inp_attn = build_attn_inp_kv();
8753
8818
 
8754
8819
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8755
8820
 
@@ -8802,7 +8867,7 @@ struct llm_build_qwen3 : public llm_graph_context {
8802
8867
 
8803
8868
  cur = build_attn(inp_attn,
8804
8869
  model.layers[il].wo, model.layers[il].bo,
8805
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8870
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8806
8871
  }
8807
8872
 
8808
8873
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8870,7 +8935,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
8870
8935
  // inp_pos - contains the positions
8871
8936
  ggml_tensor * inp_pos = build_inp_pos();
8872
8937
 
8873
- auto * inp_attn = build_attn_inp_kv_unified();
8938
+ auto * inp_attn = build_attn_inp_kv();
8874
8939
 
8875
8940
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8876
8941
 
@@ -8923,7 +8988,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
8923
8988
 
8924
8989
  cur = build_attn(inp_attn,
8925
8990
  model.layers[il].wo, model.layers[il].bo,
8926
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8991
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8927
8992
  }
8928
8993
 
8929
8994
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9000,7 +9065,7 @@ struct llm_build_phi2 : public llm_graph_context {
9000
9065
  // inp_pos - contains the positions
9001
9066
  ggml_tensor * inp_pos = build_inp_pos();
9002
9067
 
9003
- auto * inp_attn = build_attn_inp_kv_unified();
9068
+ auto * inp_attn = build_attn_inp_kv();
9004
9069
 
9005
9070
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9006
9071
 
@@ -9026,21 +9091,21 @@ struct llm_build_phi2 : public llm_graph_context {
9026
9091
 
9027
9092
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9028
9093
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9029
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9094
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9095
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9030
9096
  } else {
9031
9097
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9032
9098
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9033
9099
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9034
9100
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9035
9101
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9102
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9036
9103
  }
9037
9104
 
9038
9105
  cb(Qcur, "Qcur", il);
9039
9106
  cb(Kcur, "Kcur", il);
9040
9107
  cb(Vcur, "Vcur", il);
9041
9108
 
9042
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9043
-
9044
9109
  Qcur = ggml_rope_ext(
9045
9110
  ctx0, Qcur, inp_pos, nullptr,
9046
9111
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9063,7 +9128,7 @@ struct llm_build_phi2 : public llm_graph_context {
9063
9128
 
9064
9129
  cur = build_attn(inp_attn,
9065
9130
  model.layers[il].wo, model.layers[il].bo,
9066
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9131
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
9067
9132
  }
9068
9133
 
9069
9134
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9129,13 +9194,13 @@ struct llm_build_phi3 : public llm_graph_context {
9129
9194
  // inp_pos - contains the positions
9130
9195
  ggml_tensor * inp_pos = build_inp_pos();
9131
9196
 
9132
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
9197
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
9133
9198
  inp_attn_type * inp_attn = nullptr;
9134
9199
 
9135
9200
  if constexpr (iswa) {
9136
- inp_attn = build_attn_inp_kv_unified_iswa();
9201
+ inp_attn = build_attn_inp_kv_iswa();
9137
9202
  } else {
9138
- inp_attn = build_attn_inp_kv_unified();
9203
+ inp_attn = build_attn_inp_kv();
9139
9204
  }
9140
9205
 
9141
9206
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -9164,21 +9229,21 @@ struct llm_build_phi3 : public llm_graph_context {
9164
9229
 
9165
9230
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
9166
9231
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
9167
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
9232
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
9233
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9168
9234
  } else {
9169
9235
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9170
9236
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9171
9237
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9172
9238
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9173
9239
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9240
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9174
9241
  }
9175
9242
 
9176
9243
  cb(Qcur, "Qcur", il);
9177
9244
  cb(Kcur, "Kcur", il);
9178
9245
  cb(Vcur, "Vcur", il);
9179
9246
 
9180
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9181
-
9182
9247
  Qcur = ggml_rope_ext(
9183
9248
  ctx0, Qcur, inp_pos, rope_factors,
9184
9249
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9200,7 +9265,7 @@ struct llm_build_phi3 : public llm_graph_context {
9200
9265
 
9201
9266
  cur = build_attn(inp_attn,
9202
9267
  model.layers[il].wo, model.layers[il].bo,
9203
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9268
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
9204
9269
  }
9205
9270
 
9206
9271
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9287,7 +9352,7 @@ struct llm_build_plamo : public llm_graph_context {
9287
9352
  // inp_pos - contains the positions
9288
9353
  ggml_tensor * inp_pos = build_inp_pos();
9289
9354
 
9290
- auto * inp_attn = build_attn_inp_kv_unified();
9355
+ auto * inp_attn = build_attn_inp_kv();
9291
9356
 
9292
9357
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9293
9358
 
@@ -9334,7 +9399,7 @@ struct llm_build_plamo : public llm_graph_context {
9334
9399
 
9335
9400
  cur = build_attn(inp_attn,
9336
9401
  model.layers[il].wo, NULL,
9337
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9402
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9338
9403
  }
9339
9404
 
9340
9405
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9403,7 +9468,7 @@ struct llm_build_gpt2 : public llm_graph_context {
9403
9468
  // inp_pos - contains the positions
9404
9469
  ggml_tensor * inp_pos = build_inp_pos();
9405
9470
 
9406
- auto * inp_attn = build_attn_inp_kv_unified();
9471
+ auto * inp_attn = build_attn_inp_kv();
9407
9472
 
9408
9473
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
9409
9474
  cb(pos, "pos_embd", -1);
@@ -9428,21 +9493,21 @@ struct llm_build_gpt2 : public llm_graph_context {
9428
9493
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
9429
9494
  cb(cur, "bqkv", il);
9430
9495
 
9431
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
9432
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
9433
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9496
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
9497
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
9498
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9434
9499
 
9435
9500
  cb(Qcur, "Qcur", il);
9436
9501
  cb(Kcur, "Kcur", il);
9437
9502
  cb(Vcur, "Vcur", il);
9438
9503
 
9439
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9440
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9441
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9504
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9505
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9506
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9442
9507
 
9443
9508
  cur = build_attn(inp_attn,
9444
9509
  model.layers[il].wo, model.layers[il].bo,
9445
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9510
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9446
9511
  }
9447
9512
 
9448
9513
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9513,7 +9578,7 @@ struct llm_build_codeshell : public llm_graph_context {
9513
9578
  // inp_pos - contains the positions
9514
9579
  ggml_tensor * inp_pos = build_inp_pos();
9515
9580
 
9516
- auto * inp_attn = build_attn_inp_kv_unified();
9581
+ auto * inp_attn = build_attn_inp_kv();
9517
9582
 
9518
9583
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9519
9584
 
@@ -9534,9 +9599,9 @@ struct llm_build_codeshell : public llm_graph_context {
9534
9599
 
9535
9600
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9536
9601
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9537
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9602
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9538
9603
 
9539
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9604
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9540
9605
 
9541
9606
  Qcur = ggml_rope_ext(
9542
9607
  ctx0, Qcur, inp_pos, nullptr,
@@ -9556,7 +9621,7 @@ struct llm_build_codeshell : public llm_graph_context {
9556
9621
 
9557
9622
  cur = build_attn(inp_attn,
9558
9623
  model.layers[il].wo, model.layers[il].bo,
9559
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9624
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9560
9625
  }
9561
9626
 
9562
9627
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9626,7 +9691,7 @@ struct llm_build_orion : public llm_graph_context {
9626
9691
  // inp_pos - contains the positions
9627
9692
  ggml_tensor * inp_pos = build_inp_pos();
9628
9693
 
9629
- auto * inp_attn = build_attn_inp_kv_unified();
9694
+ auto * inp_attn = build_attn_inp_kv();
9630
9695
 
9631
9696
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9632
9697
 
@@ -9685,7 +9750,7 @@ struct llm_build_orion : public llm_graph_context {
9685
9750
 
9686
9751
  cur = build_attn(inp_attn,
9687
9752
  model.layers[il].wo, NULL,
9688
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9753
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9689
9754
  }
9690
9755
 
9691
9756
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9753,7 +9818,7 @@ struct llm_build_internlm2 : public llm_graph_context {
9753
9818
  // inp_pos - contains the positions
9754
9819
  ggml_tensor * inp_pos = build_inp_pos();
9755
9820
 
9756
- auto * inp_attn = build_attn_inp_kv_unified();
9821
+ auto * inp_attn = build_attn_inp_kv();
9757
9822
 
9758
9823
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9759
9824
 
@@ -9812,7 +9877,7 @@ struct llm_build_internlm2 : public llm_graph_context {
9812
9877
 
9813
9878
  cur = build_attn(inp_attn,
9814
9879
  model.layers[il].wo, model.layers[il].bo,
9815
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9880
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9816
9881
  }
9817
9882
 
9818
9883
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9889,7 +9954,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
9889
9954
  // inp_pos - contains the positions
9890
9955
  ggml_tensor * inp_pos = build_inp_pos();
9891
9956
 
9892
- auto * inp_attn = build_attn_inp_kv_unified();
9957
+ auto * inp_attn = build_attn_inp_kv();
9893
9958
 
9894
9959
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9895
9960
 
@@ -10000,7 +10065,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
10000
10065
 
10001
10066
  cur = build_attn(inp_attn,
10002
10067
  model.layers[il].wo, NULL,
10003
- q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
10068
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
10004
10069
  }
10005
10070
 
10006
10071
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10084,7 +10149,7 @@ struct llm_build_gemma : public llm_graph_context {
10084
10149
  // inp_pos - contains the positions
10085
10150
  ggml_tensor * inp_pos = build_inp_pos();
10086
10151
 
10087
- auto * inp_attn = build_attn_inp_kv_unified();
10152
+ auto * inp_attn = build_attn_inp_kv();
10088
10153
 
10089
10154
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10090
10155
 
@@ -10130,7 +10195,7 @@ struct llm_build_gemma : public llm_graph_context {
10130
10195
 
10131
10196
  cur = build_attn(inp_attn,
10132
10197
  model.layers[il].wo, NULL,
10133
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10198
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
10134
10199
  }
10135
10200
 
10136
10201
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10200,7 +10265,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
10200
10265
  // inp_pos - contains the positions
10201
10266
  ggml_tensor * inp_pos = build_inp_pos();
10202
10267
 
10203
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10268
+ auto * inp_attn = build_attn_inp_kv_iswa();
10204
10269
 
10205
10270
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10206
10271
 
@@ -10245,7 +10310,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
10245
10310
 
10246
10311
  cur = build_attn(inp_attn,
10247
10312
  model.layers[il].wo, NULL,
10248
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10313
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
10249
10314
  }
10250
10315
 
10251
10316
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10334,7 +10399,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
10334
10399
  ggml_tensor * inp_pos = build_inp_pos();
10335
10400
 
10336
10401
  // TODO: is causal == true correct? might need some changes
10337
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10402
+ auto * inp_attn = build_attn_inp_kv_iswa();
10338
10403
 
10339
10404
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10340
10405
 
@@ -10387,7 +10452,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
10387
10452
 
10388
10453
  cur = build_attn(inp_attn,
10389
10454
  model.layers[il].wo, NULL,
10390
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10455
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
10391
10456
  }
10392
10457
 
10393
10458
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10485,7 +10550,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10485
10550
  ggml_tensor * inp_pos = build_inp_pos();
10486
10551
 
10487
10552
  // TODO: is causal == true correct? might need some changes
10488
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10553
+ auto * inp_attn = build_attn_inp_kv_iswa();
10489
10554
 
10490
10555
  // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
10491
10556
  ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
@@ -10568,7 +10633,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10568
10633
 
10569
10634
  cur = build_attn(inp_attn,
10570
10635
  model.layers[il].wo, NULL,
10571
- Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
10636
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10572
10637
  } else {
10573
10638
  // no KV layers
10574
10639
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -10586,7 +10651,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10586
10651
 
10587
10652
  cur = build_attn(inp_attn,
10588
10653
  model.layers[il].wo, NULL,
10589
- Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10654
+ Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10590
10655
  }
10591
10656
 
10592
10657
  cur = build_norm(cur,
@@ -10864,8 +10929,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10864
10929
  ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
10865
10930
  all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
10866
10931
  cb(all_coefs, "all_coefs", il);
10867
- all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
10868
- all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
10932
+ all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
10933
+ all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
10869
10934
 
10870
10935
  innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
10871
10936
  ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
@@ -10892,7 +10957,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
10892
10957
  // inp_pos - contains the positions
10893
10958
  ggml_tensor * inp_pos = build_inp_pos();
10894
10959
 
10895
- auto * inp_attn = build_attn_inp_kv_unified();
10960
+ auto * inp_attn = build_attn_inp_kv();
10896
10961
 
10897
10962
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10898
10963
 
@@ -10951,7 +11016,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
10951
11016
 
10952
11017
  cur = build_attn(inp_attn,
10953
11018
  model.layers[il].wo, model.layers[il].bo,
10954
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11019
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10955
11020
  }
10956
11021
 
10957
11022
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11378,7 +11443,9 @@ struct llm_build_jamba : public llm_graph_context_mamba {
11378
11443
  cb(Vcur, "Vcur", il);
11379
11444
 
11380
11445
  // No RoPE :)
11381
- cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
11446
+ cur = build_attn(inp_hybrid->get_attn(),
11447
+ model.layers[il].wo, NULL,
11448
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
11382
11449
  }
11383
11450
 
11384
11451
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11461,7 +11528,7 @@ struct llm_build_command_r : public llm_graph_context {
11461
11528
  // inp_pos - contains the positions
11462
11529
  ggml_tensor * inp_pos = build_inp_pos();
11463
11530
 
11464
- auto * inp_attn = build_attn_inp_kv_unified();
11531
+ auto * inp_attn = build_attn_inp_kv();
11465
11532
 
11466
11533
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11467
11534
 
@@ -11536,7 +11603,7 @@ struct llm_build_command_r : public llm_graph_context {
11536
11603
 
11537
11604
  cur = build_attn(inp_attn,
11538
11605
  model.layers[il].wo, model.layers[il].bo,
11539
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11606
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11540
11607
  }
11541
11608
 
11542
11609
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11608,7 +11675,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
11608
11675
  // inp_pos - contains the positions
11609
11676
  ggml_tensor * inp_pos = build_inp_pos();
11610
11677
 
11611
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
11678
+ auto * inp_attn = build_attn_inp_kv_iswa();
11612
11679
 
11613
11680
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11614
11681
 
@@ -11671,7 +11738,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
11671
11738
 
11672
11739
  cur = build_attn(inp_attn,
11673
11740
  model.layers[il].wo, model.layers[il].bo,
11674
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11741
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11675
11742
  }
11676
11743
 
11677
11744
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11743,7 +11810,7 @@ struct llm_build_olmo : public llm_graph_context {
11743
11810
  // inp_pos - contains the positions
11744
11811
  ggml_tensor * inp_pos = build_inp_pos();
11745
11812
 
11746
- auto * inp_attn = build_attn_inp_kv_unified();
11813
+ auto * inp_attn = build_attn_inp_kv();
11747
11814
 
11748
11815
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11749
11816
 
@@ -11802,7 +11869,7 @@ struct llm_build_olmo : public llm_graph_context {
11802
11869
 
11803
11870
  cur = build_attn(inp_attn,
11804
11871
  model.layers[il].wo, nullptr,
11805
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11872
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11806
11873
  }
11807
11874
 
11808
11875
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11871,7 +11938,7 @@ struct llm_build_olmo2 : public llm_graph_context {
11871
11938
  // inp_pos - contains the positions
11872
11939
  ggml_tensor * inp_pos = build_inp_pos();
11873
11940
 
11874
- auto * inp_attn = build_attn_inp_kv_unified();
11941
+ auto * inp_attn = build_attn_inp_kv();
11875
11942
 
11876
11943
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11877
11944
 
@@ -11922,7 +11989,7 @@ struct llm_build_olmo2 : public llm_graph_context {
11922
11989
 
11923
11990
  cur = build_attn(inp_attn,
11924
11991
  model.layers[il].wo, NULL,
11925
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11992
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11926
11993
  }
11927
11994
 
11928
11995
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12000,7 +12067,7 @@ struct llm_build_olmoe : public llm_graph_context {
12000
12067
  // inp_pos - contains the positions
12001
12068
  ggml_tensor * inp_pos = build_inp_pos();
12002
12069
 
12003
- auto * inp_attn = build_attn_inp_kv_unified();
12070
+ auto * inp_attn = build_attn_inp_kv();
12004
12071
 
12005
12072
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12006
12073
 
@@ -12055,7 +12122,7 @@ struct llm_build_olmoe : public llm_graph_context {
12055
12122
 
12056
12123
  cur = build_attn(inp_attn,
12057
12124
  model.layers[il].wo, NULL,
12058
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12125
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12059
12126
  }
12060
12127
 
12061
12128
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12126,7 +12193,7 @@ struct llm_build_openelm : public llm_graph_context {
12126
12193
  // inp_pos - contains the positions
12127
12194
  ggml_tensor * inp_pos = build_inp_pos();
12128
12195
 
12129
- auto * inp_attn = build_attn_inp_kv_unified();
12196
+ auto * inp_attn = build_attn_inp_kv();
12130
12197
 
12131
12198
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12132
12199
 
@@ -12188,7 +12255,7 @@ struct llm_build_openelm : public llm_graph_context {
12188
12255
 
12189
12256
  cur = build_attn(inp_attn,
12190
12257
  model.layers[il].wo, NULL,
12191
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12258
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12192
12259
  }
12193
12260
 
12194
12261
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12257,7 +12324,7 @@ struct llm_build_gptneox : public llm_graph_context {
12257
12324
  // inp_pos - contains the positions
12258
12325
  ggml_tensor * inp_pos = build_inp_pos();
12259
12326
 
12260
- auto * inp_attn = build_attn_inp_kv_unified();
12327
+ auto * inp_attn = build_attn_inp_kv();
12261
12328
 
12262
12329
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12263
12330
 
@@ -12278,9 +12345,9 @@ struct llm_build_gptneox : public llm_graph_context {
12278
12345
 
12279
12346
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12280
12347
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
12281
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
12348
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
12282
12349
 
12283
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12350
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12284
12351
 
12285
12352
  Qcur = ggml_rope_ext(
12286
12353
  ctx0, Qcur, inp_pos, nullptr,
@@ -12300,7 +12367,7 @@ struct llm_build_gptneox : public llm_graph_context {
12300
12367
 
12301
12368
  cur = build_attn(inp_attn,
12302
12369
  model.layers[il].wo, model.layers[il].bo,
12303
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12370
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12304
12371
  }
12305
12372
 
12306
12373
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12403,7 +12470,7 @@ struct llm_build_arctic : public llm_graph_context {
12403
12470
  // inp_pos - contains the positions
12404
12471
  ggml_tensor * inp_pos = build_inp_pos();
12405
12472
 
12406
- auto * inp_attn = build_attn_inp_kv_unified();
12473
+ auto * inp_attn = build_attn_inp_kv();
12407
12474
 
12408
12475
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12409
12476
 
@@ -12450,7 +12517,7 @@ struct llm_build_arctic : public llm_graph_context {
12450
12517
 
12451
12518
  cur = build_attn(inp_attn,
12452
12519
  model.layers[il].wo, NULL,
12453
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12520
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12454
12521
  }
12455
12522
 
12456
12523
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12541,7 +12608,7 @@ struct llm_build_deepseek : public llm_graph_context {
12541
12608
  // inp_pos - contains the positions
12542
12609
  ggml_tensor * inp_pos = build_inp_pos();
12543
12610
 
12544
- auto * inp_attn = build_attn_inp_kv_unified();
12611
+ auto * inp_attn = build_attn_inp_kv();
12545
12612
 
12546
12613
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12547
12614
 
@@ -12605,7 +12672,7 @@ struct llm_build_deepseek : public llm_graph_context {
12605
12672
 
12606
12673
  cur = build_attn(inp_attn,
12607
12674
  model.layers[il].wo, model.layers[il].bo,
12608
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12675
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
12609
12676
  }
12610
12677
 
12611
12678
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12718,7 +12785,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12718
12785
  // inp_pos - contains the positions
12719
12786
  ggml_tensor * inp_pos = build_inp_pos();
12720
12787
 
12721
- auto * inp_attn = build_attn_inp_kv_unified();
12788
+ auto * inp_attn = build_attn_inp_kv();
12722
12789
 
12723
12790
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12724
12791
 
@@ -12833,7 +12900,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12833
12900
  // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
12834
12901
  cur = build_attn(inp_attn,
12835
12902
  model.layers[il].wo, NULL,
12836
- Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
12903
+ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
12837
12904
  } else {
12838
12905
  ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
12839
12906
  cb(kv, "kv", il);
@@ -12867,7 +12934,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12867
12934
  // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
12868
12935
  cur = build_attn(inp_attn,
12869
12936
  model.layers[il].wo, NULL,
12870
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12937
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
12871
12938
  }
12872
12939
  }
12873
12940
 
@@ -12965,7 +13032,7 @@ struct llm_build_bitnet : public llm_graph_context {
12965
13032
  // inp_pos - contains the positions
12966
13033
  ggml_tensor * inp_pos = build_inp_pos();
12967
13034
 
12968
- auto * inp_attn = build_attn_inp_kv_unified();
13035
+ auto * inp_attn = build_attn_inp_kv();
12969
13036
 
12970
13037
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12971
13038
 
@@ -13034,7 +13101,7 @@ struct llm_build_bitnet : public llm_graph_context {
13034
13101
 
13035
13102
  cur = build_attn(inp_attn,
13036
13103
  NULL, NULL,
13037
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13104
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13038
13105
 
13039
13106
  cur = build_norm(cur,
13040
13107
  model.layers[il].attn_sub_norm, NULL,
@@ -13157,7 +13224,7 @@ struct llm_build_t5_enc : public llm_graph_context {
13157
13224
 
13158
13225
  cur = build_attn(inp_attn,
13159
13226
  model.layers[il].wo_enc, nullptr,
13160
- Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
13227
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
13161
13228
  cb(cur, "kqv_out", il);
13162
13229
  }
13163
13230
 
@@ -13229,7 +13296,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13229
13296
 
13230
13297
  const int64_t n_outputs_enc = embd_enc->ne[1];
13231
13298
 
13232
- auto * inp_attn_self = build_attn_inp_kv_unified();
13299
+ auto * inp_attn_self = build_attn_inp_kv();
13233
13300
  auto * inp_attn_cross = build_attn_inp_cross();
13234
13301
 
13235
13302
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -13263,7 +13330,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13263
13330
 
13264
13331
  cur = build_attn(inp_attn_self,
13265
13332
  model.layers[il].wo, model.layers[il].bo,
13266
- Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
13333
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
13267
13334
  cb(cur, "kqv_out", il);
13268
13335
  }
13269
13336
 
@@ -13295,7 +13362,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13295
13362
 
13296
13363
  cur = build_attn(inp_attn_cross,
13297
13364
  model.layers[il].wo_cross, nullptr,
13298
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
13365
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
13299
13366
  cb(cur, "kqv_out", il);
13300
13367
 
13301
13368
  //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
@@ -13394,7 +13461,7 @@ struct llm_build_jais : public llm_graph_context {
13394
13461
 
13395
13462
  inpL = build_inp_embd(model.tok_embd);
13396
13463
 
13397
- auto * inp_attn = build_attn_inp_kv_unified();
13464
+ auto * inp_attn = build_attn_inp_kv();
13398
13465
 
13399
13466
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13400
13467
 
@@ -13413,21 +13480,21 @@ struct llm_build_jais : public llm_graph_context {
13413
13480
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
13414
13481
  cb(cur, "bqkv", il);
13415
13482
 
13416
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
13417
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
13418
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
13483
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
13484
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
13485
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
13419
13486
 
13420
13487
  cb(Qcur, "Qcur", il);
13421
13488
  cb(Kcur, "Kcur", il);
13422
13489
  cb(Vcur, "Vcur", il);
13423
13490
 
13424
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13425
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13426
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13491
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13492
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13493
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13427
13494
 
13428
13495
  cur = build_attn(inp_attn,
13429
13496
  model.layers[il].wo, model.layers[il].bo,
13430
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
13497
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
13431
13498
  }
13432
13499
 
13433
13500
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13492,7 +13559,7 @@ struct llm_build_chatglm : public llm_graph_context {
13492
13559
  // inp_pos - contains the positions
13493
13560
  ggml_tensor * inp_pos = build_inp_pos();
13494
13561
 
13495
- auto * inp_attn = build_attn_inp_kv_unified();
13562
+ auto * inp_attn = build_attn_inp_kv();
13496
13563
 
13497
13564
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13498
13565
 
@@ -13526,6 +13593,7 @@ struct llm_build_chatglm : public llm_graph_context {
13526
13593
  }
13527
13594
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13528
13595
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13596
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13529
13597
  } else {
13530
13598
  cur = build_lora_mm(model.layers[il].wqkv, cur);
13531
13599
  cb(cur, "wqkv", il);
@@ -13535,11 +13603,10 @@ struct llm_build_chatglm : public llm_graph_context {
13535
13603
  }
13536
13604
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13537
13605
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13538
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
13606
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13607
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13539
13608
  }
13540
13609
 
13541
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13542
-
13543
13610
  //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
13544
13611
  Qcur = ggml_rope_ext(
13545
13612
  ctx0, Qcur, inp_pos, nullptr,
@@ -13559,7 +13626,7 @@ struct llm_build_chatglm : public llm_graph_context {
13559
13626
 
13560
13627
  cur = build_attn(inp_attn,
13561
13628
  model.layers[il].wo, NULL,
13562
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13629
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13563
13630
  }
13564
13631
 
13565
13632
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13625,7 +13692,7 @@ struct llm_build_glm4 : public llm_graph_context {
13625
13692
  // inp_pos - contains the positions
13626
13693
  ggml_tensor * inp_pos = build_inp_pos();
13627
13694
 
13628
- auto * inp_attn = build_attn_inp_kv_unified();
13695
+ auto * inp_attn = build_attn_inp_kv();
13629
13696
 
13630
13697
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13631
13698
 
@@ -13660,6 +13727,7 @@ struct llm_build_glm4 : public llm_graph_context {
13660
13727
  }
13661
13728
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13662
13729
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13730
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13663
13731
  } else {
13664
13732
  cur = build_lora_mm(model.layers[il].wqkv, cur);
13665
13733
  cb(cur, "wqkv", il);
@@ -13669,11 +13737,10 @@ struct llm_build_glm4 : public llm_graph_context {
13669
13737
  }
13670
13738
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13671
13739
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13672
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
13740
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13741
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13673
13742
  }
13674
13743
 
13675
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13676
-
13677
13744
  Qcur = ggml_rope_ext(
13678
13745
  ctx0, Qcur, inp_pos, nullptr,
13679
13746
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13692,7 +13759,7 @@ struct llm_build_glm4 : public llm_graph_context {
13692
13759
 
13693
13760
  cur = build_attn(inp_attn,
13694
13761
  model.layers[il].wo, NULL,
13695
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13762
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13696
13763
  }
13697
13764
 
13698
13765
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13775,7 +13842,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
13775
13842
  // inp_pos - contains the positions
13776
13843
  ggml_tensor * inp_pos = build_inp_pos();
13777
13844
 
13778
- auto * inp_attn = build_attn_inp_kv_unified();
13845
+ auto * inp_attn = build_attn_inp_kv();
13779
13846
 
13780
13847
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13781
13848
 
@@ -13841,7 +13908,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
13841
13908
 
13842
13909
  cur = build_attn(inp_attn,
13843
13910
  model.layers[il].wo, NULL,
13844
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13911
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13845
13912
  }
13846
13913
 
13847
13914
  if (il == n_transformer_layers - 1 && inp_out_ids) {
@@ -13935,7 +14002,7 @@ struct llm_build_nemotron : public llm_graph_context {
13935
14002
  // inp_pos - contains the positions
13936
14003
  ggml_tensor * inp_pos = build_inp_pos();
13937
14004
 
13938
- auto * inp_attn = build_attn_inp_kv_unified();
14005
+ auto * inp_attn = build_attn_inp_kv();
13939
14006
 
13940
14007
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13941
14008
 
@@ -13995,7 +14062,7 @@ struct llm_build_nemotron : public llm_graph_context {
13995
14062
 
13996
14063
  cur = build_attn(inp_attn,
13997
14064
  model.layers[il].wo, model.layers[il].bo,
13998
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14065
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13999
14066
  }
14000
14067
 
14001
14068
  if (il == n_layer - 1 && inp_out_ids) {
@@ -14064,7 +14131,7 @@ struct llm_build_exaone : public llm_graph_context {
14064
14131
  // inp_pos - contains the positions
14065
14132
  ggml_tensor * inp_pos = build_inp_pos();
14066
14133
 
14067
- auto * inp_attn = build_attn_inp_kv_unified();
14134
+ auto * inp_attn = build_attn_inp_kv();
14068
14135
 
14069
14136
  ggml_tensor * inp_out_ids = build_inp_out_ids();
14070
14137
 
@@ -14126,7 +14193,7 @@ struct llm_build_exaone : public llm_graph_context {
14126
14193
 
14127
14194
  cur = build_attn(inp_attn,
14128
14195
  model.layers[il].wo, model.layers[il].bo,
14129
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14196
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14130
14197
  }
14131
14198
 
14132
14199
  if (il == n_layer - 1 && inp_out_ids) {
@@ -14196,13 +14263,13 @@ struct llm_build_exaone4 : public llm_graph_context {
14196
14263
  // inp_pos - contains the positions
14197
14264
  ggml_tensor * inp_pos = build_inp_pos();
14198
14265
 
14199
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
14266
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
14200
14267
  inp_attn_type * inp_attn = nullptr;
14201
14268
 
14202
14269
  if constexpr (iswa) {
14203
- inp_attn = build_attn_inp_kv_unified_iswa();
14270
+ inp_attn = build_attn_inp_kv_iswa();
14204
14271
  } else {
14205
- inp_attn = build_attn_inp_kv_unified();
14272
+ inp_attn = build_attn_inp_kv();
14206
14273
  }
14207
14274
 
14208
14275
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -14257,7 +14324,7 @@ struct llm_build_exaone4 : public llm_graph_context {
14257
14324
 
14258
14325
  cur = build_attn(inp_attn,
14259
14326
  model.layers[il].wo, NULL,
14260
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14327
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14261
14328
  cb(cur, "attn_out", il);
14262
14329
  }
14263
14330
 
@@ -15085,7 +15152,7 @@ struct llm_build_granite : public llm_graph_context {
15085
15152
  inp_pos = build_inp_pos();
15086
15153
  }
15087
15154
 
15088
- auto * inp_attn = build_attn_inp_kv_unified();
15155
+ auto * inp_attn = build_attn_inp_kv();
15089
15156
 
15090
15157
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15091
15158
 
@@ -15136,12 +15203,12 @@ struct llm_build_granite : public llm_graph_context {
15136
15203
  }
15137
15204
 
15138
15205
  ggml_tensor * build_attention_layer(
15139
- ggml_tensor * cur,
15140
- ggml_tensor * inp_pos,
15141
- llm_graph_input_attn_kv_unified * inp_attn,
15142
- const llama_model & model,
15143
- const int64_t n_embd_head,
15144
- const int il) {
15206
+ ggml_tensor * cur,
15207
+ ggml_tensor * inp_pos,
15208
+ llm_graph_input_attn_kv * inp_attn,
15209
+ const llama_model & model,
15210
+ const int64_t n_embd_head,
15211
+ const int il) {
15145
15212
 
15146
15213
  // compute Q and K and (optionally) RoPE them
15147
15214
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -15192,7 +15259,7 @@ struct llm_build_granite : public llm_graph_context {
15192
15259
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15193
15260
  cur = build_attn(inp_attn,
15194
15261
  model.layers[il].wo, model.layers[il].bo,
15195
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15262
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
15196
15263
  cb(cur, "attn_out", il);
15197
15264
  return cur;
15198
15265
  }
@@ -15355,12 +15422,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
15355
15422
  }
15356
15423
 
15357
15424
  ggml_tensor * build_attention_layer(
15358
- ggml_tensor * cur,
15359
- ggml_tensor * inp_pos,
15360
- llm_graph_input_attn_kv_unified * inp_attn,
15361
- const llama_model & model,
15362
- const int64_t n_embd_head,
15363
- const int il) {
15425
+ ggml_tensor * cur,
15426
+ ggml_tensor * inp_pos,
15427
+ llm_graph_input_attn_kv * inp_attn,
15428
+ const llama_model & model,
15429
+ const int64_t n_embd_head,
15430
+ const int il) {
15364
15431
 
15365
15432
  // compute Q and K and (optionally) RoPE them
15366
15433
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -15411,7 +15478,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
15411
15478
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15412
15479
  cur = build_attn(inp_attn,
15413
15480
  model.layers[il].wo, model.layers[il].bo,
15414
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15481
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
15415
15482
  cb(cur, "attn_out", il);
15416
15483
  return cur;
15417
15484
  }
@@ -15517,7 +15584,7 @@ struct llm_build_chameleon : public llm_graph_context {
15517
15584
  // inp_pos - contains the positions
15518
15585
  ggml_tensor * inp_pos = build_inp_pos();
15519
15586
 
15520
- auto * inp_attn = build_attn_inp_kv_unified();
15587
+ auto * inp_attn = build_attn_inp_kv();
15521
15588
 
15522
15589
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15523
15590
 
@@ -15596,7 +15663,7 @@ struct llm_build_chameleon : public llm_graph_context {
15596
15663
 
15597
15664
  cur = build_attn(inp_attn,
15598
15665
  model.layers[il].wo, nullptr,
15599
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15666
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15600
15667
  }
15601
15668
 
15602
15669
  if (il == n_layer - 1 && inp_out_ids) {
@@ -15848,7 +15915,7 @@ struct llm_build_plm : public llm_graph_context {
15848
15915
  // inp_pos - contains the positions
15849
15916
  ggml_tensor * inp_pos = build_inp_pos();
15850
15917
 
15851
- auto * inp_attn = build_attn_inp_kv_unified();
15918
+ auto * inp_attn = build_attn_inp_kv();
15852
15919
 
15853
15920
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15854
15921
 
@@ -15952,7 +16019,7 @@ struct llm_build_plm : public llm_graph_context {
15952
16019
 
15953
16020
  cur = build_attn(inp_attn,
15954
16021
  model.layers[il].wo, NULL,
15955
- q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
16022
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
15956
16023
  }
15957
16024
 
15958
16025
  if (il == n_layer - 1 && inp_out_ids) {
@@ -16013,7 +16080,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
16013
16080
  // inp_pos - contains the positions
16014
16081
  ggml_tensor * inp_pos = build_inp_pos();
16015
16082
 
16016
- auto * inp_attn = build_attn_inp_kv_unified();
16083
+ auto * inp_attn = build_attn_inp_kv();
16017
16084
 
16018
16085
  ggml_tensor * inp_out_ids = build_inp_out_ids();
16019
16086
 
@@ -16075,7 +16142,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
16075
16142
 
16076
16143
  cur = build_attn(inp_attn,
16077
16144
  model.layers[il].wo, model.layers[il].bo,
16078
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
16145
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
16079
16146
  }
16080
16147
 
16081
16148
  if (il == n_layer - 1 && inp_out_ids) {
@@ -16162,7 +16229,7 @@ struct llm_build_dots1 : public llm_graph_context {
16162
16229
  // inp_pos - contains the positions
16163
16230
  ggml_tensor * inp_pos = build_inp_pos();
16164
16231
 
16165
- auto * inp_attn = build_attn_inp_kv_unified();
16232
+ auto * inp_attn = build_attn_inp_kv();
16166
16233
 
16167
16234
  ggml_tensor * inp_out_ids = build_inp_out_ids();
16168
16235
 
@@ -16215,7 +16282,7 @@ struct llm_build_dots1 : public llm_graph_context {
16215
16282
 
16216
16283
  cur = build_attn(inp_attn,
16217
16284
  model.layers[il].wo, model.layers[il].bo,
16218
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16285
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16219
16286
  }
16220
16287
 
16221
16288
  if (il == n_layer - 1 && inp_out_ids) {
@@ -16312,7 +16379,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
16312
16379
  // inp_pos - contains the positions
16313
16380
  ggml_tensor * inp_pos = build_inp_pos();
16314
16381
 
16315
- auto * inp_attn = build_attn_inp_kv_unified();
16382
+ auto * inp_attn = build_attn_inp_kv();
16316
16383
 
16317
16384
  for (int il = 0; il < n_layer; ++il) {
16318
16385
  ggml_tensor * inpSA = inpL;
@@ -16370,7 +16437,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
16370
16437
 
16371
16438
  cur = build_attn(inp_attn,
16372
16439
  model.layers[il].wo, NULL,
16373
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16440
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16374
16441
  }
16375
16442
 
16376
16443
  if (il == n_layer - 1) {
@@ -16442,7 +16509,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
16442
16509
  // inp_pos - contains the positions
16443
16510
  ggml_tensor * inp_pos = build_inp_pos();
16444
16511
 
16445
- auto * inp_attn = build_attn_inp_kv_unified();
16512
+ auto * inp_attn = build_attn_inp_kv();
16446
16513
 
16447
16514
  ggml_tensor * inp_out_ids = build_inp_out_ids();
16448
16515
 
@@ -16503,7 +16570,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
16503
16570
 
16504
16571
  cur = build_attn(inp_attn,
16505
16572
  model.layers[il].wo, NULL,
16506
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16573
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16507
16574
  cb(cur, "attn_out", il);
16508
16575
  }
16509
16576
 
@@ -16656,7 +16723,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
16656
16723
 
16657
16724
  ggml_tensor * attn_out = build_attn(inp->get_attn(),
16658
16725
  model.layers[il].wo, NULL,
16659
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
16726
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
16660
16727
  cb(attn_out, "attn_out", il);
16661
16728
 
16662
16729
  cur = build_norm(inpL,
@@ -16816,7 +16883,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
16816
16883
 
16817
16884
  private:
16818
16885
  ggml_tensor * build_plamo2_attn_layer(
16819
- llm_graph_input_attn_kv_unified * inp,
16886
+ llm_graph_input_attn_kv * inp,
16820
16887
  ggml_tensor * inp_pos,
16821
16888
  ggml_tensor * cur,
16822
16889
  const llama_model & model,
@@ -16840,13 +16907,13 @@ private:
16840
16907
 
16841
16908
  ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
16842
16909
  ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
16843
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv)));
16910
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
16844
16911
 
16845
16912
  cb(Qcur, "Qcur", il);
16846
16913
  cb(Kcur, "Kcur", il);
16847
16914
  cb(Vcur, "Vcur", il);
16848
16915
 
16849
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
16916
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
16850
16917
 
16851
16918
  Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
16852
16919
  cb(Qcur, "Qcur_normed", il);
@@ -16866,7 +16933,9 @@ private:
16866
16933
  ext_factor, attn_factor, beta_fast, beta_slow
16867
16934
  );
16868
16935
 
16869
- cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
16936
+ cur = build_attn(inp,
16937
+ model.layers[il].wo, NULL,
16938
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
16870
16939
  }
16871
16940
 
16872
16941
  cb(cur, "attn_out", il);
@@ -16913,15 +16982,13 @@ private:
16913
16982
  cb(zx, "mamba_in_proj", il);
16914
16983
  // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
16915
16984
  zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
16916
- zx = ggml_cont(ctx0, zx);
16917
- zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
16985
+ zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
16918
16986
  cb(zx, "mamba_in_proj_out", il);
16919
16987
 
16920
16988
  // split into z and x
16921
16989
  // => {head_dim * n_heads, n_seq_tokens, n_seqs}
16922
16990
  ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
16923
- x = ggml_cont(ctx0, x);
16924
- x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
16991
+ x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
16925
16992
  // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
16926
16993
  cb(x, "mamba_x_split", il);
16927
16994
 
@@ -17051,7 +17118,7 @@ struct llm_build_arcee : public llm_graph_context {
17051
17118
  // inp_pos - contains the positions
17052
17119
  ggml_tensor * inp_pos = build_inp_pos();
17053
17120
 
17054
- auto * inp_attn = build_attn_inp_kv_unified();
17121
+ auto * inp_attn = build_attn_inp_kv();
17055
17122
 
17056
17123
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
17057
17124
 
@@ -17115,7 +17182,7 @@ struct llm_build_arcee : public llm_graph_context {
17115
17182
 
17116
17183
  cur = build_attn(inp_attn,
17117
17184
  model.layers[il].wo, model.layers[il].bo,
17118
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17185
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17119
17186
  cb(cur, "attn_out", il);
17120
17187
  }
17121
17188
 
@@ -17186,7 +17253,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
17186
17253
  // inp_pos - contains the positions
17187
17254
  ggml_tensor * inp_pos = build_inp_pos();
17188
17255
 
17189
- auto * inp_attn = build_attn_inp_kv_unified();
17256
+ auto * inp_attn = build_attn_inp_kv();
17190
17257
 
17191
17258
  const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
17192
17259
 
@@ -17260,7 +17327,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
17260
17327
 
17261
17328
  cur = build_attn(inp_attn,
17262
17329
  model.layers[il].wo, model.layers[il].bo,
17263
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17330
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17264
17331
  cb(cur, "attn_out", il);
17265
17332
  }
17266
17333
 
@@ -17347,7 +17414,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
17347
17414
  // inp_pos - contains the positions
17348
17415
  ggml_tensor * inp_pos = build_inp_pos();
17349
17416
 
17350
- auto * inp_attn = build_attn_inp_kv_unified();
17417
+ auto * inp_attn = build_attn_inp_kv();
17351
17418
 
17352
17419
  const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
17353
17420
 
@@ -17420,7 +17487,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
17420
17487
 
17421
17488
  cur = build_attn(inp_attn,
17422
17489
  model.layers[il].wo, model.layers[il].bo,
17423
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17490
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17424
17491
  cb(cur, "attn_out", il);
17425
17492
  }
17426
17493
 
@@ -17485,7 +17552,7 @@ struct llm_build_smollm3 : public llm_graph_context {
17485
17552
  // inp_pos - contains the positions
17486
17553
  ggml_tensor * inp_pos = build_inp_pos();
17487
17554
 
17488
- auto * inp_attn = build_attn_inp_kv_unified();
17555
+ auto * inp_attn = build_attn_inp_kv();
17489
17556
 
17490
17557
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
17491
17558
 
@@ -17550,7 +17617,7 @@ struct llm_build_smollm3 : public llm_graph_context {
17550
17617
 
17551
17618
  cur = build_attn(inp_attn,
17552
17619
  model.layers[il].wo, model.layers[il].bo,
17553
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17620
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17554
17621
  cb(cur, "attn_out", il);
17555
17622
  }
17556
17623
 
@@ -17617,7 +17684,7 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
17617
17684
  // inp_pos - contains the positions
17618
17685
  ggml_tensor * inp_pos = build_inp_pos();
17619
17686
 
17620
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
17687
+ auto * inp_attn = build_attn_inp_kv_iswa();
17621
17688
 
17622
17689
  for (int il = 0; il < n_layer; ++il) {
17623
17690
  ggml_tensor * inpSA = inpL;
@@ -17672,9 +17739,9 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
17672
17739
  cb(Kcur, "Kcur", il);
17673
17740
  cb(Vcur, "Vcur", il);
17674
17741
 
17675
- cur = build_attn_with_sinks(inp_attn,
17742
+ cur = build_attn(inp_attn,
17676
17743
  model.layers[il].wo, model.layers[il].bo,
17677
- Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].attn_sinks, 1.0f/sqrtf(float(n_rot)), il);
17744
+ Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
17678
17745
 
17679
17746
  cb(cur, "attn_out", il);
17680
17747
  }
@@ -17771,8 +17838,7 @@ struct llm_build_lfm2 : public llm_graph_context {
17771
17838
  cb(cur, "model.embedding_norm", -1);
17772
17839
  res->t_embd = cur;
17773
17840
 
17774
- // lm_head is tied with embeddings
17775
- cur = build_lora_mm(model.tok_embd, cur);
17841
+ cur = build_lora_mm(model.output, cur);
17776
17842
  cb(cur, "lm_head", -1);
17777
17843
 
17778
17844
  res->t_logits = cur;
@@ -17799,10 +17865,10 @@ struct llm_build_lfm2 : public llm_graph_context {
17799
17865
  return cur;
17800
17866
  }
17801
17867
 
17802
- ggml_tensor * build_attn_block(ggml_tensor * cur,
17803
- ggml_tensor * inp_pos,
17804
- llm_graph_input_attn_kv_unified * inp_attn,
17805
- int il) const {
17868
+ ggml_tensor * build_attn_block(ggml_tensor * cur,
17869
+ ggml_tensor * inp_pos,
17870
+ llm_graph_input_attn_kv * inp_attn,
17871
+ int il) const {
17806
17872
  GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
17807
17873
  auto const n_embd_head = hparams.n_embd_head_v;
17808
17874
  auto const n_head_kv = hparams.n_head_kv(il);
@@ -17837,7 +17903,7 @@ struct llm_build_lfm2 : public llm_graph_context {
17837
17903
  );
17838
17904
 
17839
17905
  cur = build_attn(inp_attn, model.layers[il].wo, NULL,
17840
- q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
17906
+ q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
17841
17907
 
17842
17908
  cb(cur, "model.layers.{}.self_attn.out_proj", il);
17843
17909
 
@@ -17914,6 +17980,137 @@ struct llm_build_lfm2 : public llm_graph_context {
17914
17980
  }
17915
17981
  };
17916
17982
 
17983
+ struct llm_build_seed_oss : public llm_graph_context {
17984
+ llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17985
+ const int64_t n_embd_head = hparams.n_embd_head_v;
17986
+
17987
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
17988
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
17989
+
17990
+ ggml_tensor * cur;
17991
+ ggml_tensor * inpL;
17992
+
17993
+ inpL = build_inp_embd(model.tok_embd);
17994
+
17995
+ // inp_pos - contains the positions
17996
+ ggml_tensor * inp_pos = build_inp_pos();
17997
+
17998
+ auto * inp_attn = build_attn_inp_kv();
17999
+
18000
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
18001
+
18002
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
18003
+
18004
+ for (int il = 0; il < n_layer; ++il) {
18005
+ ggml_tensor * inpSA = inpL;
18006
+
18007
+ // norm
18008
+ cur = build_norm(inpL,
18009
+ model.layers[il].attn_norm, NULL,
18010
+ LLM_NORM_RMS, il);
18011
+ cb(cur, "attn_norm", il);
18012
+
18013
+ // self-attention
18014
+ {
18015
+ // compute Q and K and RoPE them
18016
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
18017
+ cb(Qcur, "Qcur", il);
18018
+ if (model.layers[il].bq) {
18019
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
18020
+ cb(Qcur, "Qcur", il);
18021
+ }
18022
+
18023
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
18024
+ cb(Kcur, "Kcur", il);
18025
+ if (model.layers[il].bk) {
18026
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
18027
+ cb(Kcur, "Kcur", il);
18028
+ }
18029
+
18030
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
18031
+ cb(Vcur, "Vcur", il);
18032
+ if (model.layers[il].bv) {
18033
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
18034
+ cb(Vcur, "Vcur", il);
18035
+ }
18036
+
18037
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
18038
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
18039
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
18040
+
18041
+ Qcur = ggml_rope_ext(
18042
+ ctx0, Qcur, inp_pos, nullptr,
18043
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
18044
+ ext_factor, attn_factor, beta_fast, beta_slow
18045
+ );
18046
+
18047
+ Kcur = ggml_rope_ext(
18048
+ ctx0, Kcur, inp_pos, nullptr,
18049
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
18050
+ ext_factor, attn_factor, beta_fast, beta_slow
18051
+ );
18052
+
18053
+ cb(Qcur, "Qcur", il);
18054
+ cb(Kcur, "Kcur", il);
18055
+ cb(Vcur, "Vcur", il);
18056
+
18057
+ cur = build_attn(inp_attn,
18058
+ model.layers[il].wo, model.layers[il].bo,
18059
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
18060
+ cb(cur, "attn_out", il);
18061
+ }
18062
+
18063
+ if (il == n_layer - 1 && inp_out_ids) {
18064
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
18065
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
18066
+ }
18067
+
18068
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
18069
+ cb(ffn_inp, "ffn_inp", il);
18070
+
18071
+ // feed-forward network
18072
+ cur = build_norm(ffn_inp,
18073
+ model.layers[il].attn_post_norm, NULL,
18074
+ LLM_NORM_RMS, il);
18075
+ cb(cur, "attn_post_norm", il);
18076
+
18077
+ cur = build_ffn(cur,
18078
+ model.layers[il].ffn_up, NULL, NULL,
18079
+ model.layers[il].ffn_gate, NULL, NULL,
18080
+ model.layers[il].ffn_down, NULL, NULL,
18081
+ NULL,
18082
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
18083
+ cb(cur, "ffn_out", il);
18084
+
18085
+ cur = ggml_add(ctx0, cur, ffn_inp);
18086
+ cb(cur, "ffn_out", il);
18087
+
18088
+ cur = build_cvec(cur, il);
18089
+ cb(cur, "l_out", il);
18090
+
18091
+ // input for next layer
18092
+ inpL = cur;
18093
+ }
18094
+
18095
+ cur = inpL;
18096
+
18097
+ cur = build_norm(cur,
18098
+ model.output_norm, NULL,
18099
+ LLM_NORM_RMS, -1);
18100
+
18101
+ cb(cur, "result_norm", -1);
18102
+ res->t_embd = cur;
18103
+
18104
+ // lm_head
18105
+ cur = build_lora_mm(model.output, cur);
18106
+
18107
+ cb(cur, "result_output", -1);
18108
+ res->t_logits = cur;
18109
+
18110
+ ggml_build_forward_expand(gf, cur);
18111
+ }
18112
+ };
18113
+
17917
18114
  template <bool iswa>
17918
18115
  struct llm_build_smallthinker : public llm_graph_context{
17919
18116
  llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
@@ -17930,13 +18127,13 @@ struct llm_build_smallthinker : public llm_graph_context{
17930
18127
  // inp_pos - contains the positions
17931
18128
  ggml_tensor * inp_pos = build_inp_pos();
17932
18129
 
17933
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
18130
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
17934
18131
  inp_attn_type * inp_attn = nullptr;
17935
18132
 
17936
18133
  if constexpr (iswa) {
17937
- inp_attn = build_attn_inp_kv_unified_iswa();
18134
+ inp_attn = build_attn_inp_kv_iswa();
17938
18135
  } else {
17939
- inp_attn = build_attn_inp_kv_unified();
18136
+ inp_attn = build_attn_inp_kv();
17940
18137
  }
17941
18138
 
17942
18139
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -17981,7 +18178,7 @@ struct llm_build_smallthinker : public llm_graph_context{
17981
18178
 
17982
18179
  cur = build_attn(inp_attn,
17983
18180
  model.layers[il].wo, model.layers[il].bo,
17984
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
18181
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
17985
18182
  }
17986
18183
 
17987
18184
  if (il == n_layer - 1 && inp_out_ids) {
@@ -18066,7 +18263,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18066
18263
  std::max((uint32_t) 1, cparams.n_seq_max),
18067
18264
  cparams.n_seq_max);
18068
18265
  } else if (llm_arch_is_hybrid(arch)) {
18069
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
18266
+ const auto padding = llama_kv_cache::get_padding(cparams);
18070
18267
 
18071
18268
  cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
18072
18269
 
@@ -18088,7 +18285,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18088
18285
  /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
18089
18286
  /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
18090
18287
  } else {
18091
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
18288
+ const auto padding = llama_kv_cache::get_padding(cparams);
18092
18289
 
18093
18290
  uint32_t n_ctx_per_stream = cparams.n_ctx;
18094
18291
 
@@ -18108,7 +18305,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18108
18305
  if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
18109
18306
  GGML_ASSERT(hparams.is_swa_any());
18110
18307
 
18111
- res = new llama_kv_cache_unified_iswa(
18308
+ res = new llama_kv_cache_iswa(
18112
18309
  *this,
18113
18310
  params.type_k,
18114
18311
  params.type_v,
@@ -18123,7 +18320,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18123
18320
  } else {
18124
18321
  GGML_ASSERT(!hparams.is_swa_any());
18125
18322
 
18126
- res = new llama_kv_cache_unified(
18323
+ res = new llama_kv_cache(
18127
18324
  *this,
18128
18325
  nullptr,
18129
18326
  params.type_k,
@@ -18452,6 +18649,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18452
18649
  {
18453
18650
  llm = std::make_unique<llm_build_bailingmoe>(*this, params);
18454
18651
  } break;
18652
+ case LLM_ARCH_SEED_OSS:
18653
+ {
18654
+ llm = std::make_unique<llm_build_seed_oss>(*this, params);
18655
+ } break;
18455
18656
  case LLM_ARCH_DOTS1:
18456
18657
  {
18457
18658
  llm = std::make_unique<llm_build_dots1>(*this, params);
@@ -18510,6 +18711,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18510
18711
  return llm->res->get_gf();
18511
18712
  }
18512
18713
 
18714
+
18513
18715
  //
18514
18716
  // interface implementation
18515
18717
  //
@@ -18704,6 +18906,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
18704
18906
  case LLM_ARCH_LFM2:
18705
18907
  case LLM_ARCH_SMALLTHINKER:
18706
18908
  case LLM_ARCH_GLM4_MOE:
18909
+ case LLM_ARCH_SEED_OSS:
18707
18910
  return LLAMA_ROPE_TYPE_NEOX;
18708
18911
 
18709
18912
  case LLM_ARCH_QWEN2VL: