@fugood/llama.node 1.0.0-beta.5 → 1.0.0-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/lib/binding.ts +3 -1
  2. package/lib/index.js +2 -0
  3. package/lib/index.ts +3 -1
  4. package/package.json +14 -14
  5. package/scripts/llama.cpp.patch +27 -26
  6. package/src/EmbeddingWorker.cpp +1 -1
  7. package/src/LlamaCompletionWorker.cpp +28 -7
  8. package/src/LlamaCompletionWorker.h +4 -0
  9. package/src/LlamaContext.cpp +14 -17
  10. package/src/common.hpp +7 -6
  11. package/src/llama.cpp/CMakeLists.txt +15 -4
  12. package/src/llama.cpp/common/CMakeLists.txt +15 -24
  13. package/src/llama.cpp/common/arg.cpp +172 -110
  14. package/src/llama.cpp/common/chat-parser.cpp +385 -0
  15. package/src/llama.cpp/common/chat-parser.h +120 -0
  16. package/src/llama.cpp/common/chat.cpp +726 -596
  17. package/src/llama.cpp/common/chat.h +74 -8
  18. package/src/llama.cpp/common/common.cpp +56 -38
  19. package/src/llama.cpp/common/common.h +9 -3
  20. package/src/llama.cpp/common/json-partial.cpp +256 -0
  21. package/src/llama.cpp/common/json-partial.h +38 -0
  22. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  23. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -4
  24. package/src/llama.cpp/common/sampling.cpp +7 -8
  25. package/src/llama.cpp/common/speculative.cpp +6 -4
  26. package/src/llama.cpp/ggml/CMakeLists.txt +48 -3
  27. package/src/llama.cpp/ggml/include/ggml.h +22 -3
  28. package/src/llama.cpp/ggml/src/CMakeLists.txt +81 -22
  29. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +131 -49
  30. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  31. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2162 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  42. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  43. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  44. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  45. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +12 -13
  47. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +64 -88
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  49. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  50. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  51. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  52. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +282 -100
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  55. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  56. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  57. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1570 -0
  58. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  59. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +119 -5
  60. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  61. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  62. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +204 -49
  63. package/src/llama.cpp/include/llama.h +145 -40
  64. package/src/llama.cpp/src/CMakeLists.txt +5 -1
  65. package/src/llama.cpp/src/llama-arch.cpp +99 -3
  66. package/src/llama.cpp/src/llama-arch.h +10 -1
  67. package/src/llama.cpp/src/llama-batch.cpp +728 -272
  68. package/src/llama.cpp/src/llama-batch.h +112 -54
  69. package/src/llama.cpp/src/llama-chat.cpp +19 -2
  70. package/src/llama.cpp/src/llama-chat.h +1 -0
  71. package/src/llama.cpp/src/llama-context.cpp +525 -339
  72. package/src/llama.cpp/src/llama-context.h +38 -17
  73. package/src/llama.cpp/src/llama-cparams.cpp +4 -0
  74. package/src/llama.cpp/src/llama-cparams.h +2 -0
  75. package/src/llama.cpp/src/llama-grammar.cpp +12 -2
  76. package/src/llama.cpp/src/llama-graph.cpp +413 -353
  77. package/src/llama.cpp/src/llama-graph.h +112 -56
  78. package/src/llama.cpp/src/llama-hparams.cpp +10 -2
  79. package/src/llama.cpp/src/llama-hparams.h +13 -2
  80. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +279 -0
  81. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +128 -0
  82. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +1815 -0
  83. package/src/llama.cpp/src/llama-kv-cache-unified.h +303 -0
  84. package/src/llama.cpp/src/llama-kv-cells.h +415 -0
  85. package/src/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  86. package/src/llama.cpp/src/llama-memory-hybrid.h +138 -0
  87. package/src/llama.cpp/src/llama-memory-recurrent.cpp +1112 -0
  88. package/src/llama.cpp/src/llama-memory-recurrent.h +183 -0
  89. package/src/llama.cpp/src/llama-memory.cpp +41 -0
  90. package/src/llama.cpp/src/llama-memory.h +86 -5
  91. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  92. package/src/llama.cpp/src/llama-model-loader.cpp +42 -17
  93. package/src/llama.cpp/src/llama-model-saver.cpp +1 -0
  94. package/src/llama.cpp/src/llama-model.cpp +1137 -528
  95. package/src/llama.cpp/src/llama-model.h +4 -0
  96. package/src/llama.cpp/src/llama-quant.cpp +2 -1
  97. package/src/llama.cpp/src/llama-sampling.cpp +2 -2
  98. package/src/llama.cpp/src/llama-vocab.cpp +69 -32
  99. package/src/llama.cpp/src/llama-vocab.h +1 -0
  100. package/src/llama.cpp/src/llama.cpp +11 -7
  101. package/src/llama.cpp/src/unicode.cpp +5 -0
  102. package/src/tts_utils.h +1 -1
  103. package/src/llama.cpp/common/json.hpp +0 -24766
  104. package/src/llama.cpp/common/minja/chat-template.hpp +0 -541
  105. package/src/llama.cpp/common/minja/minja.hpp +0 -2974
  106. package/src/llama.cpp/common/stb_image.h +0 -7988
  107. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  108. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13326
  109. package/src/llama.cpp/src/llama-kv-cache.cpp +0 -2827
  110. package/src/llama.cpp/src/llama-kv-cache.h +0 -515
  111. /package/src/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  112. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  113. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -5,7 +5,11 @@
5
5
  #include "llama-batch.h"
6
6
  #include "llama-cparams.h"
7
7
  #include "llama-model-loader.h"
8
- #include "llama-kv-cache.h"
8
+
9
+ #include "llama-kv-cache-unified.h"
10
+ #include "llama-kv-cache-unified-iswa.h"
11
+ #include "llama-memory-hybrid.h"
12
+ #include "llama-memory-recurrent.h"
9
13
 
10
14
  #include "ggml-cpp.h"
11
15
 
@@ -77,6 +81,7 @@ const char * llm_type_name(llm_type type) {
77
81
  case LLM_TYPE_40B: return "40B";
78
82
  case LLM_TYPE_65B: return "65B";
79
83
  case LLM_TYPE_70B: return "70B";
84
+ case LLM_TYPE_142B: return "142B";
80
85
  case LLM_TYPE_236B: return "236B";
81
86
  case LLM_TYPE_290B: return "290B";
82
87
  case LLM_TYPE_314B: return "314B";
@@ -466,6 +471,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
466
471
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
467
472
  std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
468
473
  std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
474
+ std::fill(
475
+ hparams.recurrent_layer_arr.begin(),
476
+ hparams.recurrent_layer_arr.end(),
477
+ llm_arch_is_recurrent(ml.get_arch()));
469
478
 
470
479
  std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
471
480
 
@@ -540,6 +549,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
540
549
  uint32_t n_vocab = 0;
541
550
  ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
542
551
 
552
+ // for classifier models
553
+ ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
554
+ if (!classifier_labels.empty()) {
555
+ hparams.n_cls_out = classifier_labels.size();
556
+ }
557
+
543
558
  // arch-specific KVs
544
559
  switch (arch) {
545
560
  case LLM_ARCH_LLAMA:
@@ -589,6 +604,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
589
604
  hparams.use_kq_norm = false;
590
605
  }
591
606
  } break;
607
+ case LLM_ARCH_ARCEE:
608
+ {
609
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
610
+
611
+ // Arcee uses the same structure as Llama
612
+ switch (hparams.n_layer) {
613
+ case 36: type = LLM_TYPE_4B; break;
614
+ default: type = LLM_TYPE_UNKNOWN;
615
+ }
616
+ } break;
592
617
  case LLM_ARCH_DECI:
593
618
  {
594
619
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -729,6 +754,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
729
754
  }
730
755
  }
731
756
  } break;
757
+ case LLM_ARCH_NEO_BERT:
758
+ {
759
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
760
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
761
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
762
+
763
+ if (hparams.n_layer == 28) {
764
+ type = LLM_TYPE_250M;
765
+ }
766
+ } break;
732
767
  case LLM_ARCH_BLOOM:
733
768
  {
734
769
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -952,6 +987,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
952
987
  case 46: type = LLM_TYPE_27B; break;
953
988
  default: type = LLM_TYPE_UNKNOWN;
954
989
  }
990
+
991
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
992
+ hparams.f_attention_scale = type == LLM_TYPE_27B
993
+ ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
994
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
955
995
  } break;
956
996
  case LLM_ARCH_GEMMA3:
957
997
  {
@@ -972,6 +1012,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
972
1012
  default: type = LLM_TYPE_UNKNOWN;
973
1013
  }
974
1014
 
1015
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
975
1016
  hparams.f_attention_scale = type == LLM_TYPE_27B
976
1017
  ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
977
1018
  : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
@@ -1429,6 +1470,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1429
1470
  default: type = LLM_TYPE_UNKNOWN;
1430
1471
  }
1431
1472
  } break;
1473
+ case LLM_ARCH_DOTS1:
1474
+ {
1475
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1476
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1477
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1478
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1479
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1480
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1481
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1482
+ switch (hparams.n_layer) {
1483
+ case 62: type = LLM_TYPE_142B; break;
1484
+ default: type = LLM_TYPE_UNKNOWN;
1485
+ }
1486
+ } break;
1432
1487
  default: throw std::runtime_error("unsupported model architecture");
1433
1488
  }
1434
1489
 
@@ -2113,7 +2168,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2113
2168
  case LLM_ARCH_NOMIC_BERT_MOE:
2114
2169
  {
2115
2170
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2116
- type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
2171
+ type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
2117
2172
 
2118
2173
  if (arch == LLM_ARCH_BERT) {
2119
2174
  pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
@@ -2121,8 +2176,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2121
2176
  cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
2122
2177
  cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
2123
2178
 
2124
- cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
2125
- cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, TENSOR_NOT_REQUIRED);
2179
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2180
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2126
2181
  }
2127
2182
 
2128
2183
  tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
@@ -2131,7 +2186,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2131
2186
  for (int i = 0; i < n_layer; ++i) {
2132
2187
  auto & layer = layers[i];
2133
2188
 
2134
- if (arch == LLM_ARCH_BERT) {
2189
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2190
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2191
+
2192
+ if (!layer.wqkv) {
2135
2193
  layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2136
2194
  layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
2137
2195
 
@@ -2140,12 +2198,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2140
2198
 
2141
2199
  layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2142
2200
  layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
2143
- } else {
2144
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2145
- }
2146
-
2147
- if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
2148
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2149
2201
  }
2150
2202
 
2151
2203
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
@@ -2175,6 +2227,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2175
2227
  layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
2176
2228
  }
2177
2229
  } break;
2230
+ case LLM_ARCH_NEO_BERT:
2231
+ {
2232
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2233
+
2234
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
2235
+ cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
2236
+
2237
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2238
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2239
+
2240
+ output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
2241
+
2242
+ for (int i = 0; i < n_layer; ++i) {
2243
+ auto & layer = layers[i];
2244
+
2245
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2246
+
2247
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2248
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2249
+
2250
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2251
+
2252
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
2253
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2254
+ }
2255
+ } break;
2178
2256
  case LLM_ARCH_JINA_BERT_V2:
2179
2257
  {
2180
2258
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
@@ -2212,8 +2290,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2212
2290
  layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2213
2291
  layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2214
2292
 
2215
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2216
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2293
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2294
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
2217
2295
 
2218
2296
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2219
2297
  layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@@ -2489,7 +2567,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2489
2567
 
2490
2568
  // output
2491
2569
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2492
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2570
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2571
+ // if output is NULL, init from the input tok embed
2572
+ if (output == NULL) {
2573
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2574
+ }
2493
2575
 
2494
2576
  for (int i = 0; i < n_layer; ++i) {
2495
2577
  auto & layer = layers[i];
@@ -4107,6 +4189,89 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4107
4189
  layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4108
4190
  }
4109
4191
  } break;
4192
+ case LLM_ARCH_DOTS1:
4193
+ {
4194
+ const int64_t n_ff_exp = hparams.n_ff_exp;
4195
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4196
+
4197
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4198
+
4199
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4200
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4201
+
4202
+ for (int i = 0; i < n_layer; ++i) {
4203
+ auto & layer = layers[i];
4204
+
4205
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4206
+
4207
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4208
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4209
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4210
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4211
+
4212
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4213
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4214
+
4215
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4216
+
4217
+ if (i < (int) hparams.n_layer_dense_lead) {
4218
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4219
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4220
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4221
+ } else {
4222
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4223
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
4224
+
4225
+ if (n_expert == 0) {
4226
+ throw std::runtime_error("n_expert must be > 0");
4227
+ }
4228
+ if (n_expert_used == 0) {
4229
+ throw std::runtime_error("n_expert_used must be > 0");
4230
+ }
4231
+
4232
+ // MoE branch
4233
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4234
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4235
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4236
+
4237
+ // Shared expert branch
4238
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4239
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
4240
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4241
+ }
4242
+ }
4243
+ } break;
4244
+ case LLM_ARCH_ARCEE:
4245
+ {
4246
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4247
+
4248
+ // output
4249
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4250
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4251
+
4252
+ // if output is NULL, init from the input tok embed
4253
+ if (output == NULL) {
4254
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4255
+ }
4256
+
4257
+ for (int i = 0; i < n_layer; ++i) {
4258
+ auto & layer = layers[i];
4259
+
4260
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4261
+
4262
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4263
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4264
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4265
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4266
+
4267
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4268
+
4269
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4270
+
4271
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4272
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4273
+ }
4274
+ } break;
4110
4275
  default:
4111
4276
  throw std::runtime_error("unknown architecture");
4112
4277
  }
@@ -4351,6 +4516,15 @@ void llama_model::print_info() const {
4351
4516
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
4352
4517
  LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
4353
4518
  LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
4519
+
4520
+ if (!classifier_labels.empty()) {
4521
+ LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
4522
+
4523
+ size_t i = 0;
4524
+ for (auto label : classifier_labels) {
4525
+ LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
4526
+ }
4527
+ }
4354
4528
  }
4355
4529
 
4356
4530
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
@@ -4533,6 +4707,8 @@ struct llm_build_llama : public llm_graph_context {
4533
4707
 
4534
4708
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4535
4709
 
4710
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
4711
+
4536
4712
  for (int il = 0; il < n_layer; ++il) {
4537
4713
  ggml_tensor * inpSA = inpL;
4538
4714
 
@@ -4595,9 +4771,7 @@ struct llm_build_llama : public llm_graph_context {
4595
4771
  cb(cur, "attn_out", il);
4596
4772
  }
4597
4773
 
4598
- if (il == n_layer - 1) {
4599
- // skip computing output for unused tokens
4600
- ggml_tensor * inp_out_ids = build_inp_out_ids();
4774
+ if (il == n_layer - 1 && inp_out_ids) {
4601
4775
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
4602
4776
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4603
4777
  }
@@ -4693,6 +4867,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
4693
4867
 
4694
4868
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4695
4869
 
4870
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
4871
+
4696
4872
  for (int il = 0; il < n_layer; ++il) {
4697
4873
  ggml_tensor * inpSA = inpL;
4698
4874
 
@@ -4769,9 +4945,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
4769
4945
  cb(cur, "attn_out", il);
4770
4946
  }
4771
4947
 
4772
- if (il == n_layer - 1) {
4773
- // skip computing output for unused tokens
4774
- ggml_tensor * inp_out_ids = build_inp_out_ids();
4948
+ if (il == n_layer - 1 && inp_out_ids) {
4775
4949
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
4776
4950
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4777
4951
  }
@@ -4871,6 +5045,9 @@ struct llm_build_deci : public llm_graph_context {
4871
5045
  auto * inp_attn = build_attn_inp_kv_unified();
4872
5046
 
4873
5047
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
5048
+
5049
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5050
+
4874
5051
  for (int il = 0; il < n_layer; ++il) {
4875
5052
  ggml_tensor * inpSA = inpL;
4876
5053
  const int64_t n_head_kv = hparams.n_head_kv(il);
@@ -4944,9 +5121,7 @@ struct llm_build_deci : public llm_graph_context {
4944
5121
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4945
5122
  }
4946
5123
 
4947
- if (il == n_layer - 1) {
4948
- // skip computing output for unused tokens
4949
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5124
+ if (il == n_layer - 1 && inp_out_ids) {
4950
5125
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
4951
5126
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4952
5127
  }
@@ -5025,6 +5200,8 @@ struct llm_build_baichuan : public llm_graph_context {
5025
5200
 
5026
5201
  auto * inp_attn = build_attn_inp_kv_unified();
5027
5202
 
5203
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5204
+
5028
5205
  for (int il = 0; il < n_layer; ++il) {
5029
5206
  ggml_tensor * inpSA = inpL;
5030
5207
 
@@ -5076,9 +5253,7 @@ struct llm_build_baichuan : public llm_graph_context {
5076
5253
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5077
5254
  }
5078
5255
 
5079
- if (il == n_layer - 1) {
5080
- // skip computing output for unused tokens
5081
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5256
+ if (il == n_layer - 1 && inp_out_ids) {
5082
5257
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5083
5258
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5084
5259
  }
@@ -5147,6 +5322,8 @@ struct llm_build_xverse : public llm_graph_context {
5147
5322
 
5148
5323
  auto * inp_attn = build_attn_inp_kv_unified();
5149
5324
 
5325
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5326
+
5150
5327
  for (int il = 0; il < n_layer; ++il) {
5151
5328
  ggml_tensor * inpSA = inpL;
5152
5329
 
@@ -5191,9 +5368,7 @@ struct llm_build_xverse : public llm_graph_context {
5191
5368
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5192
5369
  }
5193
5370
 
5194
- if (il == n_layer - 1) {
5195
- // skip computing output for unused tokens
5196
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5371
+ if (il == n_layer - 1 && inp_out_ids) {
5197
5372
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5198
5373
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5199
5374
  }
@@ -5261,6 +5436,8 @@ struct llm_build_falcon : public llm_graph_context {
5261
5436
 
5262
5437
  auto * inp_attn = build_attn_inp_kv_unified();
5263
5438
 
5439
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5440
+
5264
5441
  for (int il = 0; il < n_layer; ++il) {
5265
5442
  ggml_tensor * attn_norm;
5266
5443
 
@@ -5316,9 +5493,7 @@ struct llm_build_falcon : public llm_graph_context {
5316
5493
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5317
5494
  }
5318
5495
 
5319
- if (il == n_layer - 1) {
5320
- // skip computing output for unused tokens
5321
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5496
+ if (il == n_layer - 1 && inp_out_ids) {
5322
5497
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5323
5498
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
5324
5499
  attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
@@ -5387,6 +5562,8 @@ struct llm_build_grok : public llm_graph_context {
5387
5562
 
5388
5563
  auto * inp_attn = build_attn_inp_kv_unified();
5389
5564
 
5565
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5566
+
5390
5567
  for (int il = 0; il < n_layer; ++il) {
5391
5568
  ggml_tensor * inpSA = inpL;
5392
5569
 
@@ -5446,9 +5623,7 @@ struct llm_build_grok : public llm_graph_context {
5446
5623
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
5447
5624
  }
5448
5625
 
5449
- if (il == n_layer - 1) {
5450
- // skip computing output for unused tokens
5451
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5626
+ if (il == n_layer - 1 && inp_out_ids) {
5452
5627
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5453
5628
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5454
5629
  }
@@ -5547,6 +5722,8 @@ struct llm_build_dbrx : public llm_graph_context {
5547
5722
 
5548
5723
  auto * inp_attn = build_attn_inp_kv_unified();
5549
5724
 
5725
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5726
+
5550
5727
  for (int il = 0; il < n_layer; ++il) {
5551
5728
  ggml_tensor * inpSA = inpL;
5552
5729
 
@@ -5597,9 +5774,7 @@ struct llm_build_dbrx : public llm_graph_context {
5597
5774
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5598
5775
  }
5599
5776
 
5600
- if (il == n_layer - 1) {
5601
- // skip computing output for unused tokens
5602
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5777
+ if (il == n_layer - 1 && inp_out_ids) {
5603
5778
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5604
5779
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5605
5780
  }
@@ -5679,6 +5854,8 @@ struct llm_build_starcoder : public llm_graph_context {
5679
5854
  inpL = ggml_add(ctx0, inpL, pos);
5680
5855
  cb(inpL, "inpL", -1);
5681
5856
 
5857
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5858
+
5682
5859
  for (int il = 0; il < n_layer; ++il) {
5683
5860
  cur = build_norm(inpL,
5684
5861
  model.layers[il].attn_norm,
@@ -5711,9 +5888,7 @@ struct llm_build_starcoder : public llm_graph_context {
5711
5888
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5712
5889
  }
5713
5890
 
5714
- if (il == n_layer - 1) {
5715
- // skip computing output for unused tokens
5716
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5891
+ if (il == n_layer - 1 && inp_out_ids) {
5717
5892
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5718
5893
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
5719
5894
  }
@@ -5778,6 +5953,8 @@ struct llm_build_refact : public llm_graph_context {
5778
5953
 
5779
5954
  auto * inp_attn = build_attn_inp_kv_unified();
5780
5955
 
5956
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5957
+
5781
5958
  for (int il = 0; il < n_layer; ++il) {
5782
5959
  ggml_tensor * inpSA = inpL;
5783
5960
 
@@ -5810,9 +5987,7 @@ struct llm_build_refact : public llm_graph_context {
5810
5987
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5811
5988
  }
5812
5989
 
5813
- if (il == n_layer - 1) {
5814
- // skip computing output for unused tokens
5815
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5990
+ if (il == n_layer - 1 && inp_out_ids) {
5816
5991
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5817
5992
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5818
5993
  }
@@ -5883,8 +6058,10 @@ struct llm_build_bert : public llm_graph_context {
5883
6058
  inpL = build_inp_embd(model.tok_embd);
5884
6059
 
5885
6060
  // token types are hardcoded to zero ("Sentence A")
5886
- ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5887
- inpL = ggml_add(ctx0, inpL, type_row0);
6061
+ if (model.type_embd) {
6062
+ ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
6063
+ inpL = ggml_add(ctx0, inpL, type_row0);
6064
+ }
5888
6065
  if (model.arch == LLM_ARCH_BERT) {
5889
6066
  inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5890
6067
  }
@@ -5896,17 +6073,34 @@ struct llm_build_bert : public llm_graph_context {
5896
6073
 
5897
6074
  auto * inp_attn = build_attn_inp_no_cache();
5898
6075
 
5899
- // iterate layers
6076
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6077
+
5900
6078
  for (int il = 0; il < n_layer; ++il) {
5901
6079
  ggml_tensor * cur = inpL;
5902
6080
 
5903
- ggml_tensor * Qcur;
5904
- ggml_tensor * Kcur;
5905
- ggml_tensor * Vcur;
6081
+ {
6082
+ ggml_tensor * Qcur;
6083
+ ggml_tensor * Kcur;
6084
+ ggml_tensor * Vcur;
5906
6085
 
5907
- // self-attention
5908
- if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
5909
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
6086
+ // self-attention
6087
+ if (model.layers[il].wqkv) {
6088
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
6089
+ cb(cur, "wqkv", il);
6090
+
6091
+ if (model.layers[il].bqkv) {
6092
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6093
+ cb(cur, "bqkv", il);
6094
+ }
6095
+
6096
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6097
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6098
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6099
+ } else {
6100
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
6101
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
6102
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
6103
+ }
5910
6104
 
5911
6105
  if (model.layers[il].attn_q_norm) {
5912
6106
  Qcur = build_norm(Qcur,
@@ -5915,8 +6109,6 @@ struct llm_build_bert : public llm_graph_context {
5915
6109
  LLM_NORM, il);
5916
6110
  }
5917
6111
 
5918
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
5919
-
5920
6112
  if (model.layers[il].attn_k_norm) {
5921
6113
  Kcur = build_norm(Kcur,
5922
6114
  model.layers[il].attn_k_norm,
@@ -5924,54 +6116,36 @@ struct llm_build_bert : public llm_graph_context {
5924
6116
  LLM_NORM, il);
5925
6117
  }
5926
6118
 
5927
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
5928
-
5929
6119
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5930
6120
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5931
6121
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5932
- } else {
5933
- // compute Q and K and RoPE them
5934
- cur = build_lora_mm(model.layers[il].wqkv, cur);
5935
- cb(cur, "wqkv", il);
5936
-
5937
- if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5938
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5939
- cb(cur, "bqkv", il);
5940
- }
5941
6122
 
5942
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5943
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5944
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6123
+ // RoPE
6124
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
6125
+ Qcur = ggml_rope_ext(
6126
+ ctx0, Qcur, inp_pos, nullptr,
6127
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6128
+ ext_factor, attn_factor, beta_fast, beta_slow
6129
+ );
5945
6130
 
5946
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5947
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5948
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6131
+ Kcur = ggml_rope_ext(
6132
+ ctx0, Kcur, inp_pos, nullptr,
6133
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6134
+ ext_factor, attn_factor, beta_fast, beta_slow
6135
+ );
6136
+ }
5949
6137
 
5950
- Qcur = ggml_rope_ext(
5951
- ctx0, Qcur, inp_pos, nullptr,
5952
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
5953
- ext_factor, attn_factor, beta_fast, beta_slow
5954
- );
6138
+ cb(Qcur, "Qcur", il);
6139
+ cb(Kcur, "Kcur", il);
6140
+ cb(Vcur, "Vcur", il);
5955
6141
 
5956
- Kcur = ggml_rope_ext(
5957
- ctx0, Kcur, inp_pos, nullptr,
5958
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
5959
- ext_factor, attn_factor, beta_fast, beta_slow
5960
- );
6142
+ cur = build_attn(inp_attn, gf,
6143
+ model.layers[il].wo, model.layers[il].bo,
6144
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6145
+ cb(cur, "kqv_out", il);
5961
6146
  }
5962
6147
 
5963
- cb(Qcur, "Qcur", il);
5964
- cb(Kcur, "Kcur", il);
5965
- cb(Vcur, "Vcur", il);
5966
-
5967
- cur = build_attn(inp_attn, gf,
5968
- model.layers[il].wo, model.layers[il].bo,
5969
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5970
- cb(cur, "kqv_out", il);
5971
-
5972
- if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
5973
- // skip computing output for unused tokens
5974
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6148
+ if (il == n_layer - 1 && inp_out_ids) {
5975
6149
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5976
6150
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
5977
6151
  }
@@ -6020,7 +6194,7 @@ struct llm_build_bert : public llm_graph_context {
6020
6194
  model.layers[il].ffn_gate, NULL, NULL,
6021
6195
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
6022
6196
  NULL,
6023
- LLM_FFN_GELU, LLM_FFN_PAR, il);
6197
+ model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
6024
6198
  cb(cur, "ffn_out", il);
6025
6199
  } else {
6026
6200
  cur = build_ffn(cur,
@@ -6051,8 +6225,8 @@ struct llm_build_bert : public llm_graph_context {
6051
6225
  }
6052
6226
  };
6053
6227
 
6054
- struct llm_build_bloom : public llm_graph_context {
6055
- llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6228
+ struct llm_build_neo_bert : public llm_graph_context {
6229
+ llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6056
6230
  const int64_t n_embd_head = hparams.n_embd_head_v;
6057
6231
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6058
6232
 
@@ -6060,52 +6234,164 @@ struct llm_build_bloom : public llm_graph_context {
6060
6234
 
6061
6235
  ggml_tensor * cur;
6062
6236
  ggml_tensor * inpL;
6237
+ ggml_tensor * inp_pos = build_inp_pos();
6063
6238
 
6239
+ // construct input embeddings (token, type, position)
6064
6240
  inpL = build_inp_embd(model.tok_embd);
6241
+ cb(inpL, "inp_embd", -1);
6065
6242
 
6066
- auto * inp_attn = build_attn_inp_kv_unified();
6243
+ auto * inp_attn = build_attn_inp_no_cache();
6067
6244
 
6068
- inpL = build_norm(inpL,
6069
- model.tok_norm,
6070
- model.tok_norm_b,
6071
- LLM_NORM, -1);
6072
- cb(inpL, "inp_norm", -1);
6245
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6073
6246
 
6074
6247
  for (int il = 0; il < n_layer; ++il) {
6248
+ ggml_tensor * cur = inpL;
6249
+
6250
+ // pre-norm
6075
6251
  cur = build_norm(inpL,
6076
- model.layers[il].attn_norm,
6077
- model.layers[il].attn_norm_b,
6078
- LLM_NORM, il);
6079
- cb(cur, "attn_norm", il);
6252
+ model.layers[il].attn_norm, NULL,
6253
+ LLM_NORM_RMS, il);
6080
6254
 
6081
- // self-attention
6082
6255
  {
6256
+ ggml_tensor * Qcur;
6257
+ ggml_tensor * Kcur;
6258
+ ggml_tensor * Vcur;
6259
+
6260
+ // self-attention
6083
6261
  cur = build_lora_mm(model.layers[il].wqkv, cur);
6084
6262
  cb(cur, "wqkv", il);
6085
6263
 
6086
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6087
- cb(cur, "bqkv", il);
6088
-
6089
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6090
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6091
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6264
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6265
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6266
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6092
6267
 
6093
6268
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6094
6269
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6095
6270
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6096
6271
 
6097
- cb(Qcur, "Qcur", il);
6098
- cb(Kcur, "Kcur", il);
6099
- cb(Vcur, "Vcur", il);
6100
-
6272
+ // RoPE
6273
+ Qcur = ggml_rope_ext(
6274
+ ctx0, Qcur, inp_pos, nullptr,
6275
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6276
+ ext_factor, attn_factor, beta_fast, beta_slow
6277
+ );
6278
+
6279
+ Kcur = ggml_rope_ext(
6280
+ ctx0, Kcur, inp_pos, nullptr,
6281
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6282
+ ext_factor, attn_factor, beta_fast, beta_slow
6283
+ );
6284
+
6285
+ cb(Qcur, "Qcur", il);
6286
+ cb(Kcur, "Kcur", il);
6287
+ cb(Vcur, "Vcur", il);
6288
+
6289
+ cur = build_attn(inp_attn, gf,
6290
+ model.layers[il].wo, nullptr,
6291
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6292
+ cb(cur, "kqv_out", il);
6293
+ }
6294
+
6295
+ if (il == n_layer - 1 && inp_out_ids) {
6296
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6297
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6298
+ }
6299
+
6300
+ // re-add the layer input
6301
+ cur = ggml_add(ctx0, cur, inpL);
6302
+
6303
+ ggml_tensor * ffn_inp = cur;
6304
+ cb(ffn_inp, "ffn_inp", il);
6305
+
6306
+ // pre-norm
6307
+ cur = build_norm(ffn_inp,
6308
+ model.layers[il].ffn_norm, NULL,
6309
+ LLM_NORM_RMS, il);
6310
+ cb(cur, "ffn_norm", il);
6311
+
6312
+ // feed-forward network
6313
+ cur = build_ffn(cur,
6314
+ model.layers[il].ffn_up,
6315
+ NULL, NULL, NULL, NULL, NULL,
6316
+ model.layers[il].ffn_down,
6317
+ NULL, NULL, NULL,
6318
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
6319
+
6320
+ // attentions bypass the intermediate layer
6321
+ cur = ggml_add(ctx0, cur, ffn_inp);
6322
+
6323
+ // input for next layer
6324
+ inpL = cur;
6325
+ }
6326
+
6327
+ cur = inpL;
6328
+
6329
+ cur = build_norm(cur,
6330
+ model.output_norm_enc, NULL,
6331
+ LLM_NORM_RMS, -1);
6332
+
6333
+ cb(cur, "result_embd", -1);
6334
+ res->t_embd = cur;
6335
+
6336
+ ggml_build_forward_expand(gf, cur);
6337
+ }
6338
+ };
6339
+
6340
+ struct llm_build_bloom : public llm_graph_context {
6341
+ llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6342
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6343
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6344
+
6345
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6346
+
6347
+ ggml_tensor * cur;
6348
+ ggml_tensor * inpL;
6349
+
6350
+ inpL = build_inp_embd(model.tok_embd);
6351
+
6352
+ auto * inp_attn = build_attn_inp_kv_unified();
6353
+
6354
+ inpL = build_norm(inpL,
6355
+ model.tok_norm,
6356
+ model.tok_norm_b,
6357
+ LLM_NORM, -1);
6358
+ cb(inpL, "inp_norm", -1);
6359
+
6360
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6361
+
6362
+ for (int il = 0; il < n_layer; ++il) {
6363
+ cur = build_norm(inpL,
6364
+ model.layers[il].attn_norm,
6365
+ model.layers[il].attn_norm_b,
6366
+ LLM_NORM, il);
6367
+ cb(cur, "attn_norm", il);
6368
+
6369
+ // self-attention
6370
+ {
6371
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
6372
+ cb(cur, "wqkv", il);
6373
+
6374
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6375
+ cb(cur, "bqkv", il);
6376
+
6377
+ ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6378
+ ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6379
+ ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6380
+
6381
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6382
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6383
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6384
+
6385
+ cb(Qcur, "Qcur", il);
6386
+ cb(Kcur, "Kcur", il);
6387
+ cb(Vcur, "Vcur", il);
6388
+
6101
6389
  cur = build_attn(inp_attn, gf,
6102
6390
  model.layers[il].wo, model.layers[il].bo,
6103
6391
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6104
6392
  }
6105
6393
 
6106
- if (il == n_layer - 1) {
6107
- // skip computing output for unused tokens
6108
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6394
+ if (il == n_layer - 1 && inp_out_ids) {
6109
6395
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6110
6396
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6111
6397
  }
@@ -6182,6 +6468,8 @@ struct llm_build_mpt : public llm_graph_context {
6182
6468
  cb(inpL, "inpL", -1);
6183
6469
  }
6184
6470
 
6471
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6472
+
6185
6473
  for (int il = 0; il < n_layer; ++il) {
6186
6474
  ggml_tensor * attn_norm;
6187
6475
 
@@ -6244,9 +6532,7 @@ struct llm_build_mpt : public llm_graph_context {
6244
6532
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6245
6533
  }
6246
6534
 
6247
- if (il == n_layer - 1) {
6248
- // skip computing output for unused tokens
6249
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6535
+ if (il == n_layer - 1 && inp_out_ids) {
6250
6536
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6251
6537
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6252
6538
  }
@@ -6315,6 +6601,8 @@ struct llm_build_stablelm : public llm_graph_context {
6315
6601
 
6316
6602
  auto * inp_attn = build_attn_inp_kv_unified();
6317
6603
 
6604
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6605
+
6318
6606
  for (int il = 0; il < n_layer; ++il) {
6319
6607
  // norm
6320
6608
  cur = build_norm(inpL,
@@ -6390,9 +6678,7 @@ struct llm_build_stablelm : public llm_graph_context {
6390
6678
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6391
6679
  }
6392
6680
 
6393
- if (il == n_layer - 1) {
6394
- // skip computing output for unused tokens
6395
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6681
+ if (il == n_layer - 1 && inp_out_ids) {
6396
6682
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6397
6683
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6398
6684
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@@ -6467,6 +6753,8 @@ struct llm_build_qwen : public llm_graph_context {
6467
6753
 
6468
6754
  auto * inp_attn = build_attn_inp_kv_unified();
6469
6755
 
6756
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6757
+
6470
6758
  for (int il = 0; il < n_layer; ++il) {
6471
6759
  ggml_tensor * inpSA = inpL;
6472
6760
 
@@ -6513,9 +6801,7 @@ struct llm_build_qwen : public llm_graph_context {
6513
6801
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6514
6802
  }
6515
6803
 
6516
- if (il == n_layer - 1) {
6517
- // skip computing output for unused tokens
6518
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6804
+ if (il == n_layer - 1 && inp_out_ids) {
6519
6805
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6520
6806
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6521
6807
  }
@@ -6584,6 +6870,8 @@ struct llm_build_qwen2 : public llm_graph_context {
6584
6870
 
6585
6871
  auto * inp_attn = build_attn_inp_kv_unified();
6586
6872
 
6873
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6874
+
6587
6875
  for (int il = 0; il < n_layer; ++il) {
6588
6876
  ggml_tensor * inpSA = inpL;
6589
6877
 
@@ -6633,9 +6921,7 @@ struct llm_build_qwen2 : public llm_graph_context {
6633
6921
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6634
6922
  }
6635
6923
 
6636
- if (il == n_layer - 1) {
6637
- // skip computing output for unused tokens
6638
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6924
+ if (il == n_layer - 1 && inp_out_ids) {
6639
6925
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6640
6926
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6641
6927
  }
@@ -6705,6 +6991,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
6705
6991
  int sections[4];
6706
6992
  std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
6707
6993
 
6994
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6995
+
6708
6996
  for (int il = 0; il < n_layer; ++il) {
6709
6997
  ggml_tensor * inpSA = inpL;
6710
6998
 
@@ -6754,9 +7042,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
6754
7042
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6755
7043
  }
6756
7044
 
6757
- if (il == n_layer - 1) {
6758
- // skip computing output for unused tokens
6759
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7045
+ if (il == n_layer - 1 && inp_out_ids) {
6760
7046
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6761
7047
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6762
7048
  }
@@ -6823,6 +7109,8 @@ struct llm_build_qwen2moe : public llm_graph_context {
6823
7109
 
6824
7110
  auto * inp_attn = build_attn_inp_kv_unified();
6825
7111
 
7112
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7113
+
6826
7114
  for (int il = 0; il < n_layer; ++il) {
6827
7115
  ggml_tensor * inpSA = inpL;
6828
7116
 
@@ -6881,9 +7169,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
6881
7169
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6882
7170
  }
6883
7171
 
6884
- if (il == n_layer - 1) {
6885
- // skip computing output for unused tokens
6886
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7172
+ if (il == n_layer - 1 && inp_out_ids) {
6887
7173
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6888
7174
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6889
7175
  }
@@ -6982,6 +7268,8 @@ struct llm_build_qwen3 : public llm_graph_context {
6982
7268
 
6983
7269
  auto * inp_attn = build_attn_inp_kv_unified();
6984
7270
 
7271
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7272
+
6985
7273
  for (int il = 0; il < n_layer; ++il) {
6986
7274
  ggml_tensor * inpSA = inpL;
6987
7275
 
@@ -7034,9 +7322,7 @@ struct llm_build_qwen3 : public llm_graph_context {
7034
7322
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7035
7323
  }
7036
7324
 
7037
- if (il == n_layer - 1) {
7038
- // skip computing output for unused tokens
7039
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7325
+ if (il == n_layer - 1 && inp_out_ids) {
7040
7326
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7041
7327
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7042
7328
  }
@@ -7103,6 +7389,8 @@ struct llm_build_qwen3moe : public llm_graph_context {
7103
7389
 
7104
7390
  auto * inp_attn = build_attn_inp_kv_unified();
7105
7391
 
7392
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7393
+
7106
7394
  for (int il = 0; il < n_layer; ++il) {
7107
7395
  ggml_tensor * inpSA = inpL;
7108
7396
 
@@ -7155,9 +7443,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
7155
7443
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7156
7444
  }
7157
7445
 
7158
- if (il == n_layer - 1) {
7159
- // skip computing output for unused tokens
7160
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7446
+ if (il == n_layer - 1 && inp_out_ids) {
7161
7447
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7162
7448
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7163
7449
  }
@@ -7233,6 +7519,8 @@ struct llm_build_phi2 : public llm_graph_context {
7233
7519
 
7234
7520
  auto * inp_attn = build_attn_inp_kv_unified();
7235
7521
 
7522
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7523
+
7236
7524
  for (int il = 0; il < n_layer; ++il) {
7237
7525
  attn_norm_output = build_norm(inpL,
7238
7526
  model.layers[il].attn_norm,
@@ -7295,9 +7583,7 @@ struct llm_build_phi2 : public llm_graph_context {
7295
7583
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7296
7584
  }
7297
7585
 
7298
- if (il == n_layer - 1) {
7299
- // skip computing output for unused tokens
7300
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7586
+ if (il == n_layer - 1 && inp_out_ids) {
7301
7587
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7302
7588
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7303
7589
  attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
@@ -7369,6 +7655,8 @@ struct llm_build_phi3 : public llm_graph_context {
7369
7655
  inp_attn = build_attn_inp_kv_unified();
7370
7656
  }
7371
7657
 
7658
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7659
+
7372
7660
  for (int il = 0; il < n_layer; ++il) {
7373
7661
  auto * residual = inpL;
7374
7662
 
@@ -7432,9 +7720,7 @@ struct llm_build_phi3 : public llm_graph_context {
7432
7720
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7433
7721
  }
7434
7722
 
7435
- if (il == n_layer - 1) {
7436
- // skip computing output for unused tokens
7437
- ggml_tensor* inp_out_ids = build_inp_out_ids();
7723
+ if (il == n_layer - 1 && inp_out_ids) {
7438
7724
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7439
7725
  residual = ggml_get_rows(ctx0, residual, inp_out_ids);
7440
7726
  }
@@ -7520,15 +7806,16 @@ struct llm_build_plamo : public llm_graph_context {
7520
7806
 
7521
7807
  auto * inp_attn = build_attn_inp_kv_unified();
7522
7808
 
7523
- for (int il = 0; il < n_layer; ++il) {
7809
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7524
7810
 
7811
+ for (int il = 0; il < n_layer; ++il) {
7525
7812
  // norm
7526
7813
  cur = build_norm(inpL,
7527
7814
  model.layers[il].attn_norm, NULL,
7528
7815
  LLM_NORM_RMS, il);
7529
7816
  cb(cur, "attn_norm", il);
7530
7817
 
7531
- ggml_tensor * attention_norm = cur;
7818
+ ggml_tensor * sa_inp = cur;
7532
7819
 
7533
7820
  // self-attention
7534
7821
  {
@@ -7566,18 +7853,17 @@ struct llm_build_plamo : public llm_graph_context {
7566
7853
  model.layers[il].wo, NULL,
7567
7854
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7568
7855
  }
7569
- ggml_tensor * sa_out = cur;
7570
7856
 
7571
- cur = attention_norm;
7572
-
7573
- if (il == n_layer - 1) {
7574
- // skip computing output for unused tokens
7575
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7857
+ if (il == n_layer - 1 && inp_out_ids) {
7576
7858
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7577
- sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
7859
+ sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
7578
7860
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7579
7861
  }
7580
7862
 
7863
+ ggml_tensor * sa_out = cur;
7864
+
7865
+ cur = sa_inp;
7866
+
7581
7867
  // feed-forward network
7582
7868
  {
7583
7869
  cur = build_ffn(cur,
@@ -7642,6 +7928,8 @@ struct llm_build_gpt2 : public llm_graph_context {
7642
7928
  inpL = ggml_add(ctx0, inpL, pos);
7643
7929
  cb(inpL, "inpL", -1);
7644
7930
 
7931
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7932
+
7645
7933
  for (int il = 0; il < n_layer; ++il) {
7646
7934
  cur = build_norm(inpL,
7647
7935
  model.layers[il].attn_norm,
@@ -7674,9 +7962,7 @@ struct llm_build_gpt2 : public llm_graph_context {
7674
7962
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7675
7963
  }
7676
7964
 
7677
- if (il == n_layer - 1) {
7678
- // skip computing output for unused tokens
7679
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7965
+ if (il == n_layer - 1 && inp_out_ids) {
7680
7966
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7681
7967
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7682
7968
  }
@@ -7746,6 +8032,8 @@ struct llm_build_codeshell : public llm_graph_context {
7746
8032
 
7747
8033
  auto * inp_attn = build_attn_inp_kv_unified();
7748
8034
 
8035
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8036
+
7749
8037
  for (int il = 0; il < n_layer; ++il) {
7750
8038
  cur = build_norm(inpL,
7751
8039
  model.layers[il].attn_norm,
@@ -7790,9 +8078,7 @@ struct llm_build_codeshell : public llm_graph_context {
7790
8078
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7791
8079
  }
7792
8080
 
7793
- if (il == n_layer - 1) {
7794
- // skip computing output for unused tokens
7795
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8081
+ if (il == n_layer - 1 && inp_out_ids) {
7796
8082
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7797
8083
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7798
8084
  }
@@ -7846,128 +8132,128 @@ struct llm_build_codeshell : public llm_graph_context {
7846
8132
 
7847
8133
  struct llm_build_orion : public llm_graph_context {
7848
8134
  llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7849
- const int64_t n_embd_head = hparams.n_embd_head_v;
8135
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7850
8136
 
7851
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7852
- GGML_ASSERT(n_embd_head == hparams.n_rot);
8137
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8138
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
7853
8139
 
7854
- ggml_tensor * cur;
7855
- ggml_tensor * inpL;
8140
+ ggml_tensor * cur;
8141
+ ggml_tensor * inpL;
7856
8142
 
7857
- inpL = build_inp_embd(model.tok_embd);
8143
+ inpL = build_inp_embd(model.tok_embd);
7858
8144
 
7859
- // inp_pos - contains the positions
7860
- ggml_tensor * inp_pos = build_inp_pos();
8145
+ // inp_pos - contains the positions
8146
+ ggml_tensor * inp_pos = build_inp_pos();
7861
8147
 
7862
- auto * inp_attn = build_attn_inp_kv_unified();
8148
+ auto * inp_attn = build_attn_inp_kv_unified();
7863
8149
 
7864
- for (int il = 0; il < n_layer; ++il) {
7865
- ggml_tensor * inpSA = inpL;
8150
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7866
8151
 
7867
- // norm
7868
- cur = build_norm(inpL,
7869
- model.layers[il].attn_norm, model.layers[il].attn_norm_b,
7870
- LLM_NORM, il);
7871
- cb(cur, "attn_norm", il);
8152
+ for (int il = 0; il < n_layer; ++il) {
8153
+ ggml_tensor * inpSA = inpL;
7872
8154
 
7873
- // self-attention
7874
- {
7875
- // compute Q and K and RoPE them
7876
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
7877
- cb(Qcur, "Qcur", il);
7878
- // if (model.layers[il].bq) {
7879
- // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
7880
- // cb(Qcur, "Qcur", il);
7881
- // }
7882
-
7883
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
7884
- cb(Kcur, "Kcur", il);
7885
- // if (model.layers[il].bk) {
7886
- // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
7887
- // cb(Kcur, "Kcur", il);
7888
- // }
7889
-
7890
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
7891
- cb(Vcur, "Vcur", il);
7892
- // if (model.layers[il].bv) {
7893
- // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
7894
- // cb(Vcur, "Vcur", il);
7895
- // }
7896
-
7897
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7898
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7899
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7900
-
7901
- Qcur = ggml_rope_ext(
7902
- ctx0, Qcur, inp_pos, nullptr,
7903
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7904
- ext_factor, attn_factor, beta_fast, beta_slow
7905
- );
8155
+ // norm
8156
+ cur = build_norm(inpL,
8157
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
8158
+ LLM_NORM, il);
8159
+ cb(cur, "attn_norm", il);
7906
8160
 
7907
- Kcur = ggml_rope_ext(
7908
- ctx0, Kcur, inp_pos, nullptr,
7909
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7910
- ext_factor, attn_factor, beta_fast, beta_slow
7911
- );
8161
+ // self-attention
8162
+ {
8163
+ // compute Q and K and RoPE them
8164
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8165
+ cb(Qcur, "Qcur", il);
8166
+ // if (model.layers[il].bq) {
8167
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8168
+ // cb(Qcur, "Qcur", il);
8169
+ // }
7912
8170
 
7913
- cb(Qcur, "Qcur", il);
7914
- cb(Kcur, "Kcur", il);
7915
- cb(Vcur, "Vcur", il);
8171
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8172
+ cb(Kcur, "Kcur", il);
8173
+ // if (model.layers[il].bk) {
8174
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8175
+ // cb(Kcur, "Kcur", il);
8176
+ // }
7916
8177
 
7917
- cur = build_attn(inp_attn, gf,
7918
- model.layers[il].wo, NULL,
7919
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7920
- }
8178
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8179
+ cb(Vcur, "Vcur", il);
8180
+ // if (model.layers[il].bv) {
8181
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8182
+ // cb(Vcur, "Vcur", il);
8183
+ // }
7921
8184
 
7922
- if (il == n_layer - 1) {
7923
- // skip computing output for unused tokens
7924
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7925
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7926
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7927
- }
8185
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8186
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8187
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7928
8188
 
7929
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7930
- cb(ffn_inp, "ffn_inp", il);
8189
+ Qcur = ggml_rope_ext(
8190
+ ctx0, Qcur, inp_pos, nullptr,
8191
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8192
+ ext_factor, attn_factor, beta_fast, beta_slow
8193
+ );
7931
8194
 
7932
- // feed-forward network
7933
- cur = build_norm(ffn_inp,
7934
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
7935
- LLM_NORM, il);
7936
- cb(cur, "ffn_norm", il);
8195
+ Kcur = ggml_rope_ext(
8196
+ ctx0, Kcur, inp_pos, nullptr,
8197
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8198
+ ext_factor, attn_factor, beta_fast, beta_slow
8199
+ );
7937
8200
 
7938
- cur = build_ffn(cur,
7939
- model.layers[il].ffn_up, NULL, NULL,
7940
- model.layers[il].ffn_gate, NULL, NULL,
7941
- model.layers[il].ffn_down, NULL, NULL,
7942
- NULL,
7943
- LLM_FFN_SILU, LLM_FFN_PAR, il);
7944
- cb(cur, "ffn_out", il);
8201
+ cb(Qcur, "Qcur", il);
8202
+ cb(Kcur, "Kcur", il);
8203
+ cb(Vcur, "Vcur", il);
8204
+
8205
+ cur = build_attn(inp_attn, gf,
8206
+ model.layers[il].wo, NULL,
8207
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8208
+ }
7945
8209
 
7946
- cur = ggml_add(ctx0, cur, ffn_inp);
8210
+ if (il == n_layer - 1 && inp_out_ids) {
8211
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8212
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8213
+ }
7947
8214
 
7948
- cur = build_cvec(cur, il);
7949
- cb(cur, "l_out", il);
8215
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8216
+ cb(ffn_inp, "ffn_inp", il);
7950
8217
 
7951
- // input for next layer
7952
- inpL = cur;
7953
- }
8218
+ // feed-forward network
8219
+ cur = build_norm(ffn_inp,
8220
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
8221
+ LLM_NORM, il);
8222
+ cb(cur, "ffn_norm", il);
8223
+
8224
+ cur = build_ffn(cur,
8225
+ model.layers[il].ffn_up, NULL, NULL,
8226
+ model.layers[il].ffn_gate, NULL, NULL,
8227
+ model.layers[il].ffn_down, NULL, NULL,
8228
+ NULL,
8229
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
8230
+ cb(cur, "ffn_out", il);
8231
+
8232
+ cur = ggml_add(ctx0, cur, ffn_inp);
8233
+
8234
+ cur = build_cvec(cur, il);
8235
+ cb(cur, "l_out", il);
7954
8236
 
7955
- cur = inpL;
8237
+ // input for next layer
8238
+ inpL = cur;
8239
+ }
8240
+
8241
+ cur = inpL;
7956
8242
 
7957
- cur = build_norm(cur,
7958
- model.output_norm, model.output_norm_b,
7959
- LLM_NORM, -1);
8243
+ cur = build_norm(cur,
8244
+ model.output_norm, model.output_norm_b,
8245
+ LLM_NORM, -1);
7960
8246
 
7961
- cb(cur, "result_norm", -1);
7962
- res->t_embd = cur;
8247
+ cb(cur, "result_norm", -1);
8248
+ res->t_embd = cur;
7963
8249
 
7964
- // lm_head
7965
- cur = build_lora_mm(model.output, cur);
8250
+ // lm_head
8251
+ cur = build_lora_mm(model.output, cur);
7966
8252
 
7967
- cb(cur, "result_output", -1);
7968
- res->t_logits = cur;
8253
+ cb(cur, "result_output", -1);
8254
+ res->t_logits = cur;
7969
8255
 
7970
- ggml_build_forward_expand(gf, cur);
8256
+ ggml_build_forward_expand(gf, cur);
7971
8257
  }
7972
8258
  };
7973
8259
 
@@ -7988,6 +8274,8 @@ struct llm_build_internlm2 : public llm_graph_context {
7988
8274
 
7989
8275
  auto * inp_attn = build_attn_inp_kv_unified();
7990
8276
 
8277
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8278
+
7991
8279
  for (int il = 0; il < n_layer; ++il) {
7992
8280
  ggml_tensor * inpSA = inpL;
7993
8281
 
@@ -8046,9 +8334,7 @@ struct llm_build_internlm2 : public llm_graph_context {
8046
8334
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8047
8335
  }
8048
8336
 
8049
- if (il == n_layer - 1) {
8050
- // skip computing output for unused tokens
8051
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8337
+ if (il == n_layer - 1 && inp_out_ids) {
8052
8338
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8053
8339
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8054
8340
  }
@@ -8124,6 +8410,8 @@ struct llm_build_minicpm3 : public llm_graph_context {
8124
8410
 
8125
8411
  auto * inp_attn = build_attn_inp_kv_unified();
8126
8412
 
8413
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8414
+
8127
8415
  for (int il = 0; il < n_layer; ++il) {
8128
8416
  ggml_tensor * inpSA = inpL;
8129
8417
 
@@ -8243,15 +8531,13 @@ struct llm_build_minicpm3 : public llm_graph_context {
8243
8531
  q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
8244
8532
  }
8245
8533
 
8246
- if (il == n_layer - 1) {
8247
- // skip computing output for unused tokens
8248
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8534
+ if (il == n_layer - 1 && inp_out_ids) {
8249
8535
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8250
8536
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8251
8537
  }
8252
8538
 
8253
8539
  // scale_res - scale the hidden states for residual connection
8254
- const float scale_res = scale_depth/sqrtf(float(n_layer));
8540
+ const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
8255
8541
  cur = ggml_scale(ctx0, cur, scale_res);
8256
8542
  cb(cur, "hidden_scaled", il);
8257
8543
 
@@ -8328,6 +8614,8 @@ struct llm_build_gemma : public llm_graph_context {
8328
8614
 
8329
8615
  auto * inp_attn = build_attn_inp_kv_unified();
8330
8616
 
8617
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8618
+
8331
8619
  for (int il = 0; il < n_layer; ++il) {
8332
8620
  // norm
8333
8621
  cur = build_norm(inpL,
@@ -8373,9 +8661,7 @@ struct llm_build_gemma : public llm_graph_context {
8373
8661
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8374
8662
  }
8375
8663
 
8376
- if (il == n_layer - 1) {
8377
- // skip computing output for unused tokens
8378
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8664
+ if (il == n_layer - 1 && inp_out_ids) {
8379
8665
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8380
8666
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8381
8667
  }
@@ -8444,6 +8730,8 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
8444
8730
 
8445
8731
  auto * inp_attn = build_attn_inp_kv_unified_iswa();
8446
8732
 
8733
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8734
+
8447
8735
  for (int il = 0; il < n_layer; ++il) {
8448
8736
  // norm
8449
8737
  cur = build_norm(inpL,
@@ -8481,32 +8769,23 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
8481
8769
  cb(Kcur, "Kcur", il);
8482
8770
  cb(Vcur, "Vcur", il);
8483
8771
 
8484
- // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
8485
- switch (model.type) {
8486
- case LLM_TYPE_2B:
8487
- case LLM_TYPE_9B:
8488
- case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); break;
8489
- default: GGML_ABORT("fatal error");
8490
- };
8491
- cb(Qcur, "Qcur_scaled", il);
8772
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
8492
8773
 
8493
8774
  cur = build_attn(inp_attn, gf,
8494
8775
  model.layers[il].wo, NULL,
8495
8776
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8496
8777
  }
8497
8778
 
8779
+ if (il == n_layer - 1 && inp_out_ids) {
8780
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8781
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8782
+ }
8783
+
8498
8784
  cur = build_norm(cur,
8499
8785
  model.layers[il].attn_post_norm, NULL,
8500
8786
  LLM_NORM_RMS, il);
8501
8787
  cb(cur, "attn_post_norm", il);
8502
8788
 
8503
- if (il == n_layer - 1) {
8504
- // skip computing output for unused tokens
8505
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8506
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8507
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8508
- }
8509
-
8510
8789
  ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
8511
8790
  cb(sa_out, "sa_out", il);
8512
8791
 
@@ -8585,6 +8864,8 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8585
8864
  // TODO: is causal == true correct? might need some changes
8586
8865
  auto * inp_attn = build_attn_inp_kv_unified_iswa();
8587
8866
 
8867
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8868
+
8588
8869
  for (int il = 0; il < n_layer; ++il) {
8589
8870
  const float freq_base_l = model.get_rope_freq_base (cparams, il);
8590
8871
  const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
@@ -8629,9 +8910,17 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8629
8910
  cb(Kcur, "Kcur", il);
8630
8911
  cb(Vcur, "Vcur", il);
8631
8912
 
8913
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
8914
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
8915
+
8632
8916
  cur = build_attn(inp_attn, gf,
8633
8917
  model.layers[il].wo, NULL,
8634
- Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
8918
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8919
+ }
8920
+
8921
+ if (il == n_layer - 1 && inp_out_ids) {
8922
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8923
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8635
8924
  }
8636
8925
 
8637
8926
  cur = build_norm(cur,
@@ -8639,13 +8928,6 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8639
8928
  LLM_NORM_RMS, il);
8640
8929
  cb(cur, "attn_post_norm", il);
8641
8930
 
8642
- if (il == n_layer - 1) {
8643
- // skip computing output for unused tokens
8644
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8645
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8646
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8647
- }
8648
-
8649
8931
  ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
8650
8932
  cb(sa_out, "sa_out", il);
8651
8933
 
@@ -8716,6 +8998,8 @@ struct llm_build_starcoder2 : public llm_graph_context {
8716
8998
 
8717
8999
  auto * inp_attn = build_attn_inp_kv_unified();
8718
9000
 
9001
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9002
+
8719
9003
  for (int il = 0; il < n_layer; ++il) {
8720
9004
  ggml_tensor * inpSA = inpL;
8721
9005
 
@@ -8774,9 +9058,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
8774
9058
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8775
9059
  }
8776
9060
 
8777
- if (il == n_layer - 1) {
8778
- // skip computing output for unused tokens
8779
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9061
+ if (il == n_layer - 1 && inp_out_ids) {
8780
9062
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8781
9063
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8782
9064
  }
@@ -8837,8 +9119,9 @@ struct llm_build_mamba : public llm_graph_context {
8837
9119
  // {n_embd, n_tokens}
8838
9120
  inpL = build_inp_embd(model.tok_embd);
8839
9121
 
8840
- ggml_tensor * state_copy = build_inp_s_copy();
8841
- ggml_tensor * state_mask = build_inp_s_mask();
9122
+ auto * rs_inp = build_rs_inp();
9123
+
9124
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8842
9125
 
8843
9126
  for (int il = 0; il < n_layer; ++il) {
8844
9127
  // norm
@@ -8847,12 +9130,9 @@ struct llm_build_mamba : public llm_graph_context {
8847
9130
  LLM_NORM_RMS, il);
8848
9131
  cb(cur, "attn_norm", il);
8849
9132
 
8850
- //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
8851
- cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
9133
+ cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
8852
9134
 
8853
- if (il == n_layer - 1) {
8854
- // skip computing output for unused tokens
8855
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9135
+ if (il == n_layer - 1 && inp_out_ids) {
8856
9136
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8857
9137
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8858
9138
  }
@@ -8886,15 +9166,14 @@ struct llm_build_mamba : public llm_graph_context {
8886
9166
 
8887
9167
  // TODO: split
8888
9168
  ggml_tensor * build_mamba_layer(
8889
- ggml_cgraph * gf,
8890
- ggml_tensor * cur,
8891
- ggml_tensor * state_copy,
8892
- ggml_tensor * state_mask,
8893
- const llama_ubatch & ubatch,
8894
- int il) const {
8895
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
9169
+ llm_graph_input_rs * inp,
9170
+ ggml_cgraph * gf,
9171
+ ggml_tensor * cur,
9172
+ const llama_ubatch & ubatch,
9173
+ int il) const {
9174
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
8896
9175
 
8897
- const auto kv_head = kv_self->head;
9176
+ const auto kv_head = mctx_cur->get_head();
8898
9177
 
8899
9178
  const int64_t d_conv = hparams.ssm_d_conv;
8900
9179
  const int64_t d_inner = hparams.ssm_d_inner;
@@ -8912,17 +9191,17 @@ struct llm_build_mamba : public llm_graph_context {
8912
9191
  GGML_ASSERT(ubatch.equal_seqs);
8913
9192
  GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
8914
9193
 
8915
- ggml_tensor * conv_states_all = kv_self->k_l[il];
8916
- ggml_tensor * ssm_states_all = kv_self->v_l[il];
9194
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
9195
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
8917
9196
 
8918
9197
  // (ab)using the KV cache to store the states
8919
- ggml_tensor * conv = build_copy_mask_state(
8920
- gf, conv_states_all, state_copy, state_mask,
8921
- hparams.n_embd_k_s(), n_seqs);
9198
+ ggml_tensor * conv = build_rs(
9199
+ inp, gf, conv_states_all,
9200
+ hparams.n_embd_r(), n_seqs);
8922
9201
  conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
8923
- ggml_tensor * ssm = build_copy_mask_state(
8924
- gf, ssm_states_all, state_copy, state_mask,
8925
- hparams.n_embd_v_s(), n_seqs);
9202
+ ggml_tensor * ssm = build_rs(
9203
+ inp, gf, ssm_states_all,
9204
+ hparams.n_embd_s(), n_seqs);
8926
9205
  ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
8927
9206
 
8928
9207
  // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
@@ -9035,13 +9314,15 @@ struct llm_build_command_r : public llm_graph_context {
9035
9314
 
9036
9315
  auto * inp_attn = build_attn_inp_kv_unified();
9037
9316
 
9038
- for (int il = 0; il < n_layer; ++il) {
9317
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9039
9318
 
9319
+ for (int il = 0; il < n_layer; ++il) {
9040
9320
  // norm
9041
9321
  cur = build_norm(inpL,
9042
9322
  model.layers[il].attn_norm, NULL,
9043
9323
  LLM_NORM, il);
9044
9324
  cb(cur, "attn_norm", il);
9325
+
9045
9326
  ggml_tensor * ffn_inp = cur;
9046
9327
 
9047
9328
  // self-attention
@@ -9109,9 +9390,7 @@ struct llm_build_command_r : public llm_graph_context {
9109
9390
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9110
9391
  }
9111
9392
 
9112
- if (il == n_layer - 1) {
9113
- // skip computing output for unused tokens
9114
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9393
+ if (il == n_layer - 1 && inp_out_ids) {
9115
9394
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9116
9395
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9117
9396
  ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
@@ -9182,6 +9461,8 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
9182
9461
 
9183
9462
  auto * inp_attn = build_attn_inp_kv_unified_iswa();
9184
9463
 
9464
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9465
+
9185
9466
  for (int il = 0; il < n_layer; ++il) {
9186
9467
  const bool is_swa = hparams.is_swa(il);
9187
9468
 
@@ -9244,9 +9525,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
9244
9525
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9245
9526
  }
9246
9527
 
9247
- if (il == n_layer - 1) {
9248
- // skip computing output for unused tokens
9249
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9528
+ if (il == n_layer - 1 && inp_out_ids) {
9250
9529
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9251
9530
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9252
9531
  ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
@@ -9317,6 +9596,8 @@ struct llm_build_olmo : public llm_graph_context {
9317
9596
 
9318
9597
  auto * inp_attn = build_attn_inp_kv_unified();
9319
9598
 
9599
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9600
+
9320
9601
  for (int il = 0; il < n_layer; ++il) {
9321
9602
  ggml_tensor * inpSA = inpL;
9322
9603
 
@@ -9375,9 +9656,7 @@ struct llm_build_olmo : public llm_graph_context {
9375
9656
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9376
9657
  }
9377
9658
 
9378
- if (il == n_layer - 1) {
9379
- // skip computing output for unused tokens
9380
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9659
+ if (il == n_layer - 1 && inp_out_ids) {
9381
9660
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9382
9661
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9383
9662
  }
@@ -9445,6 +9724,8 @@ struct llm_build_olmo2 : public llm_graph_context {
9445
9724
 
9446
9725
  auto * inp_attn = build_attn_inp_kv_unified();
9447
9726
 
9727
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9728
+
9448
9729
  for (int il = 0; il < n_layer; ++il) {
9449
9730
  ggml_tensor * inpSA = inpL;
9450
9731
 
@@ -9495,18 +9776,16 @@ struct llm_build_olmo2 : public llm_graph_context {
9495
9776
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9496
9777
  }
9497
9778
 
9779
+ if (il == n_layer - 1 && inp_out_ids) {
9780
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9781
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9782
+ }
9783
+
9498
9784
  cur = build_norm(cur,
9499
9785
  model.layers[il].attn_post_norm, NULL,
9500
9786
  LLM_NORM_RMS, il);
9501
9787
  cb(cur, "attn_post_norm", il);
9502
9788
 
9503
- if (il == n_layer - 1) {
9504
- // skip computing output for unused tokens
9505
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9506
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9507
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9508
- }
9509
-
9510
9789
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
9511
9790
  cb(ffn_inp, "ffn_inp", il);
9512
9791
 
@@ -9574,6 +9853,8 @@ struct llm_build_olmoe : public llm_graph_context {
9574
9853
 
9575
9854
  auto * inp_attn = build_attn_inp_kv_unified();
9576
9855
 
9856
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9857
+
9577
9858
  for (int il = 0; il < n_layer; ++il) {
9578
9859
  ggml_tensor * inpSA = inpL;
9579
9860
 
@@ -9628,9 +9909,7 @@ struct llm_build_olmoe : public llm_graph_context {
9628
9909
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9629
9910
  }
9630
9911
 
9631
- if (il == n_layer - 1) {
9632
- // skip computing output for unused tokens
9633
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9912
+ if (il == n_layer - 1 && inp_out_ids) {
9634
9913
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9635
9914
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9636
9915
  }
@@ -9700,6 +9979,8 @@ struct llm_build_openelm : public llm_graph_context {
9700
9979
 
9701
9980
  auto * inp_attn = build_attn_inp_kv_unified();
9702
9981
 
9982
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9983
+
9703
9984
  for (int il = 0; il < n_layer; ++il) {
9704
9985
  const int64_t n_head = hparams.n_head(il);
9705
9986
  const int64_t n_head_kv = hparams.n_head_kv(il);
@@ -9761,11 +10042,9 @@ struct llm_build_openelm : public llm_graph_context {
9761
10042
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9762
10043
  }
9763
10044
 
9764
- if (il == n_layer - 1) {
9765
- // skip computing output for unused tokens
9766
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10045
+ if (il == n_layer - 1 && inp_out_ids) {
9767
10046
  residual = ggml_get_rows(ctx0, residual, inp_out_ids);
9768
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10047
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9769
10048
  }
9770
10049
 
9771
10050
  ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
@@ -9831,6 +10110,8 @@ struct llm_build_gptneox : public llm_graph_context {
9831
10110
 
9832
10111
  auto * inp_attn = build_attn_inp_kv_unified();
9833
10112
 
10113
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10114
+
9834
10115
  for (int il = 0; il < n_layer; ++il) {
9835
10116
  cur = build_norm(inpL,
9836
10117
  model.layers[il].attn_norm,
@@ -9875,9 +10156,7 @@ struct llm_build_gptneox : public llm_graph_context {
9875
10156
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9876
10157
  }
9877
10158
 
9878
- if (il == n_layer - 1) {
9879
- // skip computing output for unused tokens
9880
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10159
+ if (il == n_layer - 1 && inp_out_ids) {
9881
10160
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9882
10161
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9883
10162
  }
@@ -9979,6 +10258,8 @@ struct llm_build_arctic : public llm_graph_context {
9979
10258
 
9980
10259
  auto * inp_attn = build_attn_inp_kv_unified();
9981
10260
 
10261
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10262
+
9982
10263
  for (int il = 0; il < n_layer; ++il) {
9983
10264
  ggml_tensor * inpSA = inpL;
9984
10265
 
@@ -10025,9 +10306,7 @@ struct llm_build_arctic : public llm_graph_context {
10025
10306
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10026
10307
  }
10027
10308
 
10028
- if (il == n_layer - 1) {
10029
- // skip computing output for unused tokens
10030
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10309
+ if (il == n_layer - 1 && inp_out_ids) {
10031
10310
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10032
10311
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10033
10312
  }
@@ -10119,6 +10398,8 @@ struct llm_build_deepseek : public llm_graph_context {
10119
10398
 
10120
10399
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
10121
10400
 
10401
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10402
+
10122
10403
  for (int il = 0; il < n_layer; ++il) {
10123
10404
  ggml_tensor * inpSA = inpL;
10124
10405
 
@@ -10180,14 +10461,11 @@ struct llm_build_deepseek : public llm_graph_context {
10180
10461
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
10181
10462
  }
10182
10463
 
10183
- if (il == n_layer - 1) {
10184
- // skip computing output for unused tokens
10185
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10464
+ if (il == n_layer - 1 && inp_out_ids) {
10186
10465
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10187
10466
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10188
10467
  }
10189
10468
 
10190
-
10191
10469
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10192
10470
  cb(ffn_inp, "ffn_inp", il);
10193
10471
 
@@ -10295,6 +10573,8 @@ struct llm_build_deepseek2 : public llm_graph_context {
10295
10573
 
10296
10574
  auto * inp_attn = build_attn_inp_kv_unified();
10297
10575
 
10576
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10577
+
10298
10578
  for (int il = 0; il < n_layer; ++il) {
10299
10579
  ggml_tensor * inpSA = inpL;
10300
10580
 
@@ -10444,9 +10724,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
10444
10724
  }
10445
10725
  }
10446
10726
 
10447
- if (il == n_layer - 1) {
10448
- // skip computing output for unused tokens
10449
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10727
+ if (il == n_layer - 1 && inp_out_ids) {
10450
10728
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10451
10729
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10452
10730
  }
@@ -10542,6 +10820,8 @@ struct llm_build_bitnet : public llm_graph_context {
10542
10820
 
10543
10821
  auto * inp_attn = build_attn_inp_kv_unified();
10544
10822
 
10823
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10824
+
10545
10825
  for (int il = 0; il < n_layer; ++il) {
10546
10826
  ggml_tensor * inpSA = inpL;
10547
10827
 
@@ -10624,9 +10904,7 @@ struct llm_build_bitnet : public llm_graph_context {
10624
10904
  cb(cur, "attn_o_out", il);
10625
10905
  }
10626
10906
 
10627
- if (il == n_layer - 1) {
10628
- // skip computing output for unused tokens
10629
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10907
+ if (il == n_layer - 1 && inp_out_ids) {
10630
10908
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10631
10909
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10632
10910
  }
@@ -10701,6 +10979,8 @@ struct llm_build_t5_enc : public llm_graph_context {
10701
10979
 
10702
10980
  auto * inp_attn = build_attn_inp_no_cache();
10703
10981
 
10982
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10983
+
10704
10984
  for (int il = 0; il < n_layer; ++il) {
10705
10985
  ggml_tensor * inpSA = inpL;
10706
10986
 
@@ -10734,9 +11014,7 @@ struct llm_build_t5_enc : public llm_graph_context {
10734
11014
  cb(cur, "kqv_out", il);
10735
11015
  }
10736
11016
 
10737
- if (il == n_layer - 1) {
10738
- // skip computing output for unused tokens
10739
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11017
+ if (il == n_layer - 1 && inp_out_ids) {
10740
11018
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10741
11019
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10742
11020
  }
@@ -10807,6 +11085,8 @@ struct llm_build_t5_dec : public llm_graph_context {
10807
11085
  auto * inp_attn_self = build_attn_inp_kv_unified();
10808
11086
  auto * inp_attn_cross = build_attn_inp_cross();
10809
11087
 
11088
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11089
+
10810
11090
  for (int il = 0; il < n_layer; ++il) {
10811
11091
  ggml_tensor * inpSA = inpL;
10812
11092
 
@@ -10898,11 +11178,8 @@ struct llm_build_t5_dec : public llm_graph_context {
10898
11178
  //cb(cur, "kqv_out", il);
10899
11179
  }
10900
11180
 
10901
- if (il == n_layer - 1) {
10902
- // skip computing output for unused tokens
10903
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11181
+ if (il == n_layer - 1 && inp_out_ids) {
10904
11182
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10905
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10906
11183
  inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
10907
11184
  }
10908
11185
 
@@ -10972,6 +11249,8 @@ struct llm_build_jais : public llm_graph_context {
10972
11249
 
10973
11250
  auto * inp_attn = build_attn_inp_kv_unified();
10974
11251
 
11252
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11253
+
10975
11254
  for (int il = 0; il < n_layer; ++il) {
10976
11255
  cur = build_norm(inpL,
10977
11256
  model.layers[il].attn_norm,
@@ -11004,9 +11283,7 @@ struct llm_build_jais : public llm_graph_context {
11004
11283
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
11005
11284
  }
11006
11285
 
11007
- if (il == n_layer - 1) {
11008
- // skip computing output for unused tokens
11009
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11286
+ if (il == n_layer - 1 && inp_out_ids) {
11010
11287
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11011
11288
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
11012
11289
  }
@@ -11070,6 +11347,8 @@ struct llm_build_chatglm : public llm_graph_context {
11070
11347
 
11071
11348
  auto * inp_attn = build_attn_inp_kv_unified();
11072
11349
 
11350
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11351
+
11073
11352
  for (int il = 0; il < n_layer; ++il) {
11074
11353
  ggml_tensor * inpSA = inpL;
11075
11354
 
@@ -11136,9 +11415,7 @@ struct llm_build_chatglm : public llm_graph_context {
11136
11415
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11137
11416
  }
11138
11417
 
11139
- if (il == n_layer - 1) {
11140
- // skip computing output for unused tokens
11141
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11418
+ if (il == n_layer - 1 && inp_out_ids) {
11142
11419
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11143
11420
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11144
11421
  }
@@ -11203,6 +11480,8 @@ struct llm_build_glm4 : public llm_graph_context {
11203
11480
 
11204
11481
  auto * inp_attn = build_attn_inp_kv_unified();
11205
11482
 
11483
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11484
+
11206
11485
  for (int il = 0; il < n_layer; ++il) {
11207
11486
  ggml_tensor * inpSA = inpL;
11208
11487
 
@@ -11269,9 +11548,7 @@ struct llm_build_glm4 : public llm_graph_context {
11269
11548
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11270
11549
  }
11271
11550
 
11272
- if (il == n_layer - 1) {
11273
- // skip computing output for unused tokens
11274
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11551
+ if (il == n_layer - 1 && inp_out_ids) {
11275
11552
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11276
11553
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11277
11554
  }
@@ -11354,6 +11631,8 @@ struct llm_build_nemotron : public llm_graph_context {
11354
11631
 
11355
11632
  auto * inp_attn = build_attn_inp_kv_unified();
11356
11633
 
11634
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11635
+
11357
11636
  for (int il = 0; il < n_layer; ++il) {
11358
11637
  ggml_tensor * inpSA = inpL;
11359
11638
 
@@ -11413,9 +11692,7 @@ struct llm_build_nemotron : public llm_graph_context {
11413
11692
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11414
11693
  }
11415
11694
 
11416
- if (il == n_layer - 1) {
11417
- // skip computing output for unused tokens
11418
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11695
+ if (il == n_layer - 1 && inp_out_ids) {
11419
11696
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11420
11697
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11421
11698
  }
@@ -11483,6 +11760,8 @@ struct llm_build_exaone : public llm_graph_context {
11483
11760
 
11484
11761
  auto * inp_attn = build_attn_inp_kv_unified();
11485
11762
 
11763
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11764
+
11486
11765
  for (int il = 0; il < n_layer; ++il) {
11487
11766
  ggml_tensor * inpSA = inpL;
11488
11767
 
@@ -11544,9 +11823,7 @@ struct llm_build_exaone : public llm_graph_context {
11544
11823
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11545
11824
  }
11546
11825
 
11547
- if (il == n_layer - 1) {
11548
- // skip computing output for unused tokens
11549
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11826
+ if (il == n_layer - 1 && inp_out_ids) {
11550
11827
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11551
11828
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11552
11829
  }
@@ -11633,14 +11910,13 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11633
11910
  }
11634
11911
 
11635
11912
  ggml_tensor * build_rwkv6_time_mix(
11913
+ llm_graph_input_rs * inp,
11636
11914
  ggml_cgraph * gf,
11637
11915
  ggml_tensor * cur,
11638
11916
  ggml_tensor * x_prev,
11639
- ggml_tensor * state_copy,
11640
- ggml_tensor * state_mask,
11641
11917
  const llama_ubatch & ubatch,
11642
11918
  int il) const {
11643
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
11919
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
11644
11920
 
11645
11921
  const auto n_tokens = ubatch.n_tokens;
11646
11922
  const auto n_seqs = ubatch.n_seqs;
@@ -11650,7 +11926,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11650
11926
  const auto n_head = n_embd / head_size;
11651
11927
  const auto n_head_kv = hparams.n_head_kv(il);
11652
11928
 
11653
- const auto kv_head = kv_self->head;
11929
+ const auto kv_head = mctx_cur->get_head();
11654
11930
 
11655
11931
  const auto & layer = model.layers[il];
11656
11932
 
@@ -11761,9 +12037,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11761
12037
  k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
11762
12038
  }
11763
12039
 
11764
- ggml_tensor * wkv_state = build_copy_mask_state(
11765
- gf, kv_self->v_l[il], state_copy, state_mask,
11766
- hparams.n_embd_v_s(), n_seqs);
12040
+ ggml_tensor * wkv_state = build_rs(
12041
+ inp, gf, mctx_cur->get_s_l(il),
12042
+ hparams.n_embd_s(), n_seqs);
11767
12043
 
11768
12044
  ggml_tensor * wkv_output;
11769
12045
  if (is_qrwkv) {
@@ -11781,9 +12057,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11781
12057
  wkv_state,
11782
12058
  ggml_view_1d(
11783
12059
  ctx0,
11784
- kv_self->v_l[il],
11785
- hparams.n_embd_v_s() * n_seqs,
11786
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
12060
+ mctx_cur->get_s_l(il),
12061
+ hparams.n_embd_s() * n_seqs,
12062
+ hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
11787
12063
  )
11788
12064
  )
11789
12065
  );
@@ -11817,20 +12093,19 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11817
12093
  inpL = build_inp_embd(model.tok_embd);
11818
12094
  inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
11819
12095
 
11820
- ggml_tensor * state_copy = build_inp_s_copy();
11821
- ggml_tensor * state_mask = build_inp_s_mask();
12096
+ auto * rs_inp = build_rs_inp();
11822
12097
 
11823
12098
  const auto n_embd = hparams.n_embd;
11824
12099
  const auto n_seq_tokens = ubatch.n_seq_tokens;
11825
12100
  const auto n_seqs = ubatch.n_seqs;
11826
12101
 
12102
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12103
+
11827
12104
  for (int il = 0; il < n_layer; ++il) {
11828
12105
  const llama_layer * layer = &model.layers[il];
11829
12106
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
11830
12107
 
11831
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
11832
- gf, state_copy, state_mask, ubatch, il
11833
- );
12108
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
11834
12109
 
11835
12110
  ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
11836
12111
  ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -11845,7 +12120,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11845
12120
  1
11846
12121
  );
11847
12122
 
11848
- cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
12123
+ cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
11849
12124
 
11850
12125
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
11851
12126
  cb(ffn_inp, "ffn_inp", il);
@@ -11867,13 +12142,16 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11867
12142
  );
11868
12143
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
11869
12144
 
11870
- if (il == n_layer - 1) {
11871
- // skip computing output for unused tokens
11872
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
11873
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
11874
- ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
11875
- x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
11876
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
12145
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
12146
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
12147
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
12148
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
12149
+
12150
+ if (il == n_layer - 1 && inp_out_ids) {
12151
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
12152
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
12153
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
12154
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11877
12155
  }
11878
12156
 
11879
12157
  cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
@@ -11908,27 +12186,26 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11908
12186
  // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
11909
12187
  struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11910
12188
  llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
11911
- GGML_ASSERT(n_embd == hparams.n_embd_k_s());
12189
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
11912
12190
 
11913
12191
  ggml_tensor * cur;
11914
12192
  ggml_tensor * inpL;
11915
12193
 
11916
12194
  inpL = build_inp_embd(model.tok_embd);
11917
12195
 
11918
- ggml_tensor * state_copy = build_inp_s_copy();
11919
- ggml_tensor * state_mask = build_inp_s_mask();
12196
+ auto * rs_inp = build_rs_inp();
11920
12197
 
11921
12198
  const auto n_embd = hparams.n_embd;
11922
12199
  const auto n_seq_tokens = ubatch.n_seq_tokens;
11923
12200
  const auto n_seqs = ubatch.n_seqs;
11924
12201
 
12202
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12203
+
11925
12204
  for (int il = 0; il < n_layer; ++il) {
11926
12205
  const llama_layer * layer = &model.layers[il];
11927
12206
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
11928
12207
 
11929
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
11930
- gf, state_copy, state_mask, ubatch, il
11931
- );
12208
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
11932
12209
 
11933
12210
  ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
11934
12211
  cb(att_norm, "attn_norm", il);
@@ -11940,7 +12217,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11940
12217
  1
11941
12218
  );
11942
12219
 
11943
- cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
12220
+ cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
11944
12221
 
11945
12222
  token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
11946
12223
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -11948,11 +12225,12 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11948
12225
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
11949
12226
  cb(ffn_inp, "ffn_inp", il);
11950
12227
 
11951
- if (il == n_layer - 1) {
11952
- // skip computing output for unused tokens
11953
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
11954
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
11955
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
12228
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
12229
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
12230
+
12231
+ if (il == n_layer - 1 && inp_out_ids) {
12232
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12233
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
11956
12234
  }
11957
12235
 
11958
12236
  // feed-forward network
@@ -12028,15 +12306,14 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12028
12306
  }
12029
12307
 
12030
12308
  ggml_tensor * build_rwkv7_time_mix(
12309
+ llm_graph_input_rs * inp,
12031
12310
  ggml_cgraph * gf,
12032
12311
  ggml_tensor * cur,
12033
12312
  ggml_tensor * x_prev,
12034
- ggml_tensor * state_copy,
12035
- ggml_tensor * state_mask,
12036
12313
  ggml_tensor *& first_layer_value,
12037
12314
  const llama_ubatch & ubatch,
12038
12315
  int il) const {
12039
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
12316
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
12040
12317
 
12041
12318
  const auto n_tokens = ubatch.n_tokens;
12042
12319
  const auto n_seqs = ubatch.n_seqs;
@@ -12045,7 +12322,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12045
12322
  const auto head_count = n_embd / head_size;
12046
12323
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12047
12324
 
12048
- const auto kv_head = kv_self->head;
12325
+ const auto kv_head = mctx_cur->get_head();
12049
12326
 
12050
12327
  const auto & layer = model.layers[il];
12051
12328
 
@@ -12115,9 +12392,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12115
12392
  v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
12116
12393
  a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
12117
12394
 
12118
- ggml_tensor * wkv_state = build_copy_mask_state(
12119
- gf, kv_self->v_l[il], state_copy, state_mask,
12120
- hparams.n_embd_v_s(), n_seqs);
12395
+ ggml_tensor * wkv_state = build_rs(
12396
+ inp, gf, mctx_cur->get_s_l(il),
12397
+ hparams.n_embd_s(), n_seqs);
12121
12398
 
12122
12399
  ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
12123
12400
  cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
@@ -12130,9 +12407,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12130
12407
  wkv_state,
12131
12408
  ggml_view_1d(
12132
12409
  ctx0,
12133
- kv_self->v_l[il],
12134
- hparams.n_embd_v_s() * n_seqs,
12135
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
12410
+ mctx_cur->get_s_l(il),
12411
+ hparams.n_embd_s() * n_seqs,
12412
+ hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
12136
12413
  )
12137
12414
  )
12138
12415
  );
@@ -12173,20 +12450,19 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12173
12450
  inpL = build_inp_embd(model.tok_embd);
12174
12451
  inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
12175
12452
 
12176
- ggml_tensor * state_copy = build_inp_s_copy();
12177
- ggml_tensor * state_mask = build_inp_s_mask();
12453
+ auto * rs_inp = build_rs_inp();
12178
12454
 
12179
12455
  const auto n_embd = hparams.n_embd;
12180
12456
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12181
12457
  const auto n_seqs = ubatch.n_seqs;
12182
12458
 
12459
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12460
+
12183
12461
  for (int il = 0; il < n_layer; ++il) {
12184
12462
  const llama_layer * layer = &model.layers[il];
12185
12463
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
12186
12464
 
12187
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
12188
- gf, state_copy, state_mask, ubatch, il
12189
- );
12465
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
12190
12466
 
12191
12467
  ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
12192
12468
  ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -12201,7 +12477,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12201
12477
  1
12202
12478
  );
12203
12479
 
12204
- cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
12480
+ cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
12205
12481
 
12206
12482
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
12207
12483
  cb(ffn_inp, "ffn_inp", il);
@@ -12223,12 +12499,14 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12223
12499
  );
12224
12500
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
12225
12501
 
12226
- if (il == n_layer - 1) {
12227
- // skip computing output for unused tokens
12228
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
12229
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
12230
- ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
12231
- x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
12502
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
12503
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
12504
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
12505
+
12506
+ if (il == n_layer - 1 && inp_out_ids) {
12507
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
12508
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
12509
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
12232
12510
  }
12233
12511
 
12234
12512
  cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
@@ -12259,7 +12537,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12259
12537
 
12260
12538
  struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12261
12539
  llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
12262
- GGML_ASSERT(n_embd == hparams.n_embd_k_s());
12540
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
12263
12541
 
12264
12542
  ggml_tensor * cur;
12265
12543
  ggml_tensor * inpL;
@@ -12267,20 +12545,19 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12267
12545
 
12268
12546
  inpL = build_inp_embd(model.tok_embd);
12269
12547
 
12270
- ggml_tensor * state_copy = build_inp_s_copy();
12271
- ggml_tensor * state_mask = build_inp_s_mask();
12548
+ auto * rs_inp = build_rs_inp();
12272
12549
 
12273
12550
  const auto n_embd = hparams.n_embd;
12274
12551
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12275
12552
  const auto n_seqs = ubatch.n_seqs;
12276
12553
 
12554
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12555
+
12277
12556
  for (int il = 0; il < n_layer; ++il) {
12278
12557
  const llama_layer * layer = &model.layers[il];
12279
12558
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
12280
12559
 
12281
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
12282
- gf, state_copy, state_mask, ubatch, il
12283
- );
12560
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
12284
12561
 
12285
12562
  ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
12286
12563
  cb(att_norm, "attn_norm", il);
@@ -12292,7 +12569,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12292
12569
  1
12293
12570
  );
12294
12571
 
12295
- cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
12572
+ cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
12296
12573
 
12297
12574
  token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
12298
12575
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -12300,11 +12577,12 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12300
12577
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
12301
12578
  cb(ffn_inp, "ffn_inp", il);
12302
12579
 
12303
- if (il == n_layer - 1) {
12304
- // skip computing output for unused tokens
12305
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
12306
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
12307
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
12580
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
12581
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
12582
+
12583
+ if (il == n_layer - 1 && inp_out_ids) {
12584
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12585
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
12308
12586
  }
12309
12587
 
12310
12588
  // feed-forward network
@@ -12373,6 +12651,9 @@ struct llm_build_granite : public llm_graph_context {
12373
12651
  auto * inp_attn = build_attn_inp_kv_unified();
12374
12652
 
12375
12653
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12654
+
12655
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12656
+
12376
12657
  for (int il = 0; il < n_layer; ++il) {
12377
12658
  ggml_tensor * inpSA = inpL;
12378
12659
 
@@ -12435,9 +12716,7 @@ struct llm_build_granite : public llm_graph_context {
12435
12716
  cb(cur, "attn_out", il);
12436
12717
  }
12437
12718
 
12438
- if (il == n_layer - 1) {
12439
- // skip computing output for unused tokens
12440
- ggml_tensor * inp_out_ids = build_inp_out_ids();
12719
+ if (il == n_layer - 1 && inp_out_ids) {
12441
12720
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12442
12721
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12443
12722
  }
@@ -12556,6 +12835,8 @@ struct llm_build_chameleon : public llm_graph_context {
12556
12835
 
12557
12836
  auto * inp_attn = build_attn_inp_kv_unified();
12558
12837
 
12838
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12839
+
12559
12840
  for (int il = 0; il < n_layer; ++il) {
12560
12841
  ggml_tensor * inpSA = inpL;
12561
12842
 
@@ -12632,21 +12913,19 @@ struct llm_build_chameleon : public llm_graph_context {
12632
12913
  cur = build_attn(inp_attn, gf,
12633
12914
  model.layers[il].wo, nullptr,
12634
12915
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12635
-
12636
- if (hparams.swin_norm) {
12637
- cur = build_norm(cur,
12638
- model.layers[il].attn_norm, NULL,
12639
- LLM_NORM_RMS, il);
12640
- }
12641
12916
  }
12642
12917
 
12643
- if (il == n_layer - 1) {
12644
- // skip computing output for unused tokens
12645
- ggml_tensor * inp_out_ids = build_inp_out_ids();
12918
+ if (il == n_layer - 1 && inp_out_ids) {
12646
12919
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12647
12920
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12648
12921
  }
12649
12922
 
12923
+ if (hparams.swin_norm) {
12924
+ cur = build_norm(cur,
12925
+ model.layers[il].attn_norm, NULL,
12926
+ LLM_NORM_RMS, il);
12927
+ }
12928
+
12650
12929
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12651
12930
  cb(ffn_inp, "ffn_inp", il);
12652
12931
 
@@ -12887,6 +13166,8 @@ struct llm_build_plm : public llm_graph_context {
12887
13166
 
12888
13167
  auto * inp_attn = build_attn_inp_kv_unified();
12889
13168
 
13169
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13170
+
12890
13171
  for (int il = 0; il < n_layer; ++il) {
12891
13172
  ggml_tensor * inpSA = inpL;
12892
13173
 
@@ -12990,9 +13271,7 @@ struct llm_build_plm : public llm_graph_context {
12990
13271
  q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
12991
13272
  }
12992
13273
 
12993
- if (il == n_layer - 1) {
12994
- // skip computing output for unused tokens
12995
- ggml_tensor * inp_out_ids = build_inp_out_ids();
13274
+ if (il == n_layer - 1 && inp_out_ids) {
12996
13275
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12997
13276
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12998
13277
  }
@@ -13052,6 +13331,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
13052
13331
 
13053
13332
  auto * inp_attn = build_attn_inp_kv_unified();
13054
13333
 
13334
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13335
+
13055
13336
  for (int il = 0; il < n_layer; ++il) {
13056
13337
  ggml_tensor * inpSA = inpL;
13057
13338
 
@@ -13113,9 +13394,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
13113
13394
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
13114
13395
  }
13115
13396
 
13116
- if (il == n_layer - 1) {
13117
- // skip computing output for unused tokens
13118
- ggml_tensor * inp_out_ids = build_inp_out_ids();
13397
+ if (il == n_layer - 1 && inp_out_ids) {
13119
13398
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13120
13399
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13121
13400
  }
@@ -13184,69 +13463,375 @@ struct llm_build_bailingmoe : public llm_graph_context {
13184
13463
  }
13185
13464
  };
13186
13465
 
13466
+ struct llm_build_dots1 : public llm_graph_context {
13467
+ llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13468
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13469
+
13470
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13471
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
13472
+
13473
+ ggml_tensor * cur;
13474
+ ggml_tensor * inpL;
13475
+
13476
+ inpL = build_inp_embd(model.tok_embd);
13477
+
13478
+ // inp_pos - contains the positions
13479
+ ggml_tensor * inp_pos = build_inp_pos();
13480
+
13481
+ auto * inp_attn = build_attn_inp_kv_unified();
13482
+
13483
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13484
+
13485
+ for (int il = 0; il < n_layer; ++il) {
13486
+ ggml_tensor * inpSA = inpL;
13487
+
13488
+ // norm
13489
+ cur = build_norm(inpL,
13490
+ model.layers[il].attn_norm, NULL,
13491
+ LLM_NORM_RMS, il);
13492
+ cb(cur, "attn_norm", il);
13493
+
13494
+ // self_attention
13495
+ {
13496
+ // compute Q and K and RoPE them
13497
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13498
+ cb(Qcur, "Qcur", il);
13499
+
13500
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13501
+ cb(Kcur, "Kcur", il);
13502
+
13503
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13504
+ cb(Vcur, "Vcur", il);
13505
+
13506
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13507
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13508
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13509
+
13510
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
13511
+ cb(Qcur, "Qcur_normed", il);
13512
+
13513
+ Qcur = ggml_rope_ext(
13514
+ ctx0, Qcur, inp_pos, nullptr,
13515
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13516
+ ext_factor, attn_factor, beta_fast, beta_slow
13517
+ );
13518
+
13519
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
13520
+ cb(Kcur, "Kcur_normed", il);
13521
+
13522
+ Kcur = ggml_rope_ext(
13523
+ ctx0, Kcur, inp_pos, nullptr,
13524
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13525
+ ext_factor, attn_factor, beta_fast, beta_slow
13526
+ );
13527
+
13528
+ cb(Qcur, "Qcur", il);
13529
+ cb(Kcur, "Kcur", il);
13530
+ cb(Vcur, "Vcur", il);
13531
+
13532
+ cur = build_attn(inp_attn, gf,
13533
+ model.layers[il].wo, model.layers[il].bo,
13534
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13535
+ }
13536
+
13537
+ if (il == n_layer - 1 && inp_out_ids) {
13538
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13539
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13540
+ }
13541
+
13542
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13543
+ cb(ffn_inp, "ffn_inp", il);
13544
+
13545
+ // MoE branch
13546
+ cur = build_norm(ffn_inp,
13547
+ model.layers[il].ffn_norm, NULL,
13548
+ LLM_NORM_RMS, il);
13549
+ cb(cur, "ffn_norm", il);
13550
+
13551
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
13552
+ cur = build_ffn(cur,
13553
+ model.layers[il].ffn_up, NULL, NULL,
13554
+ model.layers[il].ffn_gate, NULL, NULL,
13555
+ model.layers[il].ffn_down, NULL, NULL,
13556
+ NULL,
13557
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13558
+ cb(cur, "ffn_out", il);
13559
+ } else {
13560
+ ggml_tensor * moe_out =
13561
+ build_moe_ffn(cur,
13562
+ model.layers[il].ffn_gate_inp,
13563
+ model.layers[il].ffn_up_exps,
13564
+ model.layers[il].ffn_gate_exps,
13565
+ model.layers[il].ffn_down_exps,
13566
+ model.layers[il].ffn_exp_probs_b,
13567
+ n_expert, n_expert_used,
13568
+ LLM_FFN_SILU, hparams.expert_weights_norm,
13569
+ true, hparams.expert_weights_scale,
13570
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
13571
+ il);
13572
+ cb(moe_out, "ffn_moe_out", il);
13573
+
13574
+ {
13575
+ ggml_tensor * ffn_shexp = build_ffn(cur,
13576
+ model.layers[il].ffn_up_shexp, NULL, NULL,
13577
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
13578
+ model.layers[il].ffn_down_shexp, NULL, NULL,
13579
+ NULL,
13580
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13581
+ cb(ffn_shexp, "ffn_shexp", il);
13582
+
13583
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
13584
+ cb(cur, "ffn_out", il);
13585
+ }
13586
+ }
13587
+
13588
+ cur = ggml_add(ctx0, cur, ffn_inp);
13589
+
13590
+ cur = build_cvec(cur, il);
13591
+ cb(cur, "l_out", il);
13592
+
13593
+ // input for next layer
13594
+ inpL = cur;
13595
+ }
13596
+
13597
+ cur = inpL;
13598
+
13599
+ cur = build_norm(cur,
13600
+ model.output_norm, NULL,
13601
+ LLM_NORM_RMS, -1);
13602
+
13603
+ cb(cur, "result_norm", -1);
13604
+ res->t_embd = cur;
13605
+
13606
+ // lm_head
13607
+ cur = build_lora_mm(model.output, cur);
13608
+
13609
+ cb(cur, "result_output", -1);
13610
+ res->t_logits = cur;
13611
+
13612
+ ggml_build_forward_expand(gf, cur);
13613
+ }
13614
+ };
13615
+
13616
+ struct llm_build_arcee : public llm_graph_context {
13617
+ llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13618
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13619
+
13620
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13621
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
13622
+
13623
+ ggml_tensor * cur;
13624
+ ggml_tensor * inpL;
13625
+
13626
+ inpL = build_inp_embd(model.tok_embd);
13627
+
13628
+ // inp_pos - contains the positions
13629
+ ggml_tensor * inp_pos = build_inp_pos();
13630
+
13631
+ auto * inp_attn = build_attn_inp_kv_unified();
13632
+
13633
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
13634
+
13635
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13636
+
13637
+ for (int il = 0; il < n_layer; ++il) {
13638
+ ggml_tensor * inpSA = inpL;
13639
+
13640
+ // norm
13641
+ cur = build_norm(inpL,
13642
+ model.layers[il].attn_norm, NULL,
13643
+ LLM_NORM_RMS, il);
13644
+ cb(cur, "attn_norm", il);
13645
+
13646
+ // self-attention
13647
+ {
13648
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
13649
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
13650
+
13651
+ // compute Q and K and RoPE them
13652
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13653
+ cb(Qcur, "Qcur", il);
13654
+ if (model.layers[il].bq) {
13655
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13656
+ cb(Qcur, "Qcur", il);
13657
+ }
13658
+
13659
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13660
+ cb(Kcur, "Kcur", il);
13661
+ if (model.layers[il].bk) {
13662
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13663
+ cb(Kcur, "Kcur", il);
13664
+ }
13665
+
13666
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13667
+ cb(Vcur, "Vcur", il);
13668
+ if (model.layers[il].bv) {
13669
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
13670
+ cb(Vcur, "Vcur", il);
13671
+ }
13672
+
13673
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13674
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13675
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13676
+
13677
+ Qcur = ggml_rope_ext(
13678
+ ctx0, Qcur, inp_pos, rope_factors,
13679
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13680
+ ext_factor, attn_factor, beta_fast, beta_slow
13681
+ );
13682
+
13683
+ Kcur = ggml_rope_ext(
13684
+ ctx0, Kcur, inp_pos, rope_factors,
13685
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13686
+ ext_factor, attn_factor, beta_fast, beta_slow
13687
+ );
13688
+
13689
+ cb(Qcur, "Qcur", il);
13690
+ cb(Kcur, "Kcur", il);
13691
+ cb(Vcur, "Vcur", il);
13692
+
13693
+ cur = build_attn(inp_attn, gf,
13694
+ model.layers[il].wo, model.layers[il].bo,
13695
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
13696
+ cb(cur, "attn_out", il);
13697
+ }
13698
+
13699
+ if (il == n_layer - 1 && inp_out_ids) {
13700
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13701
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13702
+ }
13703
+
13704
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13705
+ cb(ffn_inp, "ffn_inp", il);
13706
+
13707
+ // feed-forward network
13708
+ // ARCEE uses relu^2 instead of silu
13709
+ cur = build_norm(ffn_inp,
13710
+ model.layers[il].ffn_norm, NULL,
13711
+ LLM_NORM_RMS, il);
13712
+ cb(cur, "ffn_norm", il);
13713
+
13714
+ cur = build_ffn(cur,
13715
+ model.layers[il].ffn_up, NULL, NULL,
13716
+ NULL, NULL, NULL,
13717
+ model.layers[il].ffn_down, NULL, NULL,
13718
+ NULL,
13719
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
13720
+ cb(cur, "ffn_out", il);
13721
+
13722
+ cur = ggml_add(ctx0, cur, ffn_inp);
13723
+ cb(cur, "ffn_out", il);
13724
+
13725
+ cur = build_cvec(cur, il);
13726
+ cb(cur, "l_out", il);
13727
+
13728
+ // input for next layer
13729
+ inpL = cur;
13730
+ }
13731
+
13732
+ cur = inpL;
13733
+
13734
+ cur = build_norm(cur,
13735
+ model.output_norm, NULL,
13736
+ LLM_NORM_RMS, -1);
13737
+
13738
+ cb(cur, "result_norm", -1);
13739
+ res->t_embd = cur;
13740
+
13741
+ // lm_head
13742
+ cur = build_lora_mm(model.output, cur);
13743
+
13744
+ cb(cur, "result_output", -1);
13745
+ res->t_logits = cur;
13746
+
13747
+ ggml_build_forward_expand(gf, cur);
13748
+ }
13749
+ };
13750
+
13187
13751
  llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
13188
13752
  llama_memory_i * res;
13189
13753
 
13190
13754
  switch (arch) {
13755
+ // Models that need specific instantiation should be handled in the
13756
+ // switch statement
13191
13757
  case LLM_ARCH_BERT:
13192
13758
  case LLM_ARCH_JINA_BERT_V2:
13193
13759
  case LLM_ARCH_NOMIC_BERT:
13194
13760
  case LLM_ARCH_NOMIC_BERT_MOE:
13761
+ case LLM_ARCH_NEO_BERT:
13195
13762
  case LLM_ARCH_WAVTOKENIZER_DEC:
13196
13763
  {
13197
13764
  res = nullptr;
13198
13765
  } break;
13199
- case LLM_ARCH_MAMBA:
13200
- case LLM_ARCH_RWKV6:
13201
- case LLM_ARCH_RWKV6QWEN2:
13202
- case LLM_ARCH_RWKV7:
13203
- case LLM_ARCH_ARWKV7:
13204
- {
13205
- res = new llama_kv_cache_recurrent(
13206
- *this,
13207
- GGML_TYPE_F32,
13208
- GGML_TYPE_F32,
13209
- cparams.offload_kqv,
13210
- std::max((uint32_t) 1, cparams.n_seq_max),
13211
- cparams.n_seq_max);
13212
- } break;
13766
+ // Models that need standard caching should rely on recurrent/hybrid
13767
+ // checks
13213
13768
  default:
13214
13769
  {
13215
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
13216
-
13217
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
13218
-
13219
- LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
13220
-
13221
- if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13222
- GGML_ASSERT(hparams.is_swa_any());
13223
-
13224
- res = new llama_kv_cache_unified_iswa(
13225
- *this,
13226
- params.type_k,
13227
- params.type_v,
13228
- !cparams.flash_attn,
13229
- cparams.offload_kqv,
13230
- params.swa_full,
13231
- cparams.n_ctx,
13232
- cparams.n_seq_max,
13233
- cparams.n_batch,
13234
- padding);
13235
- } else {
13236
- GGML_ASSERT(!hparams.is_swa_any());
13237
-
13238
- res = new llama_kv_cache_unified(
13770
+ if (llm_arch_is_recurrent(arch)) {
13771
+ res = new llama_memory_recurrent(
13239
13772
  *this,
13240
13773
  nullptr,
13241
- params.type_k,
13242
- params.type_v,
13243
- !cparams.flash_attn,
13774
+ GGML_TYPE_F32,
13775
+ GGML_TYPE_F32,
13244
13776
  cparams.offload_kqv,
13245
- cparams.n_ctx,
13246
- cparams.n_seq_max,
13247
- padding,
13248
- hparams.n_swa,
13249
- hparams.swa_type);
13777
+ std::max((uint32_t) 1, cparams.n_seq_max),
13778
+ cparams.n_seq_max);
13779
+ } else if (llm_arch_is_hybrid(arch)) {
13780
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
13781
+
13782
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
13783
+
13784
+ res = new llama_memory_hybrid(
13785
+ /* model */ *this,
13786
+ /* attn_type_k */ params.type_k,
13787
+ /* attn_type_v */ params.type_v,
13788
+ /* attn_v_trans */ !cparams.flash_attn,
13789
+ /* attn_kv_size */ cparams.n_ctx,
13790
+ /* attn_n_pad */ padding,
13791
+ /* attn_n_swa */ hparams.n_swa,
13792
+ /* attn_swa_type */ hparams.swa_type,
13793
+ /* recurrent_type_k */ GGML_TYPE_F32,
13794
+ /* recurrent_type_v */ GGML_TYPE_F32,
13795
+ /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
13796
+ /* n_seq_max */ cparams.n_seq_max,
13797
+ /* offload */ cparams.offload_kqv);
13798
+ } else {
13799
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
13800
+
13801
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
13802
+
13803
+ LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
13804
+
13805
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13806
+ GGML_ASSERT(hparams.is_swa_any());
13807
+
13808
+ res = new llama_kv_cache_unified_iswa(
13809
+ *this,
13810
+ params.type_k,
13811
+ params.type_v,
13812
+ !cparams.flash_attn,
13813
+ cparams.offload_kqv,
13814
+ params.swa_full,
13815
+ cparams.n_ctx,
13816
+ cparams.n_seq_max,
13817
+ cparams.n_ubatch,
13818
+ padding);
13819
+ } else {
13820
+ GGML_ASSERT(!hparams.is_swa_any());
13821
+
13822
+ res = new llama_kv_cache_unified(
13823
+ *this,
13824
+ nullptr,
13825
+ params.type_k,
13826
+ params.type_v,
13827
+ !cparams.flash_attn,
13828
+ cparams.offload_kqv,
13829
+ cparams.n_ctx,
13830
+ cparams.n_seq_max,
13831
+ padding,
13832
+ hparams.n_swa,
13833
+ hparams.swa_type);
13834
+ }
13250
13835
  }
13251
13836
  }
13252
13837
  }
@@ -13262,7 +13847,6 @@ llm_graph_result_ptr llama_model::build_graph(
13262
13847
 
13263
13848
  switch (arch) {
13264
13849
  case LLM_ARCH_LLAMA:
13265
- case LLM_ARCH_MINICPM:
13266
13850
  {
13267
13851
  llm = std::make_unique<llm_build_llama>(*this, params, gf);
13268
13852
  } break;
@@ -13301,6 +13885,10 @@ llm_graph_result_ptr llama_model::build_graph(
13301
13885
  {
13302
13886
  llm = std::make_unique<llm_build_bert>(*this, params, gf);
13303
13887
  } break;
13888
+ case LLM_ARCH_NEO_BERT:
13889
+ {
13890
+ llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
13891
+ } break;
13304
13892
  case LLM_ARCH_BLOOM:
13305
13893
  {
13306
13894
  llm = std::make_unique<llm_build_bloom>(*this, params, gf);
@@ -13503,6 +14091,7 @@ llm_graph_result_ptr llama_model::build_graph(
13503
14091
  } break;
13504
14092
  case LLM_ARCH_GRANITE:
13505
14093
  case LLM_ARCH_GRANITE_MOE:
14094
+ case LLM_ARCH_MINICPM:
13506
14095
  {
13507
14096
  llm = std::make_unique<llm_build_granite>(*this, params, gf);
13508
14097
  } break;
@@ -13522,6 +14111,14 @@ llm_graph_result_ptr llama_model::build_graph(
13522
14111
  {
13523
14112
  llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
13524
14113
  } break;
14114
+ case LLM_ARCH_DOTS1:
14115
+ {
14116
+ llm = std::make_unique<llm_build_dots1>(*this, params, gf);
14117
+ } break;
14118
+ case LLM_ARCH_ARCEE:
14119
+ {
14120
+ llm = std::make_unique<llm_build_arcee>(*this, params, gf);
14121
+ } break;
13525
14122
  default:
13526
14123
  GGML_ABORT("fatal error");
13527
14124
  }
@@ -13593,6 +14190,22 @@ int32_t llama_model_n_head_kv(const llama_model * model) {
13593
14190
  return model->hparams.n_head_kv();
13594
14191
  }
13595
14192
 
14193
+ int32_t llama_model_n_swa(const llama_model * model) {
14194
+ return model->hparams.n_swa;
14195
+ }
14196
+
14197
+ uint32_t llama_model_n_cls_out(const struct llama_model * model) {
14198
+ return model->hparams.n_cls_out;
14199
+ }
14200
+
14201
+ const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
14202
+ if (i < model->classifier_labels.size()) {
14203
+ return model->classifier_labels[i].c_str();
14204
+ }
14205
+
14206
+ return nullptr;
14207
+ }
14208
+
13596
14209
  // deprecated
13597
14210
  int32_t llama_n_ctx_train(const llama_model * model) {
13598
14211
  return llama_model_n_ctx_train(model);
@@ -13655,6 +14268,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13655
14268
  case LLM_ARCH_GRANITE_MOE:
13656
14269
  case LLM_ARCH_CHAMELEON:
13657
14270
  case LLM_ARCH_BAILINGMOE:
14271
+ case LLM_ARCH_NEO_BERT:
14272
+ case LLM_ARCH_ARCEE:
13658
14273
  return LLAMA_ROPE_TYPE_NORM;
13659
14274
 
13660
14275
  // the pairs of head values are offset by n_rot/2
@@ -13688,6 +14303,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13688
14303
  case LLM_ARCH_NEMOTRON:
13689
14304
  case LLM_ARCH_EXAONE:
13690
14305
  case LLM_ARCH_MINICPM3:
14306
+ case LLM_ARCH_DOTS1:
13691
14307
  return LLAMA_ROPE_TYPE_NEOX;
13692
14308
 
13693
14309
  case LLM_ARCH_QWEN2VL:
@@ -13753,7 +14369,7 @@ uint64_t llama_model_size(const llama_model * model) {
13753
14369
  }
13754
14370
 
13755
14371
  const char * llama_model_chat_template(const llama_model * model, const char * name) {
13756
- const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
14372
+ const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
13757
14373
  : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
13758
14374
  const auto & it = model->gguf_kv.find(key);
13759
14375
  if (it == model->gguf_kv.end()) {
@@ -13795,14 +14411,7 @@ llama_token llama_model_decoder_start_token(const llama_model * model) {
13795
14411
  }
13796
14412
 
13797
14413
  bool llama_model_is_recurrent(const llama_model * model) {
13798
- switch (model->arch) {
13799
- case LLM_ARCH_MAMBA: return true;
13800
- case LLM_ARCH_RWKV6: return true;
13801
- case LLM_ARCH_RWKV6QWEN2: return true;
13802
- case LLM_ARCH_RWKV7: return true;
13803
- case LLM_ARCH_ARWKV7: return true;
13804
- default: return false;
13805
- }
14414
+ return llm_arch_is_recurrent(model->arch);
13806
14415
  }
13807
14416
 
13808
14417
  const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {