@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +56 -22
  11. package/cpp/build-info.cpp +2 -2
  12. package/cpp/llama.cpp/CMakeLists.txt +1 -1
  13. package/cpp/llama.cpp/common/arg.cpp +7 -0
  14. package/cpp/llama.cpp/common/common.cpp +3 -0
  15. package/cpp/llama.cpp/common/common.h +1 -0
  16. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  17. package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
  18. package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
  19. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  20. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  21. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
  22. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  23. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
  24. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  25. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  26. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  27. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  28. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  30. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  32. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  33. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  34. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  35. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  62. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
  64. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
  65. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  66. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
  67. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  68. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  69. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  70. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  71. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  72. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  73. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  74. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  76. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  77. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
  78. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  80. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  81. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  82. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  83. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  89. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  90. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  92. package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
  93. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  94. package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
  95. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
  96. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
  97. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  98. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  99. package/cpp/llama.cpp/include/llama.h +8 -3
  100. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  101. package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
  102. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  103. package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
  104. package/cpp/llama.cpp/src/llama-batch.h +98 -70
  105. package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
  106. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  107. package/cpp/llama.cpp/src/llama-context.h +13 -13
  108. package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
  109. package/cpp/llama.cpp/src/llama-graph.h +44 -32
  110. package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
  111. package/cpp/llama.cpp/src/llama-hparams.h +8 -0
  112. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
  113. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
  114. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
  115. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
  116. package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
  117. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
  118. package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
  119. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
  120. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  121. package/cpp/llama.cpp/src/llama-memory.h +18 -22
  122. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  123. package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
  124. package/cpp/llama.cpp/src/llama-model.h +22 -0
  125. package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
  126. package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
  127. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  128. package/cpp/rn-utils.h +3 -0
  129. package/ios/include/common.h +1 -0
  130. package/ios/include/llama.h +8 -3
  131. package/ios/libs/llama.xcframework/Info.plist +19 -19
  132. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  133. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  134. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  135. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  136. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
  137. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  138. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  139. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  140. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  141. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  142. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  143. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  144. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  145. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  146. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  147. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
  148. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  149. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  150. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
  151. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  152. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  153. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
  154. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  155. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  160. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  161. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  162. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  163. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  164. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
  165. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  168. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  173. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
  175. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
  178. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  183. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  184. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  186. package/package.json +1 -1
@@ -103,6 +103,8 @@ const char * llm_type_name(llm_type type) {
103
103
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
104
104
  case LLM_TYPE_30B_A3B: return "30B.A3B";
105
105
  case LLM_TYPE_235B_A22B: return "235B.A22B";
106
+ case LLM_TYPE_E2B: return "E2B";
107
+ case LLM_TYPE_E4B: return "E4B";
106
108
  default: return "?B";
107
109
  }
108
110
  }
@@ -1017,6 +1019,24 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1017
1019
  ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1018
1020
  : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1019
1021
  } break;
1022
+ case LLM_ARCH_GEMMA3N:
1023
+ {
1024
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1025
+ hparams.set_swa_pattern(5);
1026
+
1027
+ hparams.rope_freq_base_train_swa = 10000.0f;
1028
+ hparams.rope_freq_scale_train_swa = 1.0f;
1029
+ hparams.f_attention_scale = 1.0f;
1030
+
1031
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1032
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1033
+
1034
+ switch (hparams.n_layer) {
1035
+ case 30: type = LLM_TYPE_E2B; break;
1036
+ case 35: type = LLM_TYPE_E4B; break;
1037
+ default: type = LLM_TYPE_UNKNOWN;
1038
+ }
1039
+ } break;
1020
1040
  case LLM_ARCH_STARCODER2:
1021
1041
  {
1022
1042
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -2950,6 +2970,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2950
2970
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2951
2971
  }
2952
2972
  } break;
2973
+ case LLM_ARCH_GEMMA3N:
2974
+ {
2975
+ const int64_t n_altup = hparams.n_altup;
2976
+ const int64_t laurel_rank = hparams.laurel_rank;
2977
+ const int64_t n_embd_altup = hparams.n_embd_altup;
2978
+
2979
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2980
+ // if output is NULL, init from the input tok embed
2981
+ if (output == NULL) {
2982
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2983
+ }
2984
+
2985
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2986
+ tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
2987
+
2988
+ altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
2989
+ altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
2990
+ per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
2991
+ per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
2992
+
2993
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2994
+
2995
+ for (int i = 0; i < n_layer; ++i) {
2996
+ auto & layer = layers[i];
2997
+
2998
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2999
+
3000
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3001
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3002
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3003
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3004
+
3005
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3006
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3007
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3008
+
3009
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3010
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3011
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3012
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3013
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3014
+
3015
+ // altup & laurel
3016
+ layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
3017
+ layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
3018
+ layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
3019
+ layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
3020
+ layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
3021
+ layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
3022
+ layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
3023
+ layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
3024
+ layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
3025
+ layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
3026
+ layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
3027
+ }
3028
+ } break;
2953
3029
  case LLM_ARCH_STARCODER2:
2954
3030
  {
2955
3031
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4707,6 +4783,8 @@ struct llm_build_llama : public llm_graph_context {
4707
4783
 
4708
4784
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4709
4785
 
4786
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
4787
+
4710
4788
  for (int il = 0; il < n_layer; ++il) {
4711
4789
  ggml_tensor * inpSA = inpL;
4712
4790
 
@@ -4769,9 +4847,7 @@ struct llm_build_llama : public llm_graph_context {
4769
4847
  cb(cur, "attn_out", il);
4770
4848
  }
4771
4849
 
4772
- if (il == n_layer - 1) {
4773
- // skip computing output for unused tokens
4774
- ggml_tensor * inp_out_ids = build_inp_out_ids();
4850
+ if (il == n_layer - 1 && inp_out_ids) {
4775
4851
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
4776
4852
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4777
4853
  }
@@ -4867,6 +4943,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
4867
4943
 
4868
4944
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4869
4945
 
4946
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
4947
+
4870
4948
  for (int il = 0; il < n_layer; ++il) {
4871
4949
  ggml_tensor * inpSA = inpL;
4872
4950
 
@@ -4943,9 +5021,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
4943
5021
  cb(cur, "attn_out", il);
4944
5022
  }
4945
5023
 
4946
- if (il == n_layer - 1) {
4947
- // skip computing output for unused tokens
4948
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5024
+ if (il == n_layer - 1 && inp_out_ids) {
4949
5025
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
4950
5026
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4951
5027
  }
@@ -5045,6 +5121,9 @@ struct llm_build_deci : public llm_graph_context {
5045
5121
  auto * inp_attn = build_attn_inp_kv_unified();
5046
5122
 
5047
5123
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
5124
+
5125
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5126
+
5048
5127
  for (int il = 0; il < n_layer; ++il) {
5049
5128
  ggml_tensor * inpSA = inpL;
5050
5129
  const int64_t n_head_kv = hparams.n_head_kv(il);
@@ -5118,9 +5197,7 @@ struct llm_build_deci : public llm_graph_context {
5118
5197
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
5119
5198
  }
5120
5199
 
5121
- if (il == n_layer - 1) {
5122
- // skip computing output for unused tokens
5123
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5200
+ if (il == n_layer - 1 && inp_out_ids) {
5124
5201
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5125
5202
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5126
5203
  }
@@ -5199,6 +5276,8 @@ struct llm_build_baichuan : public llm_graph_context {
5199
5276
 
5200
5277
  auto * inp_attn = build_attn_inp_kv_unified();
5201
5278
 
5279
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5280
+
5202
5281
  for (int il = 0; il < n_layer; ++il) {
5203
5282
  ggml_tensor * inpSA = inpL;
5204
5283
 
@@ -5250,9 +5329,7 @@ struct llm_build_baichuan : public llm_graph_context {
5250
5329
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5251
5330
  }
5252
5331
 
5253
- if (il == n_layer - 1) {
5254
- // skip computing output for unused tokens
5255
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5332
+ if (il == n_layer - 1 && inp_out_ids) {
5256
5333
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5257
5334
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5258
5335
  }
@@ -5321,6 +5398,8 @@ struct llm_build_xverse : public llm_graph_context {
5321
5398
 
5322
5399
  auto * inp_attn = build_attn_inp_kv_unified();
5323
5400
 
5401
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5402
+
5324
5403
  for (int il = 0; il < n_layer; ++il) {
5325
5404
  ggml_tensor * inpSA = inpL;
5326
5405
 
@@ -5365,9 +5444,7 @@ struct llm_build_xverse : public llm_graph_context {
5365
5444
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5366
5445
  }
5367
5446
 
5368
- if (il == n_layer - 1) {
5369
- // skip computing output for unused tokens
5370
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5447
+ if (il == n_layer - 1 && inp_out_ids) {
5371
5448
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5372
5449
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5373
5450
  }
@@ -5435,6 +5512,8 @@ struct llm_build_falcon : public llm_graph_context {
5435
5512
 
5436
5513
  auto * inp_attn = build_attn_inp_kv_unified();
5437
5514
 
5515
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5516
+
5438
5517
  for (int il = 0; il < n_layer; ++il) {
5439
5518
  ggml_tensor * attn_norm;
5440
5519
 
@@ -5490,9 +5569,7 @@ struct llm_build_falcon : public llm_graph_context {
5490
5569
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5491
5570
  }
5492
5571
 
5493
- if (il == n_layer - 1) {
5494
- // skip computing output for unused tokens
5495
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5572
+ if (il == n_layer - 1 && inp_out_ids) {
5496
5573
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5497
5574
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
5498
5575
  attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
@@ -5561,6 +5638,8 @@ struct llm_build_grok : public llm_graph_context {
5561
5638
 
5562
5639
  auto * inp_attn = build_attn_inp_kv_unified();
5563
5640
 
5641
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5642
+
5564
5643
  for (int il = 0; il < n_layer; ++il) {
5565
5644
  ggml_tensor * inpSA = inpL;
5566
5645
 
@@ -5620,9 +5699,7 @@ struct llm_build_grok : public llm_graph_context {
5620
5699
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
5621
5700
  }
5622
5701
 
5623
- if (il == n_layer - 1) {
5624
- // skip computing output for unused tokens
5625
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5702
+ if (il == n_layer - 1 && inp_out_ids) {
5626
5703
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5627
5704
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5628
5705
  }
@@ -5721,6 +5798,8 @@ struct llm_build_dbrx : public llm_graph_context {
5721
5798
 
5722
5799
  auto * inp_attn = build_attn_inp_kv_unified();
5723
5800
 
5801
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5802
+
5724
5803
  for (int il = 0; il < n_layer; ++il) {
5725
5804
  ggml_tensor * inpSA = inpL;
5726
5805
 
@@ -5771,9 +5850,7 @@ struct llm_build_dbrx : public llm_graph_context {
5771
5850
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5772
5851
  }
5773
5852
 
5774
- if (il == n_layer - 1) {
5775
- // skip computing output for unused tokens
5776
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5853
+ if (il == n_layer - 1 && inp_out_ids) {
5777
5854
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5778
5855
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5779
5856
  }
@@ -5853,6 +5930,8 @@ struct llm_build_starcoder : public llm_graph_context {
5853
5930
  inpL = ggml_add(ctx0, inpL, pos);
5854
5931
  cb(inpL, "inpL", -1);
5855
5932
 
5933
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5934
+
5856
5935
  for (int il = 0; il < n_layer; ++il) {
5857
5936
  cur = build_norm(inpL,
5858
5937
  model.layers[il].attn_norm,
@@ -5885,9 +5964,7 @@ struct llm_build_starcoder : public llm_graph_context {
5885
5964
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5886
5965
  }
5887
5966
 
5888
- if (il == n_layer - 1) {
5889
- // skip computing output for unused tokens
5890
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5967
+ if (il == n_layer - 1 && inp_out_ids) {
5891
5968
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5892
5969
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
5893
5970
  }
@@ -5952,6 +6029,8 @@ struct llm_build_refact : public llm_graph_context {
5952
6029
 
5953
6030
  auto * inp_attn = build_attn_inp_kv_unified();
5954
6031
 
6032
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6033
+
5955
6034
  for (int il = 0; il < n_layer; ++il) {
5956
6035
  ggml_tensor * inpSA = inpL;
5957
6036
 
@@ -5984,9 +6063,7 @@ struct llm_build_refact : public llm_graph_context {
5984
6063
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5985
6064
  }
5986
6065
 
5987
- if (il == n_layer - 1) {
5988
- // skip computing output for unused tokens
5989
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6066
+ if (il == n_layer - 1 && inp_out_ids) {
5990
6067
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5991
6068
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5992
6069
  }
@@ -6072,78 +6149,79 @@ struct llm_build_bert : public llm_graph_context {
6072
6149
 
6073
6150
  auto * inp_attn = build_attn_inp_no_cache();
6074
6151
 
6075
- // iterate layers
6152
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6153
+
6076
6154
  for (int il = 0; il < n_layer; ++il) {
6077
6155
  ggml_tensor * cur = inpL;
6078
6156
 
6079
- ggml_tensor * Qcur;
6080
- ggml_tensor * Kcur;
6081
- ggml_tensor * Vcur;
6157
+ {
6158
+ ggml_tensor * Qcur;
6159
+ ggml_tensor * Kcur;
6160
+ ggml_tensor * Vcur;
6082
6161
 
6083
- // self-attention
6084
- if (model.layers[il].wqkv) {
6085
- cur = build_lora_mm(model.layers[il].wqkv, cur);
6086
- cb(cur, "wqkv", il);
6162
+ // self-attention
6163
+ if (model.layers[il].wqkv) {
6164
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
6165
+ cb(cur, "wqkv", il);
6087
6166
 
6088
- if (model.layers[il].bqkv) {
6089
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6090
- cb(cur, "bqkv", il);
6091
- }
6167
+ if (model.layers[il].bqkv) {
6168
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6169
+ cb(cur, "bqkv", il);
6170
+ }
6092
6171
 
6093
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6094
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6095
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6096
- } else {
6097
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
6098
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
6099
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
6100
- }
6172
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6173
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6174
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6175
+ } else {
6176
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
6177
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
6178
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
6179
+ }
6101
6180
 
6102
- if (model.layers[il].attn_q_norm) {
6103
- Qcur = build_norm(Qcur,
6104
- model.layers[il].attn_q_norm,
6105
- model.layers[il].attn_q_norm_b,
6106
- LLM_NORM, il);
6107
- }
6181
+ if (model.layers[il].attn_q_norm) {
6182
+ Qcur = build_norm(Qcur,
6183
+ model.layers[il].attn_q_norm,
6184
+ model.layers[il].attn_q_norm_b,
6185
+ LLM_NORM, il);
6186
+ }
6108
6187
 
6109
- if (model.layers[il].attn_k_norm) {
6110
- Kcur = build_norm(Kcur,
6111
- model.layers[il].attn_k_norm,
6112
- model.layers[il].attn_k_norm_b,
6113
- LLM_NORM, il);
6114
- }
6188
+ if (model.layers[il].attn_k_norm) {
6189
+ Kcur = build_norm(Kcur,
6190
+ model.layers[il].attn_k_norm,
6191
+ model.layers[il].attn_k_norm_b,
6192
+ LLM_NORM, il);
6193
+ }
6115
6194
 
6116
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6117
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6118
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6195
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6196
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6197
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6119
6198
 
6120
- // RoPE
6121
- if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
6122
- Qcur = ggml_rope_ext(
6123
- ctx0, Qcur, inp_pos, nullptr,
6124
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6125
- ext_factor, attn_factor, beta_fast, beta_slow
6126
- );
6199
+ // RoPE
6200
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
6201
+ Qcur = ggml_rope_ext(
6202
+ ctx0, Qcur, inp_pos, nullptr,
6203
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6204
+ ext_factor, attn_factor, beta_fast, beta_slow
6205
+ );
6127
6206
 
6128
- Kcur = ggml_rope_ext(
6129
- ctx0, Kcur, inp_pos, nullptr,
6130
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6131
- ext_factor, attn_factor, beta_fast, beta_slow
6132
- );
6133
- }
6207
+ Kcur = ggml_rope_ext(
6208
+ ctx0, Kcur, inp_pos, nullptr,
6209
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6210
+ ext_factor, attn_factor, beta_fast, beta_slow
6211
+ );
6212
+ }
6134
6213
 
6135
- cb(Qcur, "Qcur", il);
6136
- cb(Kcur, "Kcur", il);
6137
- cb(Vcur, "Vcur", il);
6214
+ cb(Qcur, "Qcur", il);
6215
+ cb(Kcur, "Kcur", il);
6216
+ cb(Vcur, "Vcur", il);
6138
6217
 
6139
- cur = build_attn(inp_attn, gf,
6140
- model.layers[il].wo, model.layers[il].bo,
6141
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6142
- cb(cur, "kqv_out", il);
6218
+ cur = build_attn(inp_attn, gf,
6219
+ model.layers[il].wo, model.layers[il].bo,
6220
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6221
+ cb(cur, "kqv_out", il);
6222
+ }
6143
6223
 
6144
- if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
6145
- // skip computing output for unused tokens
6146
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6224
+ if (il == n_layer - 1 && inp_out_ids) {
6147
6225
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6148
6226
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6149
6227
  }
@@ -6240,56 +6318,57 @@ struct llm_build_neo_bert : public llm_graph_context {
6240
6318
 
6241
6319
  auto * inp_attn = build_attn_inp_no_cache();
6242
6320
 
6243
- // iterate layers
6321
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6322
+
6244
6323
  for (int il = 0; il < n_layer; ++il) {
6245
6324
  ggml_tensor * cur = inpL;
6246
6325
 
6247
- ggml_tensor * Qcur;
6248
- ggml_tensor * Kcur;
6249
- ggml_tensor * Vcur;
6250
-
6251
6326
  // pre-norm
6252
6327
  cur = build_norm(inpL,
6253
6328
  model.layers[il].attn_norm, NULL,
6254
6329
  LLM_NORM_RMS, il);
6255
6330
 
6256
- // self-attention
6257
- cur = build_lora_mm(model.layers[il].wqkv, cur);
6258
- cb(cur, "wqkv", il);
6259
-
6260
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6261
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6262
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6263
-
6264
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6265
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6266
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6267
-
6268
- // RoPE
6269
- Qcur = ggml_rope_ext(
6270
- ctx0, Qcur, inp_pos, nullptr,
6271
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6272
- ext_factor, attn_factor, beta_fast, beta_slow
6273
- );
6331
+ {
6332
+ ggml_tensor * Qcur;
6333
+ ggml_tensor * Kcur;
6334
+ ggml_tensor * Vcur;
6274
6335
 
6275
- Kcur = ggml_rope_ext(
6276
- ctx0, Kcur, inp_pos, nullptr,
6277
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6278
- ext_factor, attn_factor, beta_fast, beta_slow
6279
- );
6336
+ // self-attention
6337
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
6338
+ cb(cur, "wqkv", il);
6339
+
6340
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6341
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6342
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6343
+
6344
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6345
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6346
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6347
+
6348
+ // RoPE
6349
+ Qcur = ggml_rope_ext(
6350
+ ctx0, Qcur, inp_pos, nullptr,
6351
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6352
+ ext_factor, attn_factor, beta_fast, beta_slow
6353
+ );
6354
+
6355
+ Kcur = ggml_rope_ext(
6356
+ ctx0, Kcur, inp_pos, nullptr,
6357
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6358
+ ext_factor, attn_factor, beta_fast, beta_slow
6359
+ );
6280
6360
 
6281
- cb(Qcur, "Qcur", il);
6282
- cb(Kcur, "Kcur", il);
6283
- cb(Vcur, "Vcur", il);
6361
+ cb(Qcur, "Qcur", il);
6362
+ cb(Kcur, "Kcur", il);
6363
+ cb(Vcur, "Vcur", il);
6284
6364
 
6285
- cur = build_attn(inp_attn, gf,
6286
- model.layers[il].wo, nullptr,
6287
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6288
- cb(cur, "kqv_out", il);
6365
+ cur = build_attn(inp_attn, gf,
6366
+ model.layers[il].wo, nullptr,
6367
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6368
+ cb(cur, "kqv_out", il);
6369
+ }
6289
6370
 
6290
- if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
6291
- // skip computing output for unused tokens
6292
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6371
+ if (il == n_layer - 1 && inp_out_ids) {
6293
6372
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6294
6373
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6295
6374
  }
@@ -6354,6 +6433,8 @@ struct llm_build_bloom : public llm_graph_context {
6354
6433
  LLM_NORM, -1);
6355
6434
  cb(inpL, "inp_norm", -1);
6356
6435
 
6436
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6437
+
6357
6438
  for (int il = 0; il < n_layer; ++il) {
6358
6439
  cur = build_norm(inpL,
6359
6440
  model.layers[il].attn_norm,
@@ -6386,9 +6467,7 @@ struct llm_build_bloom : public llm_graph_context {
6386
6467
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6387
6468
  }
6388
6469
 
6389
- if (il == n_layer - 1) {
6390
- // skip computing output for unused tokens
6391
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6470
+ if (il == n_layer - 1 && inp_out_ids) {
6392
6471
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6393
6472
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6394
6473
  }
@@ -6465,6 +6544,8 @@ struct llm_build_mpt : public llm_graph_context {
6465
6544
  cb(inpL, "inpL", -1);
6466
6545
  }
6467
6546
 
6547
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6548
+
6468
6549
  for (int il = 0; il < n_layer; ++il) {
6469
6550
  ggml_tensor * attn_norm;
6470
6551
 
@@ -6527,9 +6608,7 @@ struct llm_build_mpt : public llm_graph_context {
6527
6608
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6528
6609
  }
6529
6610
 
6530
- if (il == n_layer - 1) {
6531
- // skip computing output for unused tokens
6532
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6611
+ if (il == n_layer - 1 && inp_out_ids) {
6533
6612
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6534
6613
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6535
6614
  }
@@ -6598,6 +6677,8 @@ struct llm_build_stablelm : public llm_graph_context {
6598
6677
 
6599
6678
  auto * inp_attn = build_attn_inp_kv_unified();
6600
6679
 
6680
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6681
+
6601
6682
  for (int il = 0; il < n_layer; ++il) {
6602
6683
  // norm
6603
6684
  cur = build_norm(inpL,
@@ -6673,9 +6754,7 @@ struct llm_build_stablelm : public llm_graph_context {
6673
6754
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6674
6755
  }
6675
6756
 
6676
- if (il == n_layer - 1) {
6677
- // skip computing output for unused tokens
6678
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6757
+ if (il == n_layer - 1 && inp_out_ids) {
6679
6758
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6680
6759
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6681
6760
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@@ -6750,6 +6829,8 @@ struct llm_build_qwen : public llm_graph_context {
6750
6829
 
6751
6830
  auto * inp_attn = build_attn_inp_kv_unified();
6752
6831
 
6832
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6833
+
6753
6834
  for (int il = 0; il < n_layer; ++il) {
6754
6835
  ggml_tensor * inpSA = inpL;
6755
6836
 
@@ -6796,9 +6877,7 @@ struct llm_build_qwen : public llm_graph_context {
6796
6877
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6797
6878
  }
6798
6879
 
6799
- if (il == n_layer - 1) {
6800
- // skip computing output for unused tokens
6801
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6880
+ if (il == n_layer - 1 && inp_out_ids) {
6802
6881
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6803
6882
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6804
6883
  }
@@ -6867,6 +6946,8 @@ struct llm_build_qwen2 : public llm_graph_context {
6867
6946
 
6868
6947
  auto * inp_attn = build_attn_inp_kv_unified();
6869
6948
 
6949
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6950
+
6870
6951
  for (int il = 0; il < n_layer; ++il) {
6871
6952
  ggml_tensor * inpSA = inpL;
6872
6953
 
@@ -6916,9 +6997,7 @@ struct llm_build_qwen2 : public llm_graph_context {
6916
6997
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6917
6998
  }
6918
6999
 
6919
- if (il == n_layer - 1) {
6920
- // skip computing output for unused tokens
6921
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7000
+ if (il == n_layer - 1 && inp_out_ids) {
6922
7001
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6923
7002
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6924
7003
  }
@@ -6988,6 +7067,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
6988
7067
  int sections[4];
6989
7068
  std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
6990
7069
 
7070
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7071
+
6991
7072
  for (int il = 0; il < n_layer; ++il) {
6992
7073
  ggml_tensor * inpSA = inpL;
6993
7074
 
@@ -7037,9 +7118,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
7037
7118
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7038
7119
  }
7039
7120
 
7040
- if (il == n_layer - 1) {
7041
- // skip computing output for unused tokens
7042
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7121
+ if (il == n_layer - 1 && inp_out_ids) {
7043
7122
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7044
7123
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7045
7124
  }
@@ -7106,6 +7185,8 @@ struct llm_build_qwen2moe : public llm_graph_context {
7106
7185
 
7107
7186
  auto * inp_attn = build_attn_inp_kv_unified();
7108
7187
 
7188
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7189
+
7109
7190
  for (int il = 0; il < n_layer; ++il) {
7110
7191
  ggml_tensor * inpSA = inpL;
7111
7192
 
@@ -7164,9 +7245,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
7164
7245
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7165
7246
  }
7166
7247
 
7167
- if (il == n_layer - 1) {
7168
- // skip computing output for unused tokens
7169
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7248
+ if (il == n_layer - 1 && inp_out_ids) {
7170
7249
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7171
7250
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7172
7251
  }
@@ -7265,6 +7344,8 @@ struct llm_build_qwen3 : public llm_graph_context {
7265
7344
 
7266
7345
  auto * inp_attn = build_attn_inp_kv_unified();
7267
7346
 
7347
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7348
+
7268
7349
  for (int il = 0; il < n_layer; ++il) {
7269
7350
  ggml_tensor * inpSA = inpL;
7270
7351
 
@@ -7317,9 +7398,7 @@ struct llm_build_qwen3 : public llm_graph_context {
7317
7398
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7318
7399
  }
7319
7400
 
7320
- if (il == n_layer - 1) {
7321
- // skip computing output for unused tokens
7322
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7401
+ if (il == n_layer - 1 && inp_out_ids) {
7323
7402
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7324
7403
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7325
7404
  }
@@ -7386,6 +7465,8 @@ struct llm_build_qwen3moe : public llm_graph_context {
7386
7465
 
7387
7466
  auto * inp_attn = build_attn_inp_kv_unified();
7388
7467
 
7468
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7469
+
7389
7470
  for (int il = 0; il < n_layer; ++il) {
7390
7471
  ggml_tensor * inpSA = inpL;
7391
7472
 
@@ -7438,9 +7519,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
7438
7519
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7439
7520
  }
7440
7521
 
7441
- if (il == n_layer - 1) {
7442
- // skip computing output for unused tokens
7443
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7522
+ if (il == n_layer - 1 && inp_out_ids) {
7444
7523
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7445
7524
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7446
7525
  }
@@ -7516,6 +7595,8 @@ struct llm_build_phi2 : public llm_graph_context {
7516
7595
 
7517
7596
  auto * inp_attn = build_attn_inp_kv_unified();
7518
7597
 
7598
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7599
+
7519
7600
  for (int il = 0; il < n_layer; ++il) {
7520
7601
  attn_norm_output = build_norm(inpL,
7521
7602
  model.layers[il].attn_norm,
@@ -7578,9 +7659,7 @@ struct llm_build_phi2 : public llm_graph_context {
7578
7659
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7579
7660
  }
7580
7661
 
7581
- if (il == n_layer - 1) {
7582
- // skip computing output for unused tokens
7583
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7662
+ if (il == n_layer - 1 && inp_out_ids) {
7584
7663
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7585
7664
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7586
7665
  attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
@@ -7652,6 +7731,8 @@ struct llm_build_phi3 : public llm_graph_context {
7652
7731
  inp_attn = build_attn_inp_kv_unified();
7653
7732
  }
7654
7733
 
7734
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7735
+
7655
7736
  for (int il = 0; il < n_layer; ++il) {
7656
7737
  auto * residual = inpL;
7657
7738
 
@@ -7715,9 +7796,7 @@ struct llm_build_phi3 : public llm_graph_context {
7715
7796
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7716
7797
  }
7717
7798
 
7718
- if (il == n_layer - 1) {
7719
- // skip computing output for unused tokens
7720
- ggml_tensor* inp_out_ids = build_inp_out_ids();
7799
+ if (il == n_layer - 1 && inp_out_ids) {
7721
7800
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7722
7801
  residual = ggml_get_rows(ctx0, residual, inp_out_ids);
7723
7802
  }
@@ -7803,15 +7882,16 @@ struct llm_build_plamo : public llm_graph_context {
7803
7882
 
7804
7883
  auto * inp_attn = build_attn_inp_kv_unified();
7805
7884
 
7806
- for (int il = 0; il < n_layer; ++il) {
7885
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7807
7886
 
7887
+ for (int il = 0; il < n_layer; ++il) {
7808
7888
  // norm
7809
7889
  cur = build_norm(inpL,
7810
7890
  model.layers[il].attn_norm, NULL,
7811
7891
  LLM_NORM_RMS, il);
7812
7892
  cb(cur, "attn_norm", il);
7813
7893
 
7814
- ggml_tensor * attention_norm = cur;
7894
+ ggml_tensor * sa_inp = cur;
7815
7895
 
7816
7896
  // self-attention
7817
7897
  {
@@ -7849,18 +7929,17 @@ struct llm_build_plamo : public llm_graph_context {
7849
7929
  model.layers[il].wo, NULL,
7850
7930
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7851
7931
  }
7852
- ggml_tensor * sa_out = cur;
7853
7932
 
7854
- cur = attention_norm;
7855
-
7856
- if (il == n_layer - 1) {
7857
- // skip computing output for unused tokens
7858
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7933
+ if (il == n_layer - 1 && inp_out_ids) {
7859
7934
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7860
- sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
7935
+ sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
7861
7936
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7862
7937
  }
7863
7938
 
7939
+ ggml_tensor * sa_out = cur;
7940
+
7941
+ cur = sa_inp;
7942
+
7864
7943
  // feed-forward network
7865
7944
  {
7866
7945
  cur = build_ffn(cur,
@@ -7925,6 +8004,8 @@ struct llm_build_gpt2 : public llm_graph_context {
7925
8004
  inpL = ggml_add(ctx0, inpL, pos);
7926
8005
  cb(inpL, "inpL", -1);
7927
8006
 
8007
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8008
+
7928
8009
  for (int il = 0; il < n_layer; ++il) {
7929
8010
  cur = build_norm(inpL,
7930
8011
  model.layers[il].attn_norm,
@@ -7957,9 +8038,7 @@ struct llm_build_gpt2 : public llm_graph_context {
7957
8038
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7958
8039
  }
7959
8040
 
7960
- if (il == n_layer - 1) {
7961
- // skip computing output for unused tokens
7962
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8041
+ if (il == n_layer - 1 && inp_out_ids) {
7963
8042
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7964
8043
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7965
8044
  }
@@ -8029,6 +8108,8 @@ struct llm_build_codeshell : public llm_graph_context {
8029
8108
 
8030
8109
  auto * inp_attn = build_attn_inp_kv_unified();
8031
8110
 
8111
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8112
+
8032
8113
  for (int il = 0; il < n_layer; ++il) {
8033
8114
  cur = build_norm(inpL,
8034
8115
  model.layers[il].attn_norm,
@@ -8073,9 +8154,7 @@ struct llm_build_codeshell : public llm_graph_context {
8073
8154
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8074
8155
  }
8075
8156
 
8076
- if (il == n_layer - 1) {
8077
- // skip computing output for unused tokens
8078
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8157
+ if (il == n_layer - 1 && inp_out_ids) {
8079
8158
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8080
8159
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8081
8160
  }
@@ -8129,133 +8208,6 @@ struct llm_build_codeshell : public llm_graph_context {
8129
8208
 
8130
8209
  struct llm_build_orion : public llm_graph_context {
8131
8210
  llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8132
- const int64_t n_embd_head = hparams.n_embd_head_v;
8133
-
8134
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8135
- GGML_ASSERT(n_embd_head == hparams.n_rot);
8136
-
8137
- ggml_tensor * cur;
8138
- ggml_tensor * inpL;
8139
-
8140
- inpL = build_inp_embd(model.tok_embd);
8141
-
8142
- // inp_pos - contains the positions
8143
- ggml_tensor * inp_pos = build_inp_pos();
8144
-
8145
- auto * inp_attn = build_attn_inp_kv_unified();
8146
-
8147
- for (int il = 0; il < n_layer; ++il) {
8148
- ggml_tensor * inpSA = inpL;
8149
-
8150
- // norm
8151
- cur = build_norm(inpL,
8152
- model.layers[il].attn_norm, model.layers[il].attn_norm_b,
8153
- LLM_NORM, il);
8154
- cb(cur, "attn_norm", il);
8155
-
8156
- // self-attention
8157
- {
8158
- // compute Q and K and RoPE them
8159
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8160
- cb(Qcur, "Qcur", il);
8161
- // if (model.layers[il].bq) {
8162
- // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8163
- // cb(Qcur, "Qcur", il);
8164
- // }
8165
-
8166
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8167
- cb(Kcur, "Kcur", il);
8168
- // if (model.layers[il].bk) {
8169
- // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8170
- // cb(Kcur, "Kcur", il);
8171
- // }
8172
-
8173
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8174
- cb(Vcur, "Vcur", il);
8175
- // if (model.layers[il].bv) {
8176
- // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8177
- // cb(Vcur, "Vcur", il);
8178
- // }
8179
-
8180
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8181
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8182
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8183
-
8184
- Qcur = ggml_rope_ext(
8185
- ctx0, Qcur, inp_pos, nullptr,
8186
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8187
- ext_factor, attn_factor, beta_fast, beta_slow
8188
- );
8189
-
8190
- Kcur = ggml_rope_ext(
8191
- ctx0, Kcur, inp_pos, nullptr,
8192
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8193
- ext_factor, attn_factor, beta_fast, beta_slow
8194
- );
8195
-
8196
- cb(Qcur, "Qcur", il);
8197
- cb(Kcur, "Kcur", il);
8198
- cb(Vcur, "Vcur", il);
8199
-
8200
- cur = build_attn(inp_attn, gf,
8201
- model.layers[il].wo, NULL,
8202
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8203
- }
8204
-
8205
- if (il == n_layer - 1) {
8206
- // skip computing output for unused tokens
8207
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8208
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8209
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8210
- }
8211
-
8212
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8213
- cb(ffn_inp, "ffn_inp", il);
8214
-
8215
- // feed-forward network
8216
- cur = build_norm(ffn_inp,
8217
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
8218
- LLM_NORM, il);
8219
- cb(cur, "ffn_norm", il);
8220
-
8221
- cur = build_ffn(cur,
8222
- model.layers[il].ffn_up, NULL, NULL,
8223
- model.layers[il].ffn_gate, NULL, NULL,
8224
- model.layers[il].ffn_down, NULL, NULL,
8225
- NULL,
8226
- LLM_FFN_SILU, LLM_FFN_PAR, il);
8227
- cb(cur, "ffn_out", il);
8228
-
8229
- cur = ggml_add(ctx0, cur, ffn_inp);
8230
-
8231
- cur = build_cvec(cur, il);
8232
- cb(cur, "l_out", il);
8233
-
8234
- // input for next layer
8235
- inpL = cur;
8236
- }
8237
-
8238
- cur = inpL;
8239
-
8240
- cur = build_norm(cur,
8241
- model.output_norm, model.output_norm_b,
8242
- LLM_NORM, -1);
8243
-
8244
- cb(cur, "result_norm", -1);
8245
- res->t_embd = cur;
8246
-
8247
- // lm_head
8248
- cur = build_lora_mm(model.output, cur);
8249
-
8250
- cb(cur, "result_output", -1);
8251
- res->t_logits = cur;
8252
-
8253
- ggml_build_forward_expand(gf, cur);
8254
- }
8255
- };
8256
-
8257
- struct llm_build_internlm2 : public llm_graph_context {
8258
- llm_build_internlm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8259
8211
  const int64_t n_embd_head = hparams.n_embd_head_v;
8260
8212
 
8261
8213
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8271,13 +8223,15 @@ struct llm_build_internlm2 : public llm_graph_context {
8271
8223
 
8272
8224
  auto * inp_attn = build_attn_inp_kv_unified();
8273
8225
 
8226
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8227
+
8274
8228
  for (int il = 0; il < n_layer; ++il) {
8275
8229
  ggml_tensor * inpSA = inpL;
8276
8230
 
8277
8231
  // norm
8278
8232
  cur = build_norm(inpL,
8279
- model.layers[il].attn_norm, NULL,
8280
- LLM_NORM_RMS, il);
8233
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
8234
+ LLM_NORM, il);
8281
8235
  cb(cur, "attn_norm", il);
8282
8236
 
8283
8237
  // self-attention
@@ -8285,24 +8239,24 @@ struct llm_build_internlm2 : public llm_graph_context {
8285
8239
  // compute Q and K and RoPE them
8286
8240
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8287
8241
  cb(Qcur, "Qcur", il);
8288
- if (model.layers[il].bq) {
8289
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8290
- cb(Qcur, "Qcur", il);
8291
- }
8242
+ // if (model.layers[il].bq) {
8243
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8244
+ // cb(Qcur, "Qcur", il);
8245
+ // }
8292
8246
 
8293
8247
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8294
8248
  cb(Kcur, "Kcur", il);
8295
- if (model.layers[il].bk) {
8296
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8297
- cb(Kcur, "Kcur", il);
8298
- }
8249
+ // if (model.layers[il].bk) {
8250
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8251
+ // cb(Kcur, "Kcur", il);
8252
+ // }
8299
8253
 
8300
8254
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8301
8255
  cb(Vcur, "Vcur", il);
8302
- if (model.layers[il].bv) {
8303
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8304
- cb(Vcur, "Vcur", il);
8305
- }
8256
+ // if (model.layers[il].bv) {
8257
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8258
+ // cb(Vcur, "Vcur", il);
8259
+ // }
8306
8260
 
8307
8261
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8308
8262
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
@@ -8325,13 +8279,11 @@ struct llm_build_internlm2 : public llm_graph_context {
8325
8279
  cb(Vcur, "Vcur", il);
8326
8280
 
8327
8281
  cur = build_attn(inp_attn, gf,
8328
- model.layers[il].wo, model.layers[il].bo,
8282
+ model.layers[il].wo, NULL,
8329
8283
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8330
8284
  }
8331
8285
 
8332
- if (il == n_layer - 1) {
8333
- // skip computing output for unused tokens
8334
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8286
+ if (il == n_layer - 1 && inp_out_ids) {
8335
8287
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8336
8288
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8337
8289
  }
@@ -8341,8 +8293,135 @@ struct llm_build_internlm2 : public llm_graph_context {
8341
8293
 
8342
8294
  // feed-forward network
8343
8295
  cur = build_norm(ffn_inp,
8344
- model.layers[il].ffn_norm, NULL,
8345
- LLM_NORM_RMS, il);
8296
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
8297
+ LLM_NORM, il);
8298
+ cb(cur, "ffn_norm", il);
8299
+
8300
+ cur = build_ffn(cur,
8301
+ model.layers[il].ffn_up, NULL, NULL,
8302
+ model.layers[il].ffn_gate, NULL, NULL,
8303
+ model.layers[il].ffn_down, NULL, NULL,
8304
+ NULL,
8305
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
8306
+ cb(cur, "ffn_out", il);
8307
+
8308
+ cur = ggml_add(ctx0, cur, ffn_inp);
8309
+
8310
+ cur = build_cvec(cur, il);
8311
+ cb(cur, "l_out", il);
8312
+
8313
+ // input for next layer
8314
+ inpL = cur;
8315
+ }
8316
+
8317
+ cur = inpL;
8318
+
8319
+ cur = build_norm(cur,
8320
+ model.output_norm, model.output_norm_b,
8321
+ LLM_NORM, -1);
8322
+
8323
+ cb(cur, "result_norm", -1);
8324
+ res->t_embd = cur;
8325
+
8326
+ // lm_head
8327
+ cur = build_lora_mm(model.output, cur);
8328
+
8329
+ cb(cur, "result_output", -1);
8330
+ res->t_logits = cur;
8331
+
8332
+ ggml_build_forward_expand(gf, cur);
8333
+ }
8334
+ };
8335
+
8336
+ struct llm_build_internlm2 : public llm_graph_context {
8337
+ llm_build_internlm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8338
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8339
+
8340
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8341
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8342
+
8343
+ ggml_tensor * cur;
8344
+ ggml_tensor * inpL;
8345
+
8346
+ inpL = build_inp_embd(model.tok_embd);
8347
+
8348
+ // inp_pos - contains the positions
8349
+ ggml_tensor * inp_pos = build_inp_pos();
8350
+
8351
+ auto * inp_attn = build_attn_inp_kv_unified();
8352
+
8353
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8354
+
8355
+ for (int il = 0; il < n_layer; ++il) {
8356
+ ggml_tensor * inpSA = inpL;
8357
+
8358
+ // norm
8359
+ cur = build_norm(inpL,
8360
+ model.layers[il].attn_norm, NULL,
8361
+ LLM_NORM_RMS, il);
8362
+ cb(cur, "attn_norm", il);
8363
+
8364
+ // self-attention
8365
+ {
8366
+ // compute Q and K and RoPE them
8367
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8368
+ cb(Qcur, "Qcur", il);
8369
+ if (model.layers[il].bq) {
8370
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8371
+ cb(Qcur, "Qcur", il);
8372
+ }
8373
+
8374
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8375
+ cb(Kcur, "Kcur", il);
8376
+ if (model.layers[il].bk) {
8377
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8378
+ cb(Kcur, "Kcur", il);
8379
+ }
8380
+
8381
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8382
+ cb(Vcur, "Vcur", il);
8383
+ if (model.layers[il].bv) {
8384
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8385
+ cb(Vcur, "Vcur", il);
8386
+ }
8387
+
8388
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8389
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8390
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8391
+
8392
+ Qcur = ggml_rope_ext(
8393
+ ctx0, Qcur, inp_pos, nullptr,
8394
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8395
+ ext_factor, attn_factor, beta_fast, beta_slow
8396
+ );
8397
+
8398
+ Kcur = ggml_rope_ext(
8399
+ ctx0, Kcur, inp_pos, nullptr,
8400
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8401
+ ext_factor, attn_factor, beta_fast, beta_slow
8402
+ );
8403
+
8404
+ cb(Qcur, "Qcur", il);
8405
+ cb(Kcur, "Kcur", il);
8406
+ cb(Vcur, "Vcur", il);
8407
+
8408
+ cur = build_attn(inp_attn, gf,
8409
+ model.layers[il].wo, model.layers[il].bo,
8410
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8411
+ }
8412
+
8413
+ if (il == n_layer - 1 && inp_out_ids) {
8414
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8415
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8416
+ }
8417
+
8418
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8419
+ cb(ffn_inp, "ffn_inp", il);
8420
+
8421
+ // feed-forward network
8422
+ cur = build_norm(ffn_inp,
8423
+ model.layers[il].ffn_norm, NULL,
8424
+ LLM_NORM_RMS, il);
8346
8425
  cb(cur, "ffn_norm", il);
8347
8426
 
8348
8427
  cur = build_ffn(cur,
@@ -8407,6 +8486,8 @@ struct llm_build_minicpm3 : public llm_graph_context {
8407
8486
 
8408
8487
  auto * inp_attn = build_attn_inp_kv_unified();
8409
8488
 
8489
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8490
+
8410
8491
  for (int il = 0; il < n_layer; ++il) {
8411
8492
  ggml_tensor * inpSA = inpL;
8412
8493
 
@@ -8526,15 +8607,13 @@ struct llm_build_minicpm3 : public llm_graph_context {
8526
8607
  q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
8527
8608
  }
8528
8609
 
8529
- if (il == n_layer - 1) {
8530
- // skip computing output for unused tokens
8531
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8610
+ if (il == n_layer - 1 && inp_out_ids) {
8532
8611
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8533
8612
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8534
8613
  }
8535
8614
 
8536
8615
  // scale_res - scale the hidden states for residual connection
8537
- const float scale_res = scale_depth/sqrtf(float(n_layer));
8616
+ const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
8538
8617
  cur = ggml_scale(ctx0, cur, scale_res);
8539
8618
  cb(cur, "hidden_scaled", il);
8540
8619
 
@@ -8611,6 +8690,8 @@ struct llm_build_gemma : public llm_graph_context {
8611
8690
 
8612
8691
  auto * inp_attn = build_attn_inp_kv_unified();
8613
8692
 
8693
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8694
+
8614
8695
  for (int il = 0; il < n_layer; ++il) {
8615
8696
  // norm
8616
8697
  cur = build_norm(inpL,
@@ -8656,9 +8737,7 @@ struct llm_build_gemma : public llm_graph_context {
8656
8737
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8657
8738
  }
8658
8739
 
8659
- if (il == n_layer - 1) {
8660
- // skip computing output for unused tokens
8661
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8740
+ if (il == n_layer - 1 && inp_out_ids) {
8662
8741
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8663
8742
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8664
8743
  }
@@ -8727,6 +8806,8 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
8727
8806
 
8728
8807
  auto * inp_attn = build_attn_inp_kv_unified_iswa();
8729
8808
 
8809
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8810
+
8730
8811
  for (int il = 0; il < n_layer; ++il) {
8731
8812
  // norm
8732
8813
  cur = build_norm(inpL,
@@ -8771,18 +8852,16 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
8771
8852
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8772
8853
  }
8773
8854
 
8855
+ if (il == n_layer - 1 && inp_out_ids) {
8856
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8857
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8858
+ }
8859
+
8774
8860
  cur = build_norm(cur,
8775
8861
  model.layers[il].attn_post_norm, NULL,
8776
8862
  LLM_NORM_RMS, il);
8777
8863
  cb(cur, "attn_post_norm", il);
8778
8864
 
8779
- if (il == n_layer - 1) {
8780
- // skip computing output for unused tokens
8781
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8782
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8783
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8784
- }
8785
-
8786
8865
  ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
8787
8866
  cb(sa_out, "sa_out", il);
8788
8867
 
@@ -8861,6 +8940,8 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8861
8940
  // TODO: is causal == true correct? might need some changes
8862
8941
  auto * inp_attn = build_attn_inp_kv_unified_iswa();
8863
8942
 
8943
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8944
+
8864
8945
  for (int il = 0; il < n_layer; ++il) {
8865
8946
  const float freq_base_l = model.get_rope_freq_base (cparams, il);
8866
8947
  const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
@@ -8913,18 +8994,16 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8913
8994
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8914
8995
  }
8915
8996
 
8997
+ if (il == n_layer - 1 && inp_out_ids) {
8998
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8999
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9000
+ }
9001
+
8916
9002
  cur = build_norm(cur,
8917
9003
  model.layers[il].attn_post_norm, NULL,
8918
9004
  LLM_NORM_RMS, il);
8919
9005
  cb(cur, "attn_post_norm", il);
8920
9006
 
8921
- if (il == n_layer - 1) {
8922
- // skip computing output for unused tokens
8923
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8924
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8925
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8926
- }
8927
-
8928
9007
  ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
8929
9008
  cb(sa_out, "sa_out", il);
8930
9009
 
@@ -8977,6 +9056,442 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8977
9056
  }
8978
9057
  };
8979
9058
 
9059
+ struct llm_build_gemma3n_iswa : public llm_graph_context {
9060
+ const llama_model & model;
9061
+ ggml_cgraph * gf;
9062
+
9063
+ const int64_t n_embd_head;
9064
+ const int64_t n_embd_altup;
9065
+ const int64_t n_altup;
9066
+ const int i_altup_act;
9067
+ const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
9068
+ const int n_layer_sparsity = 10; // number of layers using activation sparsity
9069
+ const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
9070
+
9071
+ ggml_tensor * one; // containing single element 1.0f
9072
+
9073
+ llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
9074
+ : llm_graph_context(params),
9075
+ model(model),
9076
+ gf(gf),
9077
+ n_embd_head(model.hparams.n_embd_head_k),
9078
+ n_embd_altup(model.hparams.n_embd_altup),
9079
+ n_altup(model.hparams.n_altup),
9080
+ i_altup_act(model.hparams.i_altup_act) {
9081
+ ggml_tensor * cur;
9082
+ ggml_tensor * inpL;
9083
+
9084
+ // TODO: remove this when ggml_scale_add is implemented
9085
+ one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
9086
+ {
9087
+ auto inp = std::make_unique<llm_graph_input_one>();
9088
+ inp->one = one;
9089
+ res->add_input(std::move(inp));
9090
+ }
9091
+
9092
+ inpL = build_inp_embd(model.tok_embd);
9093
+
9094
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
9095
+ if (ubatch.token) {
9096
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
9097
+ cb(inpL, "inp_scaled", -1);
9098
+ }
9099
+
9100
+ // inp_pos - contains the positions
9101
+ ggml_tensor * inp_pos = build_inp_pos();
9102
+
9103
+ // TODO: is causal == true correct? might need some changes
9104
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
9105
+
9106
+ // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
9107
+ ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
9108
+
9109
+ // inpL now has only 1 altup, project it to the rest of the altups
9110
+ // these "added" altups will be concat to the last dim of inpL
9111
+ {
9112
+ ggml_tensor * target_magnitude = calc_magnitude(inpL);
9113
+ ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
9114
+ ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
9115
+ ggml_tensor * new_magnitude = calc_magnitude(altup_added);
9116
+ altup_added = ggml_div(ctx0,
9117
+ ggml_mul(ctx0, altup_added, target_magnitude),
9118
+ new_magnitude);
9119
+ inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
9120
+ cb(inpL, "inp_stacked", -1);
9121
+ }
9122
+
9123
+ // inpL now has shape: [n_embd, n_tokens, n_altup]
9124
+ // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
9125
+
9126
+ for (int il = 0; il < n_layer; ++il) {
9127
+ // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
9128
+ const bool has_kv = (il < n_layer_kv);
9129
+
9130
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
9131
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
9132
+
9133
+ ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
9134
+ ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
9135
+
9136
+ // predicted value will go through self-attention and laurel
9137
+ ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
9138
+ cur = active_prediction;
9139
+ cb(cur, "active_prediction", il);
9140
+
9141
+ // norm
9142
+ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
9143
+ cb(cur, "attn_norm", il);
9144
+
9145
+ // laurel
9146
+ ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
9147
+
9148
+ // self-attention
9149
+ if (has_kv) {
9150
+ // compute Q and K and RoPE them
9151
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
9152
+ cb(Qcur, "Qcur", il);
9153
+
9154
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
9155
+ cb(Kcur, "Kcur", il);
9156
+
9157
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
9158
+ cb(Vcur, "Vcur", il);
9159
+
9160
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9161
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9162
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9163
+
9164
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
9165
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
9166
+ Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
9167
+
9168
+ cb(Qcur, "Qcur_normed", il);
9169
+ cb(Kcur, "Kcur_normed", il);
9170
+ cb(Vcur, "Vcur_normed", il);
9171
+
9172
+ Qcur = ggml_rope_ext(
9173
+ ctx0, Qcur, inp_pos, nullptr,
9174
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
9175
+ ext_factor, attn_factor, beta_fast, beta_slow);
9176
+
9177
+ Kcur = ggml_rope_ext(
9178
+ ctx0, Kcur, inp_pos, nullptr,
9179
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
9180
+ ext_factor, attn_factor, beta_fast, beta_slow);
9181
+
9182
+ cb(Qcur, "Qcur_pos", il);
9183
+ cb(Kcur, "Kcur_pos", il);
9184
+
9185
+ cur = build_attn(inp_attn, gf,
9186
+ model.layers[il].wo, NULL,
9187
+ Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
9188
+ } else {
9189
+ // no KV layers
9190
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
9191
+ cb(Qcur, "Qcur", il);
9192
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9193
+
9194
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
9195
+ cb(Qcur, "Qcur_normed", il);
9196
+
9197
+ Qcur = ggml_rope_ext(
9198
+ ctx0, Qcur, inp_pos, nullptr,
9199
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
9200
+ ext_factor, attn_factor, beta_fast, beta_slow);
9201
+ cb(Qcur, "Qcur_pos", il);
9202
+
9203
+ cur = build_attn(inp_attn, gf,
9204
+ model.layers[il].wo, NULL,
9205
+ Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
9206
+ }
9207
+
9208
+ cur = build_norm(cur,
9209
+ model.layers[il].attn_post_norm, NULL,
9210
+ LLM_NORM_RMS, il);
9211
+ cb(cur, "attn_post_norm", il);
9212
+
9213
+ cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
9214
+ cb(cur, "attn_gated", il);
9215
+
9216
+ ggml_tensor * attn_laurel = ggml_scale(ctx0,
9217
+ ggml_add(ctx0, cur, laurel_out),
9218
+ 1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
9219
+ cb(attn_laurel, "attn_laurel", il);
9220
+
9221
+ cur = build_norm(attn_laurel,
9222
+ model.layers[il].ffn_norm, NULL,
9223
+ LLM_NORM_RMS, il);
9224
+ cb(cur, "ffn_norm", il);
9225
+
9226
+ // feed-forward network
9227
+ {
9228
+ ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
9229
+ ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
9230
+
9231
+ if (il < n_layer_sparsity) {
9232
+ // apply activation sparsity
9233
+ gate_proj = gaussian_topk(gate_proj);
9234
+ }
9235
+ gate_proj = ggml_gelu(ctx0, gate_proj);
9236
+
9237
+ cur = ggml_mul(ctx0, up_proj, gate_proj);
9238
+ cur = build_lora_mm(model.layers[il].ffn_down, cur);
9239
+ cb(cur, "ffn_out", il);
9240
+ }
9241
+
9242
+ cur = build_norm(cur,
9243
+ model.layers[il].ffn_post_norm, NULL,
9244
+ LLM_NORM_RMS, -1);
9245
+ cb(cur, "ffn_post_norm", il);
9246
+
9247
+ ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
9248
+ cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
9249
+
9250
+ ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
9251
+
9252
+ ggml_tensor * first_prediction; // [n_embd, n_tokens]
9253
+ {
9254
+ first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
9255
+ first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
9256
+ first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
9257
+ first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
9258
+ cb(first_prediction, "first_prediction_gated", il);
9259
+ ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
9260
+ first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
9261
+ cb(first_prediction, "first_prediction_scaled", il);
9262
+
9263
+ first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
9264
+ first_prediction = build_norm(first_prediction,
9265
+ model.layers[il].per_layer_post_norm, NULL,
9266
+ LLM_NORM_RMS, il);
9267
+ cb(first_prediction, "first_prediction_out", il);
9268
+ }
9269
+
9270
+ // equivalent to python code: corrected_predictions[1:] += first_prediction
9271
+ {
9272
+ ggml_tensor * slice_first = view_2d_slice(corrected, 0);
9273
+ ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1,
9274
+ ggml_row_size(corrected->type, n_embd),
9275
+ ggml_row_size(corrected->type, n_embd*n_tokens),
9276
+ n_embd*n_tokens*ggml_element_size(corrected));
9277
+ ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
9278
+ corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
9279
+ }
9280
+
9281
+ cur = corrected; // [n_embd, n_tokens, n_altup]
9282
+ cur = build_cvec(cur, il);
9283
+ cb(cur, "l_out", il);
9284
+
9285
+ // input for next layer
9286
+ inpL = cur;
9287
+ }
9288
+
9289
+ cur = inpL; // [n_embd, n_tokens, n_altup]
9290
+
9291
+ // cur now has multiple altup(s), we want to merge them back to 1 altup
9292
+ {
9293
+ ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
9294
+ // do a view to skip the first slice (active altup)
9295
+ ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1,
9296
+ ggml_row_size(cur->type, n_embd),
9297
+ ggml_row_size(cur->type, n_embd*n_tokens),
9298
+ n_embd*n_tokens*ggml_element_size(cur));
9299
+ ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
9300
+ ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
9301
+ altup_unembd = ggml_div(ctx0,
9302
+ ggml_mul(ctx0, altup_unembd, target_magnitude),
9303
+ new_magnitude);
9304
+ cb(altup_unembd, "altup_unembd", -1);
9305
+
9306
+ // equivalent to torch.mean(hidden_states, dim=0)
9307
+ cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
9308
+ for (int i = 0; i < n_altup - 1; ++i) {
9309
+ cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
9310
+ }
9311
+ cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
9312
+ cb(cur, "unembd_merged", -1);
9313
+ }
9314
+
9315
+ // cur now has shape: [n_embd, n_tokens]
9316
+
9317
+ // TODO: move this to right after the last KV layer
9318
+ {
9319
+ // skip computing output for unused tokens
9320
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9321
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9322
+ }
9323
+
9324
+ cur = build_norm(cur,
9325
+ model.output_norm, NULL,
9326
+ LLM_NORM_RMS, -1);
9327
+
9328
+ cb(cur, "result_norm", -1);
9329
+ res->t_embd = cur;
9330
+
9331
+ cur = build_lora_mm(model.output, cur);
9332
+
9333
+ {
9334
+ // final logit soft-capping
9335
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
9336
+ cur = ggml_tanh(ctx0, cur);
9337
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
9338
+ }
9339
+
9340
+ cb(cur, "result_output", -1);
9341
+ res->t_logits = cur;
9342
+
9343
+ ggml_build_forward_expand(gf, cur);
9344
+ }
9345
+
9346
+ ggml_tensor * calc_magnitude(ggml_tensor * x) {
9347
+ return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
9348
+ }
9349
+
9350
+ // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
9351
+ ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) {
9352
+ GGML_ASSERT(idx < (int)x->ne[2]);
9353
+ return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1],
9354
+ ggml_row_size(x->type, x->ne[0]),
9355
+ idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
9356
+ }
9357
+
9358
+ // equivalent to get_per_layer_inputs() in python code
9359
+ // output shape: [n_embd_altup, n_layer, n_tokens]
9360
+ ggml_tensor * get_per_layer_inputs() {
9361
+ auto inp = std::make_unique<llm_graph_input_embd>();
9362
+ ggml_tensor * inp_per_layer;
9363
+ if (ubatch.token) {
9364
+ inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
9365
+ ggml_set_input(inp->tokens);
9366
+ res->t_tokens = inp->tokens;
9367
+ inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
9368
+ inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
9369
+ inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup));
9370
+ cb(inp_per_layer, "inp_per_layer_selected", -1);
9371
+ } else {
9372
+ GGML_ABORT("TODO: support embd input");
9373
+ }
9374
+ res->add_input(std::move(inp));
9375
+ return inp_per_layer;
9376
+ }
9377
+
9378
+ // equivalent to project_per_layer_inputs() in python code
9379
+ // this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
9380
+ // output shape: [n_embd_altup, n_tokens, n_layer]
9381
+ ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
9382
+ const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd);
9383
+ const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
9384
+
9385
+ ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
9386
+ per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
9387
+ per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
9388
+ per_layer_proj = build_norm(per_layer_proj,
9389
+ model.per_layer_proj_norm, NULL,
9390
+ LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens]
9391
+ cb(per_layer_proj, "per_layer_proj", -1);
9392
+
9393
+ inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
9394
+ inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
9395
+ cb(inp_per_layer, "inp_per_layer", -1);
9396
+
9397
+ // permute to shape: [n_embd_altup, n_tokens, n_layer]
9398
+ inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
9399
+ return inp_per_layer;
9400
+ }
9401
+
9402
+ // input cur shape: [n_altup, n_tokens]
9403
+ // output shape: [n_altup, n_tokens]
9404
+ ggml_tensor * laurel(ggml_tensor * cur, int il) {
9405
+ ggml_tensor * tmp = cur;
9406
+ tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
9407
+ tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
9408
+ tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
9409
+ tmp = ggml_add(ctx0, tmp, cur);
9410
+ cb(tmp, "laurel_out", il);
9411
+ return tmp;
9412
+ }
9413
+
9414
+ // input x shape: [n_embd, n_tokens]
9415
+ // output shape: [n_embd, n_tokens]
9416
+ ggml_tensor * gaussian_topk(ggml_tensor * x) {
9417
+ ggml_tensor * mean = ggml_mean(ctx0, x);
9418
+ ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0,
9419
+ ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
9420
+ 1.0f / (float)(x->ne[0] - 1)
9421
+ ));
9422
+ ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
9423
+ return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
9424
+ }
9425
+
9426
+ //
9427
+ // altup functions
9428
+ //
9429
+
9430
+ // equivalent to compute_router_modalities() in python code
9431
+ // input x shape: [n_embd, n_tokens]
9432
+ // output shape: [n_altup, n_tokens]
9433
+ ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) {
9434
+ ggml_tensor * router_inputs = build_norm(x,
9435
+ model.layers[il].altup_router_norm, NULL,
9436
+ LLM_NORM_RMS, il);
9437
+
9438
+ // router_input_scale
9439
+ router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd);
9440
+
9441
+ ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
9442
+ return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
9443
+ }
9444
+
9445
+ // input cur shape: [n_embd, n_tokens, n_altup]
9446
+ // output shape: [n_embd, n_tokens, n_altup]
9447
+ ggml_tensor * altup_predict(ggml_tensor * cur, int il) {
9448
+ ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
9449
+ ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
9450
+ cb(modalities, "modalities", il);
9451
+
9452
+ ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
9453
+ cb(all_coefs, "all_coefs", il);
9454
+ // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
9455
+ all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
9456
+
9457
+ // permute to [n_altup, n_embd, n_tokens]
9458
+ ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
9459
+ ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
9460
+
9461
+ // final shape must be the same as cur: [n_embd, n_tokens, n_altup]
9462
+ predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
9463
+ predictions = ggml_add(ctx0, predictions, cur);
9464
+ cb(predictions, "predictions", il);
9465
+
9466
+ return predictions;
9467
+ }
9468
+
9469
+ // input predictions shape: [n_embd, n_tokens, n_altup]
9470
+ // input activated shape: [n_embd, n_tokens]
9471
+ // output shape: [n_embd, n_tokens, n_altup]
9472
+ ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
9473
+ ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
9474
+ cb(modalities, "modalities", il);
9475
+
9476
+ ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
9477
+ ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
9478
+ cb(innovation, "innovation", il);
9479
+
9480
+ ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
9481
+ all_coefs = ggml_add(ctx0, all_coefs, one);
9482
+ cb(all_coefs, "all_coefs", il);
9483
+ all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
9484
+ all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
9485
+
9486
+ innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
9487
+ ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
9488
+ corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
9489
+ cb(corrected, "corrected", il);
9490
+
9491
+ return corrected;
9492
+ }
9493
+ };
9494
+
8980
9495
  // TODO: move up next to build_starcoder
8981
9496
  struct llm_build_starcoder2 : public llm_graph_context {
8982
9497
  llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
@@ -8995,6 +9510,8 @@ struct llm_build_starcoder2 : public llm_graph_context {
8995
9510
 
8996
9511
  auto * inp_attn = build_attn_inp_kv_unified();
8997
9512
 
9513
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9514
+
8998
9515
  for (int il = 0; il < n_layer; ++il) {
8999
9516
  ggml_tensor * inpSA = inpL;
9000
9517
 
@@ -9053,9 +9570,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
9053
9570
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9054
9571
  }
9055
9572
 
9056
- if (il == n_layer - 1) {
9057
- // skip computing output for unused tokens
9058
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9573
+ if (il == n_layer - 1 && inp_out_ids) {
9059
9574
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9060
9575
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9061
9576
  }
@@ -9118,6 +9633,8 @@ struct llm_build_mamba : public llm_graph_context {
9118
9633
 
9119
9634
  auto * rs_inp = build_rs_inp();
9120
9635
 
9636
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9637
+
9121
9638
  for (int il = 0; il < n_layer; ++il) {
9122
9639
  // norm
9123
9640
  cur = build_norm(inpL,
@@ -9127,9 +9644,7 @@ struct llm_build_mamba : public llm_graph_context {
9127
9644
 
9128
9645
  cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
9129
9646
 
9130
- if (il == n_layer - 1) {
9131
- // skip computing output for unused tokens
9132
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9647
+ if (il == n_layer - 1 && inp_out_ids) {
9133
9648
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9134
9649
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9135
9650
  }
@@ -9168,9 +9683,9 @@ struct llm_build_mamba : public llm_graph_context {
9168
9683
  ggml_tensor * cur,
9169
9684
  const llama_ubatch & ubatch,
9170
9685
  int il) const {
9171
- const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
9686
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
9172
9687
 
9173
- const auto kv_head = kv_state->get_head();
9688
+ const auto kv_head = mctx_cur->get_head();
9174
9689
 
9175
9690
  const int64_t d_conv = hparams.ssm_d_conv;
9176
9691
  const int64_t d_inner = hparams.ssm_d_inner;
@@ -9188,8 +9703,8 @@ struct llm_build_mamba : public llm_graph_context {
9188
9703
  GGML_ASSERT(ubatch.equal_seqs);
9189
9704
  GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
9190
9705
 
9191
- ggml_tensor * conv_states_all = kv_state->get_r_l(il);
9192
- ggml_tensor * ssm_states_all = kv_state->get_s_l(il);
9706
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
9707
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
9193
9708
 
9194
9709
  // (ab)using the KV cache to store the states
9195
9710
  ggml_tensor * conv = build_rs(
@@ -9311,13 +9826,15 @@ struct llm_build_command_r : public llm_graph_context {
9311
9826
 
9312
9827
  auto * inp_attn = build_attn_inp_kv_unified();
9313
9828
 
9314
- for (int il = 0; il < n_layer; ++il) {
9829
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9315
9830
 
9831
+ for (int il = 0; il < n_layer; ++il) {
9316
9832
  // norm
9317
9833
  cur = build_norm(inpL,
9318
9834
  model.layers[il].attn_norm, NULL,
9319
9835
  LLM_NORM, il);
9320
9836
  cb(cur, "attn_norm", il);
9837
+
9321
9838
  ggml_tensor * ffn_inp = cur;
9322
9839
 
9323
9840
  // self-attention
@@ -9385,9 +9902,7 @@ struct llm_build_command_r : public llm_graph_context {
9385
9902
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9386
9903
  }
9387
9904
 
9388
- if (il == n_layer - 1) {
9389
- // skip computing output for unused tokens
9390
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9905
+ if (il == n_layer - 1 && inp_out_ids) {
9391
9906
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9392
9907
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9393
9908
  ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
@@ -9458,6 +9973,8 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
9458
9973
 
9459
9974
  auto * inp_attn = build_attn_inp_kv_unified_iswa();
9460
9975
 
9976
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9977
+
9461
9978
  for (int il = 0; il < n_layer; ++il) {
9462
9979
  const bool is_swa = hparams.is_swa(il);
9463
9980
 
@@ -9520,9 +10037,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
9520
10037
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9521
10038
  }
9522
10039
 
9523
- if (il == n_layer - 1) {
9524
- // skip computing output for unused tokens
9525
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10040
+ if (il == n_layer - 1 && inp_out_ids) {
9526
10041
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9527
10042
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9528
10043
  ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
@@ -9593,6 +10108,8 @@ struct llm_build_olmo : public llm_graph_context {
9593
10108
 
9594
10109
  auto * inp_attn = build_attn_inp_kv_unified();
9595
10110
 
10111
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10112
+
9596
10113
  for (int il = 0; il < n_layer; ++il) {
9597
10114
  ggml_tensor * inpSA = inpL;
9598
10115
 
@@ -9651,9 +10168,7 @@ struct llm_build_olmo : public llm_graph_context {
9651
10168
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9652
10169
  }
9653
10170
 
9654
- if (il == n_layer - 1) {
9655
- // skip computing output for unused tokens
9656
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10171
+ if (il == n_layer - 1 && inp_out_ids) {
9657
10172
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9658
10173
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9659
10174
  }
@@ -9721,6 +10236,8 @@ struct llm_build_olmo2 : public llm_graph_context {
9721
10236
 
9722
10237
  auto * inp_attn = build_attn_inp_kv_unified();
9723
10238
 
10239
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10240
+
9724
10241
  for (int il = 0; il < n_layer; ++il) {
9725
10242
  ggml_tensor * inpSA = inpL;
9726
10243
 
@@ -9771,18 +10288,16 @@ struct llm_build_olmo2 : public llm_graph_context {
9771
10288
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9772
10289
  }
9773
10290
 
10291
+ if (il == n_layer - 1 && inp_out_ids) {
10292
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10293
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10294
+ }
10295
+
9774
10296
  cur = build_norm(cur,
9775
10297
  model.layers[il].attn_post_norm, NULL,
9776
10298
  LLM_NORM_RMS, il);
9777
10299
  cb(cur, "attn_post_norm", il);
9778
10300
 
9779
- if (il == n_layer - 1) {
9780
- // skip computing output for unused tokens
9781
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9782
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9783
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9784
- }
9785
-
9786
10301
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
9787
10302
  cb(ffn_inp, "ffn_inp", il);
9788
10303
 
@@ -9850,6 +10365,8 @@ struct llm_build_olmoe : public llm_graph_context {
9850
10365
 
9851
10366
  auto * inp_attn = build_attn_inp_kv_unified();
9852
10367
 
10368
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10369
+
9853
10370
  for (int il = 0; il < n_layer; ++il) {
9854
10371
  ggml_tensor * inpSA = inpL;
9855
10372
 
@@ -9904,9 +10421,7 @@ struct llm_build_olmoe : public llm_graph_context {
9904
10421
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9905
10422
  }
9906
10423
 
9907
- if (il == n_layer - 1) {
9908
- // skip computing output for unused tokens
9909
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10424
+ if (il == n_layer - 1 && inp_out_ids) {
9910
10425
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9911
10426
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9912
10427
  }
@@ -9976,6 +10491,8 @@ struct llm_build_openelm : public llm_graph_context {
9976
10491
 
9977
10492
  auto * inp_attn = build_attn_inp_kv_unified();
9978
10493
 
10494
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10495
+
9979
10496
  for (int il = 0; il < n_layer; ++il) {
9980
10497
  const int64_t n_head = hparams.n_head(il);
9981
10498
  const int64_t n_head_kv = hparams.n_head_kv(il);
@@ -10037,11 +10554,9 @@ struct llm_build_openelm : public llm_graph_context {
10037
10554
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10038
10555
  }
10039
10556
 
10040
- if (il == n_layer - 1) {
10041
- // skip computing output for unused tokens
10042
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10557
+ if (il == n_layer - 1 && inp_out_ids) {
10043
10558
  residual = ggml_get_rows(ctx0, residual, inp_out_ids);
10044
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10559
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10045
10560
  }
10046
10561
 
10047
10562
  ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
@@ -10107,6 +10622,8 @@ struct llm_build_gptneox : public llm_graph_context {
10107
10622
 
10108
10623
  auto * inp_attn = build_attn_inp_kv_unified();
10109
10624
 
10625
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10626
+
10110
10627
  for (int il = 0; il < n_layer; ++il) {
10111
10628
  cur = build_norm(inpL,
10112
10629
  model.layers[il].attn_norm,
@@ -10151,9 +10668,7 @@ struct llm_build_gptneox : public llm_graph_context {
10151
10668
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10152
10669
  }
10153
10670
 
10154
- if (il == n_layer - 1) {
10155
- // skip computing output for unused tokens
10156
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10671
+ if (il == n_layer - 1 && inp_out_ids) {
10157
10672
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10158
10673
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10159
10674
  }
@@ -10255,6 +10770,8 @@ struct llm_build_arctic : public llm_graph_context {
10255
10770
 
10256
10771
  auto * inp_attn = build_attn_inp_kv_unified();
10257
10772
 
10773
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10774
+
10258
10775
  for (int il = 0; il < n_layer; ++il) {
10259
10776
  ggml_tensor * inpSA = inpL;
10260
10777
 
@@ -10301,9 +10818,7 @@ struct llm_build_arctic : public llm_graph_context {
10301
10818
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10302
10819
  }
10303
10820
 
10304
- if (il == n_layer - 1) {
10305
- // skip computing output for unused tokens
10306
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10821
+ if (il == n_layer - 1 && inp_out_ids) {
10307
10822
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10308
10823
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10309
10824
  }
@@ -10395,6 +10910,8 @@ struct llm_build_deepseek : public llm_graph_context {
10395
10910
 
10396
10911
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
10397
10912
 
10913
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10914
+
10398
10915
  for (int il = 0; il < n_layer; ++il) {
10399
10916
  ggml_tensor * inpSA = inpL;
10400
10917
 
@@ -10456,14 +10973,11 @@ struct llm_build_deepseek : public llm_graph_context {
10456
10973
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
10457
10974
  }
10458
10975
 
10459
- if (il == n_layer - 1) {
10460
- // skip computing output for unused tokens
10461
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10976
+ if (il == n_layer - 1 && inp_out_ids) {
10462
10977
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10463
10978
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10464
10979
  }
10465
10980
 
10466
-
10467
10981
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10468
10982
  cb(ffn_inp, "ffn_inp", il);
10469
10983
 
@@ -10571,6 +11085,8 @@ struct llm_build_deepseek2 : public llm_graph_context {
10571
11085
 
10572
11086
  auto * inp_attn = build_attn_inp_kv_unified();
10573
11087
 
11088
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11089
+
10574
11090
  for (int il = 0; il < n_layer; ++il) {
10575
11091
  ggml_tensor * inpSA = inpL;
10576
11092
 
@@ -10720,9 +11236,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
10720
11236
  }
10721
11237
  }
10722
11238
 
10723
- if (il == n_layer - 1) {
10724
- // skip computing output for unused tokens
10725
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11239
+ if (il == n_layer - 1 && inp_out_ids) {
10726
11240
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10727
11241
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10728
11242
  }
@@ -10818,6 +11332,8 @@ struct llm_build_bitnet : public llm_graph_context {
10818
11332
 
10819
11333
  auto * inp_attn = build_attn_inp_kv_unified();
10820
11334
 
11335
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11336
+
10821
11337
  for (int il = 0; il < n_layer; ++il) {
10822
11338
  ggml_tensor * inpSA = inpL;
10823
11339
 
@@ -10900,9 +11416,7 @@ struct llm_build_bitnet : public llm_graph_context {
10900
11416
  cb(cur, "attn_o_out", il);
10901
11417
  }
10902
11418
 
10903
- if (il == n_layer - 1) {
10904
- // skip computing output for unused tokens
10905
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11419
+ if (il == n_layer - 1 && inp_out_ids) {
10906
11420
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10907
11421
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10908
11422
  }
@@ -10977,6 +11491,8 @@ struct llm_build_t5_enc : public llm_graph_context {
10977
11491
 
10978
11492
  auto * inp_attn = build_attn_inp_no_cache();
10979
11493
 
11494
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11495
+
10980
11496
  for (int il = 0; il < n_layer; ++il) {
10981
11497
  ggml_tensor * inpSA = inpL;
10982
11498
 
@@ -11010,9 +11526,7 @@ struct llm_build_t5_enc : public llm_graph_context {
11010
11526
  cb(cur, "kqv_out", il);
11011
11527
  }
11012
11528
 
11013
- if (il == n_layer - 1) {
11014
- // skip computing output for unused tokens
11015
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11529
+ if (il == n_layer - 1 && inp_out_ids) {
11016
11530
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11017
11531
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11018
11532
  }
@@ -11083,6 +11597,8 @@ struct llm_build_t5_dec : public llm_graph_context {
11083
11597
  auto * inp_attn_self = build_attn_inp_kv_unified();
11084
11598
  auto * inp_attn_cross = build_attn_inp_cross();
11085
11599
 
11600
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11601
+
11086
11602
  for (int il = 0; il < n_layer; ++il) {
11087
11603
  ggml_tensor * inpSA = inpL;
11088
11604
 
@@ -11174,11 +11690,8 @@ struct llm_build_t5_dec : public llm_graph_context {
11174
11690
  //cb(cur, "kqv_out", il);
11175
11691
  }
11176
11692
 
11177
- if (il == n_layer - 1) {
11178
- // skip computing output for unused tokens
11179
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11693
+ if (il == n_layer - 1 && inp_out_ids) {
11180
11694
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11181
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11182
11695
  inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
11183
11696
  }
11184
11697
 
@@ -11248,6 +11761,8 @@ struct llm_build_jais : public llm_graph_context {
11248
11761
 
11249
11762
  auto * inp_attn = build_attn_inp_kv_unified();
11250
11763
 
11764
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11765
+
11251
11766
  for (int il = 0; il < n_layer; ++il) {
11252
11767
  cur = build_norm(inpL,
11253
11768
  model.layers[il].attn_norm,
@@ -11280,9 +11795,7 @@ struct llm_build_jais : public llm_graph_context {
11280
11795
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
11281
11796
  }
11282
11797
 
11283
- if (il == n_layer - 1) {
11284
- // skip computing output for unused tokens
11285
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11798
+ if (il == n_layer - 1 && inp_out_ids) {
11286
11799
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11287
11800
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
11288
11801
  }
@@ -11346,6 +11859,8 @@ struct llm_build_chatglm : public llm_graph_context {
11346
11859
 
11347
11860
  auto * inp_attn = build_attn_inp_kv_unified();
11348
11861
 
11862
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11863
+
11349
11864
  for (int il = 0; il < n_layer; ++il) {
11350
11865
  ggml_tensor * inpSA = inpL;
11351
11866
 
@@ -11412,9 +11927,7 @@ struct llm_build_chatglm : public llm_graph_context {
11412
11927
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11413
11928
  }
11414
11929
 
11415
- if (il == n_layer - 1) {
11416
- // skip computing output for unused tokens
11417
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11930
+ if (il == n_layer - 1 && inp_out_ids) {
11418
11931
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11419
11932
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11420
11933
  }
@@ -11479,6 +11992,8 @@ struct llm_build_glm4 : public llm_graph_context {
11479
11992
 
11480
11993
  auto * inp_attn = build_attn_inp_kv_unified();
11481
11994
 
11995
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11996
+
11482
11997
  for (int il = 0; il < n_layer; ++il) {
11483
11998
  ggml_tensor * inpSA = inpL;
11484
11999
 
@@ -11545,9 +12060,7 @@ struct llm_build_glm4 : public llm_graph_context {
11545
12060
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11546
12061
  }
11547
12062
 
11548
- if (il == n_layer - 1) {
11549
- // skip computing output for unused tokens
11550
- ggml_tensor * inp_out_ids = build_inp_out_ids();
12063
+ if (il == n_layer - 1 && inp_out_ids) {
11551
12064
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11552
12065
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11553
12066
  }
@@ -11630,6 +12143,8 @@ struct llm_build_nemotron : public llm_graph_context {
11630
12143
 
11631
12144
  auto * inp_attn = build_attn_inp_kv_unified();
11632
12145
 
12146
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12147
+
11633
12148
  for (int il = 0; il < n_layer; ++il) {
11634
12149
  ggml_tensor * inpSA = inpL;
11635
12150
 
@@ -11689,9 +12204,7 @@ struct llm_build_nemotron : public llm_graph_context {
11689
12204
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11690
12205
  }
11691
12206
 
11692
- if (il == n_layer - 1) {
11693
- // skip computing output for unused tokens
11694
- ggml_tensor * inp_out_ids = build_inp_out_ids();
12207
+ if (il == n_layer - 1 && inp_out_ids) {
11695
12208
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11696
12209
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11697
12210
  }
@@ -11759,6 +12272,8 @@ struct llm_build_exaone : public llm_graph_context {
11759
12272
 
11760
12273
  auto * inp_attn = build_attn_inp_kv_unified();
11761
12274
 
12275
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12276
+
11762
12277
  for (int il = 0; il < n_layer; ++il) {
11763
12278
  ggml_tensor * inpSA = inpL;
11764
12279
 
@@ -11820,9 +12335,7 @@ struct llm_build_exaone : public llm_graph_context {
11820
12335
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11821
12336
  }
11822
12337
 
11823
- if (il == n_layer - 1) {
11824
- // skip computing output for unused tokens
11825
- ggml_tensor * inp_out_ids = build_inp_out_ids();
12338
+ if (il == n_layer - 1 && inp_out_ids) {
11826
12339
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11827
12340
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11828
12341
  }
@@ -11915,7 +12428,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11915
12428
  ggml_tensor * x_prev,
11916
12429
  const llama_ubatch & ubatch,
11917
12430
  int il) const {
11918
- const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
12431
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
11919
12432
 
11920
12433
  const auto n_tokens = ubatch.n_tokens;
11921
12434
  const auto n_seqs = ubatch.n_seqs;
@@ -11925,7 +12438,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11925
12438
  const auto n_head = n_embd / head_size;
11926
12439
  const auto n_head_kv = hparams.n_head_kv(il);
11927
12440
 
11928
- const auto kv_head = kv_state->get_head();
12441
+ const auto kv_head = mctx_cur->get_head();
11929
12442
 
11930
12443
  const auto & layer = model.layers[il];
11931
12444
 
@@ -12037,7 +12550,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
12037
12550
  }
12038
12551
 
12039
12552
  ggml_tensor * wkv_state = build_rs(
12040
- inp, gf, kv_state->get_s_l(il),
12553
+ inp, gf, mctx_cur->get_s_l(il),
12041
12554
  hparams.n_embd_s(), n_seqs);
12042
12555
 
12043
12556
  ggml_tensor * wkv_output;
@@ -12056,9 +12569,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
12056
12569
  wkv_state,
12057
12570
  ggml_view_1d(
12058
12571
  ctx0,
12059
- kv_state->get_s_l(il),
12572
+ mctx_cur->get_s_l(il),
12060
12573
  hparams.n_embd_s() * n_seqs,
12061
- hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il))
12574
+ hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
12062
12575
  )
12063
12576
  )
12064
12577
  );
@@ -12098,6 +12611,8 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
12098
12611
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12099
12612
  const auto n_seqs = ubatch.n_seqs;
12100
12613
 
12614
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12615
+
12101
12616
  for (int il = 0; il < n_layer; ++il) {
12102
12617
  const llama_layer * layer = &model.layers[il];
12103
12618
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
@@ -12139,13 +12654,16 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
12139
12654
  );
12140
12655
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
12141
12656
 
12142
- if (il == n_layer - 1) {
12143
- // skip computing output for unused tokens
12144
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
12145
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
12146
- ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
12147
- x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
12148
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
12657
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
12658
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
12659
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
12660
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
12661
+
12662
+ if (il == n_layer - 1 && inp_out_ids) {
12663
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
12664
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
12665
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
12666
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12149
12667
  }
12150
12668
 
12151
12669
  cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
@@ -12193,6 +12711,8 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
12193
12711
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12194
12712
  const auto n_seqs = ubatch.n_seqs;
12195
12713
 
12714
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12715
+
12196
12716
  for (int il = 0; il < n_layer; ++il) {
12197
12717
  const llama_layer * layer = &model.layers[il];
12198
12718
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
@@ -12217,11 +12737,12 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
12217
12737
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
12218
12738
  cb(ffn_inp, "ffn_inp", il);
12219
12739
 
12220
- if (il == n_layer - 1) {
12221
- // skip computing output for unused tokens
12222
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
12223
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
12224
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
12740
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
12741
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
12742
+
12743
+ if (il == n_layer - 1 && inp_out_ids) {
12744
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12745
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
12225
12746
  }
12226
12747
 
12227
12748
  // feed-forward network
@@ -12304,7 +12825,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12304
12825
  ggml_tensor *& first_layer_value,
12305
12826
  const llama_ubatch & ubatch,
12306
12827
  int il) const {
12307
- const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
12828
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
12308
12829
 
12309
12830
  const auto n_tokens = ubatch.n_tokens;
12310
12831
  const auto n_seqs = ubatch.n_seqs;
@@ -12313,7 +12834,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12313
12834
  const auto head_count = n_embd / head_size;
12314
12835
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12315
12836
 
12316
- const auto kv_head = kv_state->get_head();
12837
+ const auto kv_head = mctx_cur->get_head();
12317
12838
 
12318
12839
  const auto & layer = model.layers[il];
12319
12840
 
@@ -12384,7 +12905,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12384
12905
  a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
12385
12906
 
12386
12907
  ggml_tensor * wkv_state = build_rs(
12387
- inp, gf, kv_state->get_s_l(il),
12908
+ inp, gf, mctx_cur->get_s_l(il),
12388
12909
  hparams.n_embd_s(), n_seqs);
12389
12910
 
12390
12911
  ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
@@ -12398,9 +12919,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12398
12919
  wkv_state,
12399
12920
  ggml_view_1d(
12400
12921
  ctx0,
12401
- kv_state->get_s_l(il),
12922
+ mctx_cur->get_s_l(il),
12402
12923
  hparams.n_embd_s() * n_seqs,
12403
- hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il))
12924
+ hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
12404
12925
  )
12405
12926
  )
12406
12927
  );
@@ -12447,6 +12968,8 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12447
12968
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12448
12969
  const auto n_seqs = ubatch.n_seqs;
12449
12970
 
12971
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12972
+
12450
12973
  for (int il = 0; il < n_layer; ++il) {
12451
12974
  const llama_layer * layer = &model.layers[il];
12452
12975
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
@@ -12488,12 +13011,14 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12488
13011
  );
12489
13012
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
12490
13013
 
12491
- if (il == n_layer - 1) {
12492
- // skip computing output for unused tokens
12493
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
12494
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
12495
- ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
12496
- x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
13014
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
13015
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
13016
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
13017
+
13018
+ if (il == n_layer - 1 && inp_out_ids) {
13019
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
13020
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
13021
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
12497
13022
  }
12498
13023
 
12499
13024
  cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
@@ -12538,6 +13063,8 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12538
13063
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12539
13064
  const auto n_seqs = ubatch.n_seqs;
12540
13065
 
13066
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13067
+
12541
13068
  for (int il = 0; il < n_layer; ++il) {
12542
13069
  const llama_layer * layer = &model.layers[il];
12543
13070
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
@@ -12562,11 +13089,12 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12562
13089
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
12563
13090
  cb(ffn_inp, "ffn_inp", il);
12564
13091
 
12565
- if (il == n_layer - 1) {
12566
- // skip computing output for unused tokens
12567
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
12568
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
12569
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
13092
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
13093
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
13094
+
13095
+ if (il == n_layer - 1 && inp_out_ids) {
13096
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13097
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
12570
13098
  }
12571
13099
 
12572
13100
  // feed-forward network
@@ -12635,6 +13163,9 @@ struct llm_build_granite : public llm_graph_context {
12635
13163
  auto * inp_attn = build_attn_inp_kv_unified();
12636
13164
 
12637
13165
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
13166
+
13167
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13168
+
12638
13169
  for (int il = 0; il < n_layer; ++il) {
12639
13170
  ggml_tensor * inpSA = inpL;
12640
13171
 
@@ -12697,9 +13228,7 @@ struct llm_build_granite : public llm_graph_context {
12697
13228
  cb(cur, "attn_out", il);
12698
13229
  }
12699
13230
 
12700
- if (il == n_layer - 1) {
12701
- // skip computing output for unused tokens
12702
- ggml_tensor * inp_out_ids = build_inp_out_ids();
13231
+ if (il == n_layer - 1 && inp_out_ids) {
12703
13232
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12704
13233
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12705
13234
  }
@@ -12818,6 +13347,8 @@ struct llm_build_chameleon : public llm_graph_context {
12818
13347
 
12819
13348
  auto * inp_attn = build_attn_inp_kv_unified();
12820
13349
 
13350
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13351
+
12821
13352
  for (int il = 0; il < n_layer; ++il) {
12822
13353
  ggml_tensor * inpSA = inpL;
12823
13354
 
@@ -12894,21 +13425,19 @@ struct llm_build_chameleon : public llm_graph_context {
12894
13425
  cur = build_attn(inp_attn, gf,
12895
13426
  model.layers[il].wo, nullptr,
12896
13427
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12897
-
12898
- if (hparams.swin_norm) {
12899
- cur = build_norm(cur,
12900
- model.layers[il].attn_norm, NULL,
12901
- LLM_NORM_RMS, il);
12902
- }
12903
13428
  }
12904
13429
 
12905
- if (il == n_layer - 1) {
12906
- // skip computing output for unused tokens
12907
- ggml_tensor * inp_out_ids = build_inp_out_ids();
13430
+ if (il == n_layer - 1 && inp_out_ids) {
12908
13431
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12909
13432
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12910
13433
  }
12911
13434
 
13435
+ if (hparams.swin_norm) {
13436
+ cur = build_norm(cur,
13437
+ model.layers[il].attn_norm, NULL,
13438
+ LLM_NORM_RMS, il);
13439
+ }
13440
+
12912
13441
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12913
13442
  cb(ffn_inp, "ffn_inp", il);
12914
13443
 
@@ -13149,6 +13678,8 @@ struct llm_build_plm : public llm_graph_context {
13149
13678
 
13150
13679
  auto * inp_attn = build_attn_inp_kv_unified();
13151
13680
 
13681
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13682
+
13152
13683
  for (int il = 0; il < n_layer; ++il) {
13153
13684
  ggml_tensor * inpSA = inpL;
13154
13685
 
@@ -13252,9 +13783,7 @@ struct llm_build_plm : public llm_graph_context {
13252
13783
  q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
13253
13784
  }
13254
13785
 
13255
- if (il == n_layer - 1) {
13256
- // skip computing output for unused tokens
13257
- ggml_tensor * inp_out_ids = build_inp_out_ids();
13786
+ if (il == n_layer - 1 && inp_out_ids) {
13258
13787
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13259
13788
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13260
13789
  }
@@ -13314,6 +13843,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
13314
13843
 
13315
13844
  auto * inp_attn = build_attn_inp_kv_unified();
13316
13845
 
13846
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13847
+
13317
13848
  for (int il = 0; il < n_layer; ++il) {
13318
13849
  ggml_tensor * inpSA = inpL;
13319
13850
 
@@ -13375,9 +13906,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
13375
13906
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
13376
13907
  }
13377
13908
 
13378
- if (il == n_layer - 1) {
13379
- // skip computing output for unused tokens
13380
- ggml_tensor * inp_out_ids = build_inp_out_ids();
13909
+ if (il == n_layer - 1 && inp_out_ids) {
13381
13910
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13382
13911
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13383
13912
  }
@@ -13463,6 +13992,8 @@ struct llm_build_dots1 : public llm_graph_context {
13463
13992
 
13464
13993
  auto * inp_attn = build_attn_inp_kv_unified();
13465
13994
 
13995
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13996
+
13466
13997
  for (int il = 0; il < n_layer; ++il) {
13467
13998
  ggml_tensor * inpSA = inpL;
13468
13999
 
@@ -13515,9 +14046,7 @@ struct llm_build_dots1 : public llm_graph_context {
13515
14046
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13516
14047
  }
13517
14048
 
13518
- if (il == n_layer - 1) {
13519
- // skip computing output for unused tokens
13520
- ggml_tensor * inp_out_ids = build_inp_out_ids();
14049
+ if (il == n_layer - 1 && inp_out_ids) {
13521
14050
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13522
14051
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13523
14052
  }
@@ -13615,6 +14144,8 @@ struct llm_build_arcee : public llm_graph_context {
13615
14144
 
13616
14145
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
13617
14146
 
14147
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14148
+
13618
14149
  for (int il = 0; il < n_layer; ++il) {
13619
14150
  ggml_tensor * inpSA = inpL;
13620
14151
 
@@ -13677,9 +14208,7 @@ struct llm_build_arcee : public llm_graph_context {
13677
14208
  cb(cur, "attn_out", il);
13678
14209
  }
13679
14210
 
13680
- if (il == n_layer - 1) {
13681
- // skip computing output for unused tokens
13682
- ggml_tensor * inp_out_ids = build_inp_out_ids();
14211
+ if (il == n_layer - 1 && inp_out_ids) {
13683
14212
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13684
14213
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13685
14214
  }
@@ -13957,6 +14486,10 @@ llm_graph_result_ptr llama_model::build_graph(
13957
14486
  {
13958
14487
  llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
13959
14488
  } break;
14489
+ case LLM_ARCH_GEMMA3N:
14490
+ {
14491
+ llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params, gf);
14492
+ } break;
13960
14493
  case LLM_ARCH_STARCODER2:
13961
14494
  {
13962
14495
  llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
@@ -14278,6 +14811,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14278
14811
  case LLM_ARCH_GEMMA:
14279
14812
  case LLM_ARCH_GEMMA2:
14280
14813
  case LLM_ARCH_GEMMA3:
14814
+ case LLM_ARCH_GEMMA3N:
14281
14815
  case LLM_ARCH_STARCODER2:
14282
14816
  case LLM_ARCH_OPENELM:
14283
14817
  case LLM_ARCH_GPTNEOX:
@@ -14360,7 +14894,7 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
14360
14894
  // do not extend this list unless absolutely necessary
14361
14895
  // Mistral-Small-2503 does not have built-in chat template
14362
14896
  llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
14363
- if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
14897
+ if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
14364
14898
  return "mistral-v7-tekken";
14365
14899
  }
14366
14900