@novastera-oss/llamarn 0.3.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (347) hide show
  1. package/README.md +86 -3
  2. package/RNLlamaCpp.podspec +1 -1
  3. package/android/CMakeLists.txt +11 -3
  4. package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI.h +49 -4
  5. package/android/src/main/cpp/include/llama.h +53 -114
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  12. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  13. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  15. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  16. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  17. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  20. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  21. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  22. package/cpp/LlamaCppModel.cpp +2 -10
  23. package/cpp/PureCppImpl.cpp +71 -4
  24. package/cpp/SystemUtils.cpp +3 -7
  25. package/cpp/build-info.cpp +2 -2
  26. package/cpp/llama.cpp/CMakeLists.txt +2 -0
  27. package/cpp/llama.cpp/CODEOWNERS +1 -1
  28. package/cpp/llama.cpp/Makefile +6 -1605
  29. package/cpp/llama.cpp/README.md +5 -1
  30. package/cpp/llama.cpp/common/arg.cpp +230 -51
  31. package/cpp/llama.cpp/common/chat-parser.cpp +9 -1
  32. package/cpp/llama.cpp/common/chat.cpp +539 -8
  33. package/cpp/llama.cpp/common/chat.h +8 -1
  34. package/cpp/llama.cpp/common/common.cpp +60 -15
  35. package/cpp/llama.cpp/common/common.h +64 -15
  36. package/cpp/llama.cpp/common/speculative.cpp +135 -54
  37. package/cpp/llama.cpp/common/speculative.h +8 -1
  38. package/cpp/llama.cpp/convert_hf_to_gguf.py +1216 -109
  39. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +19 -6
  40. package/cpp/llama.cpp/convert_lora_to_gguf.py +1 -1
  41. package/cpp/llama.cpp/flake.nix +0 -5
  42. package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -3
  43. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +71 -70
  44. package/cpp/llama.cpp/ggml/include/ggml-opt.h +25 -6
  45. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  46. package/cpp/llama.cpp/ggml/include/ggml.h +90 -3
  47. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +13 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +1 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +10 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +113 -17
  51. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +4 -4
  52. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +701 -585
  54. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +13 -3
  55. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +52 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +274 -91
  57. package/cpp/llama.cpp/ggml/src/ggml-common.h +17 -0
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +2 -2
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +371 -298
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +33 -2
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -1
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +428 -23
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -8
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +458 -46
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +22 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +122 -5
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +9 -11
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cu +58 -0
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cuh +3 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +275 -170
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +103 -65
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +171 -0
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +33 -7
  96. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +13 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +2 -10
  98. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +3 -4
  99. package/cpp/llama.cpp/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  100. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +83 -27
  101. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +116 -57
  102. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +45 -18
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +56 -29
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +61 -39
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +70 -49
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +70 -21
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +162 -50
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cuh +2 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +5 -4
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +208 -97
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +46 -35
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +56 -2
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +95 -51
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +427 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +5 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +204 -57
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +252 -168
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +10 -5
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +192 -19
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +5 -0
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +82 -0
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cu +67 -0
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cuh +5 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +1 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cu +34 -0
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cuh +5 -0
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +16 -10
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +153 -71
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +6 -10
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +21 -4
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +75 -0
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -25
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -1
  144. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  145. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +31 -20
  147. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +342 -131
  148. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +464 -134
  149. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
  150. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +8 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1108 -176
  152. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +343 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +343 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +346 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +41 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +10 -2
  167. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +10 -2
  168. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +10 -2
  169. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +10 -2
  170. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +97 -41
  173. package/cpp/llama.cpp/ggml/src/ggml-quants.c +110 -16
  174. package/cpp/llama.cpp/ggml/src/ggml-quants.h +6 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +22 -9
  176. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -212
  178. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +213 -1
  179. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +117 -238
  180. package/cpp/llama.cpp/ggml/src/ggml-sycl/quantize.hpp +133 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +94 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1666 -633
  183. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  184. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  186. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  187. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +107 -43
  188. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  189. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +18 -0
  190. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +21 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +16 -1
  195. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +44 -8
  196. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +44 -16
  197. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +26 -1
  198. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -17
  199. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +37 -1
  201. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  202. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +109 -55
  203. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +71 -41
  204. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +6 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  207. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -11
  208. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  209. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +9 -3
  210. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  211. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  212. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  213. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +55 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +75 -20
  216. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +2 -2
  217. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +807 -412
  218. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +72 -22
  219. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +8 -8
  220. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +1794 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +82 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +97 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +846 -0
  225. package/cpp/llama.cpp/ggml/src/ggml.c +204 -50
  226. package/cpp/llama.cpp/gguf-py/gguf/constants.py +187 -2
  227. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +11 -2
  228. package/cpp/llama.cpp/gguf-py/gguf/quants.py +53 -4
  229. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +67 -63
  230. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +7 -1
  231. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +120 -16
  232. package/cpp/llama.cpp/gguf-py/gguf/utility.py +5 -1
  233. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +284 -1
  234. package/cpp/llama.cpp/gguf-py/tests/test_quants.py +14 -5
  235. package/cpp/llama.cpp/include/llama.h +53 -114
  236. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +171 -0
  237. package/cpp/llama.cpp/models/templates/README.md +2 -1
  238. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +59 -0
  239. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +331 -0
  240. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +105 -0
  241. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -1
  242. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -6
  243. package/cpp/llama.cpp/requirements/requirements-pydantic.txt +1 -1
  244. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  245. package/cpp/llama.cpp/src/llama-adapter.cpp +68 -4
  246. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  247. package/cpp/llama.cpp/src/llama-arch.cpp +192 -2
  248. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  249. package/cpp/llama.cpp/src/llama-batch.cpp +2 -2
  250. package/cpp/llama.cpp/src/llama-chat.cpp +47 -6
  251. package/cpp/llama.cpp/src/llama-chat.h +3 -0
  252. package/cpp/llama.cpp/src/llama-context.cpp +61 -252
  253. package/cpp/llama.cpp/src/llama-context.h +10 -15
  254. package/cpp/llama.cpp/src/llama-cparams.h +0 -1
  255. package/cpp/llama.cpp/src/llama-graph.cpp +180 -85
  256. package/cpp/llama.cpp/src/llama-graph.h +90 -51
  257. package/cpp/llama.cpp/src/llama-hparams.cpp +34 -3
  258. package/cpp/llama.cpp/src/llama-hparams.h +21 -6
  259. package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +79 -56
  260. package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +30 -28
  261. package/cpp/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +240 -632
  262. package/cpp/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +39 -74
  263. package/cpp/llama.cpp/src/llama-kv-cells.h +21 -21
  264. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +41 -35
  265. package/cpp/llama.cpp/src/llama-memory-hybrid.h +26 -29
  266. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +13 -9
  267. package/cpp/llama.cpp/src/llama-memory-recurrent.h +10 -14
  268. package/cpp/llama.cpp/src/llama-memory.h +13 -10
  269. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  270. package/cpp/llama.cpp/src/llama-model-loader.h +3 -2
  271. package/cpp/llama.cpp/src/llama-model.cpp +1959 -419
  272. package/cpp/llama.cpp/src/llama-model.h +28 -4
  273. package/cpp/llama.cpp/src/llama-quant.cpp +40 -4
  274. package/cpp/llama.cpp/src/llama-vocab.cpp +51 -2
  275. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  276. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +16 -7
  277. package/cpp/llama.cpp/vendor/minja/minja.hpp +47 -12
  278. package/cpp/rn-completion.cpp +3 -27
  279. package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec.h +30 -0
  280. package/ios/generated/RNLlamaCppSpecJSI.h +49 -4
  281. package/ios/include/chat.h +8 -1
  282. package/ios/include/common/minja/chat-template.hpp +16 -7
  283. package/ios/include/common/minja/minja.hpp +47 -12
  284. package/ios/include/common.h +64 -15
  285. package/ios/include/llama.h +53 -114
  286. package/ios/include/speculative.h +8 -1
  287. package/ios/libs/llama.xcframework/Info.plist +18 -18
  288. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  289. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5557 -5267
  290. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  291. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +90 -3
  292. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +53 -114
  293. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  294. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  295. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5520 -5238
  296. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
  297. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  298. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  299. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  300. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  301. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  302. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
  303. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4242 -4016
  304. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +25 -6
  305. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +90 -3
  306. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +53 -114
  307. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +25 -6
  308. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +90 -3
  309. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +53 -114
  310. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  311. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +25 -6
  312. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +90 -3
  313. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +53 -114
  314. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  315. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  316. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  317. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5556 -5267
  318. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  319. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +90 -3
  320. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +53 -114
  321. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  322. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  323. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
  324. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
  325. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  326. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  327. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  328. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  329. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  330. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5553 -5303
  331. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  332. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +90 -3
  333. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +53 -114
  334. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  335. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  336. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5515 -5274
  337. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4238 -4044
  338. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  339. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  340. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  341. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  342. package/lib/module/NativeRNLlamaCpp.js.map +1 -1
  343. package/lib/typescript/src/NativeRNLlamaCpp.d.ts +5 -0
  344. package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
  345. package/package.json +1 -2
  346. package/src/NativeRNLlamaCpp.ts +7 -0
  347. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -56
@@ -6,8 +6,8 @@
6
6
  #include "llama-cparams.h"
7
7
  #include "llama-model-loader.h"
8
8
 
9
- #include "llama-kv-cache-unified.h"
10
- #include "llama-kv-cache-unified-iswa.h"
9
+ #include "llama-kv-cache.h"
10
+ #include "llama-kv-cache-iswa.h"
11
11
  #include "llama-memory-hybrid.h"
12
12
  #include "llama-memory-recurrent.h"
13
13
 
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
47
47
  case LLM_TYPE_410M: return "410M";
48
48
  case LLM_TYPE_450M: return "450M";
49
49
  case LLM_TYPE_475M: return "475M";
50
+ case LLM_TYPE_558M: return "558M";
50
51
  case LLM_TYPE_700M: return "700M";
51
52
  case LLM_TYPE_770M: return "770M";
52
53
  case LLM_TYPE_780M: return "780M";
@@ -83,9 +84,11 @@ const char * llm_type_name(llm_type type) {
83
84
  case LLM_TYPE_32B: return "32B";
84
85
  case LLM_TYPE_34B: return "34B";
85
86
  case LLM_TYPE_35B: return "35B";
87
+ case LLM_TYPE_36B: return "36B";
86
88
  case LLM_TYPE_40B: return "40B";
87
89
  case LLM_TYPE_65B: return "65B";
88
90
  case LLM_TYPE_70B: return "70B";
91
+ case LLM_TYPE_120B: return "120B";
89
92
  case LLM_TYPE_142B: return "142B";
90
93
  case LLM_TYPE_236B: return "236B";
91
94
  case LLM_TYPE_290B: return "290B";
@@ -109,8 +112,10 @@ const char * llm_type_name(llm_type type) {
109
112
  case LLM_TYPE_A13B: return "A13B";
110
113
  case LLM_TYPE_21B_A3B: return "21B.A3B";
111
114
  case LLM_TYPE_30B_A3B: return "30B.A3B";
115
+ case LLM_TYPE_106B_A12B: return "106B.A12B";
112
116
  case LLM_TYPE_235B_A22B: return "235B.A22B";
113
117
  case LLM_TYPE_300B_A47B: return "300B.A47B";
118
+ case LLM_TYPE_355B_A32B: return "355B.A32B";
114
119
  case LLM_TYPE_E2B: return "E2B";
115
120
  case LLM_TYPE_E4B: return "E4B";
116
121
  default: return "?B";
@@ -190,6 +195,13 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
190
195
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
191
196
  op_tensor = ggml_add(ctx, a, w);
192
197
  } break;
198
+ case GGML_OP_ADD_ID:
199
+ {
200
+ int n_expert_used = hparams.n_expert_used;
201
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
202
+ ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
203
+ op_tensor = ggml_add_id(ctx, a, w, c);
204
+ } break;
193
205
  case GGML_OP_MUL:
194
206
  {
195
207
  ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
@@ -258,6 +270,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
258
270
  ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
259
271
  op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
260
272
  } break;
273
+ case GGML_OP_SCALE:
274
+ {
275
+ op_tensor = ggml_scale(ctx, w, 1.0f);
276
+ } break;
261
277
  default:
262
278
  GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
263
279
  }
@@ -290,7 +306,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
290
306
  }
291
307
 
292
308
  // CPU: ACCEL -> GPU host -> CPU extra -> CPU
293
- static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
309
+ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
294
310
  buft_list_t buft_list;
295
311
 
296
312
  // add ACCEL buffer types
@@ -319,21 +335,22 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
319
335
  }
320
336
  }
321
337
 
322
- // add extra buffer types, only if no GPU device is present
323
- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
324
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
325
- if (cpu_dev == nullptr) {
326
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
327
- }
338
+ // add extra buffer types
339
+ if (use_extra_bufts) {
340
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
341
+ if (cpu_dev == nullptr) {
342
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
343
+ }
328
344
 
329
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
330
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
331
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
332
- if (ggml_backend_dev_get_extra_bufts_fn) {
333
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
334
- while (extra_bufts && *extra_bufts) {
335
- buft_list.emplace_back(cpu_dev, *extra_bufts);
336
- ++extra_bufts;
345
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
346
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
347
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
348
+ if (ggml_backend_dev_get_extra_bufts_fn) {
349
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
350
+ while (extra_bufts && *extra_bufts) {
351
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
352
+ ++extra_bufts;
353
+ }
337
354
  }
338
355
  }
339
356
 
@@ -756,6 +773,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
756
773
  default: type = LLM_TYPE_UNKNOWN;
757
774
  }
758
775
  } break;
776
+ case LLM_ARCH_JINA_BERT_V3:
777
+ {
778
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
779
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
780
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
781
+
782
+ switch (hparams.n_layer) {
783
+ case 24:
784
+ type = LLM_TYPE_558M; break;
785
+ default: type = LLM_TYPE_UNKNOWN;
786
+ }
787
+ } break;
759
788
  case LLM_ARCH_NOMIC_BERT:
760
789
  case LLM_ARCH_NOMIC_BERT_MOE:
761
790
  {
@@ -869,6 +898,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
869
898
  hparams.causal_attn = false;
870
899
  }
871
900
  break;
901
+ case LLM_ARCH_LLADA:
902
+ {
903
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
904
+ // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion
905
+ switch (hparams.n_layer) {
906
+ case 32:
907
+ type = LLM_TYPE_8B;
908
+ break;
909
+ default:
910
+ type = LLM_TYPE_UNKNOWN;
911
+ }
912
+ // Set non-causal attention for diffusion models
913
+ hparams.causal_attn = false;
914
+ }
915
+ break;
872
916
  case LLM_ARCH_QWEN2MOE:
873
917
  {
874
918
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -883,6 +927,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
883
927
  } break;
884
928
  case LLM_ARCH_QWEN3:
885
929
  {
930
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
886
931
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
887
932
  switch (hparams.n_layer) {
888
933
  case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
@@ -1065,6 +1110,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1065
1110
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1066
1111
 
1067
1112
  switch (hparams.n_layer) {
1113
+ case 18: type = LLM_TYPE_537M; break;
1068
1114
  case 26: type = LLM_TYPE_1B; break;
1069
1115
  case 34: type = LLM_TYPE_4B; break;
1070
1116
  case 48: type = LLM_TYPE_12B; break;
@@ -1082,6 +1128,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1082
1128
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1083
1129
  hparams.set_swa_pattern(5);
1084
1130
 
1131
+ hparams.n_layer_kv_from_start = 20;
1085
1132
  hparams.rope_freq_base_train_swa = 10000.0f;
1086
1133
  hparams.rope_freq_scale_train_swa = 1.0f;
1087
1134
  hparams.f_attention_scale = 1.0f;
@@ -1256,6 +1303,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1256
1303
  default: type = LLM_TYPE_UNKNOWN;
1257
1304
  }
1258
1305
  } break;
1306
+ case LLM_ARCH_SEED_OSS:
1307
+ {
1308
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1309
+ switch (hparams.n_layer) {
1310
+ case 64: type = LLM_TYPE_36B; break;
1311
+ default: type = LLM_TYPE_UNKNOWN;
1312
+ }
1313
+ } break;
1259
1314
  case LLM_ARCH_OLMOE:
1260
1315
  {
1261
1316
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1417,6 +1472,37 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1417
1472
  default: type = LLM_TYPE_UNKNOWN;
1418
1473
  }
1419
1474
  } break;
1475
+ case LLM_ARCH_GLM4_MOE:
1476
+ {
1477
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1478
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1479
+
1480
+ // MoE parameters
1481
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
1482
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
1483
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1484
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
1485
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1486
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1487
+
1488
+ // Expert gating function (GLM-4.5 uses sigmoid)
1489
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1490
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1491
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1492
+ }
1493
+
1494
+ // NextN/MTP parameters
1495
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1496
+
1497
+ // TODO: when MTP is implemented, this should probably be updated if needed
1498
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1499
+
1500
+ switch (hparams.n_layer) {
1501
+ case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
1502
+ case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
1503
+ default: type = LLM_TYPE_UNKNOWN;
1504
+ }
1505
+ } break;
1420
1506
  case LLM_ARCH_BITNET:
1421
1507
  {
1422
1508
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1484,6 +1570,27 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1484
1570
  default: type = LLM_TYPE_UNKNOWN;
1485
1571
  }
1486
1572
  } break;
1573
+ case LLM_ARCH_NEMOTRON_H:
1574
+ {
1575
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1576
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1577
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1578
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1579
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1580
+
1581
+ // A layer is recurrent IFF the n_head_kv value is set to 0 and
1582
+ // the n_ff value is set to 0
1583
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1584
+ hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
1585
+ }
1586
+
1587
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1588
+
1589
+ switch (hparams.n_layer) {
1590
+ case 56: type = LLM_TYPE_9B; break;
1591
+ default: type = LLM_TYPE_UNKNOWN;
1592
+ }
1593
+ } break;
1487
1594
  case LLM_ARCH_EXAONE:
1488
1595
  {
1489
1596
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1744,6 +1851,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1744
1851
  default: type = LLM_TYPE_UNKNOWN;
1745
1852
  }
1746
1853
  } break;
1854
+ case LLM_ARCH_HUNYUAN_DENSE:
1855
+ {
1856
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1857
+
1858
+ switch (hparams.n_embd) {
1859
+ case 1024: type = LLM_TYPE_0_5B; break;
1860
+ case 2048: type = LLM_TYPE_1_8B; break;
1861
+ case 3072: type = LLM_TYPE_4B; break;
1862
+ case 4096: type = LLM_TYPE_7B; break;
1863
+ default: type = LLM_TYPE_UNKNOWN;
1864
+ }
1865
+ } break;
1747
1866
  case LLM_ARCH_SMOLLM3:
1748
1867
  {
1749
1868
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1754,6 +1873,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1754
1873
  default: type = LLM_TYPE_UNKNOWN;
1755
1874
  }
1756
1875
  } break;
1876
+ case LLM_ARCH_OPENAI_MOE:
1877
+ {
1878
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1879
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1880
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1881
+
1882
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1883
+ hparams.set_swa_pattern(2);
1884
+
1885
+ switch (hparams.n_layer) {
1886
+ case 24: type = LLM_TYPE_20B; break;
1887
+ case 36: type = LLM_TYPE_120B; break;
1888
+ default: type = LLM_TYPE_UNKNOWN;
1889
+ }
1890
+ } break;
1757
1891
  case LLM_ARCH_LFM2:
1758
1892
  {
1759
1893
  ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
@@ -1768,6 +1902,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1768
1902
  default: type = LLM_TYPE_UNKNOWN;
1769
1903
  }
1770
1904
  } break;
1905
+ case LLM_ARCH_SMALLTHINKER:
1906
+ {
1907
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1908
+
1909
+ if (found_swa && hparams.n_swa > 0) {
1910
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1911
+ hparams.n_swa = 4096;
1912
+ hparams.set_swa_pattern(4, true);
1913
+ } else {
1914
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1915
+ hparams.n_no_rope_layer_step = hparams.n_layer;
1916
+ }
1917
+
1918
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1919
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1920
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1921
+
1922
+ switch (hparams.n_layer) {
1923
+ case 32: type = LLM_TYPE_4B; break;
1924
+ case 52: type = LLM_TYPE_20B; break;
1925
+ default: type = LLM_TYPE_UNKNOWN;
1926
+ }
1927
+ } break;
1771
1928
  default: throw std::runtime_error("unsupported model architecture");
1772
1929
  }
1773
1930
 
@@ -1801,7 +1958,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1801
1958
  LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
1802
1959
 
1803
1960
  // build a list of buffer types for the CPU and GPU devices
1804
- pimpl->cpu_buft_list = make_cpu_buft_list(devices);
1961
+ pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
1805
1962
  for (auto * dev : devices) {
1806
1963
  buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
1807
1964
  // add CPU buffer types as a fallback
@@ -1897,6 +2054,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1897
2054
 
1898
2055
  const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
1899
2056
  const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2057
+ const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
1900
2058
 
1901
2059
  // create tensors for the weights
1902
2060
  {
@@ -1952,7 +2110,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1952
2110
  }
1953
2111
 
1954
2112
  // skip unused tensors
1955
- if (info.op == GGML_OP_NONE) {
2113
+ if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
1956
2114
  const size_t nbytes = ggml_nbytes(t_meta);
1957
2115
  LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
1958
2116
 
@@ -1962,11 +2120,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1962
2120
  return nullptr;
1963
2121
  }
1964
2122
 
1965
- // tensors with "bias" suffix are always used with GGML_OP_ADD
2123
+ // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
1966
2124
  ggml_op op;
1967
2125
  bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
1968
2126
  if (bias) {
1969
- op = GGML_OP_ADD;
2127
+ if (info.op == GGML_OP_MUL_MAT_ID) {
2128
+ op = GGML_OP_ADD_ID;
2129
+ } else {
2130
+ op = GGML_OP_ADD;
2131
+ }
1970
2132
  } else {
1971
2133
  op = info.op;
1972
2134
  }
@@ -2006,7 +2168,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2006
2168
  for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2007
2169
  std::regex pattern(overrides->pattern);
2008
2170
  if (std::regex_search(tensor_name, pattern)) {
2009
- buft = overrides->buft;
2171
+ if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2172
+ // when overriding to a CPU buffer, consider the extra buffer types
2173
+ buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
2174
+ } else {
2175
+ buft = overrides->buft;
2176
+ }
2177
+
2010
2178
  LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2011
2179
  tensor_name.c_str(),
2012
2180
  ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
@@ -2126,6 +2294,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2126
2294
  }
2127
2295
  }
2128
2296
  } break;
2297
+ case LLM_ARCH_LLADA:
2298
+ {
2299
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
2300
+
2301
+ // output
2302
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
2303
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
2304
+
2305
+ // if output is NULL, init from the input tok embed
2306
+ if (output == NULL) {
2307
+ output =
2308
+ create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
2309
+ }
2310
+
2311
+ for (int i = 0; i < n_layer; ++i) {
2312
+ auto & layer = layers[i];
2313
+
2314
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2315
+
2316
+ // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock
2317
+ layer.wq =
2318
+ create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
2319
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
2320
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
2321
+ // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false
2322
+ layer.wo =
2323
+ create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
2324
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2325
+
2326
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
2327
+
2328
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 },
2329
+ TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2330
+
2331
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
2332
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
2333
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
2334
+
2335
+ // optional MLP bias
2336
+ layer.ffn_gate_b =
2337
+ create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2338
+ layer.ffn_down_b =
2339
+ create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
2340
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED);
2341
+ }
2342
+ }
2343
+ break;
2129
2344
  case LLM_ARCH_LLAMA4:
2130
2345
  {
2131
2346
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -2450,6 +2665,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2450
2665
  case LLM_ARCH_BERT:
2451
2666
  case LLM_ARCH_NOMIC_BERT:
2452
2667
  case LLM_ARCH_NOMIC_BERT_MOE:
2668
+ case LLM_ARCH_JINA_BERT_V3:
2453
2669
  {
2454
2670
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2455
2671
  type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
@@ -2485,24 +2701,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2485
2701
  }
2486
2702
 
2487
2703
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2704
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2488
2705
 
2489
2706
  layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2490
2707
  layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
2491
2708
 
2492
2709
  if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
2493
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2494
2710
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
2495
2711
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2496
2712
  layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2497
2713
  } else {
2498
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2499
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2500
-
2501
- if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
2502
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2503
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2504
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2505
- } else {
2714
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2715
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
2716
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2717
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2718
+
2719
+ if (arch == LLM_ARCH_NOMIC_BERT) {
2506
2720
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2507
2721
  }
2508
2722
  }
@@ -3799,6 +4013,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3799
4013
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3800
4014
  }
3801
4015
  } break;
4016
+ case LLM_ARCH_SEED_OSS:
4017
+ {
4018
+ const uint32_t head_dim = hparams.n_embd_head_k;
4019
+ const int64_t n_qo_dim = n_head * head_dim;
4020
+ const int64_t n_kv_dim = n_head_kv * head_dim;
4021
+
4022
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4023
+
4024
+ // output
4025
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4026
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4027
+ // if output is NULL, init from the input tok embed
4028
+ if (output == NULL) {
4029
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4030
+ }
4031
+
4032
+ for (int i = 0; i < n_layer; ++i) {
4033
+ auto & layer = layers[i];
4034
+
4035
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
4036
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
4037
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
4038
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
4039
+
4040
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
4041
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4042
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
4043
+
4044
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4045
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
4046
+
4047
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4048
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4049
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4050
+ }
4051
+ } break;
4052
+
3802
4053
  case LLM_ARCH_OLMOE:
3803
4054
  {
3804
4055
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4322,6 +4573,105 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4322
4573
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
4323
4574
  }
4324
4575
  } break;
4576
+ case LLM_ARCH_GLM4_MOE:
4577
+ {
4578
+ const int64_t n_expert = hparams.n_expert;
4579
+ const int64_t n_expert_used = hparams.n_expert_used;
4580
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4581
+
4582
+ GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
4583
+ GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
4584
+
4585
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
4586
+
4587
+ // output
4588
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
4589
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
4590
+ // if output is NULL, init from the input tok embed
4591
+ if (output == NULL) {
4592
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
4593
+ }
4594
+
4595
+ // Load ALL tensors including NextN layer to satisfy total tensor count
4596
+ // but only PROCESS up to last layer (skipping final NextN layer) in forward pass
4597
+ for (int i = 0; i < n_layer; ++i) {
4598
+ int flags = 0;
4599
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4600
+ // skip all tensors in the NextN layers
4601
+ flags |= TENSOR_SKIP;
4602
+ }
4603
+
4604
+ auto & layer = layers[i];
4605
+
4606
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
4607
+
4608
+ // GLM-style attention with bias terms
4609
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
4610
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
4611
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
4612
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
4613
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
4614
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
4615
+
4616
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
4617
+
4618
+ // K/Q norm tensors (optional for GLM-4.5 355B variant)
4619
+ layer.attn_q_norm = create_tensor(
4620
+ tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4621
+ layer.attn_k_norm = create_tensor(
4622
+ tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
4623
+
4624
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
4625
+
4626
+ // Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
4627
+ // GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
4628
+ const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
4629
+
4630
+ if (use_moe) {
4631
+ // MoE layers
4632
+ layer.ffn_gate_inp =
4633
+ create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
4634
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
4635
+
4636
+ // MoE branch
4637
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
4638
+
4639
+ layer.ffn_gate_exps = create_tensor(
4640
+ tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4641
+ layer.ffn_down_exps = create_tensor(
4642
+ tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
4643
+ layer.ffn_up_exps = create_tensor(
4644
+ tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
4645
+
4646
+ // Shared expert
4647
+ if (n_expert_shared > 0) {
4648
+ const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
4649
+ layer.ffn_gate_shexp = create_tensor(
4650
+ tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4651
+ layer.ffn_down_shexp = create_tensor(
4652
+ tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
4653
+ layer.ffn_up_shexp = create_tensor(
4654
+ tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
4655
+ }
4656
+ } else {
4657
+ // Dense layers (first k layers) - GLM uses separate gate/up projections
4658
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
4659
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
4660
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
4661
+ }
4662
+
4663
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
4664
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
4665
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
4666
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
4667
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
4668
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
4669
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
4670
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
4671
+ }
4672
+ }
4673
+ }
4674
+ break;
4325
4675
  case LLM_ARCH_NEMOTRON:
4326
4676
  {
4327
4677
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4359,48 +4709,117 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4359
4709
  layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
4360
4710
  }
4361
4711
  } break;
4362
- case LLM_ARCH_EXAONE:
4712
+ case LLM_ARCH_NEMOTRON_H:
4363
4713
  {
4714
+ // mamba2 Mixer SSM params
4715
+ // NOTE: int64_t for tensor dimensions
4716
+ const int64_t d_conv = hparams.ssm_d_conv;
4717
+ const int64_t d_inner = hparams.ssm_d_inner;
4718
+ const int64_t d_state = hparams.ssm_d_state;
4719
+ const int64_t n_ssm_head = hparams.ssm_dt_rank;
4720
+ const int64_t n_group = hparams.ssm_n_group;
4721
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
4722
+
4723
+ // embeddings
4364
4724
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4365
4725
 
4366
4726
  // output
4367
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4368
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4369
-
4370
- // if output is NULL, init from the input tok embed
4371
- if (output == NULL) {
4372
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4727
+ {
4728
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4729
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4730
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
4731
+ if (output == NULL) {
4732
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4733
+ }
4373
4734
  }
4374
4735
 
4375
4736
  for (int i = 0; i < n_layer; ++i) {
4376
4737
  auto & layer = layers[i];
4377
4738
 
4378
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4739
+ // all blocks use the attn norm
4740
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4379
4741
 
4380
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4381
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4382
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4383
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4742
+ if (hparams.is_recurrent(i)) {
4743
+ // ssm layers
4744
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
4384
4745
 
4385
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4386
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4387
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4388
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4389
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4390
- }
4391
- } break;
4392
- case LLM_ARCH_EXAONE4:
4393
- {
4394
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4746
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
4747
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
4395
4748
 
4396
- // output
4397
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4398
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4749
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
4399
4750
 
4400
- // if output is NULL, init from the input tok embed
4401
- if (output == NULL) {
4402
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4403
- }
4751
+ // no "weight" suffix for these
4752
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
4753
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
4754
+
4755
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
4756
+
4757
+ // out_proj
4758
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
4759
+ } else if (hparams.n_ff(i) == 0) {
4760
+ // attention layers (with optional bias)
4761
+ const int64_t n_head_i = hparams.n_head(i);
4762
+ const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
4763
+ const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
4764
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
4765
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
4766
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
4767
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
4768
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4769
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
4770
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
4771
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4772
+ } else {
4773
+ // mlp layers
4774
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
4775
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
4776
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4777
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
4778
+ }
4779
+ }
4780
+ } break;
4781
+ case LLM_ARCH_EXAONE:
4782
+ {
4783
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4784
+
4785
+ // output
4786
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4787
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4788
+
4789
+ // if output is NULL, init from the input tok embed
4790
+ if (output == NULL) {
4791
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4792
+ }
4793
+
4794
+ for (int i = 0; i < n_layer; ++i) {
4795
+ auto & layer = layers[i];
4796
+
4797
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4798
+
4799
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4800
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4801
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4802
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4803
+
4804
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4805
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4806
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4807
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4808
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4809
+ }
4810
+ } break;
4811
+ case LLM_ARCH_EXAONE4:
4812
+ {
4813
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4814
+
4815
+ // output
4816
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4817
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4818
+
4819
+ // if output is NULL, init from the input tok embed
4820
+ if (output == NULL) {
4821
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4822
+ }
4404
4823
 
4405
4824
  for (int i = 0; i < n_layer; ++i) {
4406
4825
  auto & layer = layers[i];
@@ -5103,6 +5522,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5103
5522
  layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
5104
5523
  }
5105
5524
  } break;
5525
+ case LLM_ARCH_HUNYUAN_DENSE:
5526
+ {
5527
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5528
+
5529
+ // output
5530
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5531
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5532
+ // if output is NULL, init from the input tok embed
5533
+ if (output == NULL) {
5534
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5535
+ }
5536
+
5537
+ for (int i = 0; i < n_layer; ++i) {
5538
+ auto & layer = layers[i];
5539
+
5540
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5541
+
5542
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5543
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5544
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5545
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5546
+
5547
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5548
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5549
+
5550
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5551
+
5552
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5553
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5554
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5555
+
5556
+ }
5557
+ } break;
5106
5558
  case LLM_ARCH_SMOLLM3:
5107
5559
  {
5108
5560
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5132,10 +5584,55 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5132
5584
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5133
5585
  }
5134
5586
  } break;
5135
- case LLM_ARCH_LFM2:
5587
+ case LLM_ARCH_OPENAI_MOE:
5136
5588
  {
5589
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5590
+
5137
5591
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5592
+
5593
+ // output
5594
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5595
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5596
+
5597
+ for (int i = 0; i < n_layer; ++i) {
5598
+ auto & layer = layers[i];
5599
+
5600
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5601
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5602
+
5603
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
5604
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5605
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
5606
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
5607
+
5608
+ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, 0);
5609
+
5610
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
5611
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5612
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5613
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5614
+
5615
+ // bias
5616
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_head * n_rot}, 0);
5617
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_head_kv * n_rot}, 0);
5618
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_head_kv * n_rot}, 0);
5619
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
5620
+
5621
+ layer.ffn_gate_inp_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_expert}, 0);
5622
+ layer.ffn_gate_exps_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
5623
+ layer.ffn_down_exps_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "bias", i), { n_embd, n_expert}, 0);
5624
+ layer.ffn_up_exps_b = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "bias", i), {n_ff_exp, n_expert}, 0);
5625
+ }
5626
+ } break;
5627
+ case LLM_ARCH_LFM2:
5628
+ {
5629
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5138
5630
  tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
5631
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5632
+
5633
+ if (output == NULL) {
5634
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5635
+ }
5139
5636
 
5140
5637
  for (int i = 0; i < n_layer; ++i) {
5141
5638
  auto & layer = layers[i];
@@ -5165,6 +5662,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5165
5662
  }
5166
5663
  }
5167
5664
  } break;
5665
+ case LLM_ARCH_SMALLTHINKER:
5666
+ {
5667
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
5668
+
5669
+ // output
5670
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
5671
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5672
+
5673
+ // if output is NULL, init from the input tok embed
5674
+ if (output == NULL) {
5675
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5676
+ }
5677
+
5678
+ for (int i = 0; i < n_layer; ++i) {
5679
+ auto & layer = layers[i];
5680
+
5681
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
5682
+
5683
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
5684
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
5685
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
5686
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
5687
+
5688
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
5689
+
5690
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
5691
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
5692
+
5693
+ // MoE branch
5694
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5695
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
5696
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
5697
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
5698
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
5699
+ }
5700
+ } break;
5168
5701
  default:
5169
5702
  throw std::runtime_error("unknown architecture");
5170
5703
  }
@@ -5419,7 +5952,8 @@ void llama_model::print_info() const {
5419
5952
  arch == LLM_ARCH_JAMBA ||
5420
5953
  arch == LLM_ARCH_FALCON_H1 ||
5421
5954
  arch == LLM_ARCH_PLAMO2 ||
5422
- arch == LLM_ARCH_GRANITE_HYBRID) {
5955
+ arch == LLM_ARCH_GRANITE_HYBRID ||
5956
+ arch == LLM_ARCH_NEMOTRON_H) {
5423
5957
  LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
5424
5958
  LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
5425
5959
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
@@ -5468,7 +6002,7 @@ void llama_model::print_info() const {
5468
6002
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5469
6003
  }
5470
6004
 
5471
- if (arch == LLM_ARCH_QWEN3MOE) {
6005
+ if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE) {
5472
6006
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5473
6007
  }
5474
6008
 
@@ -5490,6 +6024,11 @@ void llama_model::print_info() const {
5490
6024
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
5491
6025
  }
5492
6026
 
6027
+ if (arch == LLM_ARCH_SMALLTHINKER) {
6028
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6029
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6030
+ }
6031
+
5493
6032
  vocab.print_info();
5494
6033
  }
5495
6034
 
@@ -5605,7 +6144,7 @@ struct llm_build_llama : public llm_graph_context {
5605
6144
  // inp_pos - contains the positions
5606
6145
  ggml_tensor * inp_pos = build_inp_pos();
5607
6146
 
5608
- auto * inp_attn = build_attn_inp_kv_unified();
6147
+ auto * inp_attn = build_attn_inp_kv();
5609
6148
 
5610
6149
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
5611
6150
 
@@ -5669,7 +6208,7 @@ struct llm_build_llama : public llm_graph_context {
5669
6208
 
5670
6209
  cur = build_attn(inp_attn,
5671
6210
  model.layers[il].wo, model.layers[il].bo,
5672
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6211
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
5673
6212
  cb(cur, "attn_out", il);
5674
6213
  }
5675
6214
 
@@ -5765,7 +6304,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
5765
6304
  ggml_tensor * inp_attn_scale = nullptr;
5766
6305
  inp_attn_scale = build_inp_attn_scale();
5767
6306
 
5768
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
6307
+ auto * inp_attn = build_attn_inp_kv_iswa();
5769
6308
 
5770
6309
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
5771
6310
 
@@ -5843,7 +6382,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
5843
6382
 
5844
6383
  cur = build_attn(inp_attn,
5845
6384
  model.layers[il].wo, model.layers[il].bo,
5846
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6385
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
5847
6386
  cb(cur, "attn_out", il);
5848
6387
  }
5849
6388
 
@@ -5944,7 +6483,7 @@ struct llm_build_deci : public llm_graph_context {
5944
6483
  // inp_pos - contains the positions
5945
6484
  ggml_tensor * inp_pos = build_inp_pos();
5946
6485
 
5947
- auto * inp_attn = build_attn_inp_kv_unified();
6486
+ auto * inp_attn = build_attn_inp_kv();
5948
6487
 
5949
6488
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
5950
6489
 
@@ -6020,7 +6559,7 @@ struct llm_build_deci : public llm_graph_context {
6020
6559
 
6021
6560
  cur = build_attn(inp_attn,
6022
6561
  model.layers[il].wo, model.layers[il].bo,
6023
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
6562
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
6024
6563
  }
6025
6564
 
6026
6565
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6100,7 +6639,7 @@ struct llm_build_baichuan : public llm_graph_context {
6100
6639
  // inp_pos - contains the positions
6101
6640
  ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
6102
6641
 
6103
- auto * inp_attn = build_attn_inp_kv_unified();
6642
+ auto * inp_attn = build_attn_inp_kv();
6104
6643
 
6105
6644
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6106
6645
 
@@ -6152,7 +6691,7 @@ struct llm_build_baichuan : public llm_graph_context {
6152
6691
 
6153
6692
  cur = build_attn(inp_attn,
6154
6693
  model.layers[il].wo, NULL,
6155
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6694
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6156
6695
  }
6157
6696
 
6158
6697
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6222,7 +6761,7 @@ struct llm_build_xverse : public llm_graph_context {
6222
6761
  // inp_pos - contains the positions
6223
6762
  ggml_tensor * inp_pos = build_inp_pos();
6224
6763
 
6225
- auto * inp_attn = build_attn_inp_kv_unified();
6764
+ auto * inp_attn = build_attn_inp_kv();
6226
6765
 
6227
6766
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6228
6767
 
@@ -6267,7 +6806,7 @@ struct llm_build_xverse : public llm_graph_context {
6267
6806
 
6268
6807
  cur = build_attn(inp_attn,
6269
6808
  model.layers[il].wo, NULL,
6270
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6809
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6271
6810
  }
6272
6811
 
6273
6812
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6336,7 +6875,7 @@ struct llm_build_falcon : public llm_graph_context {
6336
6875
  // inp_pos - contains the positions
6337
6876
  ggml_tensor * inp_pos = build_inp_pos();
6338
6877
 
6339
- auto * inp_attn = build_attn_inp_kv_unified();
6878
+ auto * inp_attn = build_attn_inp_kv();
6340
6879
 
6341
6880
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6342
6881
 
@@ -6367,9 +6906,9 @@ struct llm_build_falcon : public llm_graph_context {
6367
6906
 
6368
6907
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6369
6908
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6370
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6909
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6371
6910
 
6372
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6911
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6373
6912
 
6374
6913
  // using mode = 2 for neox mode
6375
6914
  Qcur = ggml_rope_ext(
@@ -6390,7 +6929,7 @@ struct llm_build_falcon : public llm_graph_context {
6390
6929
 
6391
6930
  cur = build_attn(inp_attn,
6392
6931
  model.layers[il].wo, NULL,
6393
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6932
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6394
6933
  }
6395
6934
 
6396
6935
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6460,7 +6999,7 @@ struct llm_build_grok : public llm_graph_context {
6460
6999
  // inp_pos - contains the positions
6461
7000
  ggml_tensor * inp_pos = build_inp_pos();
6462
7001
 
6463
- auto * inp_attn = build_attn_inp_kv_unified();
7002
+ auto * inp_attn = build_attn_inp_kv();
6464
7003
 
6465
7004
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6466
7005
 
@@ -6520,7 +7059,7 @@ struct llm_build_grok : public llm_graph_context {
6520
7059
 
6521
7060
  cur = build_attn(inp_attn,
6522
7061
  model.layers[il].wo, model.layers[il].bo,
6523
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7062
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
6524
7063
  }
6525
7064
 
6526
7065
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6620,7 +7159,7 @@ struct llm_build_dbrx : public llm_graph_context {
6620
7159
  // inp_pos - contains the positions
6621
7160
  ggml_tensor * inp_pos = build_inp_pos();
6622
7161
 
6623
- auto * inp_attn = build_attn_inp_kv_unified();
7162
+ auto * inp_attn = build_attn_inp_kv();
6624
7163
 
6625
7164
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6626
7165
 
@@ -6647,9 +7186,9 @@ struct llm_build_dbrx : public llm_graph_context {
6647
7186
 
6648
7187
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6649
7188
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6650
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7189
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6651
7190
 
6652
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7191
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6653
7192
 
6654
7193
  Qcur = ggml_rope_ext(
6655
7194
  ctx0, Qcur, inp_pos, nullptr,
@@ -6669,7 +7208,7 @@ struct llm_build_dbrx : public llm_graph_context {
6669
7208
 
6670
7209
  cur = build_attn(inp_attn,
6671
7210
  model.layers[il].wo, NULL,
6672
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7211
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6673
7212
  }
6674
7213
 
6675
7214
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6744,7 +7283,7 @@ struct llm_build_starcoder : public llm_graph_context {
6744
7283
  // inp_pos - contains the positions
6745
7284
  ggml_tensor * inp_pos = build_inp_pos();
6746
7285
 
6747
- auto * inp_attn = build_attn_inp_kv_unified();
7286
+ auto * inp_attn = build_attn_inp_kv();
6748
7287
 
6749
7288
  ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
6750
7289
  cb(pos, "pos_embd", -1);
@@ -6769,13 +7308,13 @@ struct llm_build_starcoder : public llm_graph_context {
6769
7308
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6770
7309
  cb(cur, "bqkv", il);
6771
7310
 
6772
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6773
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6774
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7311
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7312
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7313
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6775
7314
 
6776
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6777
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6778
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7315
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7316
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7317
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6779
7318
 
6780
7319
  cb(Qcur, "Qcur", il);
6781
7320
  cb(Kcur, "Kcur", il);
@@ -6783,7 +7322,7 @@ struct llm_build_starcoder : public llm_graph_context {
6783
7322
 
6784
7323
  cur = build_attn(inp_attn,
6785
7324
  model.layers[il].wo, model.layers[il].bo,
6786
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7325
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6787
7326
  }
6788
7327
 
6789
7328
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6849,7 +7388,7 @@ struct llm_build_refact : public llm_graph_context {
6849
7388
 
6850
7389
  inpL = build_inp_embd(model.tok_embd);
6851
7390
 
6852
- auto * inp_attn = build_attn_inp_kv_unified();
7391
+ auto * inp_attn = build_attn_inp_kv();
6853
7392
 
6854
7393
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6855
7394
 
@@ -6882,7 +7421,7 @@ struct llm_build_refact : public llm_graph_context {
6882
7421
 
6883
7422
  cur = build_attn(inp_attn,
6884
7423
  model.layers[il].wo, NULL,
6885
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7424
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6886
7425
  }
6887
7426
 
6888
7427
  if (il == n_layer - 1 && inp_out_ids) {
@@ -6991,13 +7530,15 @@ struct llm_build_bert : public llm_graph_context {
6991
7530
  cb(cur, "bqkv", il);
6992
7531
  }
6993
7532
 
6994
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6995
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6996
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7533
+ Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7534
+ Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7535
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7536
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6997
7537
  } else {
6998
7538
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
6999
7539
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
7000
7540
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
7541
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7001
7542
  }
7002
7543
 
7003
7544
  if (model.layers[il].attn_q_norm) {
@@ -7005,6 +7546,10 @@ struct llm_build_bert : public llm_graph_context {
7005
7546
  model.layers[il].attn_q_norm,
7006
7547
  model.layers[il].attn_q_norm_b,
7007
7548
  LLM_NORM, il);
7549
+
7550
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7551
+ } else {
7552
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7008
7553
  }
7009
7554
 
7010
7555
  if (model.layers[il].attn_k_norm) {
@@ -7012,14 +7557,14 @@ struct llm_build_bert : public llm_graph_context {
7012
7557
  model.layers[il].attn_k_norm,
7013
7558
  model.layers[il].attn_k_norm_b,
7014
7559
  LLM_NORM, il);
7015
- }
7016
7560
 
7017
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7018
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7019
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7561
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7562
+ } else {
7563
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7564
+ }
7020
7565
 
7021
7566
  // RoPE
7022
- if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
7567
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
7023
7568
  Qcur = ggml_rope_ext(
7024
7569
  ctx0, Qcur, inp_pos, nullptr,
7025
7570
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -7039,7 +7584,7 @@ struct llm_build_bert : public llm_graph_context {
7039
7584
 
7040
7585
  cur = build_attn(inp_attn,
7041
7586
  model.layers[il].wo, model.layers[il].bo,
7042
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7587
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7043
7588
  cb(cur, "kqv_out", il);
7044
7589
  }
7045
7590
 
@@ -7078,7 +7623,7 @@ struct llm_build_bert : public llm_graph_context {
7078
7623
  0.0f,
7079
7624
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
7080
7625
  cb(cur, "ffn_moe_out", il);
7081
- } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
7626
+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
7082
7627
  cur = build_ffn(cur,
7083
7628
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
7084
7629
  NULL, NULL, NULL,
@@ -7161,9 +7706,9 @@ struct llm_build_neo_bert : public llm_graph_context {
7161
7706
 
7162
7707
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7163
7708
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7164
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7709
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7165
7710
 
7166
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7711
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7167
7712
 
7168
7713
  // RoPE
7169
7714
  Qcur = ggml_rope_ext(
@@ -7184,7 +7729,7 @@ struct llm_build_neo_bert : public llm_graph_context {
7184
7729
 
7185
7730
  cur = build_attn(inp_attn,
7186
7731
  model.layers[il].wo, nullptr,
7187
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7732
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7188
7733
  cb(cur, "kqv_out", il);
7189
7734
  }
7190
7735
 
@@ -7245,7 +7790,7 @@ struct llm_build_bloom : public llm_graph_context {
7245
7790
 
7246
7791
  inpL = build_inp_embd(model.tok_embd);
7247
7792
 
7248
- auto * inp_attn = build_attn_inp_kv_unified();
7793
+ auto * inp_attn = build_attn_inp_kv();
7249
7794
 
7250
7795
  inpL = build_norm(inpL,
7251
7796
  model.tok_norm,
@@ -7270,13 +7815,13 @@ struct llm_build_bloom : public llm_graph_context {
7270
7815
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7271
7816
  cb(cur, "bqkv", il);
7272
7817
 
7273
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7274
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7275
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7818
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7819
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7820
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7276
7821
 
7277
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7278
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7279
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7822
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7823
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7824
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7280
7825
 
7281
7826
  cb(Qcur, "Qcur", il);
7282
7827
  cb(Kcur, "Kcur", il);
@@ -7284,7 +7829,7 @@ struct llm_build_bloom : public llm_graph_context {
7284
7829
 
7285
7830
  cur = build_attn(inp_attn,
7286
7831
  model.layers[il].wo, model.layers[il].bo,
7287
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7832
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7288
7833
  }
7289
7834
 
7290
7835
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7352,7 +7897,7 @@ struct llm_build_mpt : public llm_graph_context {
7352
7897
 
7353
7898
  inpL = build_inp_embd(model.tok_embd);
7354
7899
 
7355
- auto * inp_attn = build_attn_inp_kv_unified();
7900
+ auto * inp_attn = build_attn_inp_kv();
7356
7901
 
7357
7902
  if (model.pos_embd) {
7358
7903
  // inp_pos - contains the positions
@@ -7394,7 +7939,7 @@ struct llm_build_mpt : public llm_graph_context {
7394
7939
 
7395
7940
  ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7396
7941
  ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7397
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7942
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7398
7943
 
7399
7944
  cb(Qcur, "Qcur", il);
7400
7945
  cb(Kcur, "Kcur", il);
@@ -7413,17 +7958,18 @@ struct llm_build_mpt : public llm_graph_context {
7413
7958
  model.layers[il].attn_k_norm_b,
7414
7959
  LLM_NORM, il);
7415
7960
  cb(Kcur, "Kcur", il);
7961
+
7962
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7963
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7416
7964
  } else {
7417
- Qcur = ggml_cont(ctx0, Qcur);
7965
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7418
7966
  cb(Qcur, "Qcur", il);
7419
7967
 
7420
- Kcur = ggml_cont(ctx0, Kcur);
7968
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7421
7969
  cb(Kcur, "Kcur", il);
7422
7970
  }
7423
7971
 
7424
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7425
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7426
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7972
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7427
7973
 
7428
7974
  cb(Qcur, "Qcur", il);
7429
7975
  cb(Kcur, "Kcur", il);
@@ -7431,7 +7977,7 @@ struct llm_build_mpt : public llm_graph_context {
7431
7977
 
7432
7978
  cur = build_attn(inp_attn,
7433
7979
  model.layers[il].wo, model.layers[il].bo,
7434
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7980
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7435
7981
  }
7436
7982
 
7437
7983
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7501,7 +8047,7 @@ struct llm_build_stablelm : public llm_graph_context {
7501
8047
  // inp_pos - contains the positions
7502
8048
  ggml_tensor * inp_pos = build_inp_pos();
7503
8049
 
7504
- auto * inp_attn = build_attn_inp_kv_unified();
8050
+ auto * inp_attn = build_attn_inp_kv();
7505
8051
 
7506
8052
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7507
8053
 
@@ -7577,7 +8123,7 @@ struct llm_build_stablelm : public llm_graph_context {
7577
8123
 
7578
8124
  cur = build_attn(inp_attn,
7579
8125
  model.layers[il].wo, NULL,
7580
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8126
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7581
8127
  }
7582
8128
 
7583
8129
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7653,7 +8199,7 @@ struct llm_build_qwen : public llm_graph_context {
7653
8199
  // inp_pos - contains the positions
7654
8200
  ggml_tensor * inp_pos = build_inp_pos();
7655
8201
 
7656
- auto * inp_attn = build_attn_inp_kv_unified();
8202
+ auto * inp_attn = build_attn_inp_kv();
7657
8203
 
7658
8204
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7659
8205
 
@@ -7675,9 +8221,9 @@ struct llm_build_qwen : public llm_graph_context {
7675
8221
 
7676
8222
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7677
8223
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7678
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
8224
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
7679
8225
 
7680
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8226
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7681
8227
 
7682
8228
  // using mode = 2 for neox mode
7683
8229
  Qcur = ggml_rope_ext(
@@ -7698,7 +8244,7 @@ struct llm_build_qwen : public llm_graph_context {
7698
8244
 
7699
8245
  cur = build_attn(inp_attn,
7700
8246
  model.layers[il].wo, NULL,
7701
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8247
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7702
8248
  }
7703
8249
 
7704
8250
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7768,7 +8314,7 @@ struct llm_build_qwen2 : public llm_graph_context {
7768
8314
  // inp_pos - contains the positions
7769
8315
  ggml_tensor * inp_pos = build_inp_pos();
7770
8316
 
7771
- auto * inp_attn = build_attn_inp_kv_unified();
8317
+ auto * inp_attn = build_attn_inp_kv();
7772
8318
 
7773
8319
  ggml_tensor * inp_out_ids = build_inp_out_ids();
7774
8320
 
@@ -7818,7 +8364,7 @@ struct llm_build_qwen2 : public llm_graph_context {
7818
8364
 
7819
8365
  cur = build_attn(inp_attn,
7820
8366
  model.layers[il].wo, model.layers[il].bo,
7821
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8367
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7822
8368
  }
7823
8369
 
7824
8370
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7932,8 +8478,9 @@ struct llm_build_dream : public llm_graph_context {
7932
8478
  cb(Kcur, "Kcur", il);
7933
8479
  cb(Vcur, "Vcur", il);
7934
8480
 
7935
- cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr,
7936
- nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
8481
+ cur = build_attn(inp_attn,
8482
+ model.layers[il].wo, model.layers[il].bo,
8483
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
7937
8484
  }
7938
8485
 
7939
8486
  if (il == n_layer - 1 && inp_out_ids) {
@@ -7978,8 +8525,10 @@ struct llm_build_dream : public llm_graph_context {
7978
8525
  }
7979
8526
  };
7980
8527
 
7981
- struct llm_build_qwen2vl : public llm_graph_context {
7982
- llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8528
+ struct llm_build_llada : public llm_graph_context {
8529
+ llm_build_llada(const llama_model & model, const llm_graph_params & params) :
8530
+ llm_graph_context(params) {
8531
+ // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
7983
8532
  const int64_t n_embd_head = hparams.n_embd_head_v;
7984
8533
 
7985
8534
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7993,10 +8542,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
7993
8542
  // inp_pos - contains the positions
7994
8543
  ggml_tensor * inp_pos = build_inp_pos();
7995
8544
 
7996
- auto * inp_attn = build_attn_inp_kv_unified();
7997
-
7998
- int sections[4];
7999
- std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
8545
+ // Non-causal attention for diffusion
8546
+ auto * inp_attn = build_attn_inp_no_cache();
8000
8547
 
8001
8548
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8002
8549
 
@@ -8004,53 +8551,41 @@ struct llm_build_qwen2vl : public llm_graph_context {
8004
8551
  ggml_tensor * inpSA = inpL;
8005
8552
 
8006
8553
  // norm
8007
- cur = build_norm(inpL,
8008
- model.layers[il].attn_norm, NULL,
8009
- LLM_NORM_RMS, il);
8554
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
8010
8555
  cb(cur, "attn_norm", il);
8011
8556
 
8012
8557
  // self-attention
8013
8558
  {
8014
- // compute Q and K and RoPE them
8559
+ // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock
8015
8560
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8016
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8017
- cb(Qcur, "Qcur", il);
8018
-
8019
8561
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8020
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8021
- cb(Kcur, "Kcur", il);
8022
-
8023
8562
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8024
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8563
+
8564
+ cb(Qcur, "Qcur", il);
8565
+ cb(Kcur, "Kcur", il);
8025
8566
  cb(Vcur, "Vcur", il);
8026
8567
 
8027
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8568
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8028
8569
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8029
8570
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8030
8571
 
8031
- Qcur = ggml_rope_multi(
8032
- ctx0, Qcur, inp_pos, nullptr,
8033
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
8034
- ext_factor, attn_factor, beta_fast, beta_slow
8035
- );
8572
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8573
+ ext_factor, attn_factor, beta_fast, beta_slow);
8036
8574
 
8037
- Kcur = ggml_rope_multi(
8038
- ctx0, Kcur, inp_pos, nullptr,
8039
- n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
8040
- ext_factor, attn_factor, beta_fast, beta_slow
8041
- );
8575
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8576
+ ext_factor, attn_factor, beta_fast, beta_slow);
8042
8577
 
8043
8578
  cb(Qcur, "Qcur", il);
8044
8579
  cb(Kcur, "Kcur", il);
8045
8580
  cb(Vcur, "Vcur", il);
8046
8581
 
8047
8582
  cur = build_attn(inp_attn,
8048
- model.layers[il].wo, model.layers[il].bo,
8049
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8583
+ model.layers[il].wo, NULL,
8584
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
8050
8585
  }
8051
8586
 
8052
8587
  if (il == n_layer - 1 && inp_out_ids) {
8053
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8588
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8054
8589
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8055
8590
  }
8056
8591
 
@@ -8058,17 +8593,11 @@ struct llm_build_qwen2vl : public llm_graph_context {
8058
8593
  cb(ffn_inp, "ffn_inp", il);
8059
8594
 
8060
8595
  // feed-forward network
8061
- cur = build_norm(ffn_inp,
8062
- model.layers[il].ffn_norm, NULL,
8063
- LLM_NORM_RMS, il);
8596
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
8064
8597
  cb(cur, "ffn_norm", il);
8065
8598
 
8066
- cur = build_ffn(cur,
8067
- model.layers[il].ffn_up, NULL, NULL,
8068
- model.layers[il].ffn_gate, NULL, NULL,
8069
- model.layers[il].ffn_down, NULL, NULL,
8070
- NULL,
8071
- LLM_FFN_SILU, LLM_FFN_PAR, il);
8599
+ cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL,
8600
+ model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
8072
8601
  cb(cur, "ffn_out", il);
8073
8602
 
8074
8603
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -8082,9 +8611,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
8082
8611
 
8083
8612
  cur = inpL;
8084
8613
 
8085
- cur = build_norm(cur,
8086
- model.output_norm, NULL,
8087
- LLM_NORM_RMS, -1);
8614
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
8088
8615
 
8089
8616
  cb(cur, "result_norm", -1);
8090
8617
  res->t_embd = cur;
@@ -8099,8 +8626,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
8099
8626
  }
8100
8627
  };
8101
8628
 
8102
- struct llm_build_qwen2moe : public llm_graph_context {
8103
- llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8629
+ struct llm_build_qwen2vl : public llm_graph_context {
8630
+ llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8104
8631
  const int64_t n_embd_head = hparams.n_embd_head_v;
8105
8632
 
8106
8633
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8114,7 +8641,10 @@ struct llm_build_qwen2moe : public llm_graph_context {
8114
8641
  // inp_pos - contains the positions
8115
8642
  ggml_tensor * inp_pos = build_inp_pos();
8116
8643
 
8117
- auto * inp_attn = build_attn_inp_kv_unified();
8644
+ auto * inp_attn = build_attn_inp_kv();
8645
+
8646
+ int sections[4];
8647
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
8118
8648
 
8119
8649
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8120
8650
 
@@ -8127,13 +8657,131 @@ struct llm_build_qwen2moe : public llm_graph_context {
8127
8657
  LLM_NORM_RMS, il);
8128
8658
  cb(cur, "attn_norm", il);
8129
8659
 
8130
- // self_attention
8660
+ // self-attention
8131
8661
  {
8132
8662
  // compute Q and K and RoPE them
8133
8663
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8664
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8134
8665
  cb(Qcur, "Qcur", il);
8135
- if (model.layers[il].bq) {
8136
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8666
+
8667
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8668
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8669
+ cb(Kcur, "Kcur", il);
8670
+
8671
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8672
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8673
+ cb(Vcur, "Vcur", il);
8674
+
8675
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8676
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8677
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8678
+
8679
+ Qcur = ggml_rope_multi(
8680
+ ctx0, Qcur, inp_pos, nullptr,
8681
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
8682
+ ext_factor, attn_factor, beta_fast, beta_slow
8683
+ );
8684
+
8685
+ Kcur = ggml_rope_multi(
8686
+ ctx0, Kcur, inp_pos, nullptr,
8687
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
8688
+ ext_factor, attn_factor, beta_fast, beta_slow
8689
+ );
8690
+
8691
+ cb(Qcur, "Qcur", il);
8692
+ cb(Kcur, "Kcur", il);
8693
+ cb(Vcur, "Vcur", il);
8694
+
8695
+ cur = build_attn(inp_attn,
8696
+ model.layers[il].wo, model.layers[il].bo,
8697
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8698
+ }
8699
+
8700
+ if (il == n_layer - 1 && inp_out_ids) {
8701
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8702
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8703
+ }
8704
+
8705
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8706
+ cb(ffn_inp, "ffn_inp", il);
8707
+
8708
+ // feed-forward network
8709
+ cur = build_norm(ffn_inp,
8710
+ model.layers[il].ffn_norm, NULL,
8711
+ LLM_NORM_RMS, il);
8712
+ cb(cur, "ffn_norm", il);
8713
+
8714
+ cur = build_ffn(cur,
8715
+ model.layers[il].ffn_up, NULL, NULL,
8716
+ model.layers[il].ffn_gate, NULL, NULL,
8717
+ model.layers[il].ffn_down, NULL, NULL,
8718
+ NULL,
8719
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
8720
+ cb(cur, "ffn_out", il);
8721
+
8722
+ cur = ggml_add(ctx0, cur, ffn_inp);
8723
+
8724
+ cur = build_cvec(cur, il);
8725
+ cb(cur, "l_out", il);
8726
+
8727
+ // input for next layer
8728
+ inpL = cur;
8729
+ }
8730
+
8731
+ cur = inpL;
8732
+
8733
+ cur = build_norm(cur,
8734
+ model.output_norm, NULL,
8735
+ LLM_NORM_RMS, -1);
8736
+
8737
+ cb(cur, "result_norm", -1);
8738
+ res->t_embd = cur;
8739
+
8740
+ // lm_head
8741
+ cur = build_lora_mm(model.output, cur);
8742
+
8743
+ cb(cur, "result_output", -1);
8744
+ res->t_logits = cur;
8745
+
8746
+ ggml_build_forward_expand(gf, cur);
8747
+ }
8748
+ };
8749
+
8750
+ struct llm_build_qwen2moe : public llm_graph_context {
8751
+ llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
8752
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8753
+
8754
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8755
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8756
+
8757
+ ggml_tensor * cur;
8758
+ ggml_tensor * inpL;
8759
+
8760
+ inpL = build_inp_embd(model.tok_embd);
8761
+
8762
+ // inp_pos - contains the positions
8763
+ ggml_tensor * inp_pos = build_inp_pos();
8764
+
8765
+ auto * inp_attn = build_attn_inp_kv();
8766
+
8767
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8768
+
8769
+ for (int il = 0; il < n_layer; ++il) {
8770
+ ggml_tensor * inpSA = inpL;
8771
+
8772
+ // norm
8773
+ cur = build_norm(inpL,
8774
+ model.layers[il].attn_norm, NULL,
8775
+ LLM_NORM_RMS, il);
8776
+ cb(cur, "attn_norm", il);
8777
+
8778
+ // self_attention
8779
+ {
8780
+ // compute Q and K and RoPE them
8781
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8782
+ cb(Qcur, "Qcur", il);
8783
+ if (model.layers[il].bq) {
8784
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8137
8785
  cb(Qcur, "Qcur", il);
8138
8786
  }
8139
8787
 
@@ -8173,7 +8821,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
8173
8821
 
8174
8822
  cur = build_attn(inp_attn,
8175
8823
  model.layers[il].wo, model.layers[il].bo,
8176
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8824
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8177
8825
  }
8178
8826
 
8179
8827
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8273,7 +8921,7 @@ struct llm_build_qwen3 : public llm_graph_context {
8273
8921
  // inp_pos - contains the positions
8274
8922
  ggml_tensor * inp_pos = build_inp_pos();
8275
8923
 
8276
- auto * inp_attn = build_attn_inp_kv_unified();
8924
+ auto * inp_attn = build_attn_inp_kv();
8277
8925
 
8278
8926
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8279
8927
 
@@ -8326,7 +8974,7 @@ struct llm_build_qwen3 : public llm_graph_context {
8326
8974
 
8327
8975
  cur = build_attn(inp_attn,
8328
8976
  model.layers[il].wo, model.layers[il].bo,
8329
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8977
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8330
8978
  }
8331
8979
 
8332
8980
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8394,7 +9042,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
8394
9042
  // inp_pos - contains the positions
8395
9043
  ggml_tensor * inp_pos = build_inp_pos();
8396
9044
 
8397
- auto * inp_attn = build_attn_inp_kv_unified();
9045
+ auto * inp_attn = build_attn_inp_kv();
8398
9046
 
8399
9047
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8400
9048
 
@@ -8447,7 +9095,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
8447
9095
 
8448
9096
  cur = build_attn(inp_attn,
8449
9097
  model.layers[il].wo, model.layers[il].bo,
8450
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9098
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8451
9099
  }
8452
9100
 
8453
9101
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8524,7 +9172,7 @@ struct llm_build_phi2 : public llm_graph_context {
8524
9172
  // inp_pos - contains the positions
8525
9173
  ggml_tensor * inp_pos = build_inp_pos();
8526
9174
 
8527
- auto * inp_attn = build_attn_inp_kv_unified();
9175
+ auto * inp_attn = build_attn_inp_kv();
8528
9176
 
8529
9177
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8530
9178
 
@@ -8550,21 +9198,21 @@ struct llm_build_phi2 : public llm_graph_context {
8550
9198
 
8551
9199
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8552
9200
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8553
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9201
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9202
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8554
9203
  } else {
8555
9204
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
8556
9205
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
8557
9206
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
8558
9207
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8559
9208
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9209
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8560
9210
  }
8561
9211
 
8562
9212
  cb(Qcur, "Qcur", il);
8563
9213
  cb(Kcur, "Kcur", il);
8564
9214
  cb(Vcur, "Vcur", il);
8565
9215
 
8566
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8567
-
8568
9216
  Qcur = ggml_rope_ext(
8569
9217
  ctx0, Qcur, inp_pos, nullptr,
8570
9218
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -8587,7 +9235,7 @@ struct llm_build_phi2 : public llm_graph_context {
8587
9235
 
8588
9236
  cur = build_attn(inp_attn,
8589
9237
  model.layers[il].wo, model.layers[il].bo,
8590
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9238
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
8591
9239
  }
8592
9240
 
8593
9241
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8653,13 +9301,13 @@ struct llm_build_phi3 : public llm_graph_context {
8653
9301
  // inp_pos - contains the positions
8654
9302
  ggml_tensor * inp_pos = build_inp_pos();
8655
9303
 
8656
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
9304
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
8657
9305
  inp_attn_type * inp_attn = nullptr;
8658
9306
 
8659
9307
  if constexpr (iswa) {
8660
- inp_attn = build_attn_inp_kv_unified_iswa();
9308
+ inp_attn = build_attn_inp_kv_iswa();
8661
9309
  } else {
8662
- inp_attn = build_attn_inp_kv_unified();
9310
+ inp_attn = build_attn_inp_kv();
8663
9311
  }
8664
9312
 
8665
9313
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -8688,21 +9336,21 @@ struct llm_build_phi3 : public llm_graph_context {
8688
9336
 
8689
9337
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
8690
9338
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
8691
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
9339
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
9340
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8692
9341
  } else {
8693
9342
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
8694
9343
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
8695
9344
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
8696
9345
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8697
9346
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9347
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8698
9348
  }
8699
9349
 
8700
9350
  cb(Qcur, "Qcur", il);
8701
9351
  cb(Kcur, "Kcur", il);
8702
9352
  cb(Vcur, "Vcur", il);
8703
9353
 
8704
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8705
-
8706
9354
  Qcur = ggml_rope_ext(
8707
9355
  ctx0, Qcur, inp_pos, rope_factors,
8708
9356
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -8724,7 +9372,7 @@ struct llm_build_phi3 : public llm_graph_context {
8724
9372
 
8725
9373
  cur = build_attn(inp_attn,
8726
9374
  model.layers[il].wo, model.layers[il].bo,
8727
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9375
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
8728
9376
  }
8729
9377
 
8730
9378
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8811,7 +9459,7 @@ struct llm_build_plamo : public llm_graph_context {
8811
9459
  // inp_pos - contains the positions
8812
9460
  ggml_tensor * inp_pos = build_inp_pos();
8813
9461
 
8814
- auto * inp_attn = build_attn_inp_kv_unified();
9462
+ auto * inp_attn = build_attn_inp_kv();
8815
9463
 
8816
9464
  ggml_tensor * inp_out_ids = build_inp_out_ids();
8817
9465
 
@@ -8858,7 +9506,7 @@ struct llm_build_plamo : public llm_graph_context {
8858
9506
 
8859
9507
  cur = build_attn(inp_attn,
8860
9508
  model.layers[il].wo, NULL,
8861
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9509
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8862
9510
  }
8863
9511
 
8864
9512
  if (il == n_layer - 1 && inp_out_ids) {
@@ -8927,7 +9575,7 @@ struct llm_build_gpt2 : public llm_graph_context {
8927
9575
  // inp_pos - contains the positions
8928
9576
  ggml_tensor * inp_pos = build_inp_pos();
8929
9577
 
8930
- auto * inp_attn = build_attn_inp_kv_unified();
9578
+ auto * inp_attn = build_attn_inp_kv();
8931
9579
 
8932
9580
  pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
8933
9581
  cb(pos, "pos_embd", -1);
@@ -8952,21 +9600,21 @@ struct llm_build_gpt2 : public llm_graph_context {
8952
9600
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8953
9601
  cb(cur, "bqkv", il);
8954
9602
 
8955
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
8956
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
8957
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9603
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
9604
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
9605
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
8958
9606
 
8959
9607
  cb(Qcur, "Qcur", il);
8960
9608
  cb(Kcur, "Kcur", il);
8961
9609
  cb(Vcur, "Vcur", il);
8962
9610
 
8963
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8964
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8965
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9611
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9612
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9613
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8966
9614
 
8967
9615
  cur = build_attn(inp_attn,
8968
9616
  model.layers[il].wo, model.layers[il].bo,
8969
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9617
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8970
9618
  }
8971
9619
 
8972
9620
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9037,7 +9685,7 @@ struct llm_build_codeshell : public llm_graph_context {
9037
9685
  // inp_pos - contains the positions
9038
9686
  ggml_tensor * inp_pos = build_inp_pos();
9039
9687
 
9040
- auto * inp_attn = build_attn_inp_kv_unified();
9688
+ auto * inp_attn = build_attn_inp_kv();
9041
9689
 
9042
9690
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9043
9691
 
@@ -9058,9 +9706,9 @@ struct llm_build_codeshell : public llm_graph_context {
9058
9706
 
9059
9707
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9060
9708
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9061
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9709
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9062
9710
 
9063
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9711
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9064
9712
 
9065
9713
  Qcur = ggml_rope_ext(
9066
9714
  ctx0, Qcur, inp_pos, nullptr,
@@ -9080,7 +9728,7 @@ struct llm_build_codeshell : public llm_graph_context {
9080
9728
 
9081
9729
  cur = build_attn(inp_attn,
9082
9730
  model.layers[il].wo, model.layers[il].bo,
9083
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9731
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9084
9732
  }
9085
9733
 
9086
9734
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9150,7 +9798,7 @@ struct llm_build_orion : public llm_graph_context {
9150
9798
  // inp_pos - contains the positions
9151
9799
  ggml_tensor * inp_pos = build_inp_pos();
9152
9800
 
9153
- auto * inp_attn = build_attn_inp_kv_unified();
9801
+ auto * inp_attn = build_attn_inp_kv();
9154
9802
 
9155
9803
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9156
9804
 
@@ -9209,7 +9857,7 @@ struct llm_build_orion : public llm_graph_context {
9209
9857
 
9210
9858
  cur = build_attn(inp_attn,
9211
9859
  model.layers[il].wo, NULL,
9212
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9860
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9213
9861
  }
9214
9862
 
9215
9863
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9277,7 +9925,7 @@ struct llm_build_internlm2 : public llm_graph_context {
9277
9925
  // inp_pos - contains the positions
9278
9926
  ggml_tensor * inp_pos = build_inp_pos();
9279
9927
 
9280
- auto * inp_attn = build_attn_inp_kv_unified();
9928
+ auto * inp_attn = build_attn_inp_kv();
9281
9929
 
9282
9930
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9283
9931
 
@@ -9336,7 +9984,7 @@ struct llm_build_internlm2 : public llm_graph_context {
9336
9984
 
9337
9985
  cur = build_attn(inp_attn,
9338
9986
  model.layers[il].wo, model.layers[il].bo,
9339
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9987
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9340
9988
  }
9341
9989
 
9342
9990
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9413,7 +10061,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
9413
10061
  // inp_pos - contains the positions
9414
10062
  ggml_tensor * inp_pos = build_inp_pos();
9415
10063
 
9416
- auto * inp_attn = build_attn_inp_kv_unified();
10064
+ auto * inp_attn = build_attn_inp_kv();
9417
10065
 
9418
10066
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9419
10067
 
@@ -9524,7 +10172,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
9524
10172
 
9525
10173
  cur = build_attn(inp_attn,
9526
10174
  model.layers[il].wo, NULL,
9527
- q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
10175
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
9528
10176
  }
9529
10177
 
9530
10178
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9608,7 +10256,7 @@ struct llm_build_gemma : public llm_graph_context {
9608
10256
  // inp_pos - contains the positions
9609
10257
  ggml_tensor * inp_pos = build_inp_pos();
9610
10258
 
9611
- auto * inp_attn = build_attn_inp_kv_unified();
10259
+ auto * inp_attn = build_attn_inp_kv();
9612
10260
 
9613
10261
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9614
10262
 
@@ -9654,7 +10302,7 @@ struct llm_build_gemma : public llm_graph_context {
9654
10302
 
9655
10303
  cur = build_attn(inp_attn,
9656
10304
  model.layers[il].wo, NULL,
9657
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10305
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
9658
10306
  }
9659
10307
 
9660
10308
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9724,7 +10372,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
9724
10372
  // inp_pos - contains the positions
9725
10373
  ggml_tensor * inp_pos = build_inp_pos();
9726
10374
 
9727
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10375
+ auto * inp_attn = build_attn_inp_kv_iswa();
9728
10376
 
9729
10377
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9730
10378
 
@@ -9769,7 +10417,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
9769
10417
 
9770
10418
  cur = build_attn(inp_attn,
9771
10419
  model.layers[il].wo, NULL,
9772
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10420
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
9773
10421
  }
9774
10422
 
9775
10423
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9858,7 +10506,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
9858
10506
  ggml_tensor * inp_pos = build_inp_pos();
9859
10507
 
9860
10508
  // TODO: is causal == true correct? might need some changes
9861
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10509
+ auto * inp_attn = build_attn_inp_kv_iswa();
9862
10510
 
9863
10511
  ggml_tensor * inp_out_ids = build_inp_out_ids();
9864
10512
 
@@ -9911,7 +10559,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
9911
10559
 
9912
10560
  cur = build_attn(inp_attn,
9913
10561
  model.layers[il].wo, NULL,
9914
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10562
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
9915
10563
  }
9916
10564
 
9917
10565
  if (il == n_layer - 1 && inp_out_ids) {
@@ -9983,7 +10631,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
9983
10631
  const int64_t n_embd_altup;
9984
10632
  const int64_t n_altup;
9985
10633
  const int i_altup_act;
9986
- const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
9987
10634
  const int n_layer_sparsity = 10; // number of layers using activation sparsity
9988
10635
  const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
9989
10636
 
@@ -10009,7 +10656,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10009
10656
  ggml_tensor * inp_pos = build_inp_pos();
10010
10657
 
10011
10658
  // TODO: is causal == true correct? might need some changes
10012
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
10659
+ auto * inp_attn = build_attn_inp_kv_iswa();
10013
10660
 
10014
10661
  // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
10015
10662
  ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
@@ -10033,8 +10680,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10033
10680
 
10034
10681
  for (int il = 0; il < n_layer; ++il) {
10035
10682
  // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
10036
- const bool has_kv = (il < n_layer_kv);
10037
-
10038
10683
  const float freq_base_l = model.get_rope_freq_base (cparams, il);
10039
10684
  const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
10040
10685
 
@@ -10054,7 +10699,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10054
10699
  ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
10055
10700
 
10056
10701
  // self-attention
10057
- if (has_kv) {
10702
+ if (hparams.has_kv(il)) {
10058
10703
  // compute Q and K and RoPE them
10059
10704
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
10060
10705
  cb(Qcur, "Qcur", il);
@@ -10092,9 +10737,9 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10092
10737
 
10093
10738
  cur = build_attn(inp_attn,
10094
10739
  model.layers[il].wo, NULL,
10095
- Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
10740
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10096
10741
  } else {
10097
- // no KV layers
10742
+ // reuse KV cache of earlier layers
10098
10743
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
10099
10744
  cb(Qcur, "Qcur", il);
10100
10745
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -10110,7 +10755,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10110
10755
 
10111
10756
  cur = build_attn(inp_attn,
10112
10757
  model.layers[il].wo, NULL,
10113
- Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10758
+ Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
10114
10759
  }
10115
10760
 
10116
10761
  cur = build_norm(cur,
@@ -10388,8 +11033,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10388
11033
  ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
10389
11034
  all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
10390
11035
  cb(all_coefs, "all_coefs", il);
10391
- all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
10392
- all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
11036
+ all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
11037
+ all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
10393
11038
 
10394
11039
  innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
10395
11040
  ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
@@ -10416,7 +11061,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
10416
11061
  // inp_pos - contains the positions
10417
11062
  ggml_tensor * inp_pos = build_inp_pos();
10418
11063
 
10419
- auto * inp_attn = build_attn_inp_kv_unified();
11064
+ auto * inp_attn = build_attn_inp_kv();
10420
11065
 
10421
11066
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10422
11067
 
@@ -10475,7 +11120,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
10475
11120
 
10476
11121
  cur = build_attn(inp_attn,
10477
11122
  model.layers[il].wo, model.layers[il].bo,
10478
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11123
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10479
11124
  }
10480
11125
 
10481
11126
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10902,7 +11547,9 @@ struct llm_build_jamba : public llm_graph_context_mamba {
10902
11547
  cb(Vcur, "Vcur", il);
10903
11548
 
10904
11549
  // No RoPE :)
10905
- cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
11550
+ cur = build_attn(inp_hybrid->get_attn(),
11551
+ model.layers[il].wo, NULL,
11552
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
10906
11553
  }
10907
11554
 
10908
11555
  if (il == n_layer - 1 && inp_out_ids) {
@@ -10985,7 +11632,7 @@ struct llm_build_command_r : public llm_graph_context {
10985
11632
  // inp_pos - contains the positions
10986
11633
  ggml_tensor * inp_pos = build_inp_pos();
10987
11634
 
10988
- auto * inp_attn = build_attn_inp_kv_unified();
11635
+ auto * inp_attn = build_attn_inp_kv();
10989
11636
 
10990
11637
  ggml_tensor * inp_out_ids = build_inp_out_ids();
10991
11638
 
@@ -11060,7 +11707,7 @@ struct llm_build_command_r : public llm_graph_context {
11060
11707
 
11061
11708
  cur = build_attn(inp_attn,
11062
11709
  model.layers[il].wo, model.layers[il].bo,
11063
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11710
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11064
11711
  }
11065
11712
 
11066
11713
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11132,7 +11779,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
11132
11779
  // inp_pos - contains the positions
11133
11780
  ggml_tensor * inp_pos = build_inp_pos();
11134
11781
 
11135
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
11782
+ auto * inp_attn = build_attn_inp_kv_iswa();
11136
11783
 
11137
11784
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11138
11785
 
@@ -11195,7 +11842,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
11195
11842
 
11196
11843
  cur = build_attn(inp_attn,
11197
11844
  model.layers[il].wo, model.layers[il].bo,
11198
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11845
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11199
11846
  }
11200
11847
 
11201
11848
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11267,7 +11914,7 @@ struct llm_build_olmo : public llm_graph_context {
11267
11914
  // inp_pos - contains the positions
11268
11915
  ggml_tensor * inp_pos = build_inp_pos();
11269
11916
 
11270
- auto * inp_attn = build_attn_inp_kv_unified();
11917
+ auto * inp_attn = build_attn_inp_kv();
11271
11918
 
11272
11919
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11273
11920
 
@@ -11326,7 +11973,7 @@ struct llm_build_olmo : public llm_graph_context {
11326
11973
 
11327
11974
  cur = build_attn(inp_attn,
11328
11975
  model.layers[il].wo, nullptr,
11329
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11976
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11330
11977
  }
11331
11978
 
11332
11979
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11395,7 +12042,7 @@ struct llm_build_olmo2 : public llm_graph_context {
11395
12042
  // inp_pos - contains the positions
11396
12043
  ggml_tensor * inp_pos = build_inp_pos();
11397
12044
 
11398
- auto * inp_attn = build_attn_inp_kv_unified();
12045
+ auto * inp_attn = build_attn_inp_kv();
11399
12046
 
11400
12047
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11401
12048
 
@@ -11446,7 +12093,7 @@ struct llm_build_olmo2 : public llm_graph_context {
11446
12093
 
11447
12094
  cur = build_attn(inp_attn,
11448
12095
  model.layers[il].wo, NULL,
11449
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12096
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11450
12097
  }
11451
12098
 
11452
12099
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11524,7 +12171,7 @@ struct llm_build_olmoe : public llm_graph_context {
11524
12171
  // inp_pos - contains the positions
11525
12172
  ggml_tensor * inp_pos = build_inp_pos();
11526
12173
 
11527
- auto * inp_attn = build_attn_inp_kv_unified();
12174
+ auto * inp_attn = build_attn_inp_kv();
11528
12175
 
11529
12176
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11530
12177
 
@@ -11579,7 +12226,7 @@ struct llm_build_olmoe : public llm_graph_context {
11579
12226
 
11580
12227
  cur = build_attn(inp_attn,
11581
12228
  model.layers[il].wo, NULL,
11582
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12229
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11583
12230
  }
11584
12231
 
11585
12232
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11650,7 +12297,7 @@ struct llm_build_openelm : public llm_graph_context {
11650
12297
  // inp_pos - contains the positions
11651
12298
  ggml_tensor * inp_pos = build_inp_pos();
11652
12299
 
11653
- auto * inp_attn = build_attn_inp_kv_unified();
12300
+ auto * inp_attn = build_attn_inp_kv();
11654
12301
 
11655
12302
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11656
12303
 
@@ -11712,7 +12359,7 @@ struct llm_build_openelm : public llm_graph_context {
11712
12359
 
11713
12360
  cur = build_attn(inp_attn,
11714
12361
  model.layers[il].wo, NULL,
11715
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12362
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11716
12363
  }
11717
12364
 
11718
12365
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11781,7 +12428,7 @@ struct llm_build_gptneox : public llm_graph_context {
11781
12428
  // inp_pos - contains the positions
11782
12429
  ggml_tensor * inp_pos = build_inp_pos();
11783
12430
 
11784
- auto * inp_attn = build_attn_inp_kv_unified();
12431
+ auto * inp_attn = build_attn_inp_kv();
11785
12432
 
11786
12433
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11787
12434
 
@@ -11802,9 +12449,9 @@ struct llm_build_gptneox : public llm_graph_context {
11802
12449
 
11803
12450
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
11804
12451
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
11805
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
12452
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
11806
12453
 
11807
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12454
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11808
12455
 
11809
12456
  Qcur = ggml_rope_ext(
11810
12457
  ctx0, Qcur, inp_pos, nullptr,
@@ -11824,7 +12471,7 @@ struct llm_build_gptneox : public llm_graph_context {
11824
12471
 
11825
12472
  cur = build_attn(inp_attn,
11826
12473
  model.layers[il].wo, model.layers[il].bo,
11827
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12474
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11828
12475
  }
11829
12476
 
11830
12477
  if (il == n_layer - 1 && inp_out_ids) {
@@ -11927,7 +12574,7 @@ struct llm_build_arctic : public llm_graph_context {
11927
12574
  // inp_pos - contains the positions
11928
12575
  ggml_tensor * inp_pos = build_inp_pos();
11929
12576
 
11930
- auto * inp_attn = build_attn_inp_kv_unified();
12577
+ auto * inp_attn = build_attn_inp_kv();
11931
12578
 
11932
12579
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11933
12580
 
@@ -11974,7 +12621,7 @@ struct llm_build_arctic : public llm_graph_context {
11974
12621
 
11975
12622
  cur = build_attn(inp_attn,
11976
12623
  model.layers[il].wo, NULL,
11977
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12624
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11978
12625
  }
11979
12626
 
11980
12627
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12065,7 +12712,7 @@ struct llm_build_deepseek : public llm_graph_context {
12065
12712
  // inp_pos - contains the positions
12066
12713
  ggml_tensor * inp_pos = build_inp_pos();
12067
12714
 
12068
- auto * inp_attn = build_attn_inp_kv_unified();
12715
+ auto * inp_attn = build_attn_inp_kv();
12069
12716
 
12070
12717
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12071
12718
 
@@ -12129,7 +12776,7 @@ struct llm_build_deepseek : public llm_graph_context {
12129
12776
 
12130
12777
  cur = build_attn(inp_attn,
12131
12778
  model.layers[il].wo, model.layers[il].bo,
12132
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12779
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
12133
12780
  }
12134
12781
 
12135
12782
  if (il == n_layer - 1 && inp_out_ids) {
@@ -12242,7 +12889,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12242
12889
  // inp_pos - contains the positions
12243
12890
  ggml_tensor * inp_pos = build_inp_pos();
12244
12891
 
12245
- auto * inp_attn = build_attn_inp_kv_unified();
12892
+ auto * inp_attn = build_attn_inp_kv();
12246
12893
 
12247
12894
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12248
12895
 
@@ -12357,7 +13004,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12357
13004
  // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
12358
13005
  cur = build_attn(inp_attn,
12359
13006
  model.layers[il].wo, NULL,
12360
- Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
13007
+ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
12361
13008
  } else {
12362
13009
  ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
12363
13010
  cb(kv, "kv", il);
@@ -12391,7 +13038,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
12391
13038
  // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
12392
13039
  cur = build_attn(inp_attn,
12393
13040
  model.layers[il].wo, NULL,
12394
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
13041
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
12395
13042
  }
12396
13043
  }
12397
13044
 
@@ -12489,7 +13136,7 @@ struct llm_build_bitnet : public llm_graph_context {
12489
13136
  // inp_pos - contains the positions
12490
13137
  ggml_tensor * inp_pos = build_inp_pos();
12491
13138
 
12492
- auto * inp_attn = build_attn_inp_kv_unified();
13139
+ auto * inp_attn = build_attn_inp_kv();
12493
13140
 
12494
13141
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12495
13142
 
@@ -12558,7 +13205,7 @@ struct llm_build_bitnet : public llm_graph_context {
12558
13205
 
12559
13206
  cur = build_attn(inp_attn,
12560
13207
  NULL, NULL,
12561
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13208
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12562
13209
 
12563
13210
  cur = build_norm(cur,
12564
13211
  model.layers[il].attn_sub_norm, NULL,
@@ -12681,7 +13328,7 @@ struct llm_build_t5_enc : public llm_graph_context {
12681
13328
 
12682
13329
  cur = build_attn(inp_attn,
12683
13330
  model.layers[il].wo_enc, nullptr,
12684
- Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
13331
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
12685
13332
  cb(cur, "kqv_out", il);
12686
13333
  }
12687
13334
 
@@ -12753,7 +13400,7 @@ struct llm_build_t5_dec : public llm_graph_context {
12753
13400
 
12754
13401
  const int64_t n_outputs_enc = embd_enc->ne[1];
12755
13402
 
12756
- auto * inp_attn_self = build_attn_inp_kv_unified();
13403
+ auto * inp_attn_self = build_attn_inp_kv();
12757
13404
  auto * inp_attn_cross = build_attn_inp_cross();
12758
13405
 
12759
13406
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -12787,7 +13434,7 @@ struct llm_build_t5_dec : public llm_graph_context {
12787
13434
 
12788
13435
  cur = build_attn(inp_attn_self,
12789
13436
  model.layers[il].wo, model.layers[il].bo,
12790
- Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
13437
+ Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
12791
13438
  cb(cur, "kqv_out", il);
12792
13439
  }
12793
13440
 
@@ -12819,7 +13466,7 @@ struct llm_build_t5_dec : public llm_graph_context {
12819
13466
 
12820
13467
  cur = build_attn(inp_attn_cross,
12821
13468
  model.layers[il].wo_cross, nullptr,
12822
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
13469
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
12823
13470
  cb(cur, "kqv_out", il);
12824
13471
 
12825
13472
  //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
@@ -12918,7 +13565,7 @@ struct llm_build_jais : public llm_graph_context {
12918
13565
 
12919
13566
  inpL = build_inp_embd(model.tok_embd);
12920
13567
 
12921
- auto * inp_attn = build_attn_inp_kv_unified();
13568
+ auto * inp_attn = build_attn_inp_kv();
12922
13569
 
12923
13570
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12924
13571
 
@@ -12937,21 +13584,21 @@ struct llm_build_jais : public llm_graph_context {
12937
13584
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
12938
13585
  cb(cur, "bqkv", il);
12939
13586
 
12940
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
12941
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
12942
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
13587
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
13588
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
13589
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
12943
13590
 
12944
13591
  cb(Qcur, "Qcur", il);
12945
13592
  cb(Kcur, "Kcur", il);
12946
13593
  cb(Vcur, "Vcur", il);
12947
13594
 
12948
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12949
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12950
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13595
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13596
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13597
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12951
13598
 
12952
13599
  cur = build_attn(inp_attn,
12953
13600
  model.layers[il].wo, model.layers[il].bo,
12954
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
13601
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
12955
13602
  }
12956
13603
 
12957
13604
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13016,7 +13663,7 @@ struct llm_build_chatglm : public llm_graph_context {
13016
13663
  // inp_pos - contains the positions
13017
13664
  ggml_tensor * inp_pos = build_inp_pos();
13018
13665
 
13019
- auto * inp_attn = build_attn_inp_kv_unified();
13666
+ auto * inp_attn = build_attn_inp_kv();
13020
13667
 
13021
13668
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13022
13669
 
@@ -13050,6 +13697,7 @@ struct llm_build_chatglm : public llm_graph_context {
13050
13697
  }
13051
13698
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13052
13699
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13700
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13053
13701
  } else {
13054
13702
  cur = build_lora_mm(model.layers[il].wqkv, cur);
13055
13703
  cb(cur, "wqkv", il);
@@ -13059,11 +13707,10 @@ struct llm_build_chatglm : public llm_graph_context {
13059
13707
  }
13060
13708
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13061
13709
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13062
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
13710
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13711
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13063
13712
  }
13064
13713
 
13065
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13066
-
13067
13714
  //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
13068
13715
  Qcur = ggml_rope_ext(
13069
13716
  ctx0, Qcur, inp_pos, nullptr,
@@ -13083,7 +13730,7 @@ struct llm_build_chatglm : public llm_graph_context {
13083
13730
 
13084
13731
  cur = build_attn(inp_attn,
13085
13732
  model.layers[il].wo, NULL,
13086
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13733
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13087
13734
  }
13088
13735
 
13089
13736
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13149,7 +13796,7 @@ struct llm_build_glm4 : public llm_graph_context {
13149
13796
  // inp_pos - contains the positions
13150
13797
  ggml_tensor * inp_pos = build_inp_pos();
13151
13798
 
13152
- auto * inp_attn = build_attn_inp_kv_unified();
13799
+ auto * inp_attn = build_attn_inp_kv();
13153
13800
 
13154
13801
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13155
13802
 
@@ -13184,6 +13831,7 @@ struct llm_build_glm4 : public llm_graph_context {
13184
13831
  }
13185
13832
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13186
13833
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13834
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13187
13835
  } else {
13188
13836
  cur = build_lora_mm(model.layers[il].wqkv, cur);
13189
13837
  cb(cur, "wqkv", il);
@@ -13193,11 +13841,10 @@ struct llm_build_glm4 : public llm_graph_context {
13193
13841
  }
13194
13842
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13195
13843
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13196
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
13844
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13845
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13197
13846
  }
13198
13847
 
13199
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13200
-
13201
13848
  Qcur = ggml_rope_ext(
13202
13849
  ctx0, Qcur, inp_pos, nullptr,
13203
13850
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13216,7 +13863,7 @@ struct llm_build_glm4 : public llm_graph_context {
13216
13863
 
13217
13864
  cur = build_attn(inp_attn,
13218
13865
  model.layers[il].wo, NULL,
13219
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13866
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13220
13867
  }
13221
13868
 
13222
13869
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13285,12 +13932,11 @@ struct llm_build_glm4 : public llm_graph_context {
13285
13932
  }
13286
13933
  };
13287
13934
 
13288
- struct llm_build_nemotron : public llm_graph_context {
13289
- llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13935
+ struct llm_build_glm4_moe : public llm_graph_context {
13936
+ llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13290
13937
  const int64_t n_embd_head = hparams.n_embd_head_v;
13291
13938
 
13292
13939
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13293
- //GGML_ASSERT(n_embd_head == hparams.n_rot);
13294
13940
 
13295
13941
  ggml_tensor * cur;
13296
13942
  ggml_tensor * inpL;
@@ -13300,48 +13946,54 @@ struct llm_build_nemotron : public llm_graph_context {
13300
13946
  // inp_pos - contains the positions
13301
13947
  ggml_tensor * inp_pos = build_inp_pos();
13302
13948
 
13303
- auto * inp_attn = build_attn_inp_kv_unified();
13949
+ auto * inp_attn = build_attn_inp_kv();
13304
13950
 
13305
13951
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13306
13952
 
13307
- for (int il = 0; il < n_layer; ++il) {
13953
+ // Only process up to last layer (skip final NextN layer)
13954
+ // Final layer tensors are loaded but not processed in forward pass
13955
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
13956
+ for (int il = 0; il < n_transformer_layers; ++il) {
13308
13957
  ggml_tensor * inpSA = inpL;
13309
13958
 
13310
- // norm
13311
- cur = build_norm(inpL,
13312
- model.layers[il].attn_norm,
13313
- model.layers[il].attn_norm_b,
13314
- LLM_NORM, il);
13959
+ // Pre-attention norm
13960
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
13315
13961
  cb(cur, "attn_norm", il);
13316
13962
 
13317
13963
  // self-attention
13318
13964
  {
13319
- // compute Q and K and RoPE them
13320
13965
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13321
- cb(Qcur, "Qcur", il);
13322
13966
  if (model.layers[il].bq) {
13323
13967
  Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13324
- cb(Qcur, "Qcur", il);
13325
13968
  }
13969
+ cb(Qcur, "Qcur", il);
13326
13970
 
13327
13971
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13328
- cb(Kcur, "Kcur", il);
13329
13972
  if (model.layers[il].bk) {
13330
13973
  Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13331
- cb(Kcur, "Kcur", il);
13332
13974
  }
13975
+ cb(Kcur, "Kcur", il);
13333
13976
 
13334
13977
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13335
- cb(Vcur, "Vcur", il);
13336
13978
  if (model.layers[il].bv) {
13337
13979
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
13338
- cb(Vcur, "Vcur", il);
13339
13980
  }
13981
+ cb(Vcur, "Vcur", il);
13340
13982
 
13341
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13983
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13342
13984
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13343
13985
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13344
13986
 
13987
+ // Apply Q/K norm if available (GLM-4.5 355B variant)
13988
+ if (model.layers[il].attn_q_norm) {
13989
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
13990
+ cb(Qcur, "Qcur_normed", il);
13991
+ }
13992
+ if (model.layers[il].attn_k_norm) {
13993
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
13994
+ cb(Kcur, "Kcur_normed", il);
13995
+ }
13996
+
13345
13997
  Qcur = ggml_rope_ext(
13346
13998
  ctx0, Qcur, inp_pos, nullptr,
13347
13999
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13359,34 +14011,62 @@ struct llm_build_nemotron : public llm_graph_context {
13359
14011
  cb(Vcur, "Vcur", il);
13360
14012
 
13361
14013
  cur = build_attn(inp_attn,
13362
- model.layers[il].wo, model.layers[il].bo,
13363
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14014
+ model.layers[il].wo, NULL,
14015
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13364
14016
  }
13365
14017
 
13366
- if (il == n_layer - 1 && inp_out_ids) {
13367
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14018
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
14019
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13368
14020
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13369
14021
  }
13370
14022
 
13371
14023
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13372
14024
  cb(ffn_inp, "ffn_inp", il);
13373
14025
 
13374
- // feed-forward network
13375
- cur = build_norm(ffn_inp,
13376
- model.layers[il].ffn_norm,
13377
- model.layers[il].ffn_norm_b,
13378
- LLM_NORM, il);
13379
- cb(cur, "ffn_norm", il);
14026
+ // Post-attention norm
14027
+ cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
14028
+ cb(cur, "post_attn_norm", il);
13380
14029
 
13381
- cur = build_ffn(cur,
13382
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
13383
- NULL, NULL, NULL,
13384
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
13385
- NULL,
13386
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
14030
+ // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
14031
+ if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
14032
+ // Dense FFN layer
14033
+ cur = build_ffn(cur,
14034
+ model.layers[il].ffn_up, NULL, NULL,
14035
+ model.layers[il].ffn_gate, NULL, NULL,
14036
+ model.layers[il].ffn_down, NULL, NULL,
14037
+ NULL,
14038
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14039
+ cb(cur, "ffn_out", il);
14040
+ } else {
14041
+ // Process routed experts using existing MoE infrastructure
14042
+ ggml_tensor * routed_out = build_moe_ffn(cur,
14043
+ model.layers[il].ffn_gate_inp,
14044
+ model.layers[il].ffn_up_exps,
14045
+ model.layers[il].ffn_gate_exps,
14046
+ model.layers[il].ffn_down_exps,
14047
+ model.layers[il].ffn_exp_probs_b,
14048
+ n_expert, n_expert_used,
14049
+ LLM_FFN_SILU, hparams.expert_weights_norm,
14050
+ true, hparams.expert_weights_scale,
14051
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
14052
+ il);
14053
+ cb(routed_out, "ffn_moe_out", il);
14054
+
14055
+ // Process shared expert on original input
14056
+ ggml_tensor * shared_out = build_ffn(cur,
14057
+ model.layers[il].ffn_up_shexp, NULL, NULL,
14058
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
14059
+ model.layers[il].ffn_down_shexp, NULL, NULL,
14060
+ NULL,
14061
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14062
+ cb(shared_out, "ffn_shexp_out", il);
14063
+
14064
+ // Final output: routed_output + shared_output
14065
+ cur = ggml_add(ctx0, routed_out, shared_out);
14066
+ cb(cur, "ffn_out", il);
14067
+ }
13387
14068
 
13388
14069
  cur = ggml_add(ctx0, cur, ffn_inp);
13389
- cb(cur, "ffn_out", il);
13390
14070
 
13391
14071
  cur = build_cvec(cur, il);
13392
14072
  cb(cur, "l_out", il);
@@ -13396,10 +14076,7 @@ struct llm_build_nemotron : public llm_graph_context {
13396
14076
  }
13397
14077
 
13398
14078
  cur = inpL;
13399
-
13400
- cur = build_norm(cur,
13401
- model.output_norm, model.output_norm_b,
13402
- LLM_NORM, -1);
14079
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
13403
14080
 
13404
14081
  cb(cur, "result_norm", -1);
13405
14082
  res->t_embd = cur;
@@ -13414,12 +14091,12 @@ struct llm_build_nemotron : public llm_graph_context {
13414
14091
  }
13415
14092
  };
13416
14093
 
13417
- struct llm_build_exaone : public llm_graph_context {
13418
- llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
14094
+ struct llm_build_nemotron : public llm_graph_context {
14095
+ llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13419
14096
  const int64_t n_embd_head = hparams.n_embd_head_v;
13420
14097
 
13421
14098
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13422
- GGML_ASSERT(n_embd_head == hparams.n_rot);
14099
+ //GGML_ASSERT(n_embd_head == hparams.n_rot);
13423
14100
 
13424
14101
  ggml_tensor * cur;
13425
14102
  ggml_tensor * inpL;
@@ -13429,7 +14106,7 @@ struct llm_build_exaone : public llm_graph_context {
13429
14106
  // inp_pos - contains the positions
13430
14107
  ggml_tensor * inp_pos = build_inp_pos();
13431
14108
 
13432
- auto * inp_attn = build_attn_inp_kv_unified();
14109
+ auto * inp_attn = build_attn_inp_kv();
13433
14110
 
13434
14111
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13435
14112
 
@@ -13438,15 +14115,13 @@ struct llm_build_exaone : public llm_graph_context {
13438
14115
 
13439
14116
  // norm
13440
14117
  cur = build_norm(inpL,
13441
- model.layers[il].attn_norm, NULL,
13442
- LLM_NORM_RMS, il);
14118
+ model.layers[il].attn_norm,
14119
+ model.layers[il].attn_norm_b,
14120
+ LLM_NORM, il);
13443
14121
  cb(cur, "attn_norm", il);
13444
14122
 
13445
14123
  // self-attention
13446
14124
  {
13447
- // rope freq factors for llama3; may return nullptr for llama2 and other models
13448
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
13449
-
13450
14125
  // compute Q and K and RoPE them
13451
14126
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13452
14127
  cb(Qcur, "Qcur", il);
@@ -13474,13 +14149,13 @@ struct llm_build_exaone : public llm_graph_context {
13474
14149
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13475
14150
 
13476
14151
  Qcur = ggml_rope_ext(
13477
- ctx0, Qcur, inp_pos, rope_factors,
14152
+ ctx0, Qcur, inp_pos, nullptr,
13478
14153
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13479
14154
  ext_factor, attn_factor, beta_fast, beta_slow
13480
14155
  );
13481
14156
 
13482
14157
  Kcur = ggml_rope_ext(
13483
- ctx0, Kcur, inp_pos, rope_factors,
14158
+ ctx0, Kcur, inp_pos, nullptr,
13484
14159
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13485
14160
  ext_factor, attn_factor, beta_fast, beta_slow
13486
14161
  );
@@ -13491,7 +14166,7 @@ struct llm_build_exaone : public llm_graph_context {
13491
14166
 
13492
14167
  cur = build_attn(inp_attn,
13493
14168
  model.layers[il].wo, model.layers[il].bo,
13494
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14169
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13495
14170
  }
13496
14171
 
13497
14172
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13504,17 +14179,17 @@ struct llm_build_exaone : public llm_graph_context {
13504
14179
 
13505
14180
  // feed-forward network
13506
14181
  cur = build_norm(ffn_inp,
13507
- model.layers[il].ffn_norm, NULL,
13508
- LLM_NORM_RMS, il);
14182
+ model.layers[il].ffn_norm,
14183
+ model.layers[il].ffn_norm_b,
14184
+ LLM_NORM, il);
13509
14185
  cb(cur, "ffn_norm", il);
13510
14186
 
13511
14187
  cur = build_ffn(cur,
13512
- model.layers[il].ffn_up, NULL, NULL,
13513
- model.layers[il].ffn_gate, NULL, NULL,
13514
- model.layers[il].ffn_down, NULL, NULL,
14188
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14189
+ NULL, NULL, NULL,
14190
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
13515
14191
  NULL,
13516
- LLM_FFN_SILU, LLM_FFN_PAR, il);
13517
- cb(cur, "ffn_out", il);
14192
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
13518
14193
 
13519
14194
  cur = ggml_add(ctx0, cur, ffn_inp);
13520
14195
  cb(cur, "ffn_out", il);
@@ -13529,8 +14204,8 @@ struct llm_build_exaone : public llm_graph_context {
13529
14204
  cur = inpL;
13530
14205
 
13531
14206
  cur = build_norm(cur,
13532
- model.output_norm, NULL,
13533
- LLM_NORM_RMS, -1);
14207
+ model.output_norm, model.output_norm_b,
14208
+ LLM_NORM, -1);
13534
14209
 
13535
14210
  cb(cur, "result_norm", -1);
13536
14211
  res->t_embd = cur;
@@ -13545,10 +14220,273 @@ struct llm_build_exaone : public llm_graph_context {
13545
14220
  }
13546
14221
  };
13547
14222
 
13548
- template <bool iswa>
13549
- struct llm_build_exaone4 : public llm_graph_context {
13550
- llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13551
- const int64_t n_embd_head = hparams.n_embd_head_k;
14223
+ struct llm_build_nemotron_h : public llm_graph_context_mamba {
14224
+ llm_build_nemotron_h(
14225
+ const llama_model & model,
14226
+ const llm_graph_params & params) :
14227
+ llm_graph_context_mamba(params) {
14228
+
14229
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14230
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14231
+
14232
+ ggml_tensor * cur;
14233
+ ggml_tensor * inpL;
14234
+
14235
+ inpL = build_inp_embd(model.tok_embd);
14236
+
14237
+ auto * inp = build_inp_mem_hybrid();
14238
+
14239
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14240
+
14241
+ for (int il = 0; il < n_layer; ++il) {
14242
+ struct ggml_tensor * inpSA = inpL;
14243
+
14244
+ // norm
14245
+ cur = build_norm(inpL,
14246
+ model.layers[il].attn_norm, NULL,
14247
+ LLM_NORM_RMS, il);
14248
+ cb(cur, "attn_norm", il);
14249
+
14250
+ if (hparams.is_recurrent(il)) {
14251
+ // ssm layer //
14252
+ cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
14253
+ } else if (hparams.n_ff(il) == 0) {
14254
+ // attention layer //
14255
+ cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
14256
+ } else {
14257
+ cur = build_ffn_layer(cur, model, il);
14258
+ }
14259
+
14260
+ if (il == n_layer - 1 && inp_out_ids) {
14261
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14262
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14263
+ }
14264
+
14265
+ // add residual
14266
+ cur = ggml_add(ctx0, cur, inpSA);
14267
+ cb(cur, "block_out", il);
14268
+
14269
+ // input for next layer
14270
+ inpL = cur;
14271
+ }
14272
+
14273
+ cur = inpL;
14274
+
14275
+ cur = build_norm(cur,
14276
+ model.output_norm, NULL,
14277
+ LLM_NORM_RMS, -1);
14278
+
14279
+ cb(cur, "result_norm", -1);
14280
+ res->t_embd = cur;
14281
+
14282
+ // lm_head
14283
+ cur = build_lora_mm(model.output, cur);
14284
+ cb(cur, "result_output", -1);
14285
+ res->t_logits = cur;
14286
+
14287
+ ggml_build_forward_expand(gf, cur);
14288
+ }
14289
+
14290
+ ggml_tensor * build_attention_layer(
14291
+ ggml_tensor * cur,
14292
+ llm_graph_input_attn_kv * inp_attn,
14293
+ const llama_model & model,
14294
+ const int64_t n_embd_head,
14295
+ const int il) {
14296
+
14297
+ // compute Q and K and (optionally) RoPE them
14298
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14299
+ cb(Qcur, "Qcur", il);
14300
+ if (model.layers[il].bq) {
14301
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14302
+ cb(Qcur, "Qcur", il);
14303
+ }
14304
+
14305
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14306
+ cb(Kcur, "Kcur", il);
14307
+ if (model.layers[il].bk) {
14308
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14309
+ cb(Kcur, "Kcur", il);
14310
+ }
14311
+
14312
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14313
+ cb(Vcur, "Vcur", il);
14314
+ if (model.layers[il].bv) {
14315
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14316
+ cb(Vcur, "Vcur", il);
14317
+ }
14318
+
14319
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
14320
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14321
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14322
+
14323
+ cb(Qcur, "Qcur", il);
14324
+ cb(Kcur, "Kcur", il);
14325
+ cb(Vcur, "Vcur", il);
14326
+
14327
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14328
+ cur = build_attn(inp_attn,
14329
+ model.layers[il].wo, model.layers[il].bo,
14330
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
14331
+ cb(cur, "attn_out", il);
14332
+ return cur;
14333
+ }
14334
+
14335
+ ggml_tensor * build_ffn_layer(
14336
+ ggml_tensor * cur,
14337
+ const llama_model & model,
14338
+ const int il) {
14339
+
14340
+ cur = build_ffn(cur,
14341
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14342
+ NULL, NULL, NULL,
14343
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14344
+ NULL,
14345
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
14346
+ cb(cur, "ffn_out", il);
14347
+
14348
+ cur = build_cvec(cur, il);
14349
+ cb(cur, "l_out", il);
14350
+
14351
+ return cur;
14352
+ }
14353
+ };
14354
+
14355
+ struct llm_build_exaone : public llm_graph_context {
14356
+ llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
14357
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14358
+
14359
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14360
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
14361
+
14362
+ ggml_tensor * cur;
14363
+ ggml_tensor * inpL;
14364
+
14365
+ inpL = build_inp_embd(model.tok_embd);
14366
+
14367
+ // inp_pos - contains the positions
14368
+ ggml_tensor * inp_pos = build_inp_pos();
14369
+
14370
+ auto * inp_attn = build_attn_inp_kv();
14371
+
14372
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14373
+
14374
+ for (int il = 0; il < n_layer; ++il) {
14375
+ ggml_tensor * inpSA = inpL;
14376
+
14377
+ // norm
14378
+ cur = build_norm(inpL,
14379
+ model.layers[il].attn_norm, NULL,
14380
+ LLM_NORM_RMS, il);
14381
+ cb(cur, "attn_norm", il);
14382
+
14383
+ // self-attention
14384
+ {
14385
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
14386
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14387
+
14388
+ // compute Q and K and RoPE them
14389
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14390
+ cb(Qcur, "Qcur", il);
14391
+ if (model.layers[il].bq) {
14392
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14393
+ cb(Qcur, "Qcur", il);
14394
+ }
14395
+
14396
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14397
+ cb(Kcur, "Kcur", il);
14398
+ if (model.layers[il].bk) {
14399
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14400
+ cb(Kcur, "Kcur", il);
14401
+ }
14402
+
14403
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14404
+ cb(Vcur, "Vcur", il);
14405
+ if (model.layers[il].bv) {
14406
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14407
+ cb(Vcur, "Vcur", il);
14408
+ }
14409
+
14410
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14411
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14412
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14413
+
14414
+ Qcur = ggml_rope_ext(
14415
+ ctx0, Qcur, inp_pos, rope_factors,
14416
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14417
+ ext_factor, attn_factor, beta_fast, beta_slow
14418
+ );
14419
+
14420
+ Kcur = ggml_rope_ext(
14421
+ ctx0, Kcur, inp_pos, rope_factors,
14422
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14423
+ ext_factor, attn_factor, beta_fast, beta_slow
14424
+ );
14425
+
14426
+ cb(Qcur, "Qcur", il);
14427
+ cb(Kcur, "Kcur", il);
14428
+ cb(Vcur, "Vcur", il);
14429
+
14430
+ cur = build_attn(inp_attn,
14431
+ model.layers[il].wo, model.layers[il].bo,
14432
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14433
+ }
14434
+
14435
+ if (il == n_layer - 1 && inp_out_ids) {
14436
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14437
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14438
+ }
14439
+
14440
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14441
+ cb(ffn_inp, "ffn_inp", il);
14442
+
14443
+ // feed-forward network
14444
+ cur = build_norm(ffn_inp,
14445
+ model.layers[il].ffn_norm, NULL,
14446
+ LLM_NORM_RMS, il);
14447
+ cb(cur, "ffn_norm", il);
14448
+
14449
+ cur = build_ffn(cur,
14450
+ model.layers[il].ffn_up, NULL, NULL,
14451
+ model.layers[il].ffn_gate, NULL, NULL,
14452
+ model.layers[il].ffn_down, NULL, NULL,
14453
+ NULL,
14454
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14455
+ cb(cur, "ffn_out", il);
14456
+
14457
+ cur = ggml_add(ctx0, cur, ffn_inp);
14458
+ cb(cur, "ffn_out", il);
14459
+
14460
+ cur = build_cvec(cur, il);
14461
+ cb(cur, "l_out", il);
14462
+
14463
+ // input for next layer
14464
+ inpL = cur;
14465
+ }
14466
+
14467
+ cur = inpL;
14468
+
14469
+ cur = build_norm(cur,
14470
+ model.output_norm, NULL,
14471
+ LLM_NORM_RMS, -1);
14472
+
14473
+ cb(cur, "result_norm", -1);
14474
+ res->t_embd = cur;
14475
+
14476
+ // lm_head
14477
+ cur = build_lora_mm(model.output, cur);
14478
+
14479
+ cb(cur, "result_output", -1);
14480
+ res->t_logits = cur;
14481
+
14482
+ ggml_build_forward_expand(gf, cur);
14483
+ }
14484
+ };
14485
+
14486
+ template <bool iswa>
14487
+ struct llm_build_exaone4 : public llm_graph_context {
14488
+ llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
14489
+ const int64_t n_embd_head = hparams.n_embd_head_k;
13552
14490
 
13553
14491
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
13554
14492
  GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -13561,13 +14499,13 @@ struct llm_build_exaone4 : public llm_graph_context {
13561
14499
  // inp_pos - contains the positions
13562
14500
  ggml_tensor * inp_pos = build_inp_pos();
13563
14501
 
13564
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
14502
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
13565
14503
  inp_attn_type * inp_attn = nullptr;
13566
14504
 
13567
14505
  if constexpr (iswa) {
13568
- inp_attn = build_attn_inp_kv_unified_iswa();
14506
+ inp_attn = build_attn_inp_kv_iswa();
13569
14507
  } else {
13570
- inp_attn = build_attn_inp_kv_unified();
14508
+ inp_attn = build_attn_inp_kv();
13571
14509
  }
13572
14510
 
13573
14511
  ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -13622,7 +14560,7 @@ struct llm_build_exaone4 : public llm_graph_context {
13622
14560
 
13623
14561
  cur = build_attn(inp_attn,
13624
14562
  model.layers[il].wo, NULL,
13625
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14563
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13626
14564
  cb(cur, "attn_out", il);
13627
14565
  }
13628
14566
 
@@ -14450,7 +15388,7 @@ struct llm_build_granite : public llm_graph_context {
14450
15388
  inp_pos = build_inp_pos();
14451
15389
  }
14452
15390
 
14453
- auto * inp_attn = build_attn_inp_kv_unified();
15391
+ auto * inp_attn = build_attn_inp_kv();
14454
15392
 
14455
15393
  ggml_tensor * inp_out_ids = build_inp_out_ids();
14456
15394
 
@@ -14501,12 +15439,12 @@ struct llm_build_granite : public llm_graph_context {
14501
15439
  }
14502
15440
 
14503
15441
  ggml_tensor * build_attention_layer(
14504
- ggml_tensor * cur,
14505
- ggml_tensor * inp_pos,
14506
- llm_graph_input_attn_kv_unified * inp_attn,
14507
- const llama_model & model,
14508
- const int64_t n_embd_head,
14509
- const int il) {
15442
+ ggml_tensor * cur,
15443
+ ggml_tensor * inp_pos,
15444
+ llm_graph_input_attn_kv * inp_attn,
15445
+ const llama_model & model,
15446
+ const int64_t n_embd_head,
15447
+ const int il) {
14510
15448
 
14511
15449
  // compute Q and K and (optionally) RoPE them
14512
15450
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -14557,7 +15495,7 @@ struct llm_build_granite : public llm_graph_context {
14557
15495
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14558
15496
  cur = build_attn(inp_attn,
14559
15497
  model.layers[il].wo, model.layers[il].bo,
14560
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15498
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
14561
15499
  cb(cur, "attn_out", il);
14562
15500
  return cur;
14563
15501
  }
@@ -14720,12 +15658,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
14720
15658
  }
14721
15659
 
14722
15660
  ggml_tensor * build_attention_layer(
14723
- ggml_tensor * cur,
14724
- ggml_tensor * inp_pos,
14725
- llm_graph_input_attn_kv_unified * inp_attn,
14726
- const llama_model & model,
14727
- const int64_t n_embd_head,
14728
- const int il) {
15661
+ ggml_tensor * cur,
15662
+ ggml_tensor * inp_pos,
15663
+ llm_graph_input_attn_kv * inp_attn,
15664
+ const llama_model & model,
15665
+ const int64_t n_embd_head,
15666
+ const int il) {
14729
15667
 
14730
15668
  // compute Q and K and (optionally) RoPE them
14731
15669
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -14776,7 +15714,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
14776
15714
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14777
15715
  cur = build_attn(inp_attn,
14778
15716
  model.layers[il].wo, model.layers[il].bo,
14779
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15717
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
14780
15718
  cb(cur, "attn_out", il);
14781
15719
  return cur;
14782
15720
  }
@@ -14882,7 +15820,7 @@ struct llm_build_chameleon : public llm_graph_context {
14882
15820
  // inp_pos - contains the positions
14883
15821
  ggml_tensor * inp_pos = build_inp_pos();
14884
15822
 
14885
- auto * inp_attn = build_attn_inp_kv_unified();
15823
+ auto * inp_attn = build_attn_inp_kv();
14886
15824
 
14887
15825
  ggml_tensor * inp_out_ids = build_inp_out_ids();
14888
15826
 
@@ -14961,7 +15899,7 @@ struct llm_build_chameleon : public llm_graph_context {
14961
15899
 
14962
15900
  cur = build_attn(inp_attn,
14963
15901
  model.layers[il].wo, nullptr,
14964
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15902
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14965
15903
  }
14966
15904
 
14967
15905
  if (il == n_layer - 1 && inp_out_ids) {
@@ -15213,7 +16151,7 @@ struct llm_build_plm : public llm_graph_context {
15213
16151
  // inp_pos - contains the positions
15214
16152
  ggml_tensor * inp_pos = build_inp_pos();
15215
16153
 
15216
- auto * inp_attn = build_attn_inp_kv_unified();
16154
+ auto * inp_attn = build_attn_inp_kv();
15217
16155
 
15218
16156
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15219
16157
 
@@ -15317,7 +16255,7 @@ struct llm_build_plm : public llm_graph_context {
15317
16255
 
15318
16256
  cur = build_attn(inp_attn,
15319
16257
  model.layers[il].wo, NULL,
15320
- q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
16258
+ q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
15321
16259
  }
15322
16260
 
15323
16261
  if (il == n_layer - 1 && inp_out_ids) {
@@ -15378,7 +16316,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
15378
16316
  // inp_pos - contains the positions
15379
16317
  ggml_tensor * inp_pos = build_inp_pos();
15380
16318
 
15381
- auto * inp_attn = build_attn_inp_kv_unified();
16319
+ auto * inp_attn = build_attn_inp_kv();
15382
16320
 
15383
16321
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15384
16322
 
@@ -15440,7 +16378,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
15440
16378
 
15441
16379
  cur = build_attn(inp_attn,
15442
16380
  model.layers[il].wo, model.layers[il].bo,
15443
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
16381
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
15444
16382
  }
15445
16383
 
15446
16384
  if (il == n_layer - 1 && inp_out_ids) {
@@ -15527,7 +16465,7 @@ struct llm_build_dots1 : public llm_graph_context {
15527
16465
  // inp_pos - contains the positions
15528
16466
  ggml_tensor * inp_pos = build_inp_pos();
15529
16467
 
15530
- auto * inp_attn = build_attn_inp_kv_unified();
16468
+ auto * inp_attn = build_attn_inp_kv();
15531
16469
 
15532
16470
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15533
16471
 
@@ -15580,7 +16518,7 @@ struct llm_build_dots1 : public llm_graph_context {
15580
16518
 
15581
16519
  cur = build_attn(inp_attn,
15582
16520
  model.layers[il].wo, model.layers[il].bo,
15583
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16521
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15584
16522
  }
15585
16523
 
15586
16524
  if (il == n_layer - 1 && inp_out_ids) {
@@ -15677,7 +16615,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
15677
16615
  // inp_pos - contains the positions
15678
16616
  ggml_tensor * inp_pos = build_inp_pos();
15679
16617
 
15680
- auto * inp_attn = build_attn_inp_kv_unified();
16618
+ auto * inp_attn = build_attn_inp_kv();
15681
16619
 
15682
16620
  for (int il = 0; il < n_layer; ++il) {
15683
16621
  ggml_tensor * inpSA = inpL;
@@ -15735,7 +16673,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
15735
16673
 
15736
16674
  cur = build_attn(inp_attn,
15737
16675
  model.layers[il].wo, NULL,
15738
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16676
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15739
16677
  }
15740
16678
 
15741
16679
  if (il == n_layer - 1) {
@@ -15807,7 +16745,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
15807
16745
  // inp_pos - contains the positions
15808
16746
  ggml_tensor * inp_pos = build_inp_pos();
15809
16747
 
15810
- auto * inp_attn = build_attn_inp_kv_unified();
16748
+ auto * inp_attn = build_attn_inp_kv();
15811
16749
 
15812
16750
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15813
16751
 
@@ -15868,7 +16806,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
15868
16806
 
15869
16807
  cur = build_attn(inp_attn,
15870
16808
  model.layers[il].wo, NULL,
15871
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16809
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15872
16810
  cb(cur, "attn_out", il);
15873
16811
  }
15874
16812
 
@@ -16021,7 +16959,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
16021
16959
 
16022
16960
  ggml_tensor * attn_out = build_attn(inp->get_attn(),
16023
16961
  model.layers[il].wo, NULL,
16024
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
16962
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
16025
16963
  cb(attn_out, "attn_out", il);
16026
16964
 
16027
16965
  cur = build_norm(inpL,
@@ -16181,7 +17119,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
16181
17119
 
16182
17120
  private:
16183
17121
  ggml_tensor * build_plamo2_attn_layer(
16184
- llm_graph_input_attn_kv_unified * inp,
17122
+ llm_graph_input_attn_kv * inp,
16185
17123
  ggml_tensor * inp_pos,
16186
17124
  ggml_tensor * cur,
16187
17125
  const llama_model & model,
@@ -16205,13 +17143,13 @@ private:
16205
17143
 
16206
17144
  ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
16207
17145
  ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
16208
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv)));
17146
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
16209
17147
 
16210
17148
  cb(Qcur, "Qcur", il);
16211
17149
  cb(Kcur, "Kcur", il);
16212
17150
  cb(Vcur, "Vcur", il);
16213
17151
 
16214
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
17152
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
16215
17153
 
16216
17154
  Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
16217
17155
  cb(Qcur, "Qcur_normed", il);
@@ -16231,7 +17169,9 @@ private:
16231
17169
  ext_factor, attn_factor, beta_fast, beta_slow
16232
17170
  );
16233
17171
 
16234
- cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
17172
+ cur = build_attn(inp,
17173
+ model.layers[il].wo, NULL,
17174
+ Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
16235
17175
  }
16236
17176
 
16237
17177
  cb(cur, "attn_out", il);
@@ -16278,15 +17218,13 @@ private:
16278
17218
  cb(zx, "mamba_in_proj", il);
16279
17219
  // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
16280
17220
  zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
16281
- zx = ggml_cont(ctx0, zx);
16282
- zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
17221
+ zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
16283
17222
  cb(zx, "mamba_in_proj_out", il);
16284
17223
 
16285
17224
  // split into z and x
16286
17225
  // => {head_dim * n_heads, n_seq_tokens, n_seqs}
16287
17226
  ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
16288
- x = ggml_cont(ctx0, x);
16289
- x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
17227
+ x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
16290
17228
  // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
16291
17229
  cb(x, "mamba_x_split", il);
16292
17230
 
@@ -16416,7 +17354,7 @@ struct llm_build_arcee : public llm_graph_context {
16416
17354
  // inp_pos - contains the positions
16417
17355
  ggml_tensor * inp_pos = build_inp_pos();
16418
17356
 
16419
- auto * inp_attn = build_attn_inp_kv_unified();
17357
+ auto * inp_attn = build_attn_inp_kv();
16420
17358
 
16421
17359
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
16422
17360
 
@@ -16480,7 +17418,7 @@ struct llm_build_arcee : public llm_graph_context {
16480
17418
 
16481
17419
  cur = build_attn(inp_attn,
16482
17420
  model.layers[il].wo, model.layers[il].bo,
16483
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17421
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
16484
17422
  cb(cur, "attn_out", il);
16485
17423
  }
16486
17424
 
@@ -16551,7 +17489,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
16551
17489
  // inp_pos - contains the positions
16552
17490
  ggml_tensor * inp_pos = build_inp_pos();
16553
17491
 
16554
- auto * inp_attn = build_attn_inp_kv_unified();
17492
+ auto * inp_attn = build_attn_inp_kv();
16555
17493
 
16556
17494
  const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
16557
17495
 
@@ -16625,7 +17563,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
16625
17563
 
16626
17564
  cur = build_attn(inp_attn,
16627
17565
  model.layers[il].wo, model.layers[il].bo,
16628
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17566
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
16629
17567
  cb(cur, "attn_out", il);
16630
17568
  }
16631
17569
 
@@ -16697,8 +17635,8 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
16697
17635
  }
16698
17636
  };
16699
17637
 
16700
- struct llm_build_smollm3 : public llm_graph_context {
16701
- llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17638
+ struct llm_build_hunyuan_dense : public llm_graph_context {
17639
+ llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
16702
17640
  const int64_t n_embd_head = hparams.n_embd_head_v;
16703
17641
 
16704
17642
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -16712,25 +17650,25 @@ struct llm_build_smollm3 : public llm_graph_context {
16712
17650
  // inp_pos - contains the positions
16713
17651
  ggml_tensor * inp_pos = build_inp_pos();
16714
17652
 
16715
- auto * inp_attn = build_attn_inp_kv_unified();
17653
+ auto * inp_attn = build_attn_inp_kv();
16716
17654
 
16717
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
17655
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
16718
17656
 
16719
17657
  ggml_tensor * inp_out_ids = build_inp_out_ids();
16720
17658
 
16721
17659
  for (int il = 0; il < n_layer; ++il) {
16722
17660
  ggml_tensor * inpSA = inpL;
16723
17661
 
16724
- const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
16725
-
16726
17662
  // norm
16727
17663
  cur = build_norm(inpL,
16728
17664
  model.layers[il].attn_norm, NULL,
16729
17665
  LLM_NORM_RMS, il);
16730
17666
  cb(cur, "attn_norm", il);
16731
-
16732
17667
  // self-attention
16733
17668
  {
17669
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
17670
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
17671
+
16734
17672
  // compute Q and K and RoPE them
16735
17673
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
16736
17674
  cb(Qcur, "Qcur", il);
@@ -16757,7 +17695,145 @@ struct llm_build_smollm3 : public llm_graph_context {
16757
17695
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
16758
17696
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
16759
17697
 
16760
- if (use_rope) {
17698
+ Qcur = ggml_rope_ext(
17699
+ ctx0, Qcur, inp_pos, rope_factors,
17700
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17701
+ ext_factor, attn_factor, beta_fast, beta_slow
17702
+ );
17703
+
17704
+ cb(Qcur, "Qcur", il);
17705
+ cb(Kcur, "Kcur", il);
17706
+ cb(Vcur, "Vcur", il);
17707
+
17708
+ Kcur = ggml_rope_ext(
17709
+ ctx0, Kcur, inp_pos, rope_factors,
17710
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17711
+ ext_factor, attn_factor, beta_fast, beta_slow
17712
+ );
17713
+
17714
+ Kcur = build_norm(Kcur,
17715
+ model.layers[il].attn_k_norm, nullptr,
17716
+ LLM_NORM_RMS, il);
17717
+ cb(Kcur, "Kcur_norm", il);
17718
+
17719
+ Qcur = build_norm(Qcur,
17720
+ model.layers[il].attn_q_norm, nullptr,
17721
+ LLM_NORM_RMS, il);
17722
+ cb(Qcur, "Qcur_norm", il);
17723
+
17724
+ cur = build_attn(inp_attn,
17725
+ model.layers[il].wo, model.layers[il].bo,
17726
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
17727
+ cb(cur, "attn_out", il);
17728
+ }
17729
+
17730
+ if (il == n_layer - 1 && inp_out_ids) {
17731
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17732
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17733
+ }
17734
+
17735
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
17736
+ cb(ffn_inp, "ffn_inp", il);
17737
+
17738
+ cur = build_norm(ffn_inp,
17739
+ model.layers[il].ffn_norm, NULL,
17740
+ LLM_NORM_RMS, il);
17741
+ cb(cur, "ffn_norm", il);
17742
+ // feed-forward network (non-MoE)
17743
+ ggml_tensor * cur_mlp = build_ffn(cur,
17744
+ model.layers[il].ffn_up, NULL, NULL,
17745
+ model.layers[il].ffn_gate, NULL, NULL,
17746
+ model.layers[il].ffn_down, NULL, NULL,
17747
+ NULL,
17748
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
17749
+ cb(cur_mlp, "ffn_out", il);
17750
+
17751
+ cur = ggml_add(ctx0, cur_mlp, ffn_inp);
17752
+
17753
+ cur = build_cvec(cur, il);
17754
+ cb(cur, "l_out", il);
17755
+
17756
+ // input for next layer
17757
+ inpL = cur;
17758
+ }
17759
+ cur = inpL;
17760
+
17761
+ cur = build_norm(cur,
17762
+ model.output_norm, NULL,
17763
+ LLM_NORM_RMS, -1);
17764
+
17765
+ cb(cur, "result_norm", -1);
17766
+ res->t_embd = cur;
17767
+ // lm_head
17768
+ cur = build_lora_mm(model.output, cur);
17769
+ cb(cur, "result_output", -1);
17770
+ res->t_logits = cur;
17771
+
17772
+ ggml_build_forward_expand(gf, cur);
17773
+ }
17774
+ };
17775
+
17776
+ struct llm_build_smollm3 : public llm_graph_context {
17777
+ llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17778
+ const int64_t n_embd_head = hparams.n_embd_head_v;
17779
+
17780
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
17781
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
17782
+
17783
+ ggml_tensor * cur;
17784
+ ggml_tensor * inpL;
17785
+
17786
+ inpL = build_inp_embd(model.tok_embd);
17787
+
17788
+ // inp_pos - contains the positions
17789
+ ggml_tensor * inp_pos = build_inp_pos();
17790
+
17791
+ auto * inp_attn = build_attn_inp_kv();
17792
+
17793
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
17794
+
17795
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17796
+
17797
+ for (int il = 0; il < n_layer; ++il) {
17798
+ ggml_tensor * inpSA = inpL;
17799
+
17800
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
17801
+
17802
+ // norm
17803
+ cur = build_norm(inpL,
17804
+ model.layers[il].attn_norm, NULL,
17805
+ LLM_NORM_RMS, il);
17806
+ cb(cur, "attn_norm", il);
17807
+
17808
+ // self-attention
17809
+ {
17810
+ // compute Q and K and RoPE them
17811
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17812
+ cb(Qcur, "Qcur", il);
17813
+ if (model.layers[il].bq) {
17814
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
17815
+ cb(Qcur, "Qcur", il);
17816
+ }
17817
+
17818
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
17819
+ cb(Kcur, "Kcur", il);
17820
+ if (model.layers[il].bk) {
17821
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
17822
+ cb(Kcur, "Kcur", il);
17823
+ }
17824
+
17825
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
17826
+ cb(Vcur, "Vcur", il);
17827
+ if (model.layers[il].bv) {
17828
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
17829
+ cb(Vcur, "Vcur", il);
17830
+ }
17831
+
17832
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
17833
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
17834
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
17835
+
17836
+ if (use_rope) {
16761
17837
  Qcur = ggml_rope_ext(
16762
17838
  ctx0, Qcur, inp_pos, nullptr,
16763
17839
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -16777,7 +17853,7 @@ struct llm_build_smollm3 : public llm_graph_context {
16777
17853
 
16778
17854
  cur = build_attn(inp_attn,
16779
17855
  model.layers[il].wo, model.layers[il].bo,
16780
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
17856
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
16781
17857
  cb(cur, "attn_out", il);
16782
17858
  }
16783
17859
 
@@ -16834,6 +17910,136 @@ struct llm_build_smollm3 : public llm_graph_context {
16834
17910
  }
16835
17911
  };
16836
17912
 
17913
+ struct llm_build_openai_moe_iswa : public llm_graph_context {
17914
+ llm_build_openai_moe_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17915
+ ggml_tensor * cur;
17916
+ ggml_tensor * inpL;
17917
+
17918
+ inpL = build_inp_embd(model.tok_embd);
17919
+
17920
+ // inp_pos - contains the positions
17921
+ ggml_tensor * inp_pos = build_inp_pos();
17922
+
17923
+ auto * inp_attn = build_attn_inp_kv_iswa();
17924
+
17925
+ for (int il = 0; il < n_layer; ++il) {
17926
+ ggml_tensor * inpSA = inpL;
17927
+
17928
+ // norm
17929
+ cur = build_norm(inpL,
17930
+ model.layers[il].attn_norm, nullptr,
17931
+ LLM_NORM_RMS, il);
17932
+ cb(cur, "attn_norm", il);
17933
+
17934
+ // self-attention
17935
+ {
17936
+ // compute Q and K and RoPE them
17937
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17938
+ cb(Qcur, "Qcur", il);
17939
+ if (model.layers[il].bq) {
17940
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
17941
+ cb(Qcur, "Qcur", il);
17942
+ }
17943
+
17944
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
17945
+ cb(Kcur, "Kcur", il);
17946
+ if (model.layers[il].bk) {
17947
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
17948
+ cb(Kcur, "Kcur", il);
17949
+ }
17950
+
17951
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
17952
+ cb(Vcur, "Vcur", il);
17953
+ if (model.layers[il].bv) {
17954
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
17955
+ cb(Vcur, "Vcur", il);
17956
+ }
17957
+
17958
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
17959
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
17960
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
17961
+
17962
+ Qcur = ggml_rope_ext(
17963
+ ctx0, Qcur, inp_pos, nullptr,
17964
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17965
+ ext_factor, attn_factor, beta_fast, beta_slow
17966
+ );
17967
+
17968
+ Kcur = ggml_rope_ext(
17969
+ ctx0, Kcur, inp_pos, nullptr,
17970
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17971
+ ext_factor, attn_factor, beta_fast, beta_slow
17972
+ );
17973
+
17974
+ cb(Qcur, "Qcur", il);
17975
+ cb(Kcur, "Kcur", il);
17976
+ cb(Vcur, "Vcur", il);
17977
+
17978
+ cur = build_attn(inp_attn,
17979
+ model.layers[il].wo, model.layers[il].bo,
17980
+ Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
17981
+
17982
+ cb(cur, "attn_out", il);
17983
+ }
17984
+
17985
+ if (il == n_layer - 1) {
17986
+ // skip computing output for unused tokens
17987
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17988
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17989
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17990
+ }
17991
+
17992
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
17993
+ cb(ffn_inp, "ffn_inp", il);
17994
+
17995
+ cur = ffn_inp;
17996
+ cur = build_norm(cur,
17997
+ model.layers[il].attn_post_norm, nullptr,
17998
+ LLM_NORM_RMS, il);
17999
+ cb(cur, "attn_post_norm", il);
18000
+
18001
+ // MoE branch
18002
+ cur = build_moe_ffn(cur,
18003
+ model.layers[il].ffn_gate_inp, model.layers[il].ffn_gate_inp_b,
18004
+ model.layers[il].ffn_up_exps, model.layers[il].ffn_up_exps_b,
18005
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_gate_exps_b,
18006
+ model.layers[il].ffn_down_exps, model.layers[il].ffn_down_exps_b,
18007
+ nullptr,
18008
+ n_expert, n_expert_used,
18009
+ LLM_FFN_SWIGLU_OAI_MOE, false,
18010
+ false, 0.0,
18011
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
18012
+ il);
18013
+ cb(cur, "ffn_moe_out", il);
18014
+
18015
+ cur = ggml_add(ctx0, cur, ffn_inp);
18016
+
18017
+ cur = build_cvec(cur, il);
18018
+ cb(cur, "l_out", il);
18019
+
18020
+ // input for next layer
18021
+ inpL = cur;
18022
+ }
18023
+
18024
+ cur = inpL;
18025
+
18026
+ cur = build_norm(cur,
18027
+ model.output_norm, NULL,
18028
+ LLM_NORM_RMS, -1);
18029
+
18030
+ cb(cur, "result_norm", -1);
18031
+ res->t_embd = cur;
18032
+
18033
+ // lm_head
18034
+ cur = build_lora_mm(model.output, cur);
18035
+
18036
+ cb(cur, "result_output", -1);
18037
+ res->t_logits = cur;
18038
+
18039
+ ggml_build_forward_expand(gf, cur);
18040
+ }
18041
+ };
18042
+
16837
18043
  struct llm_build_lfm2 : public llm_graph_context {
16838
18044
  const llama_model & model;
16839
18045
 
@@ -16868,8 +18074,7 @@ struct llm_build_lfm2 : public llm_graph_context {
16868
18074
  cb(cur, "model.embedding_norm", -1);
16869
18075
  res->t_embd = cur;
16870
18076
 
16871
- // lm_head is tied with embeddings
16872
- cur = build_lora_mm(model.tok_embd, cur);
18077
+ cur = build_lora_mm(model.output, cur);
16873
18078
  cb(cur, "lm_head", -1);
16874
18079
 
16875
18080
  res->t_logits = cur;
@@ -16896,10 +18101,10 @@ struct llm_build_lfm2 : public llm_graph_context {
16896
18101
  return cur;
16897
18102
  }
16898
18103
 
16899
- ggml_tensor * build_attn_block(ggml_tensor * cur,
16900
- ggml_tensor * inp_pos,
16901
- llm_graph_input_attn_kv_unified * inp_attn,
16902
- int il) const {
18104
+ ggml_tensor * build_attn_block(ggml_tensor * cur,
18105
+ ggml_tensor * inp_pos,
18106
+ llm_graph_input_attn_kv * inp_attn,
18107
+ int il) const {
16903
18108
  GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
16904
18109
  auto const n_embd_head = hparams.n_embd_head_v;
16905
18110
  auto const n_head_kv = hparams.n_head_kv(il);
@@ -16934,7 +18139,7 @@ struct llm_build_lfm2 : public llm_graph_context {
16934
18139
  );
16935
18140
 
16936
18141
  cur = build_attn(inp_attn, model.layers[il].wo, NULL,
16937
- q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
18142
+ q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16938
18143
 
16939
18144
  cb(cur, "model.layers.{}.self_attn.out_proj", il);
16940
18145
 
@@ -17011,6 +18216,258 @@ struct llm_build_lfm2 : public llm_graph_context {
17011
18216
  }
17012
18217
  };
17013
18218
 
18219
+ struct llm_build_seed_oss : public llm_graph_context {
18220
+ llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
18221
+ const int64_t n_embd_head = hparams.n_embd_head_v;
18222
+
18223
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
18224
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
18225
+
18226
+ ggml_tensor * cur;
18227
+ ggml_tensor * inpL;
18228
+
18229
+ inpL = build_inp_embd(model.tok_embd);
18230
+
18231
+ // inp_pos - contains the positions
18232
+ ggml_tensor * inp_pos = build_inp_pos();
18233
+
18234
+ auto * inp_attn = build_attn_inp_kv();
18235
+
18236
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
18237
+
18238
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
18239
+
18240
+ for (int il = 0; il < n_layer; ++il) {
18241
+ ggml_tensor * inpSA = inpL;
18242
+
18243
+ // norm
18244
+ cur = build_norm(inpL,
18245
+ model.layers[il].attn_norm, NULL,
18246
+ LLM_NORM_RMS, il);
18247
+ cb(cur, "attn_norm", il);
18248
+
18249
+ // self-attention
18250
+ {
18251
+ // compute Q and K and RoPE them
18252
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
18253
+ cb(Qcur, "Qcur", il);
18254
+ if (model.layers[il].bq) {
18255
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
18256
+ cb(Qcur, "Qcur", il);
18257
+ }
18258
+
18259
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
18260
+ cb(Kcur, "Kcur", il);
18261
+ if (model.layers[il].bk) {
18262
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
18263
+ cb(Kcur, "Kcur", il);
18264
+ }
18265
+
18266
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
18267
+ cb(Vcur, "Vcur", il);
18268
+ if (model.layers[il].bv) {
18269
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
18270
+ cb(Vcur, "Vcur", il);
18271
+ }
18272
+
18273
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
18274
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
18275
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
18276
+
18277
+ Qcur = ggml_rope_ext(
18278
+ ctx0, Qcur, inp_pos, nullptr,
18279
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
18280
+ ext_factor, attn_factor, beta_fast, beta_slow
18281
+ );
18282
+
18283
+ Kcur = ggml_rope_ext(
18284
+ ctx0, Kcur, inp_pos, nullptr,
18285
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
18286
+ ext_factor, attn_factor, beta_fast, beta_slow
18287
+ );
18288
+
18289
+ cb(Qcur, "Qcur", il);
18290
+ cb(Kcur, "Kcur", il);
18291
+ cb(Vcur, "Vcur", il);
18292
+
18293
+ cur = build_attn(inp_attn,
18294
+ model.layers[il].wo, model.layers[il].bo,
18295
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
18296
+ cb(cur, "attn_out", il);
18297
+ }
18298
+
18299
+ if (il == n_layer - 1 && inp_out_ids) {
18300
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
18301
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
18302
+ }
18303
+
18304
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
18305
+ cb(ffn_inp, "ffn_inp", il);
18306
+
18307
+ // feed-forward network
18308
+ cur = build_norm(ffn_inp,
18309
+ model.layers[il].attn_post_norm, NULL,
18310
+ LLM_NORM_RMS, il);
18311
+ cb(cur, "attn_post_norm", il);
18312
+
18313
+ cur = build_ffn(cur,
18314
+ model.layers[il].ffn_up, NULL, NULL,
18315
+ model.layers[il].ffn_gate, NULL, NULL,
18316
+ model.layers[il].ffn_down, NULL, NULL,
18317
+ NULL,
18318
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
18319
+ cb(cur, "ffn_out", il);
18320
+
18321
+ cur = ggml_add(ctx0, cur, ffn_inp);
18322
+ cb(cur, "ffn_out", il);
18323
+
18324
+ cur = build_cvec(cur, il);
18325
+ cb(cur, "l_out", il);
18326
+
18327
+ // input for next layer
18328
+ inpL = cur;
18329
+ }
18330
+
18331
+ cur = inpL;
18332
+
18333
+ cur = build_norm(cur,
18334
+ model.output_norm, NULL,
18335
+ LLM_NORM_RMS, -1);
18336
+
18337
+ cb(cur, "result_norm", -1);
18338
+ res->t_embd = cur;
18339
+
18340
+ // lm_head
18341
+ cur = build_lora_mm(model.output, cur);
18342
+
18343
+ cb(cur, "result_output", -1);
18344
+ res->t_logits = cur;
18345
+
18346
+ ggml_build_forward_expand(gf, cur);
18347
+ }
18348
+ };
18349
+
18350
+ template <bool iswa>
18351
+ struct llm_build_smallthinker : public llm_graph_context{
18352
+ llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
18353
+ const int64_t n_embd_head = hparams.n_embd_head_v;
18354
+
18355
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
18356
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
18357
+
18358
+ ggml_tensor * cur;
18359
+ ggml_tensor * inpL;
18360
+
18361
+ inpL = build_inp_embd(model.tok_embd);
18362
+
18363
+ // inp_pos - contains the positions
18364
+ ggml_tensor * inp_pos = build_inp_pos();
18365
+
18366
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
18367
+ inp_attn_type * inp_attn = nullptr;
18368
+
18369
+ if constexpr (iswa) {
18370
+ inp_attn = build_attn_inp_kv_iswa();
18371
+ } else {
18372
+ inp_attn = build_attn_inp_kv();
18373
+ }
18374
+
18375
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
18376
+
18377
+ for (int il = 0; il < n_layer; ++il) {
18378
+ ggml_tensor * inpSA = inpL;
18379
+ ggml_tensor * probs = nullptr;
18380
+
18381
+ probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
18382
+ cb(probs, "ffn_moe_logits", il);
18383
+
18384
+ // norm
18385
+ cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
18386
+ cb(cur, "attn_norm", il);
18387
+
18388
+ // self_attention
18389
+ {
18390
+ // compute Q and K and RoPE them
18391
+ struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
18392
+ cb(Qcur, "Qcur", il);
18393
+
18394
+ struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
18395
+ cb(Kcur, "Kcur", il);
18396
+
18397
+ struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
18398
+ cb(Vcur, "Vcur", il);
18399
+
18400
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
18401
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
18402
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
18403
+
18404
+ if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
18405
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
18406
+ ext_factor, attn_factor, beta_fast, beta_slow);
18407
+
18408
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
18409
+ ext_factor, attn_factor, beta_fast, beta_slow);
18410
+ }
18411
+
18412
+ cb(Qcur, "Qcur", il);
18413
+ cb(Kcur, "Kcur", il);
18414
+
18415
+ cur = build_attn(inp_attn,
18416
+ model.layers[il].wo, model.layers[il].bo,
18417
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
18418
+ }
18419
+
18420
+ if (il == n_layer - 1 && inp_out_ids) {
18421
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
18422
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
18423
+ probs = ggml_get_rows(ctx0, probs, inp_out_ids);
18424
+ }
18425
+
18426
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
18427
+ cb(ffn_inp, "ffn_inp", il);
18428
+
18429
+ // MoE branch
18430
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
18431
+ cb(cur, "ffn_norm", il);
18432
+
18433
+ ggml_tensor * ffn_out =
18434
+ build_moe_ffn(cur,
18435
+ nullptr,
18436
+ model.layers[il].ffn_up_exps,
18437
+ model.layers[il].ffn_gate_exps,
18438
+ model.layers[il].ffn_down_exps,
18439
+ nullptr,
18440
+ n_expert, n_expert_used,
18441
+ LLM_FFN_RELU, true,
18442
+ false, 0.0,
18443
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
18444
+ il, probs);
18445
+
18446
+ cb(ffn_out, "ffn_out", il);
18447
+ cur = ffn_out;
18448
+
18449
+ cur = ggml_add(ctx0, cur, ffn_inp);
18450
+ cur = build_cvec(cur, il);
18451
+ cb(cur, "l_out", il);
18452
+
18453
+ // input for next layer
18454
+ inpL = cur;
18455
+ }
18456
+
18457
+ cur = inpL;
18458
+
18459
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
18460
+ cb(cur, "result_norm", -1);
18461
+
18462
+ // lm_head
18463
+ cur = build_lora_mm(model.output, cur);
18464
+ cb(cur, "result_output", -1);
18465
+ res->t_logits = cur;
18466
+
18467
+ ggml_build_forward_expand(gf, cur);
18468
+ }
18469
+ };
18470
+
17014
18471
  llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
17015
18472
  llama_memory_i * res;
17016
18473
 
@@ -17019,11 +18476,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17019
18476
  // switch statement
17020
18477
  case LLM_ARCH_BERT:
17021
18478
  case LLM_ARCH_JINA_BERT_V2:
18479
+ case LLM_ARCH_JINA_BERT_V3:
17022
18480
  case LLM_ARCH_NOMIC_BERT:
17023
18481
  case LLM_ARCH_NOMIC_BERT_MOE:
17024
18482
  case LLM_ARCH_NEO_BERT:
17025
18483
  case LLM_ARCH_WAVTOKENIZER_DEC:
17026
18484
  case LLM_ARCH_DREAM:
18485
+ case LLM_ARCH_LLADA:
17027
18486
  {
17028
18487
  res = nullptr;
17029
18488
  } break;
@@ -17034,14 +18493,31 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17034
18493
  if (llm_arch_is_recurrent(arch)) {
17035
18494
  res = new llama_memory_recurrent(
17036
18495
  *this,
17037
- nullptr,
17038
18496
  GGML_TYPE_F32,
17039
18497
  GGML_TYPE_F32,
17040
18498
  cparams.offload_kqv,
17041
18499
  std::max((uint32_t) 1, cparams.n_seq_max),
17042
- cparams.n_seq_max);
18500
+ cparams.n_seq_max,
18501
+ nullptr);
17043
18502
  } else if (llm_arch_is_hybrid(arch)) {
17044
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
18503
+
18504
+ // The main difference between hybrid architectures is the
18505
+ // layer filters, so pick the right one here
18506
+ llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
18507
+ llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
18508
+ if (arch == LLM_ARCH_FALCON_H1) {
18509
+ filter_attn = [&](int32_t) { return true; };
18510
+ filter_recr = [&](int32_t) { return true; };
18511
+ } else if (arch == LLM_ARCH_NEMOTRON_H) {
18512
+ filter_attn = [&](int32_t il) {
18513
+ return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18514
+ };
18515
+ filter_recr = [&](int32_t il) {
18516
+ return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
18517
+ };
18518
+ }
18519
+
18520
+ const auto padding = llama_kv_cache::get_padding(cparams);
17045
18521
 
17046
18522
  cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
17047
18523
 
@@ -17059,10 +18535,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17059
18535
  /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
17060
18536
  /* n_seq_max */ cparams.n_seq_max,
17061
18537
  /* offload */ cparams.offload_kqv,
17062
- /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
17063
- /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
18538
+ /* unified */ cparams.kv_unified,
18539
+ /* filter_attn */ std::move(filter_attn),
18540
+ /* filter_recr */ std::move(filter_recr));
17064
18541
  } else {
17065
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
18542
+ const auto padding = llama_kv_cache::get_padding(cparams);
17066
18543
 
17067
18544
  uint32_t n_ctx_per_stream = cparams.n_ctx;
17068
18545
 
@@ -17079,10 +18556,22 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17079
18556
 
17080
18557
  LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
17081
18558
 
18559
+ llama_memory_i::layer_reuse_cb reuse = nullptr;
18560
+
18561
+ if (arch == LLM_ARCH_GEMMA3N) {
18562
+ reuse = [&](int32_t il) {
18563
+ if (il >= (int32_t) hparams.n_layer_kv_from_start) {
18564
+ return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
18565
+ }
18566
+
18567
+ return -1;
18568
+ };
18569
+ }
18570
+
17082
18571
  if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
17083
18572
  GGML_ASSERT(hparams.is_swa_any());
17084
18573
 
17085
- res = new llama_kv_cache_unified_iswa(
18574
+ res = new llama_kv_cache_iswa(
17086
18575
  *this,
17087
18576
  params.type_k,
17088
18577
  params.type_v,
@@ -17093,13 +18582,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17093
18582
  n_ctx_per_stream,
17094
18583
  cparams.n_seq_max,
17095
18584
  cparams.n_ubatch,
17096
- padding);
18585
+ padding,
18586
+ nullptr,
18587
+ reuse);
17097
18588
  } else {
17098
18589
  GGML_ASSERT(!hparams.is_swa_any());
17099
18590
 
17100
- res = new llama_kv_cache_unified(
18591
+ res = new llama_kv_cache(
17101
18592
  *this,
17102
- nullptr,
17103
18593
  params.type_k,
17104
18594
  params.type_v,
17105
18595
  !cparams.flash_attn,
@@ -17109,7 +18599,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
17109
18599
  cparams.n_seq_max,
17110
18600
  padding,
17111
18601
  hparams.n_swa,
17112
- hparams.swa_type);
18602
+ hparams.swa_type,
18603
+ nullptr,
18604
+ nullptr);
17113
18605
  }
17114
18606
  }
17115
18607
  }
@@ -17156,6 +18648,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17156
18648
  } break;
17157
18649
  case LLM_ARCH_BERT:
17158
18650
  case LLM_ARCH_JINA_BERT_V2:
18651
+ case LLM_ARCH_JINA_BERT_V3:
17159
18652
  case LLM_ARCH_NOMIC_BERT:
17160
18653
  case LLM_ARCH_NOMIC_BERT_MOE:
17161
18654
  {
@@ -17190,6 +18683,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17190
18683
  llm = std::make_unique<llm_build_dream>(*this, params);
17191
18684
  }
17192
18685
  break;
18686
+ case LLM_ARCH_LLADA:
18687
+ {
18688
+ llm = std::make_unique<llm_build_llada>(*this, params);
18689
+ }
18690
+ break;
17193
18691
  case LLM_ARCH_QWEN2VL:
17194
18692
  {
17195
18693
  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -17332,6 +18830,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17332
18830
  {
17333
18831
  llm = std::make_unique<llm_build_glm4>(*this, params);
17334
18832
  } break;
18833
+ case LLM_ARCH_GLM4_MOE:
18834
+ {
18835
+ llm = std::make_unique<llm_build_glm4_moe>(*this, params);
18836
+ } break;
17335
18837
  case LLM_ARCH_BITNET:
17336
18838
  {
17337
18839
  llm = std::make_unique<llm_build_bitnet>(*this, params);
@@ -17363,6 +18865,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17363
18865
  {
17364
18866
  llm = std::make_unique<llm_build_nemotron>(*this, params);
17365
18867
  } break;
18868
+ case LLM_ARCH_NEMOTRON_H:
18869
+ {
18870
+ llm = std::make_unique<llm_build_nemotron_h>(*this, params);
18871
+ } break;
17366
18872
  case LLM_ARCH_EXAONE:
17367
18873
  {
17368
18874
  llm = std::make_unique<llm_build_exaone>(*this, params);
@@ -17417,6 +18923,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17417
18923
  {
17418
18924
  llm = std::make_unique<llm_build_bailingmoe>(*this, params);
17419
18925
  } break;
18926
+ case LLM_ARCH_SEED_OSS:
18927
+ {
18928
+ llm = std::make_unique<llm_build_seed_oss>(*this, params);
18929
+ } break;
17420
18930
  case LLM_ARCH_DOTS1:
17421
18931
  {
17422
18932
  llm = std::make_unique<llm_build_dots1>(*this, params);
@@ -17437,10 +18947,18 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17437
18947
  {
17438
18948
  llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
17439
18949
  } break;
18950
+ case LLM_ARCH_HUNYUAN_DENSE:
18951
+ {
18952
+ llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
18953
+ } break;
17440
18954
  case LLM_ARCH_SMOLLM3:
17441
18955
  {
17442
18956
  llm = std::make_unique<llm_build_smollm3>(*this, params);
17443
18957
  } break;
18958
+ case LLM_ARCH_OPENAI_MOE:
18959
+ {
18960
+ llm = std::make_unique<llm_build_openai_moe_iswa>(*this, params);
18961
+ } break;
17444
18962
  case LLM_ARCH_FALCON_H1:
17445
18963
  {
17446
18964
  llm = std::make_unique<llm_build_falcon_h1>(*this, params);
@@ -17449,6 +18967,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17449
18967
  {
17450
18968
  llm = std::make_unique<llm_build_lfm2>(*this, params);
17451
18969
  } break;
18970
+ case LLM_ARCH_SMALLTHINKER:
18971
+ {
18972
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
18973
+ llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
18974
+ } else {
18975
+ llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
18976
+ }
18977
+ } break;
17452
18978
  default:
17453
18979
  GGML_ABORT("fatal error");
17454
18980
  }
@@ -17459,6 +18985,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17459
18985
  return llm->res->get_gf();
17460
18986
  }
17461
18987
 
18988
+
17462
18989
  //
17463
18990
  // interface implementation
17464
18991
  //
@@ -17478,6 +19005,7 @@ llama_model_params llama_model_default_params() {
17478
19005
  /*.use_mmap =*/ true,
17479
19006
  /*.use_mlock =*/ false,
17480
19007
  /*.check_tensors =*/ false,
19008
+ /*.use_extra_bufts =*/ true,
17481
19009
  };
17482
19010
 
17483
19011
  #ifdef GGML_USE_METAL
@@ -17576,10 +19104,12 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
17576
19104
  case LLM_ARCH_RWKV7:
17577
19105
  case LLM_ARCH_ARWKV7:
17578
19106
  case LLM_ARCH_WAVTOKENIZER_DEC:
19107
+ case LLM_ARCH_NEMOTRON_H:
17579
19108
  return LLAMA_ROPE_TYPE_NONE;
17580
19109
 
17581
19110
  // use what we call a normal RoPE, operating on pairs of consecutive head values
17582
19111
  case LLM_ARCH_LLAMA:
19112
+ case LLM_ARCH_LLADA:
17583
19113
  case LLM_ARCH_LLAMA4:
17584
19114
  case LLM_ARCH_DECI:
17585
19115
  case LLM_ARCH_BAICHUAN:
@@ -17614,6 +19144,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
17614
19144
  case LLM_ARCH_GROK:
17615
19145
  case LLM_ARCH_DBRX:
17616
19146
  case LLM_ARCH_BERT:
19147
+ case LLM_ARCH_JINA_BERT_V3:
17617
19148
  case LLM_ARCH_NOMIC_BERT:
17618
19149
  case LLM_ARCH_NOMIC_BERT_MOE:
17619
19150
  case LLM_ARCH_STABLELM:
@@ -17646,7 +19177,12 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
17646
19177
  case LLM_ARCH_MINICPM3:
17647
19178
  case LLM_ARCH_DOTS1:
17648
19179
  case LLM_ARCH_HUNYUAN_MOE:
19180
+ case LLM_ARCH_OPENAI_MOE:
19181
+ case LLM_ARCH_HUNYUAN_DENSE:
17649
19182
  case LLM_ARCH_LFM2:
19183
+ case LLM_ARCH_SMALLTHINKER:
19184
+ case LLM_ARCH_GLM4_MOE:
19185
+ case LLM_ARCH_SEED_OSS:
17650
19186
  return LLAMA_ROPE_TYPE_NEOX;
17651
19187
 
17652
19188
  case LLM_ARCH_QWEN2VL:
@@ -17757,6 +19293,10 @@ bool llama_model_is_recurrent(const llama_model * model) {
17757
19293
  return llm_arch_is_recurrent(model->arch);
17758
19294
  }
17759
19295
 
19296
+ bool llama_model_is_diffusion(const llama_model * model) {
19297
+ return llm_arch_is_diffusion(model->arch);
19298
+ }
19299
+
17760
19300
  const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
17761
19301
  return model->tensors_by_name;
17762
19302
  }