@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -17,6 +17,7 @@
17
17
  #include <cmath>
18
18
  #include <functional>
19
19
  #include <map>
20
+ #include <regex>
20
21
  #include <sstream>
21
22
  #include <stdexcept>
22
23
 
@@ -39,14 +40,18 @@ const char * llm_type_name(llm_type type) {
39
40
  case LLM_TYPE_335M: return "335M";
40
41
  case LLM_TYPE_410M: return "410M";
41
42
  case LLM_TYPE_450M: return "450M";
43
+ case LLM_TYPE_475M: return "475M";
42
44
  case LLM_TYPE_770M: return "770M";
43
45
  case LLM_TYPE_780M: return "780M";
44
46
  case LLM_TYPE_0_5B: return "0.5B";
47
+ case LLM_TYPE_0_6B: return "0.6B";
45
48
  case LLM_TYPE_1B: return "1B";
46
49
  case LLM_TYPE_1_3B: return "1.3B";
47
50
  case LLM_TYPE_1_4B: return "1.4B";
48
51
  case LLM_TYPE_1_5B: return "1.5B";
49
52
  case LLM_TYPE_1_6B: return "1.6B";
53
+ case LLM_TYPE_1_7B: return "1.7B";
54
+ case LLM_TYPE_1_8B: return "1.8B";
50
55
  case LLM_TYPE_2B: return "2B";
51
56
  case LLM_TYPE_2_8B: return "2.8B";
52
57
  case LLM_TYPE_2_9B: return "2.9B";
@@ -64,6 +69,7 @@ const char * llm_type_name(llm_type type) {
64
69
  case LLM_TYPE_15B: return "15B";
65
70
  case LLM_TYPE_16B: return "16B";
66
71
  case LLM_TYPE_20B: return "20B";
72
+ case LLM_TYPE_27B: return "27B";
67
73
  case LLM_TYPE_30B: return "30B";
68
74
  case LLM_TYPE_32B: return "32B";
69
75
  case LLM_TYPE_34B: return "34B";
@@ -72,7 +78,9 @@ const char * llm_type_name(llm_type type) {
72
78
  case LLM_TYPE_65B: return "65B";
73
79
  case LLM_TYPE_70B: return "70B";
74
80
  case LLM_TYPE_236B: return "236B";
81
+ case LLM_TYPE_290B: return "290B";
75
82
  case LLM_TYPE_314B: return "314B";
83
+ case LLM_TYPE_405B: return "405B";
76
84
  case LLM_TYPE_671B: return "671B";
77
85
  case LLM_TYPE_SMALL: return "0.1B";
78
86
  case LLM_TYPE_MEDIUM: return "0.4B";
@@ -86,7 +94,10 @@ const char * llm_type_name(llm_type type) {
86
94
  case LLM_TYPE_16x3_8B: return "16x3.8B";
87
95
  case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
88
96
  case LLM_TYPE_57B_A14B: return "57B.A14B";
89
- case LLM_TYPE_27B: return "27B";
97
+ case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
98
+ case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
99
+ case LLM_TYPE_30B_A3B: return "30B.A3B";
100
+ case LLM_TYPE_235B_A22B: return "235B.A22B";
90
101
  default: return "?B";
91
102
  }
92
103
  }
@@ -106,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
106
117
  { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
107
118
  };
108
119
 
120
+ std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
121
+ return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
122
+ }
123
+
109
124
  static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
110
125
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
111
126
  if (kv.second == name) {
@@ -255,7 +270,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
255
270
  return nullptr;
256
271
  }
257
272
 
258
- // CPU: ACCEL -> CPU extra -> GPU host -> CPU
273
+ // CPU: ACCEL -> GPU host -> CPU extra -> CPU
259
274
  static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
260
275
  buft_list_t buft_list;
261
276
 
@@ -271,32 +286,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
271
286
  }
272
287
  }
273
288
 
274
- bool has_gpu_device = false;
275
- for (auto * dev : devices) {
276
- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
277
- has_gpu_device = true;
278
- break;
279
- }
280
- }
281
-
282
- // add extra buffer types, only if no GPU device is present
283
- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
284
- if (!has_gpu_device) {
285
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
286
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
287
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
288
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
289
- if (ggml_backend_dev_get_extra_bufts_fn) {
290
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
291
- while (extra_bufts && *extra_bufts) {
292
- buft_list.emplace_back(cpu_dev, *extra_bufts);
293
- ++extra_bufts;
294
- }
295
- }
296
- } else {
297
- LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
298
- }
299
-
300
289
  // add a host buffer type
301
290
  // storing the tensors in a host buffer is useful when the processing of large batches
302
291
  // is offloaded to a GPU device, since it reduces the time spent on data transfers
@@ -311,6 +300,24 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
311
300
  }
312
301
  }
313
302
 
303
+ // add extra buffer types, only if no GPU device is present
304
+ // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
305
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
306
+ if (cpu_dev == nullptr) {
307
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
308
+ }
309
+
310
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
311
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
312
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
313
+ if (ggml_backend_dev_get_extra_bufts_fn) {
314
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
315
+ while (extra_bufts && *extra_bufts) {
316
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
317
+ ++extra_bufts;
318
+ }
319
+ }
320
+
314
321
  // add the CPU buffer type
315
322
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
316
323
  ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -388,9 +395,12 @@ struct llama_model::impl {
388
395
  layer_dev dev_input = {};
389
396
  layer_dev dev_output = {};
390
397
  std::vector<layer_dev> dev_layer;
398
+
399
+ bool has_tensor_overrides;
391
400
  };
392
401
 
393
402
  llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
403
+ pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
394
404
  }
395
405
 
396
406
  llama_model::~llama_model() {}
@@ -556,12 +566,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
556
566
  }
557
567
  }
558
568
  } break;
569
+ case LLM_ARCH_LLAMA4:
570
+ {
571
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
572
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
573
+ ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
574
+ hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
575
+ hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
576
+ hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
577
+
578
+ switch (hparams.n_expert) {
579
+ case 16: type = LLM_TYPE_17B_16E; break;
580
+ case 128: type = LLM_TYPE_17B_128E; break;
581
+ default: type = LLM_TYPE_UNKNOWN;
582
+ }
583
+
584
+ if (type == LLM_TYPE_17B_128E) {
585
+ hparams.use_kq_norm = false;
586
+ }
587
+ } break;
559
588
  case LLM_ARCH_DECI:
560
589
  {
561
590
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
562
591
  switch (hparams.n_layer) {
563
592
  case 32: type = LLM_TYPE_7B; break;
564
593
  case 80: type = LLM_TYPE_70B; break;
594
+ case 162: type = LLM_TYPE_405B; break;
565
595
  default: type = LLM_TYPE_UNKNOWN;
566
596
  }
567
597
  } break;
@@ -680,13 +710,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
680
710
  }
681
711
  } break;
682
712
  case LLM_ARCH_NOMIC_BERT:
713
+ case LLM_ARCH_NOMIC_BERT_MOE:
683
714
  {
684
715
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
685
716
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
686
717
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
718
+ ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
687
719
 
688
720
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
689
- type = LLM_TYPE_137M;
721
+ if (arch == LLM_ARCH_NOMIC_BERT) {
722
+ type = LLM_TYPE_137M;
723
+ } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
724
+ type = LLM_TYPE_475M;
725
+ }
690
726
  }
691
727
  } break;
692
728
  case LLM_ARCH_BLOOM:
@@ -747,6 +783,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
747
783
  // fall through
748
784
  case LLM_ARCH_QWEN2:
749
785
  {
786
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
750
787
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
751
788
  switch (hparams.n_layer) {
752
789
  case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
@@ -772,6 +809,28 @@ void llama_model::load_hparams(llama_model_loader & ml) {
772
809
  default: type = LLM_TYPE_UNKNOWN;
773
810
  }
774
811
  } break;
812
+ case LLM_ARCH_QWEN3:
813
+ {
814
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
815
+ switch (hparams.n_layer) {
816
+ case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
817
+ case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
818
+ case 40: type = LLM_TYPE_14B; break;
819
+ case 64: type = LLM_TYPE_32B; break;
820
+ default: type = LLM_TYPE_UNKNOWN;
821
+ }
822
+ } break;
823
+ case LLM_ARCH_QWEN3MOE:
824
+ {
825
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
826
+
827
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
828
+ switch (hparams.n_layer) {
829
+ case 48: type = LLM_TYPE_30B_A3B; break;
830
+ case 94: type = LLM_TYPE_235B_A22B; break;
831
+ default: type = LLM_TYPE_UNKNOWN;
832
+ }
833
+ } break;
775
834
  case LLM_ARCH_PHI2:
776
835
  {
777
836
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1125,6 +1184,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1125
1184
  ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1126
1185
  }
1127
1186
  ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1187
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
1188
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
1128
1189
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1129
1190
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1130
1191
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
@@ -1144,6 +1205,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1144
1205
  default: type = LLM_TYPE_UNKNOWN;
1145
1206
  }
1146
1207
  } break;
1208
+ case LLM_ARCH_PLM:
1209
+ {
1210
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1211
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1212
+ switch (hparams.n_layer) {
1213
+ case 32: type = LLM_TYPE_1_8B; break;
1214
+ default: type = LLM_TYPE_UNKNOWN;
1215
+ }
1216
+ } break;
1147
1217
  case LLM_ARCH_CHATGLM:
1148
1218
  {
1149
1219
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1165,6 +1235,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1165
1235
  default: type = LLM_TYPE_UNKNOWN;
1166
1236
  }
1167
1237
  } break;
1238
+ case LLM_ARCH_GLM4:
1239
+ {
1240
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1241
+ switch (hparams.n_layer) {
1242
+ case 40: type = LLM_TYPE_9B; break;
1243
+ case 61: type = LLM_TYPE_32B; break;
1244
+ default: type = LLM_TYPE_UNKNOWN;
1245
+ }
1246
+ } break;
1168
1247
  case LLM_ARCH_BITNET:
1169
1248
  {
1170
1249
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1310,6 +1389,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1310
1389
  // Add additional layer/vocab/etc checks here for other model sizes
1311
1390
  default: type = LLM_TYPE_UNKNOWN;
1312
1391
  }
1392
+
1393
+ // For Granite MoE Shared
1394
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1313
1395
  } break;
1314
1396
  case LLM_ARCH_CHAMELEON:
1315
1397
  {
@@ -1330,6 +1412,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1330
1412
  ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
1331
1413
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
1332
1414
  } break;
1415
+ case LLM_ARCH_BAILINGMOE:
1416
+ {
1417
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1418
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1419
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1420
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1421
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1422
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1423
+
1424
+ switch (hparams.n_layer) {
1425
+ case 28: type = LLM_TYPE_16B; break;
1426
+ case 88: type = LLM_TYPE_290B; break;
1427
+ default: type = LLM_TYPE_UNKNOWN;
1428
+ }
1429
+ } break;
1333
1430
  default: throw std::runtime_error("unsupported model architecture");
1334
1431
  }
1335
1432
 
@@ -1398,6 +1495,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1398
1495
  }
1399
1496
 
1400
1497
  ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1498
+ if (cpu_dev == nullptr) {
1499
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
1500
+ }
1401
1501
  const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
1402
1502
  const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
1403
1503
  auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
@@ -1557,15 +1657,38 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1557
1657
  GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
1558
1658
  }
1559
1659
 
1560
- ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
1660
+ ggml_backend_buffer_type_t buft = nullptr;
1661
+
1662
+ // check overrides
1663
+ if (ml.tensor_buft_overrides) {
1664
+ std::string tensor_name = tn.str();
1665
+ for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
1666
+ std::regex pattern(overrides->pattern);
1667
+ if (std::regex_search(tensor_name, pattern)) {
1668
+ buft = overrides->buft;
1669
+ LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
1670
+ tensor_name.c_str(),
1671
+ ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
1672
+ ggml_backend_buft_name(buft));
1673
+ break;
1674
+ }
1675
+ }
1676
+ }
1677
+
1561
1678
  if (!buft) {
1562
- throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
1679
+ buft = select_weight_buft(hparams, t_meta, op, *buft_list);
1680
+ if (!buft) {
1681
+ throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
1682
+ }
1563
1683
  }
1564
1684
 
1565
1685
  // avoid using a host buffer when using mmap
1566
1686
  auto * buft_dev = ggml_backend_buft_get_device(buft);
1567
1687
  if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
1568
1688
  auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1689
+ if (!cpu_dev) {
1690
+ throw std::runtime_error("no CPU backend found");
1691
+ }
1569
1692
  buft = ggml_backend_dev_buffer_type(cpu_dev);
1570
1693
  }
1571
1694
 
@@ -1652,6 +1775,63 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1652
1775
  layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1653
1776
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1654
1777
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1778
+
1779
+ // For Granite MoE Shared
1780
+ if (hparams.n_ff_shexp > 0) {
1781
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1782
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1783
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
1784
+ }
1785
+ }
1786
+ }
1787
+ } break;
1788
+ case LLM_ARCH_LLAMA4:
1789
+ {
1790
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1791
+
1792
+ // output
1793
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1794
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1795
+
1796
+ // if output is NULL, init from the input tok embed
1797
+ if (output == NULL) {
1798
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
1799
+ }
1800
+
1801
+ GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
1802
+ for (int i = 0; i < n_layer; ++i) {
1803
+ bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
1804
+
1805
+ auto & layer = layers[i];
1806
+
1807
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1808
+
1809
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
1810
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1811
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1812
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
1813
+
1814
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1815
+
1816
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1817
+
1818
+ if (is_moe_layer) {
1819
+ int n_ff_exp = hparams.n_ff_exp;
1820
+
1821
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1822
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
1823
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
1824
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
1825
+
1826
+ // Shared expert
1827
+ const int64_t n_ff_shexp = n_ff_exp;
1828
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1829
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
1830
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1831
+ } else {
1832
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1833
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1834
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1655
1835
  }
1656
1836
  }
1657
1837
  } break;
@@ -1697,7 +1877,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1697
1877
  layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
1698
1878
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1699
1879
 
1700
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1880
+ if (n_ff > 0) {
1881
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1882
+ }
1701
1883
 
1702
1884
  if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
1703
1885
  layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
@@ -1707,9 +1889,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1707
1889
  layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1708
1890
  }
1709
1891
 
1710
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1711
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1712
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1892
+ if (n_ff > 0) {
1893
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1894
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1895
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1896
+ }
1713
1897
 
1714
1898
  // optional MLP bias
1715
1899
  layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
@@ -1924,6 +2108,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1924
2108
  } break;
1925
2109
  case LLM_ARCH_BERT:
1926
2110
  case LLM_ARCH_NOMIC_BERT:
2111
+ case LLM_ARCH_NOMIC_BERT_MOE:
1927
2112
  {
1928
2113
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1929
2114
  type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
@@ -1957,20 +2142,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1957
2142
  layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1958
2143
  }
1959
2144
 
2145
+ if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
2146
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2147
+ }
2148
+
1960
2149
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1961
2150
 
1962
2151
  layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
1963
2152
  layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
1964
2153
 
1965
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1966
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1967
-
1968
- if (arch == LLM_ARCH_BERT) {
2154
+ if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
1969
2155
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
1970
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
1971
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2156
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
2157
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2158
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1972
2159
  } else {
1973
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2160
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2161
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2162
+
2163
+ if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
2164
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2165
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2166
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2167
+ } else {
2168
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2169
+ }
1974
2170
  }
1975
2171
 
1976
2172
  layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
@@ -2254,6 +2450,77 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2254
2450
  layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
2255
2451
  }
2256
2452
  } break;
2453
+ case LLM_ARCH_QWEN3:
2454
+ {
2455
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2456
+
2457
+ // output
2458
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2459
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2460
+ // if output is NULL, init from the input tok embed
2461
+ if (output == NULL) {
2462
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2463
+ }
2464
+
2465
+ for (int i = 0; i < n_layer; ++i) {
2466
+ auto & layer = layers[i];
2467
+
2468
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2469
+
2470
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2471
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2472
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2473
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2474
+
2475
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2476
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2477
+
2478
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2479
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2480
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2481
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2482
+ }
2483
+ } break;
2484
+ case LLM_ARCH_QWEN3MOE:
2485
+ {
2486
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2487
+
2488
+ // output
2489
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2490
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2491
+
2492
+ for (int i = 0; i < n_layer; ++i) {
2493
+ auto & layer = layers[i];
2494
+
2495
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2496
+
2497
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2498
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2499
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2500
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2501
+
2502
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2503
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2504
+
2505
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2506
+
2507
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2508
+
2509
+ if (n_expert == 0) {
2510
+ throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
2511
+ }
2512
+ if (n_expert_used == 0) {
2513
+ throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
2514
+ }
2515
+
2516
+ // MoE branch
2517
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
2518
+
2519
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2520
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2521
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2522
+ }
2523
+ } break;
2257
2524
  case LLM_ARCH_PHI2:
2258
2525
  {
2259
2526
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3003,8 +3270,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3003
3270
  {
3004
3271
  const bool is_lite = (hparams.n_layer == 27);
3005
3272
 
3273
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
3274
+
3275
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
3276
+ const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
3277
+ const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
3278
+
3006
3279
  const int64_t n_embd_head_qk_rope = hparams.n_rot;
3007
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
3280
+ const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
3008
3281
 
3009
3282
  const int64_t q_lora_rank = hparams.n_lora_q;
3010
3283
  const int64_t kv_lora_rank = hparams.n_lora_kv;
@@ -3030,14 +3303,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3030
3303
 
3031
3304
  if (!is_lite) {
3032
3305
  layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
3033
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
3306
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
3034
3307
  } else {
3035
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3308
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
3036
3309
  }
3037
3310
 
3038
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
3039
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
3040
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
3311
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
3312
+
3313
+ // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
3314
+ if (is_mla) {
3315
+ layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
3316
+ layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
3317
+ } else {
3318
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
3319
+ }
3320
+
3321
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
3041
3322
 
3042
3323
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3043
3324
 
@@ -3068,6 +3349,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3068
3349
  }
3069
3350
  }
3070
3351
  } break;
3352
+ case LLM_ARCH_PLM:
3353
+ {
3354
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
3355
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
3356
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
3357
+
3358
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3359
+
3360
+ // output
3361
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3362
+ // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3363
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3364
+
3365
+ for (int i = 0; i < n_layer; ++i) {
3366
+ auto & layer = layers[i];
3367
+
3368
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3369
+
3370
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3371
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
3372
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
3373
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
3374
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
3375
+
3376
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3377
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3378
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3379
+ }
3380
+ } break;
3071
3381
  case LLM_ARCH_BITNET:
3072
3382
  {
3073
3383
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3227,7 +3537,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3227
3537
 
3228
3538
  // output
3229
3539
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3230
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3540
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3541
+ // if output is NULL, init from the input tok embed
3542
+ if (output == NULL) {
3543
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3544
+ }
3231
3545
 
3232
3546
  for (int i = 0; i < n_layer; ++i) {
3233
3547
  auto & layer = layers[i];
@@ -3254,21 +3568,60 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3254
3568
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3255
3569
  }
3256
3570
  } break;
3257
- case LLM_ARCH_NEMOTRON:
3571
+ case LLM_ARCH_GLM4:
3258
3572
  {
3259
3573
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3260
3574
 
3261
3575
  // output
3262
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3263
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3264
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3576
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3577
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3578
+ // if output is NULL, init from the input tok embed
3579
+ if (output == NULL) {
3580
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3581
+ }
3265
3582
 
3266
3583
  for (int i = 0; i < n_layer; ++i) {
3267
3584
  auto & layer = layers[i];
3268
3585
 
3269
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3270
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3271
-
3586
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3587
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3588
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3589
+
3590
+ if (layer.wqkv == nullptr) {
3591
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3592
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3593
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3594
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3595
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3596
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3597
+ }
3598
+
3599
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3600
+
3601
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3602
+
3603
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3604
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3605
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
3606
+
3607
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3608
+ }
3609
+ } break;
3610
+ case LLM_ARCH_NEMOTRON:
3611
+ {
3612
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3613
+
3614
+ // output
3615
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3616
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3617
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3618
+
3619
+ for (int i = 0; i < n_layer; ++i) {
3620
+ auto & layer = layers[i];
3621
+
3622
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3623
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3624
+
3272
3625
  layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3273
3626
  layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3274
3627
  layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
@@ -3712,6 +4065,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3712
4065
  output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
3713
4066
  output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
3714
4067
  } break;
4068
+ case LLM_ARCH_BAILINGMOE:
4069
+ {
4070
+ const int64_t n_ff_exp = hparams.n_ff_exp;
4071
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4072
+
4073
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4074
+
4075
+ // output
4076
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4077
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4078
+
4079
+ for (int i = 0; i < n_layer; ++i) {
4080
+ auto & layer = layers[i];
4081
+
4082
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4083
+
4084
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
4085
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
4086
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
4087
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
4088
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4089
+
4090
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4091
+
4092
+ if (n_expert == 0) {
4093
+ throw std::runtime_error("n_expert must be > 0");
4094
+ }
4095
+ if (n_expert_used == 0) {
4096
+ throw std::runtime_error("n_expert_used must be > 0");
4097
+ }
4098
+
4099
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4100
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4101
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4102
+
4103
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4104
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
4105
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4106
+ }
4107
+ } break;
3715
4108
  default:
3716
4109
  throw std::runtime_error("unknown architecture");
3717
4110
  }
@@ -3753,6 +4146,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3753
4146
  if (!dev) {
3754
4147
  // FIXME: workaround for CPU backend buft having a NULL device
3755
4148
  dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
4149
+ if (!dev) {
4150
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
4151
+ }
3756
4152
  }
3757
4153
  ggml_backend_dev_props props;
3758
4154
  ggml_backend_dev_get_props(dev, &props);
@@ -3882,7 +4278,7 @@ uint64_t llama_model::n_elements() const {
3882
4278
  }
3883
4279
 
3884
4280
  void llama_model::print_info() const {
3885
- const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
4281
+ const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
3886
4282
 
3887
4283
  auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
3888
4284
  bool is_var = false;
@@ -3943,7 +4339,7 @@ void llama_model::print_info() const {
3943
4339
  LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
3944
4340
  LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
3945
4341
  LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
3946
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
4342
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
3947
4343
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
3948
4344
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
3949
4345
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
@@ -3980,6 +4376,8 @@ void llama_model::print_info() const {
3980
4376
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
3981
4377
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
3982
4378
  LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
4379
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
4380
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
3983
4381
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
3984
4382
  LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
3985
4383
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
@@ -3993,10 +4391,25 @@ void llama_model::print_info() const {
3993
4391
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
3994
4392
  }
3995
4393
 
3996
- if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
4394
+ if (arch == LLM_ARCH_QWEN3MOE) {
4395
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4396
+ }
4397
+
4398
+ if (arch == LLM_ARCH_MINICPM ||
4399
+ arch == LLM_ARCH_GRANITE ||
4400
+ arch == LLM_ARCH_GRANITE_MOE) {
3997
4401
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
3998
4402
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
3999
4403
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
4404
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
4405
+ }
4406
+
4407
+ if (arch == LLM_ARCH_BAILINGMOE) {
4408
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
4409
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4410
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
4411
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
4412
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
4000
4413
  }
4001
4414
 
4002
4415
  vocab.print_info();
@@ -4060,6 +4473,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
4060
4473
  });
4061
4474
  }
4062
4475
 
4476
+ bool llama_model::has_tensor_overrides() const {
4477
+ return pimpl->has_tensor_overrides;
4478
+ }
4479
+
4063
4480
  const ggml_tensor * llama_model::get_tensor(const char * name) const {
4064
4481
  auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
4065
4482
  [name](const std::pair<std::string, ggml_tensor *> & it) {
@@ -4072,6 +4489,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
4072
4489
  return it->second;
4073
4490
  }
4074
4491
 
4492
+ ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
4493
+ // choose long/short freq factors based on the context size
4494
+ if (layers[il].rope_freqs != nullptr) {
4495
+ return layers[il].rope_freqs;
4496
+ }
4497
+
4498
+ if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
4499
+ return layers[il].rope_long;
4500
+ }
4501
+
4502
+ return layers[il].rope_short;
4503
+ }
4504
+
4075
4505
  struct llm_build_llama : public llm_graph_context {
4076
4506
  llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
4077
4507
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -4087,12 +4517,22 @@ struct llm_build_llama : public llm_graph_context {
4087
4517
  // inp_pos - contains the positions
4088
4518
  ggml_tensor * inp_pos = build_inp_pos();
4089
4519
 
4520
+ // temperature tuning
4521
+ ggml_tensor * inp_attn_scale = nullptr;
4522
+ if (arch == LLM_ARCH_LLAMA4) {
4523
+ inp_attn_scale = build_inp_attn_scale();
4524
+ }
4525
+
4090
4526
  auto * inp_attn = build_attn_inp_kv_unified();
4091
4527
 
4092
4528
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4093
4529
  for (int il = 0; il < n_layer; ++il) {
4094
4530
  ggml_tensor * inpSA = inpL;
4095
4531
 
4532
+ bool use_rope = arch == LLM_ARCH_LLAMA4
4533
+ ? (il + 1) % hparams.n_no_rope_layer_step != 0
4534
+ : true;
4535
+
4096
4536
  // norm
4097
4537
  cur = build_norm(inpL,
4098
4538
  model.layers[il].attn_norm, NULL,
@@ -4102,7 +4542,7 @@ struct llm_build_llama : public llm_graph_context {
4102
4542
  // self-attention
4103
4543
  {
4104
4544
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4105
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
4545
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
4106
4546
 
4107
4547
  // compute Q and K and RoPE them
4108
4548
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4130,25 +4570,38 @@ struct llm_build_llama : public llm_graph_context {
4130
4570
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
4131
4571
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
4132
4572
 
4133
- Qcur = ggml_rope_ext(
4134
- ctx0, Qcur, inp_pos, rope_factors,
4135
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4136
- ext_factor, attn_factor, beta_fast, beta_slow
4137
- );
4573
+ if (use_rope) {
4574
+ Qcur = ggml_rope_ext(
4575
+ ctx0, Qcur, inp_pos, rope_factors,
4576
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4577
+ ext_factor, attn_factor, beta_fast, beta_slow
4578
+ );
4138
4579
 
4139
- Kcur = ggml_rope_ext(
4140
- ctx0, Kcur, inp_pos, rope_factors,
4141
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4142
- ext_factor, attn_factor, beta_fast, beta_slow
4143
- );
4580
+ Kcur = ggml_rope_ext(
4581
+ ctx0, Kcur, inp_pos, rope_factors,
4582
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4583
+ ext_factor, attn_factor, beta_fast, beta_slow
4584
+ );
4585
+ } else if (inp_attn_scale) {
4586
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
4587
+ }
4144
4588
 
4145
4589
  cb(Qcur, "Qcur", il);
4146
4590
  cb(Kcur, "Kcur", il);
4147
4591
  cb(Vcur, "Vcur", il);
4148
4592
 
4593
+ if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
4594
+ // Llama4TextL2Norm
4595
+ Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
4596
+ Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
4597
+ cb(Qcur, "Qcur_normed", il);
4598
+ cb(Kcur, "Kcur_normed", il);
4599
+ }
4600
+
4149
4601
  cur = build_attn(inp_attn, gf,
4150
4602
  model.layers[il].wo, model.layers[il].bo,
4151
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
4603
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4604
+ cb(cur, "attn_out", il);
4152
4605
  }
4153
4606
 
4154
4607
  if (il == n_layer - 1) {
@@ -4158,15 +4611,10 @@ struct llm_build_llama : public llm_graph_context {
4158
4611
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4159
4612
  }
4160
4613
 
4161
- // For Granite architecture
4162
- if (hparams.f_residual_scale) {
4163
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4164
- }
4165
-
4166
4614
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4167
4615
  cb(ffn_inp, "ffn_inp", il);
4168
4616
 
4169
- // feed-forward network
4617
+ // feed-forward network (non-MoE)
4170
4618
  if (model.layers[il].ffn_gate_inp == nullptr) {
4171
4619
 
4172
4620
  cur = build_norm(ffn_inp,
@@ -4181,6 +4629,38 @@ struct llm_build_llama : public llm_graph_context {
4181
4629
  NULL,
4182
4630
  LLM_FFN_SILU, LLM_FFN_PAR, il);
4183
4631
  cb(cur, "ffn_out", il);
4632
+
4633
+ } else if (arch == LLM_ARCH_LLAMA4) {
4634
+ // llama4 MoE
4635
+ ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
4636
+ model.layers[il].ffn_norm, NULL,
4637
+ LLM_NORM_RMS, il);
4638
+ cb(cur, "ffn_norm", il);
4639
+
4640
+ ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
4641
+ model.layers[il].ffn_gate_inp,
4642
+ model.layers[il].ffn_up_exps,
4643
+ model.layers[il].ffn_gate_exps,
4644
+ model.layers[il].ffn_down_exps,
4645
+ nullptr,
4646
+ n_expert, n_expert_used,
4647
+ LLM_FFN_SILU, false,
4648
+ false, 0.0,
4649
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
4650
+ il);
4651
+
4652
+ // Shared experts
4653
+ ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
4654
+ model.layers[il].ffn_up_shexp, NULL, NULL,
4655
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
4656
+ model.layers[il].ffn_down_shexp, NULL, NULL,
4657
+ NULL,
4658
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
4659
+ cb(shexp_out, "ffn_moe_shexp", il);
4660
+
4661
+ cur = ggml_add(ctx0, moe_out, shexp_out);
4662
+ cb(cur, "ffn_moe_out_merged", il);
4663
+
4184
4664
  } else {
4185
4665
  // MoE branch
4186
4666
  cur = build_norm(ffn_inp,
@@ -4202,11 +4682,6 @@ struct llm_build_llama : public llm_graph_context {
4202
4682
  cb(cur, "ffn_moe_out", il);
4203
4683
  }
4204
4684
 
4205
- // For Granite architecture
4206
- if (hparams.f_residual_scale) {
4207
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4208
- }
4209
-
4210
4685
  cur = ggml_add(ctx0, cur, ffn_inp);
4211
4686
  cb(cur, "ffn_out", il);
4212
4687
 
@@ -4229,11 +4704,6 @@ struct llm_build_llama : public llm_graph_context {
4229
4704
  // lm_head
4230
4705
  cur = build_lora_mm(model.output, cur);
4231
4706
 
4232
- // For Granite architecture
4233
- if (hparams.f_logit_scale) {
4234
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4235
- }
4236
-
4237
4707
  cb(cur, "result_output", -1);
4238
4708
  res->t_logits = cur;
4239
4709
 
@@ -4263,6 +4733,7 @@ struct llm_build_deci : public llm_graph_context {
4263
4733
  ggml_tensor * inpSA = inpL;
4264
4734
  const int64_t n_head_kv = hparams.n_head_kv(il);
4265
4735
  const int64_t n_head = hparams.n_head(il);
4736
+ const int64_t n_ff = hparams.n_ff(il);
4266
4737
 
4267
4738
  if (n_head == 0) {
4268
4739
  // attention-free layer of Llama-3_1-Nemotron-51B
@@ -4282,7 +4753,7 @@ struct llm_build_deci : public llm_graph_context {
4282
4753
  } else if (n_head > 0) {
4283
4754
  // self-attention
4284
4755
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4285
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
4756
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
4286
4757
 
4287
4758
  // compute Q and K and RoPE them
4288
4759
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4328,7 +4799,7 @@ struct llm_build_deci : public llm_graph_context {
4328
4799
 
4329
4800
  cur = build_attn(inp_attn, gf,
4330
4801
  model.layers[il].wo, model.layers[il].bo,
4331
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
4802
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4332
4803
  }
4333
4804
 
4334
4805
  if (il == n_layer - 1) {
@@ -4338,9 +4809,9 @@ struct llm_build_deci : public llm_graph_context {
4338
4809
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4339
4810
  }
4340
4811
 
4341
- // For Granite architecture
4342
- if (hparams.f_residual_scale) {
4343
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4812
+ // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
4813
+ if (n_ff == 0) {
4814
+ continue;
4344
4815
  }
4345
4816
 
4346
4817
  // modified to support attention-free layer of Llama-3_1-Nemotron-51B
@@ -4366,11 +4837,6 @@ struct llm_build_deci : public llm_graph_context {
4366
4837
  cb(cur, "ffn_out", il);
4367
4838
  }
4368
4839
 
4369
- // For Granite architecture
4370
- if (hparams.f_residual_scale) {
4371
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4372
- }
4373
-
4374
4840
  cur = ggml_add(ctx0, cur, ffn_inp);
4375
4841
  cb(cur, "ffn_out", il);
4376
4842
 
@@ -4393,11 +4859,6 @@ struct llm_build_deci : public llm_graph_context {
4393
4859
  // lm_head
4394
4860
  cur = build_lora_mm(model.output, cur);
4395
4861
 
4396
- // For Granite architecture
4397
- if (hparams.f_logit_scale) {
4398
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4399
- }
4400
-
4401
4862
  cb(cur, "result_output", -1);
4402
4863
  res->t_logits = cur;
4403
4864
 
@@ -4470,7 +4931,7 @@ struct llm_build_baichuan : public llm_graph_context {
4470
4931
 
4471
4932
  cur = build_attn(inp_attn, gf,
4472
4933
  model.layers[il].wo, NULL,
4473
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4934
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4474
4935
  }
4475
4936
 
4476
4937
  if (il == n_layer - 1) {
@@ -4585,7 +5046,7 @@ struct llm_build_xverse : public llm_graph_context {
4585
5046
 
4586
5047
  cur = build_attn(inp_attn, gf,
4587
5048
  model.layers[il].wo, NULL,
4588
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5049
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4589
5050
  }
4590
5051
 
4591
5052
  if (il == n_layer - 1) {
@@ -4710,7 +5171,7 @@ struct llm_build_falcon : public llm_graph_context {
4710
5171
 
4711
5172
  cur = build_attn(inp_attn, gf,
4712
5173
  model.layers[il].wo, NULL,
4713
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5174
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4714
5175
  }
4715
5176
 
4716
5177
  if (il == n_layer - 1) {
@@ -4840,7 +5301,7 @@ struct llm_build_grok : public llm_graph_context {
4840
5301
 
4841
5302
  cur = build_attn(inp_attn, gf,
4842
5303
  model.layers[il].wo, model.layers[il].bo,
4843
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
5304
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
4844
5305
  }
4845
5306
 
4846
5307
  if (il == n_layer - 1) {
@@ -4991,7 +5452,7 @@ struct llm_build_dbrx : public llm_graph_context {
4991
5452
 
4992
5453
  cur = build_attn(inp_attn, gf,
4993
5454
  model.layers[il].wo, NULL,
4994
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5455
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4995
5456
  }
4996
5457
 
4997
5458
  if (il == n_layer - 1) {
@@ -5105,7 +5566,7 @@ struct llm_build_starcoder : public llm_graph_context {
5105
5566
 
5106
5567
  cur = build_attn(inp_attn, gf,
5107
5568
  model.layers[il].wo, model.layers[il].bo,
5108
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5569
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5109
5570
  }
5110
5571
 
5111
5572
  if (il == n_layer - 1) {
@@ -5204,7 +5665,7 @@ struct llm_build_refact : public llm_graph_context {
5204
5665
 
5205
5666
  cur = build_attn(inp_attn, gf,
5206
5667
  model.layers[il].wo, NULL,
5207
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5668
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5208
5669
  }
5209
5670
 
5210
5671
  if (il == n_layer - 1) {
@@ -5331,6 +5792,11 @@ struct llm_build_bert : public llm_graph_context {
5331
5792
  cur = build_lora_mm(model.layers[il].wqkv, cur);
5332
5793
  cb(cur, "wqkv", il);
5333
5794
 
5795
+ if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5796
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5797
+ cb(cur, "bqkv", il);
5798
+ }
5799
+
5334
5800
  Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5335
5801
  Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5336
5802
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@@ -5358,7 +5824,7 @@ struct llm_build_bert : public llm_graph_context {
5358
5824
 
5359
5825
  cur = build_attn(inp_attn, gf,
5360
5826
  model.layers[il].wo, model.layers[il].bo,
5361
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5827
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5362
5828
  cb(cur, "kqv_out", il);
5363
5829
 
5364
5830
  if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
@@ -5383,13 +5849,29 @@ struct llm_build_bert : public llm_graph_context {
5383
5849
  cb(ffn_inp, "ffn_inp", il);
5384
5850
 
5385
5851
  // feed-forward network
5386
- if (model.arch == LLM_ARCH_BERT) {
5852
+ if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
5853
+ // MoE branch
5854
+ cur = build_moe_ffn(cur,
5855
+ model.layers[il].ffn_gate_inp,
5856
+ model.layers[il].ffn_up_exps,
5857
+ nullptr,
5858
+ model.layers[il].ffn_down_exps,
5859
+ nullptr,
5860
+ hparams.n_expert,
5861
+ hparams.n_expert_used,
5862
+ LLM_FFN_GELU,
5863
+ false, false,
5864
+ 0.0f,
5865
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
5866
+ cb(cur, "ffn_moe_out", il);
5867
+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5387
5868
  cur = build_ffn(cur,
5388
5869
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
5389
5870
  NULL, NULL, NULL,
5390
5871
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5391
5872
  NULL,
5392
5873
  LLM_FFN_GELU, LLM_FFN_SEQ, il);
5874
+ cb(cur, "ffn_out", il);
5393
5875
  } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
5394
5876
  cur = build_ffn(cur,
5395
5877
  model.layers[il].ffn_up, NULL, NULL,
@@ -5397,6 +5879,7 @@ struct llm_build_bert : public llm_graph_context {
5397
5879
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5398
5880
  NULL,
5399
5881
  LLM_FFN_GELU, LLM_FFN_PAR, il);
5882
+ cb(cur, "ffn_out", il);
5400
5883
  } else {
5401
5884
  cur = build_ffn(cur,
5402
5885
  model.layers[il].ffn_up, NULL, NULL,
@@ -5404,8 +5887,8 @@ struct llm_build_bert : public llm_graph_context {
5404
5887
  model.layers[il].ffn_down, NULL, NULL,
5405
5888
  NULL,
5406
5889
  LLM_FFN_SILU, LLM_FFN_PAR, il);
5890
+ cb(cur, "ffn_out", il);
5407
5891
  }
5408
- cb(cur, "ffn_out", il);
5409
5892
 
5410
5893
  // attentions bypass the intermediate layer
5411
5894
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -5475,7 +5958,7 @@ struct llm_build_bloom : public llm_graph_context {
5475
5958
 
5476
5959
  cur = build_attn(inp_attn, gf,
5477
5960
  model.layers[il].wo, model.layers[il].bo,
5478
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5961
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5479
5962
  }
5480
5963
 
5481
5964
  if (il == n_layer - 1) {
@@ -5616,7 +6099,7 @@ struct llm_build_mpt : public llm_graph_context {
5616
6099
 
5617
6100
  cur = build_attn(inp_attn, gf,
5618
6101
  model.layers[il].wo, model.layers[il].bo,
5619
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6102
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5620
6103
  }
5621
6104
 
5622
6105
  if (il == n_layer - 1) {
@@ -5762,7 +6245,7 @@ struct llm_build_stablelm : public llm_graph_context {
5762
6245
 
5763
6246
  cur = build_attn(inp_attn, gf,
5764
6247
  model.layers[il].wo, NULL,
5765
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6248
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5766
6249
  }
5767
6250
 
5768
6251
  if (il == n_layer - 1) {
@@ -5885,7 +6368,7 @@ struct llm_build_qwen : public llm_graph_context {
5885
6368
 
5886
6369
  cur = build_attn(inp_attn, gf,
5887
6370
  model.layers[il].wo, NULL,
5888
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6371
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5889
6372
  }
5890
6373
 
5891
6374
  if (il == n_layer - 1) {
@@ -6005,7 +6488,7 @@ struct llm_build_qwen2 : public llm_graph_context {
6005
6488
 
6006
6489
  cur = build_attn(inp_attn, gf,
6007
6490
  model.layers[il].wo, model.layers[il].bo,
6008
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6491
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6009
6492
  }
6010
6493
 
6011
6494
  if (il == n_layer - 1) {
@@ -6126,7 +6609,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
6126
6609
 
6127
6610
  cur = build_attn(inp_attn, gf,
6128
6611
  model.layers[il].wo, model.layers[il].bo,
6129
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6612
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6130
6613
  }
6131
6614
 
6132
6615
  if (il == n_layer - 1) {
@@ -6253,7 +6736,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
6253
6736
 
6254
6737
  cur = build_attn(inp_attn, gf,
6255
6738
  model.layers[il].wo, model.layers[il].bo,
6256
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6739
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6257
6740
  }
6258
6741
 
6259
6742
  if (il == n_layer - 1) {
@@ -6284,7 +6767,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
6284
6767
  false, 0.0,
6285
6768
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
6286
6769
  il);
6287
- cb(cur, "ffn_moe_out", il);
6770
+ cb(moe_out, "ffn_moe_out", il);
6288
6771
 
6289
6772
  // FFN shared expert
6290
6773
  {
@@ -6340,16 +6823,14 @@ struct llm_build_qwen2moe : public llm_graph_context {
6340
6823
  }
6341
6824
  };
6342
6825
 
6343
- struct llm_build_phi2 : public llm_graph_context {
6344
- llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6826
+ struct llm_build_qwen3 : public llm_graph_context {
6827
+ llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6345
6828
  const int64_t n_embd_head = hparams.n_embd_head_v;
6346
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6347
6829
 
6348
6830
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6831
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6349
6832
 
6350
6833
  ggml_tensor * cur;
6351
- ggml_tensor * attn_norm_output;
6352
- ggml_tensor * ffn_output;
6353
6834
  ggml_tensor * inpL;
6354
6835
 
6355
6836
  inpL = build_inp_embd(model.tok_embd);
@@ -6360,48 +6841,42 @@ struct llm_build_phi2 : public llm_graph_context {
6360
6841
  auto * inp_attn = build_attn_inp_kv_unified();
6361
6842
 
6362
6843
  for (int il = 0; il < n_layer; ++il) {
6363
- attn_norm_output = build_norm(inpL,
6364
- model.layers[il].attn_norm,
6365
- model.layers[il].attn_norm_b,
6366
- LLM_NORM, il);
6367
- cb(attn_norm_output, "attn_norm", il);
6844
+ ggml_tensor * inpSA = inpL;
6845
+
6846
+ // norm
6847
+ cur = build_norm(inpL,
6848
+ model.layers[il].attn_norm, NULL,
6849
+ LLM_NORM_RMS, il);
6850
+ cb(cur, "attn_norm", il);
6368
6851
 
6369
6852
  // self-attention
6370
6853
  {
6371
- ggml_tensor * Qcur = nullptr;
6372
- ggml_tensor * Kcur = nullptr;
6373
- ggml_tensor * Vcur = nullptr;
6374
-
6375
- if (model.layers[il].wqkv) {
6376
- cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
6377
- cb(cur, "wqkv", il);
6378
-
6379
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6380
- cb(cur, "bqkv", il);
6381
-
6382
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6383
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6384
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6385
- } else {
6386
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
6387
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
6388
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
6389
- }
6390
-
6854
+ // compute Q and K and RoPE them
6855
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
6391
6856
  cb(Qcur, "Qcur", il);
6857
+
6858
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
6392
6859
  cb(Kcur, "Kcur", il);
6860
+
6861
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
6393
6862
  cb(Vcur, "Vcur", il);
6394
6863
 
6395
6864
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6396
6865
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6397
6866
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6398
6867
 
6868
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
6869
+ cb(Qcur, "Qcur_normed", il);
6870
+
6399
6871
  Qcur = ggml_rope_ext(
6400
6872
  ctx0, Qcur, inp_pos, nullptr,
6401
6873
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6402
6874
  ext_factor, attn_factor, beta_fast, beta_slow
6403
6875
  );
6404
6876
 
6877
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
6878
+ cb(Kcur, "Kcur_normed", il);
6879
+
6405
6880
  Kcur = ggml_rope_ext(
6406
6881
  ctx0, Kcur, inp_pos, nullptr,
6407
6882
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -6412,36 +6887,36 @@ struct llm_build_phi2 : public llm_graph_context {
6412
6887
  cb(Kcur, "Kcur", il);
6413
6888
  cb(Vcur, "Vcur", il);
6414
6889
 
6415
- // with phi2, we scale the Q to avoid precision issues
6416
- // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
6417
- Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
6418
-
6419
6890
  cur = build_attn(inp_attn, gf,
6420
6891
  model.layers[il].wo, model.layers[il].bo,
6421
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
6892
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6422
6893
  }
6423
6894
 
6424
6895
  if (il == n_layer - 1) {
6425
6896
  // skip computing output for unused tokens
6426
6897
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6427
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6428
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6429
- attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
6898
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6899
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6430
6900
  }
6431
6901
 
6432
- // FF
6433
- {
6434
- ffn_output = build_ffn(attn_norm_output,
6435
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
6436
- NULL, NULL, NULL,
6437
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
6438
- NULL,
6439
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
6440
- cb(ffn_output, "ffn_out", il);
6441
- }
6902
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6903
+ cb(ffn_inp, "ffn_inp", il);
6442
6904
 
6443
- cur = ggml_add(ctx0, cur, ffn_output);
6444
- cur = ggml_add(ctx0, cur, inpL);
6905
+ // feed-forward network
6906
+ cur = build_norm(ffn_inp,
6907
+ model.layers[il].ffn_norm, NULL,
6908
+ LLM_NORM_RMS, il);
6909
+ cb(cur, "ffn_norm", il);
6910
+
6911
+ cur = build_ffn(cur,
6912
+ model.layers[il].ffn_up, NULL, NULL,
6913
+ model.layers[il].ffn_gate, NULL, NULL,
6914
+ model.layers[il].ffn_down, NULL, NULL,
6915
+ NULL,
6916
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
6917
+ cb(cur, "ffn_out", il);
6918
+
6919
+ cur = ggml_add(ctx0, cur, ffn_inp);
6445
6920
 
6446
6921
  cur = build_cvec(cur, il);
6447
6922
  cb(cur, "l_out", il);
@@ -6450,10 +6925,267 @@ struct llm_build_phi2 : public llm_graph_context {
6450
6925
  inpL = cur;
6451
6926
  }
6452
6927
 
6453
- cur = build_norm(inpL,
6454
- model.output_norm,
6455
- model.output_norm_b,
6456
- LLM_NORM, -1);
6928
+ cur = inpL;
6929
+
6930
+ cur = build_norm(cur,
6931
+ model.output_norm, NULL,
6932
+ LLM_NORM_RMS, -1);
6933
+
6934
+ cb(cur, "result_norm", -1);
6935
+ res->t_embd = cur;
6936
+
6937
+ // lm_head
6938
+ cur = build_lora_mm(model.output, cur);
6939
+
6940
+ cb(cur, "result_output", -1);
6941
+ res->t_logits = cur;
6942
+
6943
+ ggml_build_forward_expand(gf, cur);
6944
+ }
6945
+ };
6946
+
6947
+ struct llm_build_qwen3moe : public llm_graph_context {
6948
+ llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6949
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6950
+
6951
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6952
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6953
+
6954
+ ggml_tensor * cur;
6955
+ ggml_tensor * inpL;
6956
+
6957
+ inpL = build_inp_embd(model.tok_embd);
6958
+
6959
+ // inp_pos - contains the positions
6960
+ ggml_tensor * inp_pos = build_inp_pos();
6961
+
6962
+ auto * inp_attn = build_attn_inp_kv_unified();
6963
+
6964
+ for (int il = 0; il < n_layer; ++il) {
6965
+ ggml_tensor * inpSA = inpL;
6966
+
6967
+ // norm
6968
+ cur = build_norm(inpL,
6969
+ model.layers[il].attn_norm, NULL,
6970
+ LLM_NORM_RMS, il);
6971
+ cb(cur, "attn_norm", il);
6972
+
6973
+ // self_attention
6974
+ {
6975
+ // compute Q and K and RoPE them
6976
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
6977
+ cb(Qcur, "Qcur", il);
6978
+
6979
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
6980
+ cb(Kcur, "Kcur", il);
6981
+
6982
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
6983
+ cb(Vcur, "Vcur", il);
6984
+
6985
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6986
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6987
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6988
+
6989
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
6990
+ cb(Qcur, "Qcur_normed", il);
6991
+
6992
+ Qcur = ggml_rope_ext(
6993
+ ctx0, Qcur, inp_pos, nullptr,
6994
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6995
+ ext_factor, attn_factor, beta_fast, beta_slow
6996
+ );
6997
+
6998
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
6999
+ cb(Kcur, "Kcur_normed", il);
7000
+
7001
+ Kcur = ggml_rope_ext(
7002
+ ctx0, Kcur, inp_pos, nullptr,
7003
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7004
+ ext_factor, attn_factor, beta_fast, beta_slow
7005
+ );
7006
+
7007
+ cb(Qcur, "Qcur", il);
7008
+ cb(Kcur, "Kcur", il);
7009
+ cb(Vcur, "Vcur", il);
7010
+
7011
+ cur = build_attn(inp_attn, gf,
7012
+ model.layers[il].wo, model.layers[il].bo,
7013
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7014
+ }
7015
+
7016
+ if (il == n_layer - 1) {
7017
+ // skip computing output for unused tokens
7018
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7019
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7020
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7021
+ }
7022
+
7023
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7024
+ cb(ffn_inp, "ffn_inp", il);
7025
+
7026
+ // MoE branch
7027
+ cur = build_norm(ffn_inp,
7028
+ model.layers[il].ffn_norm, NULL,
7029
+ LLM_NORM_RMS, il);
7030
+ cb(cur, "ffn_norm", il);
7031
+
7032
+ ggml_tensor * moe_out =
7033
+ build_moe_ffn(cur,
7034
+ model.layers[il].ffn_gate_inp,
7035
+ model.layers[il].ffn_up_exps,
7036
+ model.layers[il].ffn_gate_exps,
7037
+ model.layers[il].ffn_down_exps,
7038
+ nullptr,
7039
+ n_expert, n_expert_used,
7040
+ LLM_FFN_SILU, true,
7041
+ false, 0.0,
7042
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
7043
+ il);
7044
+ cb(moe_out, "ffn_moe_out", il);
7045
+ cur = moe_out;
7046
+
7047
+ cur = ggml_add(ctx0, cur, ffn_inp);
7048
+
7049
+ cur = build_cvec(cur, il);
7050
+ cb(cur, "l_out", il);
7051
+
7052
+ // input for next layer
7053
+ inpL = cur;
7054
+ }
7055
+
7056
+ cur = inpL;
7057
+
7058
+ cur = build_norm(cur,
7059
+ model.output_norm, NULL,
7060
+ LLM_NORM_RMS, -1);
7061
+
7062
+ cb(cur, "result_norm", -1);
7063
+ res->t_embd = cur;
7064
+
7065
+ // lm_head
7066
+ cur = build_lora_mm(model.output, cur);
7067
+
7068
+ cb(cur, "result_output", -1);
7069
+ res->t_logits = cur;
7070
+
7071
+ ggml_build_forward_expand(gf, cur);
7072
+ }
7073
+ };
7074
+
7075
+ struct llm_build_phi2 : public llm_graph_context {
7076
+ llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7077
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7078
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7079
+
7080
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7081
+
7082
+ ggml_tensor * cur;
7083
+ ggml_tensor * attn_norm_output;
7084
+ ggml_tensor * ffn_output;
7085
+ ggml_tensor * inpL;
7086
+
7087
+ inpL = build_inp_embd(model.tok_embd);
7088
+
7089
+ // inp_pos - contains the positions
7090
+ ggml_tensor * inp_pos = build_inp_pos();
7091
+
7092
+ auto * inp_attn = build_attn_inp_kv_unified();
7093
+
7094
+ for (int il = 0; il < n_layer; ++il) {
7095
+ attn_norm_output = build_norm(inpL,
7096
+ model.layers[il].attn_norm,
7097
+ model.layers[il].attn_norm_b,
7098
+ LLM_NORM, il);
7099
+ cb(attn_norm_output, "attn_norm", il);
7100
+
7101
+ // self-attention
7102
+ {
7103
+ ggml_tensor * Qcur = nullptr;
7104
+ ggml_tensor * Kcur = nullptr;
7105
+ ggml_tensor * Vcur = nullptr;
7106
+
7107
+ if (model.layers[il].wqkv) {
7108
+ cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
7109
+ cb(cur, "wqkv", il);
7110
+
7111
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7112
+ cb(cur, "bqkv", il);
7113
+
7114
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7115
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7116
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7117
+ } else {
7118
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7119
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7120
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
7121
+ }
7122
+
7123
+ cb(Qcur, "Qcur", il);
7124
+ cb(Kcur, "Kcur", il);
7125
+ cb(Vcur, "Vcur", il);
7126
+
7127
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7128
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7129
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7130
+
7131
+ Qcur = ggml_rope_ext(
7132
+ ctx0, Qcur, inp_pos, nullptr,
7133
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7134
+ ext_factor, attn_factor, beta_fast, beta_slow
7135
+ );
7136
+
7137
+ Kcur = ggml_rope_ext(
7138
+ ctx0, Kcur, inp_pos, nullptr,
7139
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7140
+ ext_factor, attn_factor, beta_fast, beta_slow
7141
+ );
7142
+
7143
+ cb(Qcur, "Qcur", il);
7144
+ cb(Kcur, "Kcur", il);
7145
+ cb(Vcur, "Vcur", il);
7146
+
7147
+ // with phi2, we scale the Q to avoid precision issues
7148
+ // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
7149
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
7150
+
7151
+ cur = build_attn(inp_attn, gf,
7152
+ model.layers[il].wo, model.layers[il].bo,
7153
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7154
+ }
7155
+
7156
+ if (il == n_layer - 1) {
7157
+ // skip computing output for unused tokens
7158
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7159
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7160
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7161
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
7162
+ }
7163
+
7164
+ // FF
7165
+ {
7166
+ ffn_output = build_ffn(attn_norm_output,
7167
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
7168
+ NULL, NULL, NULL,
7169
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
7170
+ NULL,
7171
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
7172
+ cb(ffn_output, "ffn_out", il);
7173
+ }
7174
+
7175
+ cur = ggml_add(ctx0, cur, ffn_output);
7176
+ cur = ggml_add(ctx0, cur, inpL);
7177
+
7178
+ cur = build_cvec(cur, il);
7179
+ cb(cur, "l_out", il);
7180
+
7181
+ // input for next layer
7182
+ inpL = cur;
7183
+ }
7184
+
7185
+ cur = build_norm(inpL,
7186
+ model.output_norm,
7187
+ model.output_norm_b,
7188
+ LLM_NORM, -1);
6457
7189
 
6458
7190
  cb(cur, "result_norm", -1);
6459
7191
  res->t_embd = cur;
@@ -6493,7 +7225,7 @@ struct llm_build_phi3 : public llm_graph_context {
6493
7225
  // self-attention
6494
7226
  {
6495
7227
  // rope freq factors for 128k context
6496
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
7228
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
6497
7229
 
6498
7230
  ggml_tensor* attn_norm_output = build_norm(inpL,
6499
7231
  model.layers[il].attn_norm,
@@ -6547,7 +7279,7 @@ struct llm_build_phi3 : public llm_graph_context {
6547
7279
 
6548
7280
  cur = build_attn(inp_attn, gf,
6549
7281
  model.layers[il].wo, model.layers[il].bo,
6550
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
7282
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
6551
7283
  }
6552
7284
 
6553
7285
  if (il == n_layer - 1) {
@@ -6682,7 +7414,7 @@ struct llm_build_plamo : public llm_graph_context {
6682
7414
 
6683
7415
  cur = build_attn(inp_attn, gf,
6684
7416
  model.layers[il].wo, NULL,
6685
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7417
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6686
7418
  }
6687
7419
  ggml_tensor * sa_out = cur;
6688
7420
 
@@ -6789,7 +7521,7 @@ struct llm_build_gpt2 : public llm_graph_context {
6789
7521
 
6790
7522
  cur = build_attn(inp_attn, gf,
6791
7523
  model.layers[il].wo, model.layers[il].bo,
6792
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7524
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6793
7525
  }
6794
7526
 
6795
7527
  if (il == n_layer - 1) {
@@ -6905,7 +7637,7 @@ struct llm_build_codeshell : public llm_graph_context {
6905
7637
 
6906
7638
  cur = build_attn(inp_attn, gf,
6907
7639
  model.layers[il].wo, model.layers[il].bo,
6908
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7640
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6909
7641
  }
6910
7642
 
6911
7643
  if (il == n_layer - 1) {
@@ -7034,7 +7766,7 @@ struct llm_build_orion : public llm_graph_context {
7034
7766
 
7035
7767
  cur = build_attn(inp_attn, gf,
7036
7768
  model.layers[il].wo, NULL,
7037
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7769
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7038
7770
  }
7039
7771
 
7040
7772
  if (il == n_layer - 1) {
@@ -7161,7 +7893,7 @@ struct llm_build_internlm2 : public llm_graph_context {
7161
7893
 
7162
7894
  cur = build_attn(inp_attn, gf,
7163
7895
  model.layers[il].wo, model.layers[il].bo,
7164
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7896
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7165
7897
  }
7166
7898
 
7167
7899
  if (il == n_layer - 1) {
@@ -7245,7 +7977,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7245
7977
  for (int il = 0; il < n_layer; ++il) {
7246
7978
  ggml_tensor * inpSA = inpL;
7247
7979
 
7248
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
7980
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
7249
7981
 
7250
7982
  // norm
7251
7983
  cur = build_norm(inpL,
@@ -7358,7 +8090,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7358
8090
 
7359
8091
  cur = build_attn(inp_attn, gf,
7360
8092
  model.layers[il].wo, NULL,
7361
- q_states, k_states, v_states, nullptr, kq_scale, il);
8093
+ q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
7362
8094
  }
7363
8095
 
7364
8096
  if (il == n_layer - 1) {
@@ -7488,7 +8220,7 @@ struct llm_build_gemma : public llm_graph_context {
7488
8220
 
7489
8221
  cur = build_attn(inp_attn, gf,
7490
8222
  model.layers[il].wo, NULL,
7491
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
8223
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7492
8224
  }
7493
8225
 
7494
8226
  if (il == n_layer - 1) {
@@ -7610,7 +8342,7 @@ struct llm_build_gemma2 : public llm_graph_context {
7610
8342
 
7611
8343
  cur = build_attn(inp_attn, gf,
7612
8344
  model.layers[il].wo, NULL,
7613
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
8345
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7614
8346
  }
7615
8347
 
7616
8348
  cur = build_norm(cur,
@@ -7751,7 +8483,7 @@ struct llm_build_gemma3 : public llm_graph_context {
7751
8483
 
7752
8484
  cur = build_attn(inp_attn, gf,
7753
8485
  model.layers[il].wo, NULL,
7754
- Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il);
8486
+ Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
7755
8487
  }
7756
8488
 
7757
8489
  cur = build_norm(cur,
@@ -7891,7 +8623,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
7891
8623
 
7892
8624
  cur = build_attn(inp_attn, gf,
7893
8625
  model.layers[il].wo, model.layers[il].bo,
7894
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8626
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7895
8627
  }
7896
8628
 
7897
8629
  if (il == n_layer - 1) {
@@ -8012,7 +8744,7 @@ struct llm_build_mamba : public llm_graph_context {
8012
8744
  ggml_tensor * state_mask,
8013
8745
  const llama_ubatch & ubatch,
8014
8746
  int il) const {
8015
- const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
8747
+ const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
8016
8748
 
8017
8749
  const auto kv_head = kv_self->head;
8018
8750
 
@@ -8226,7 +8958,7 @@ struct llm_build_command_r : public llm_graph_context {
8226
8958
 
8227
8959
  cur = build_attn(inp_attn, gf,
8228
8960
  model.layers[il].wo, model.layers[il].bo,
8229
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8961
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8230
8962
  }
8231
8963
 
8232
8964
  if (il == n_layer - 1) {
@@ -8313,7 +9045,7 @@ struct llm_build_cohere2 : public llm_graph_context {
8313
9045
  // self-attention
8314
9046
  {
8315
9047
  // rope freq factors for 128k context
8316
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
9048
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
8317
9049
 
8318
9050
  // compute Q and K and RoPE them
8319
9051
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -8361,7 +9093,7 @@ struct llm_build_cohere2 : public llm_graph_context {
8361
9093
 
8362
9094
  cur = build_attn(inp_attn, gf,
8363
9095
  model.layers[il].wo, model.layers[il].bo,
8364
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9096
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8365
9097
  }
8366
9098
 
8367
9099
  if (il == n_layer - 1) {
@@ -8492,7 +9224,7 @@ struct llm_build_olmo : public llm_graph_context {
8492
9224
 
8493
9225
  cur = build_attn(inp_attn, gf,
8494
9226
  model.layers[il].wo, nullptr,
8495
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9227
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8496
9228
  }
8497
9229
 
8498
9230
  if (il == n_layer - 1) {
@@ -8612,7 +9344,7 @@ struct llm_build_olmo2 : public llm_graph_context {
8612
9344
 
8613
9345
  cur = build_attn(inp_attn, gf,
8614
9346
  model.layers[il].wo, NULL,
8615
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9347
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8616
9348
  }
8617
9349
 
8618
9350
  cur = build_norm(cur,
@@ -8745,7 +9477,7 @@ struct llm_build_olmoe : public llm_graph_context {
8745
9477
 
8746
9478
  cur = build_attn(inp_attn, gf,
8747
9479
  model.layers[il].wo, NULL,
8748
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9480
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8749
9481
  }
8750
9482
 
8751
9483
  if (il == n_layer - 1) {
@@ -8878,7 +9610,7 @@ struct llm_build_openelm : public llm_graph_context {
8878
9610
 
8879
9611
  cur = build_attn(inp_attn, gf,
8880
9612
  model.layers[il].wo, NULL,
8881
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9613
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8882
9614
  }
8883
9615
 
8884
9616
  if (il == n_layer - 1) {
@@ -8992,7 +9724,7 @@ struct llm_build_gptneox : public llm_graph_context {
8992
9724
 
8993
9725
  cur = build_attn(inp_attn, gf,
8994
9726
  model.layers[il].wo, model.layers[il].bo,
8995
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9727
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8996
9728
  }
8997
9729
 
8998
9730
  if (il == n_layer - 1) {
@@ -9142,7 +9874,7 @@ struct llm_build_arctic : public llm_graph_context {
9142
9874
 
9143
9875
  cur = build_attn(inp_attn, gf,
9144
9876
  model.layers[il].wo, NULL,
9145
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9877
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9146
9878
  }
9147
9879
 
9148
9880
  if (il == n_layer - 1) {
@@ -9251,7 +9983,7 @@ struct llm_build_deepseek : public llm_graph_context {
9251
9983
  // self-attention
9252
9984
  {
9253
9985
  // rope freq factors for llama3; may return nullptr for llama2 and other models
9254
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
9986
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
9255
9987
 
9256
9988
  // compute Q and K and RoPE them
9257
9989
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9297,7 +10029,7 @@ struct llm_build_deepseek : public llm_graph_context {
9297
10029
 
9298
10030
  cur = build_attn(inp_attn, gf,
9299
10031
  model.layers[il].wo, model.layers[il].bo,
9300
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
10032
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
9301
10033
  }
9302
10034
 
9303
10035
  if (il == n_layer - 1) {
@@ -9387,15 +10119,22 @@ struct llm_build_deepseek2 : public llm_graph_context {
9387
10119
  llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9388
10120
  bool is_lite = (hparams.n_layer == 27);
9389
10121
 
10122
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
10123
+
10124
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
10125
+ const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
10126
+ const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
10127
+
10128
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
10129
+ const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
10130
+
10131
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
10132
+
9390
10133
  // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
9391
10134
  // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
9392
10135
  const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
9393
- const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
9394
- const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
9395
-
9396
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
9397
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
9398
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
10136
+ const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
10137
+ const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
9399
10138
 
9400
10139
  ggml_tensor * cur;
9401
10140
  ggml_tensor * inpL;
@@ -9421,16 +10160,14 @@ struct llm_build_deepseek2 : public llm_graph_context {
9421
10160
  {
9422
10161
  ggml_tensor * q = NULL;
9423
10162
  if (!is_lite) {
9424
- // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
9425
10163
  q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
9426
10164
  cb(q, "q", il);
9427
10165
 
9428
10166
  q = build_norm(q,
9429
- model.layers[il].attn_q_a_norm, NULL,
10167
+ model.layers[il].attn_q_a_norm, nullptr,
9430
10168
  LLM_NORM_RMS, il);
9431
10169
  cb(q, "q", il);
9432
10170
 
9433
- // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
9434
10171
  q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
9435
10172
  cb(q, "q", il);
9436
10173
  } else {
@@ -9438,96 +10175,125 @@ struct llm_build_deepseek2 : public llm_graph_context {
9438
10175
  cb(q, "q", il);
9439
10176
  }
9440
10177
 
9441
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
9442
- ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
9443
- ggml_row_size(q->type, hparams.n_embd_head_k),
9444
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
10178
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
10179
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
10180
+ n_embd_head_qk_nope, n_head, n_tokens,
10181
+ ggml_row_size(q->type, n_embd_head_k),
10182
+ ggml_row_size(q->type, n_embd_head_k) * n_head,
9445
10183
  0);
9446
10184
  cb(q_nope, "q_nope", il);
9447
10185
 
9448
- // and {n_head * n_embd_head_qk_rope, n_tokens}
9449
- ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
9450
- ggml_row_size(q->type, hparams.n_embd_head_k),
9451
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
10186
+ // and {n_embd_head_qk_rope, n_head, n_tokens}
10187
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
10188
+ n_embd_head_qk_rope, n_head, n_tokens,
10189
+ ggml_row_size(q->type, n_embd_head_k),
10190
+ ggml_row_size(q->type, n_embd_head_k) * n_head,
9452
10191
  ggml_row_size(q->type, n_embd_head_qk_nope));
9453
10192
  cb(q_pe, "q_pe", il);
9454
10193
 
9455
- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
9456
- ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
9457
- cb(kv_pe_compresseed, "kv_pe_compresseed", il);
10194
+ ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
10195
+ cb(kv_cmpr_pe, "kv_cmpr_pe", il);
9458
10196
 
9459
10197
  // split into {kv_lora_rank, n_tokens}
9460
- ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
9461
- kv_pe_compresseed->nb[1],
10198
+ ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
10199
+ kv_lora_rank, n_tokens,
10200
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
9462
10201
  0);
9463
- cb(kv_compressed, "kv_compressed", il);
10202
+ cb(kv_cmpr, "kv_cmpr", il);
10203
+
10204
+ // and {n_embd_head_qk_rope, 1, n_tokens}
10205
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
10206
+ n_embd_head_qk_rope, 1, n_tokens,
10207
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
10208
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
10209
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
10210
+ cb(k_pe, "k_pe", il);
9464
10211
 
9465
- // and {n_embd_head_qk_rope, n_tokens}
9466
- ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
9467
- kv_pe_compresseed->nb[1],
9468
- kv_pe_compresseed->nb[1],
9469
- ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
10212
+ q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
10213
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10214
+ ext_factor, attn_factor, beta_fast, beta_slow
10215
+ );
10216
+ cb(q_pe, "q_pe", il);
10217
+
10218
+ k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
10219
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10220
+ ext_factor, attn_factor, beta_fast, beta_slow
10221
+ );
9470
10222
  cb(k_pe, "k_pe", il);
9471
10223
 
9472
- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
9473
- kv_compressed = ggml_cont(ctx0, kv_compressed);
9474
- kv_compressed = build_norm(kv_compressed,
9475
- model.layers[il].attn_kv_a_norm, NULL,
10224
+ kv_cmpr = build_norm(kv_cmpr,
10225
+ model.layers[il].attn_kv_a_norm, nullptr,
9476
10226
  LLM_NORM_RMS, il);
9477
- cb(kv_compressed, "kv_compressed", il);
10227
+ cb(kv_cmpr, "kv_cmpr", il);
9478
10228
 
9479
- // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
9480
- ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
9481
- cb(kv, "kv", il);
10229
+ if (is_mla) {
10230
+ // {n_embd_head_qk_nope, n_tokens, n_head}
10231
+ q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
10232
+ cb(q_nope, "q_nope_perm", il);
9482
10233
 
9483
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
9484
- ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
9485
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
9486
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
9487
- 0);
9488
- cb(k_nope, "k_nope", il);
10234
+ // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
10235
+ ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
10236
+ cb(q_nope_absorbed, "q_nope_absorbed", il);
9489
10237
 
9490
- // and {n_head * n_embd_head_v, n_tokens}
9491
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
9492
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
9493
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
9494
- ggml_row_size(kv->type, (n_embd_head_qk_nope)));
9495
- cb(v_states, "v_states", il);
10238
+ // {kv_lora_rank, n_head, n_tokens}
10239
+ q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
10240
+ cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
9496
10241
 
9497
- v_states = ggml_cont(ctx0, v_states);
9498
- cb(v_states, "v_states", il);
10242
+ // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
10243
+ // note: rope must go first for in-place context shifting in build_rope_shift()
10244
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
10245
+ cb(Qcur, "Qcur", il);
9499
10246
 
9500
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
9501
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
9502
- 0);
9503
- cb(v_states, "v_states", il);
10247
+ kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
10248
+ cb(kv_cmpr, "kv_cmpr_reshape", il);
9504
10249
 
9505
- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
9506
- q_pe = ggml_rope_ext(
9507
- ctx0, q_pe, inp_pos, nullptr,
9508
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9509
- ext_factor, attn_factor_scaled, beta_fast, beta_slow
9510
- );
9511
- cb(q_pe, "q_pe", il);
10250
+ // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
10251
+ ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
10252
+ cb(Kcur, "Kcur", il);
9512
10253
 
9513
- // shared RoPE key
9514
- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
9515
- k_pe = ggml_rope_ext(
9516
- ctx0, k_pe, inp_pos, nullptr,
9517
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9518
- ext_factor, attn_factor_scaled, beta_fast, beta_slow
9519
- );
9520
- cb(k_pe, "k_pe", il);
10254
+ // {kv_lora_rank, 1, n_tokens}
10255
+ ggml_tensor * Vcur = kv_cmpr;
10256
+ cb(Vcur, "Vcur", il);
9521
10257
 
9522
- ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
9523
- cb(q_states, "q_states", il);
10258
+ // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
10259
+ cur = build_attn(inp_attn, gf,
10260
+ model.layers[il].wo, NULL,
10261
+ Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
10262
+ } else {
10263
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
10264
+ cb(kv, "kv", il);
10265
+
10266
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
10267
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
10268
+ n_embd_head_qk_nope, n_head, n_tokens,
10269
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
10270
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
10271
+ 0);
10272
+ cb(k_nope, "k_nope_view", il);
9524
10273
 
9525
- ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
9526
- cb(k_states, "k_states", il);
10274
+ // and {n_embd_head_v, n_head, n_tokens}
10275
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
10276
+ n_embd_head_v, n_head, n_tokens,
10277
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
10278
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
10279
+ ggml_row_size(kv->type, n_embd_head_qk_nope));
10280
+ cb(Vcur, "Vcur_view", il);
9527
10281
 
9528
- cur = build_attn(inp_attn, gf,
9529
- model.layers[il].wo, NULL,
9530
- q_states, k_states, v_states, nullptr, kq_scale, il);
10282
+ Vcur = ggml_cont(ctx0, Vcur);
10283
+ cb(Vcur, "Vcur_cont", il);
10284
+
10285
+ // note: rope must go first for in-place context shifting in build_rope_shift()
10286
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
10287
+ cb(Qcur, "Qcur", il);
10288
+
10289
+ ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
10290
+ cb(Kcur, "Kcur", il);
10291
+
10292
+ // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
10293
+ cur = build_attn(inp_attn, gf,
10294
+ model.layers[il].wo, NULL,
10295
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
10296
+ }
9531
10297
  }
9532
10298
 
9533
10299
  if (il == n_layer - 1) {
@@ -9693,7 +10459,7 @@ struct llm_build_bitnet : public llm_graph_context {
9693
10459
 
9694
10460
  cur = build_attn(inp_attn, gf,
9695
10461
  NULL, NULL,
9696
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10462
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9697
10463
 
9698
10464
  cur = build_norm(cur,
9699
10465
  model.layers[il].attn_sub_norm, NULL,
@@ -9816,7 +10582,7 @@ struct llm_build_t5_enc : public llm_graph_context {
9816
10582
 
9817
10583
  cur = build_attn(inp_attn, gf,
9818
10584
  model.layers[il].wo_enc, nullptr,
9819
- Qcur, Kcur, Vcur, kq_b, 1.0f, il);
10585
+ Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
9820
10586
  cb(cur, "kqv_out", il);
9821
10587
  }
9822
10588
 
@@ -9922,7 +10688,7 @@ struct llm_build_t5_dec : public llm_graph_context {
9922
10688
 
9923
10689
  cur = build_attn(inp_attn_self, gf,
9924
10690
  model.layers[il].wo, model.layers[il].bo,
9925
- Qcur, Kcur, Vcur, kq_b, 1.0f, il);
10691
+ Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
9926
10692
  cb(cur, "kqv_out", il);
9927
10693
  }
9928
10694
 
@@ -9954,7 +10720,7 @@ struct llm_build_t5_dec : public llm_graph_context {
9954
10720
 
9955
10721
  cur = build_attn(inp_attn_cross, gf,
9956
10722
  model.layers[il].wo_cross, nullptr,
9957
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
10723
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9958
10724
  cb(cur, "kqv_out", il);
9959
10725
 
9960
10726
  //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
@@ -10087,7 +10853,7 @@ struct llm_build_jais : public llm_graph_context {
10087
10853
 
10088
10854
  cur = build_attn(inp_attn, gf,
10089
10855
  model.layers[il].wo, model.layers[il].bo,
10090
- Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il);
10856
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
10091
10857
  }
10092
10858
 
10093
10859
  if (il == n_layer - 1) {
@@ -10219,7 +10985,7 @@ struct llm_build_chatglm : public llm_graph_context {
10219
10985
 
10220
10986
  cur = build_attn(inp_attn, gf,
10221
10987
  model.layers[il].wo, NULL,
10222
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10988
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10223
10989
  }
10224
10990
 
10225
10991
  if (il == n_layer - 1) {
@@ -10272,6 +11038,157 @@ struct llm_build_chatglm : public llm_graph_context {
10272
11038
  }
10273
11039
  };
10274
11040
 
11041
+ struct llm_build_glm4 : public llm_graph_context {
11042
+ llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11043
+ const int64_t n_embd_head = hparams.n_embd_head_v;
11044
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
11045
+
11046
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
11047
+
11048
+ ggml_tensor * cur;
11049
+ ggml_tensor * inpL;
11050
+
11051
+ inpL = build_inp_embd(model.tok_embd);
11052
+
11053
+ // inp_pos - contains the positions
11054
+ ggml_tensor * inp_pos = build_inp_pos();
11055
+
11056
+ auto * inp_attn = build_attn_inp_kv_unified();
11057
+
11058
+ for (int il = 0; il < n_layer; ++il) {
11059
+ ggml_tensor * inpSA = inpL;
11060
+
11061
+ // Pre-attention norm
11062
+ cur = build_norm(inpL,
11063
+ model.layers[il].attn_norm,
11064
+ NULL,
11065
+ LLM_NORM_RMS, il);
11066
+ cb(cur, "attn_norm", il);
11067
+
11068
+ // self-attention
11069
+ {
11070
+ ggml_tensor * Qcur = nullptr;
11071
+ ggml_tensor * Kcur = nullptr;
11072
+ ggml_tensor * Vcur = nullptr;
11073
+
11074
+ if (model.layers[il].wqkv == nullptr) {
11075
+ Qcur = build_lora_mm(model.layers[il].wq, cur);
11076
+ if (model.layers[il].bq) {
11077
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
11078
+ }
11079
+ Kcur = build_lora_mm(model.layers[il].wk, cur);
11080
+ if (model.layers[il].bk) {
11081
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
11082
+ }
11083
+ Vcur = build_lora_mm(model.layers[il].wv, cur);
11084
+ if (model.layers[il].bv) {
11085
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
11086
+ }
11087
+ } else {
11088
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
11089
+ cb(cur, "wqkv", il);
11090
+ if (model.layers[il].bqkv) {
11091
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
11092
+ cb(cur, "bqkv", il);
11093
+ }
11094
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
11095
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
11096
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
11097
+ }
11098
+
11099
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11100
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11101
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11102
+
11103
+ Qcur = ggml_rope_ext(
11104
+ ctx0, Qcur, inp_pos, nullptr,
11105
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11106
+ ext_factor, attn_factor, beta_fast, beta_slow
11107
+ );
11108
+
11109
+ Kcur = ggml_rope_ext(
11110
+ ctx0, Kcur, inp_pos, nullptr,
11111
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11112
+ ext_factor, attn_factor, beta_fast, beta_slow
11113
+ );
11114
+
11115
+ cb(Qcur, "Qcur", il);
11116
+ cb(Kcur, "Kcur", il);
11117
+ cb(Vcur, "Vcur", il);
11118
+
11119
+ cur = build_attn(inp_attn, gf,
11120
+ model.layers[il].wo, NULL,
11121
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11122
+ }
11123
+
11124
+ if (il == n_layer - 1) {
11125
+ // skip computing output for unused tokens
11126
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11127
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11128
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11129
+ }
11130
+
11131
+ // Post-attention norm (new!)
11132
+ cur = build_norm(cur,
11133
+ model.layers[il].attn_post_norm,
11134
+ NULL,
11135
+ LLM_NORM_RMS, il);
11136
+ cb(cur, "post_attn_norm", il);
11137
+
11138
+ // Add the input (residual connection after post-attention norm)
11139
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11140
+ cb(ffn_inp, "ffn_inp", il);
11141
+
11142
+ // FF
11143
+ {
11144
+ // Pre-MLP norm
11145
+ cur = build_norm(ffn_inp,
11146
+ model.layers[il].ffn_norm,
11147
+ NULL,
11148
+ LLM_NORM_RMS, il);
11149
+ cb(cur, "ffn_norm", il);
11150
+
11151
+ // MLP
11152
+ cur = build_ffn(cur,
11153
+ model.layers[il].ffn_up, NULL, NULL,
11154
+ NULL, NULL, NULL,
11155
+ model.layers[il].ffn_down, NULL, NULL,
11156
+ NULL,
11157
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
11158
+ cb(cur, "ffn_out", il);
11159
+
11160
+ // Post-MLP norm
11161
+ cur = build_norm(cur,
11162
+ model.layers[il].ffn_post_norm,
11163
+ NULL,
11164
+ LLM_NORM_RMS, il);
11165
+ cb(cur, "post_mlp_norm", il);
11166
+ }
11167
+
11168
+ // Add residual connection after post-MLP norm
11169
+ inpL = ggml_add(ctx0, cur, ffn_inp);
11170
+ cb(inpL, "l_out", il);
11171
+ }
11172
+
11173
+ // Final norm
11174
+ cur = build_norm(inpL,
11175
+ model.output_norm,
11176
+ NULL,
11177
+ LLM_NORM_RMS, -1);
11178
+
11179
+ cb(cur, "result_norm", -1);
11180
+ res->t_embd = cur;
11181
+
11182
+ // Output projection
11183
+ cur = build_lora_mm(model.output, cur);
11184
+
11185
+ cb(cur, "result_output", -1);
11186
+ res->t_logits = cur;
11187
+
11188
+ ggml_build_forward_expand(gf, cur);
11189
+ }
11190
+ };
11191
+
10275
11192
  struct llm_build_nemotron : public llm_graph_context {
10276
11193
  llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
10277
11194
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -10345,7 +11262,7 @@ struct llm_build_nemotron : public llm_graph_context {
10345
11262
 
10346
11263
  cur = build_attn(inp_attn, gf,
10347
11264
  model.layers[il].wo, model.layers[il].bo,
10348
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11265
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10349
11266
  }
10350
11267
 
10351
11268
  if (il == n_layer - 1) {
@@ -10430,7 +11347,7 @@ struct llm_build_exaone : public llm_graph_context {
10430
11347
  // self-attention
10431
11348
  {
10432
11349
  // rope freq factors for llama3; may return nullptr for llama2 and other models
10433
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
11350
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
10434
11351
 
10435
11352
  // compute Q and K and RoPE them
10436
11353
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -10476,7 +11393,7 @@ struct llm_build_exaone : public llm_graph_context {
10476
11393
 
10477
11394
  cur = build_attn(inp_attn, gf,
10478
11395
  model.layers[il].wo, model.layers[il].bo,
10479
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11396
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10480
11397
  }
10481
11398
 
10482
11399
  if (il == n_layer - 1) {
@@ -10575,7 +11492,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
10575
11492
  ggml_tensor * state_mask,
10576
11493
  const llama_ubatch & ubatch,
10577
11494
  int il) const {
10578
- const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
11495
+ const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
10579
11496
 
10580
11497
  const auto n_tokens = ubatch.n_tokens;
10581
11498
  const auto n_seqs = ubatch.n_seqs;
@@ -10971,7 +11888,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
10971
11888
  ggml_tensor *& first_layer_value,
10972
11889
  const llama_ubatch & ubatch,
10973
11890
  int il) const {
10974
- const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
11891
+ const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
10975
11892
 
10976
11893
  const auto n_tokens = ubatch.n_tokens;
10977
11894
  const auto n_seqs = ubatch.n_seqs;
@@ -11280,14 +12197,15 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
11280
12197
  }
11281
12198
  };
11282
12199
 
11283
- // ref: https://github.com/facebookresearch/chameleon
11284
- // based on the original build_llama() function, changes:
11285
- // * qk-norm
11286
- // * swin-norm
11287
- // * removed bias
11288
- // * removed MoE
11289
- struct llm_build_chameleon : public llm_graph_context {
11290
- llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12200
+
12201
+ struct llm_build_granite : public llm_graph_context {
12202
+ llm_build_granite(
12203
+ const llama_model & model,
12204
+ const llm_graph_params & params,
12205
+ ggml_cgraph * gf,
12206
+ const bool use_rope = true)
12207
+ : llm_graph_context(params) {
12208
+
11291
12209
  const int64_t n_embd_head = hparams.n_embd_head_v;
11292
12210
 
11293
12211
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11298,27 +12216,214 @@ struct llm_build_chameleon : public llm_graph_context {
11298
12216
 
11299
12217
  inpL = build_inp_embd(model.tok_embd);
11300
12218
 
11301
- // inp_pos - contains the positions
11302
- ggml_tensor * inp_pos = build_inp_pos();
12219
+ // inp_pos - built only if rope enabled
12220
+ ggml_tensor * inp_pos = nullptr;
12221
+ if (use_rope) {
12222
+ inp_pos = build_inp_pos();
12223
+ }
11303
12224
 
11304
12225
  auto * inp_attn = build_attn_inp_kv_unified();
11305
12226
 
12227
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
11306
12228
  for (int il = 0; il < n_layer; ++il) {
11307
12229
  ggml_tensor * inpSA = inpL;
11308
12230
 
11309
12231
  // norm
11310
- if (hparams.swin_norm) {
11311
- cur = inpL;
11312
- } else {
11313
- cur = build_norm(inpL,
11314
- model.layers[il].attn_norm, NULL,
11315
- LLM_NORM_RMS, il);
11316
- cb(cur, "attn_norm", il);
11317
- }
12232
+ cur = build_norm(inpL,
12233
+ model.layers[il].attn_norm, NULL,
12234
+ LLM_NORM_RMS, il);
12235
+ cb(cur, "attn_norm", il);
11318
12236
 
11319
12237
  // self-attention
11320
12238
  {
11321
- // compute Q and K and RoPE them
12239
+ // compute Q and K and (optionally) RoPE them
12240
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12241
+ cb(Qcur, "Qcur", il);
12242
+ if (model.layers[il].bq) {
12243
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12244
+ cb(Qcur, "Qcur", il);
12245
+ }
12246
+
12247
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12248
+ cb(Kcur, "Kcur", il);
12249
+ if (model.layers[il].bk) {
12250
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12251
+ cb(Kcur, "Kcur", il);
12252
+ }
12253
+
12254
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12255
+ cb(Vcur, "Vcur", il);
12256
+ if (model.layers[il].bv) {
12257
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12258
+ cb(Vcur, "Vcur", il);
12259
+ }
12260
+
12261
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12262
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12263
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12264
+
12265
+ if (use_rope) {
12266
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
12267
+ Qcur = ggml_rope_ext(
12268
+ ctx0, Qcur, inp_pos, rope_factors,
12269
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12270
+ ext_factor, attn_factor, beta_fast, beta_slow
12271
+ );
12272
+
12273
+ Kcur = ggml_rope_ext(
12274
+ ctx0, Kcur, inp_pos, rope_factors,
12275
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12276
+ ext_factor, attn_factor, beta_fast, beta_slow
12277
+ );
12278
+ }
12279
+
12280
+ cb(Qcur, "Qcur", il);
12281
+ cb(Kcur, "Kcur", il);
12282
+ cb(Vcur, "Vcur", il);
12283
+
12284
+ cur = build_attn(inp_attn, gf,
12285
+ model.layers[il].wo, model.layers[il].bo,
12286
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12287
+ cb(cur, "attn_out", il);
12288
+ }
12289
+
12290
+ if (il == n_layer - 1) {
12291
+ // skip computing output for unused tokens
12292
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12293
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12294
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12295
+ }
12296
+
12297
+ // For Granite architectures - scale residual
12298
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12299
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12300
+ cb(ffn_inp, "ffn_inp", il);
12301
+
12302
+ // feed-forward network (non-MoE)
12303
+ if (model.layers[il].ffn_gate_inp == nullptr) {
12304
+
12305
+ cur = build_norm(ffn_inp,
12306
+ model.layers[il].ffn_norm, NULL,
12307
+ LLM_NORM_RMS, il);
12308
+ cb(cur, "ffn_norm", il);
12309
+
12310
+ cur = build_ffn(cur,
12311
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12312
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12313
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12314
+ NULL,
12315
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12316
+ cb(cur, "ffn_out", il);
12317
+
12318
+ } else {
12319
+ // MoE branch
12320
+ cur = build_norm(ffn_inp,
12321
+ model.layers[il].ffn_norm, NULL,
12322
+ LLM_NORM_RMS, il);
12323
+ cb(cur, "ffn_norm", il);
12324
+
12325
+ ggml_tensor * moe_out = build_moe_ffn(cur,
12326
+ model.layers[il].ffn_gate_inp,
12327
+ model.layers[il].ffn_up_exps,
12328
+ model.layers[il].ffn_gate_exps,
12329
+ model.layers[il].ffn_down_exps,
12330
+ nullptr,
12331
+ n_expert, n_expert_used,
12332
+ LLM_FFN_SILU, true,
12333
+ false, 0.0,
12334
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12335
+ il);
12336
+ cb(moe_out, "ffn_moe_out", il);
12337
+
12338
+ // For Granite MoE Shared
12339
+ if (hparams.n_ff_shexp > 0) {
12340
+ ggml_tensor * ffn_shexp = build_ffn(cur,
12341
+ model.layers[il].ffn_up_shexp, NULL, NULL,
12342
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
12343
+ model.layers[il].ffn_down_shexp, NULL, NULL,
12344
+ NULL,
12345
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12346
+ cb(ffn_shexp, "ffn_shexp", il);
12347
+
12348
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
12349
+ cb(cur, "ffn_out", il);
12350
+ } else {
12351
+ cur = moe_out;
12352
+ }
12353
+ }
12354
+
12355
+ // For Granite architectures - scale residual
12356
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12357
+ cur = ggml_add(ctx0, cur, ffn_inp);
12358
+ cb(cur, "ffn_out", il);
12359
+
12360
+ cur = build_cvec(cur, il);
12361
+ cb(cur, "l_out", il);
12362
+
12363
+ // input for next layer
12364
+ inpL = cur;
12365
+ }
12366
+
12367
+ cur = inpL;
12368
+
12369
+ cur = build_norm(cur,
12370
+ model.output_norm, NULL,
12371
+ LLM_NORM_RMS, -1);
12372
+
12373
+ cb(cur, "result_norm", -1);
12374
+ res->t_embd = cur;
12375
+
12376
+ // lm_head
12377
+ cur = build_lora_mm(model.output, cur);
12378
+
12379
+ // For Granite architectures - scale logits
12380
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
12381
+ cb(cur, "result_output", -1);
12382
+ res->t_logits = cur;
12383
+
12384
+ ggml_build_forward_expand(gf, cur);
12385
+ }
12386
+ };
12387
+
12388
+ // ref: https://github.com/facebookresearch/chameleon
12389
+ // based on the original build_llama() function, changes:
12390
+ // * qk-norm
12391
+ // * swin-norm
12392
+ // * removed bias
12393
+ // * removed MoE
12394
+ struct llm_build_chameleon : public llm_graph_context {
12395
+ llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12396
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12397
+
12398
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12399
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12400
+
12401
+ ggml_tensor * cur;
12402
+ ggml_tensor * inpL;
12403
+
12404
+ inpL = build_inp_embd(model.tok_embd);
12405
+
12406
+ // inp_pos - contains the positions
12407
+ ggml_tensor * inp_pos = build_inp_pos();
12408
+
12409
+ auto * inp_attn = build_attn_inp_kv_unified();
12410
+
12411
+ for (int il = 0; il < n_layer; ++il) {
12412
+ ggml_tensor * inpSA = inpL;
12413
+
12414
+ // norm
12415
+ if (hparams.swin_norm) {
12416
+ cur = inpL;
12417
+ } else {
12418
+ cur = build_norm(inpL,
12419
+ model.layers[il].attn_norm, NULL,
12420
+ LLM_NORM_RMS, il);
12421
+ cb(cur, "attn_norm", il);
12422
+ }
12423
+
12424
+ // self-attention
12425
+ {
12426
+ // compute Q and K and RoPE them
11322
12427
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
11323
12428
  cb(Qcur, "Qcur", il);
11324
12429
 
@@ -11378,7 +12483,7 @@ struct llm_build_chameleon : public llm_graph_context {
11378
12483
 
11379
12484
  cur = build_attn(inp_attn, gf,
11380
12485
  model.layers[il].wo, nullptr,
11381
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12486
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11382
12487
 
11383
12488
  if (hparams.swin_norm) {
11384
12489
  cur = build_norm(cur,
@@ -11615,36 +12720,362 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
11615
12720
  }
11616
12721
  };
11617
12722
 
11618
- llama_memory_i * llama_model::create_memory() const {
12723
+ struct llm_build_plm : public llm_graph_context {
12724
+ llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12725
+ const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
12726
+
12727
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
12728
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
12729
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
12730
+
12731
+ ggml_tensor * cur;
12732
+ ggml_tensor * inpL;
12733
+
12734
+ // {n_embd, n_tokens}
12735
+ inpL = build_inp_embd(model.tok_embd);
12736
+
12737
+ // inp_pos - contains the positions
12738
+ ggml_tensor * inp_pos = build_inp_pos();
12739
+
12740
+ auto * inp_attn = build_attn_inp_kv_unified();
12741
+
12742
+ for (int il = 0; il < n_layer; ++il) {
12743
+ ggml_tensor * inpSA = inpL;
12744
+
12745
+ // norm
12746
+ cur = build_norm(inpL,
12747
+ model.layers[il].attn_norm, NULL,
12748
+ LLM_NORM_RMS, il);
12749
+ cb(cur, "attn_norm", il);
12750
+
12751
+ // self_attention
12752
+ {
12753
+ ggml_tensor * q = NULL;
12754
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
12755
+ cb(q, "q", il);
12756
+
12757
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
12758
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
12759
+ ggml_row_size(q->type, hparams.n_embd_head_k),
12760
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
12761
+ 0);
12762
+ cb(q_nope, "q_nope", il);
12763
+
12764
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
12765
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
12766
+ ggml_row_size(q->type, hparams.n_embd_head_k),
12767
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
12768
+ ggml_row_size(q->type, n_embd_head_qk_nope));
12769
+ cb(q_pe, "q_pe", il);
12770
+
12771
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
12772
+ ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
12773
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
12774
+
12775
+ // split into {kv_lora_rank, n_tokens}
12776
+ ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
12777
+ kv_pe_compresseed->nb[1],
12778
+ 0);
12779
+ cb(kv_compressed, "kv_compressed", il);
12780
+
12781
+ // and {n_embd_head_qk_rope, n_tokens}
12782
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
12783
+ kv_pe_compresseed->nb[1],
12784
+ kv_pe_compresseed->nb[1],
12785
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
12786
+ cb(k_pe, "k_pe", il);
12787
+
12788
+ kv_compressed = build_norm(kv_compressed,
12789
+ model.layers[il].attn_kv_a_norm, NULL,
12790
+ LLM_NORM_RMS, il);
12791
+ cb(kv_compressed, "kv_compressed", il);
12792
+
12793
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
12794
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
12795
+ cb(kv, "kv", il);
12796
+
12797
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
12798
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
12799
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
12800
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
12801
+ 0);
12802
+ cb(k_nope, "k_nope", il);
12803
+
12804
+ // and {n_head * n_embd_head_v, n_tokens}
12805
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
12806
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
12807
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
12808
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
12809
+ cb(v_states, "v_states", il);
12810
+
12811
+ v_states = ggml_cont(ctx0, v_states);
12812
+ cb(v_states, "v_states", il);
12813
+
12814
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
12815
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
12816
+ 0);
12817
+ cb(v_states, "v_states", il);
12818
+
12819
+ q_pe = ggml_rope_ext(
12820
+ ctx0, q_pe, inp_pos, nullptr,
12821
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12822
+ ext_factor, attn_factor, beta_fast, beta_slow
12823
+ );
12824
+ cb(q_pe, "q_pe", il);
12825
+
12826
+ // shared RoPE key
12827
+ k_pe = ggml_rope_ext(
12828
+ ctx0, k_pe, inp_pos, nullptr,
12829
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12830
+ ext_factor, attn_factor, beta_fast, beta_slow
12831
+ );
12832
+ cb(k_pe, "k_pe", il);
12833
+
12834
+ ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
12835
+ cb(q_states, "q_states", il);
12836
+
12837
+ ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
12838
+ cb(k_states, "k_states", il);
12839
+
12840
+ cur = build_attn(inp_attn, gf,
12841
+ model.layers[il].wo, NULL,
12842
+ q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
12843
+ }
12844
+
12845
+ if (il == n_layer - 1) {
12846
+ // skip computing output for unused tokens
12847
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12848
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12849
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12850
+ }
12851
+
12852
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12853
+ cb(ffn_inp, "ffn_inp", il);
12854
+
12855
+ cur = build_norm(ffn_inp,
12856
+ model.layers[il].ffn_norm, NULL,
12857
+ LLM_NORM_RMS, il);
12858
+ cb(cur, "ffn_norm", il);
12859
+
12860
+ cur = build_ffn(cur,
12861
+ model.layers[il].ffn_up, NULL, NULL,
12862
+ NULL, NULL, NULL,
12863
+ model.layers[il].ffn_down, NULL, NULL,
12864
+ NULL,
12865
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
12866
+ cb(cur, "ffn_out", il);
12867
+
12868
+ cur = ggml_add(ctx0, cur, ffn_inp);
12869
+
12870
+ cur = build_cvec(cur, il);
12871
+ cb(cur, "l_out", il);
12872
+
12873
+ // input for next layer
12874
+ inpL = cur;
12875
+ }
12876
+
12877
+ cur = inpL;
12878
+
12879
+ cur = build_norm(cur,
12880
+ model.output_norm, NULL,
12881
+ LLM_NORM_RMS, -1);
12882
+
12883
+ cb(cur, "result_norm", -1);
12884
+ res->t_embd = cur;
12885
+
12886
+ cur = build_lora_mm(model.output, cur);
12887
+
12888
+ cb(cur, "result_output", -1);
12889
+ res->t_logits = cur;
12890
+
12891
+ ggml_build_forward_expand(gf, cur);
12892
+ }
12893
+ };
12894
+
12895
+ struct llm_build_bailingmoe : public llm_graph_context {
12896
+ llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12897
+ ggml_tensor * cur;
12898
+ ggml_tensor * inpL;
12899
+
12900
+ inpL = build_inp_embd(model.tok_embd);
12901
+
12902
+ // inp_pos - contains the positions
12903
+ ggml_tensor * inp_pos = build_inp_pos();
12904
+
12905
+ auto * inp_attn = build_attn_inp_kv_unified();
12906
+
12907
+ for (int il = 0; il < n_layer; ++il) {
12908
+ ggml_tensor * inpSA = inpL;
12909
+
12910
+ // norm
12911
+ cur = build_norm(inpL,
12912
+ model.layers[il].attn_norm, NULL,
12913
+ LLM_NORM_RMS, il);
12914
+ cb(cur, "attn_norm", il);
12915
+
12916
+ // self-attention
12917
+ {
12918
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
12919
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
12920
+
12921
+ // compute Q and K and RoPE them
12922
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12923
+ cb(Qcur, "Qcur", il);
12924
+ if (model.layers[il].bq) {
12925
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12926
+ cb(Qcur, "Qcur", il);
12927
+ }
12928
+
12929
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12930
+ cb(Kcur, "Kcur", il);
12931
+ if (model.layers[il].bk) {
12932
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12933
+ cb(Kcur, "Kcur", il);
12934
+ }
12935
+
12936
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12937
+ cb(Vcur, "Vcur", il);
12938
+ if (model.layers[il].bv) {
12939
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12940
+ cb(Vcur, "Vcur", il);
12941
+ }
12942
+
12943
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
12944
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
12945
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
12946
+
12947
+ Qcur = ggml_rope_ext(
12948
+ ctx0, Qcur, inp_pos, rope_factors,
12949
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12950
+ ext_factor, attn_factor, beta_fast, beta_slow
12951
+ );
12952
+
12953
+ Kcur = ggml_rope_ext(
12954
+ ctx0, Kcur, inp_pos, rope_factors,
12955
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12956
+ ext_factor, attn_factor, beta_fast, beta_slow
12957
+ );
12958
+
12959
+ cb(Qcur, "Qcur", il);
12960
+ cb(Kcur, "Kcur", il);
12961
+ cb(Vcur, "Vcur", il);
12962
+
12963
+ cur = build_attn(inp_attn, gf,
12964
+ model.layers[il].wo, model.layers[il].bo,
12965
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
12966
+ }
12967
+
12968
+ if (il == n_layer - 1) {
12969
+ // skip computing output for unused tokens
12970
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12971
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12972
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12973
+ }
12974
+
12975
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12976
+ cb(ffn_inp, "ffn_inp", il);
12977
+
12978
+ cur = build_norm(ffn_inp,
12979
+ model.layers[il].ffn_norm, NULL,
12980
+ LLM_NORM_RMS, il);
12981
+ cb(cur, "ffn_norm", il);
12982
+
12983
+ ggml_tensor * moe_out =
12984
+ build_moe_ffn(cur,
12985
+ model.layers[il].ffn_gate_inp,
12986
+ model.layers[il].ffn_up_exps,
12987
+ model.layers[il].ffn_gate_exps,
12988
+ model.layers[il].ffn_down_exps,
12989
+ nullptr,
12990
+ n_expert, n_expert_used,
12991
+ LLM_FFN_SILU, hparams.expert_weights_norm,
12992
+ false, hparams.expert_weights_scale,
12993
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12994
+ il);
12995
+ cb(moe_out, "ffn_moe_out", il);
12996
+
12997
+ // FFN shared expert
12998
+ {
12999
+ ggml_tensor * ffn_shexp = build_ffn(cur,
13000
+ model.layers[il].ffn_up_shexp, NULL, NULL,
13001
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
13002
+ model.layers[il].ffn_down_shexp, NULL, NULL,
13003
+ NULL,
13004
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13005
+ cb(ffn_shexp, "ffn_shexp", il);
13006
+
13007
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
13008
+ cb(cur, "ffn_out", il);
13009
+ }
13010
+
13011
+ cur = ggml_add(ctx0, cur, ffn_inp);
13012
+
13013
+ cur = build_cvec(cur, il);
13014
+ cb(cur, "l_out", il);
13015
+
13016
+ // input for next layer
13017
+ inpL = cur;
13018
+ }
13019
+
13020
+ cur = inpL;
13021
+
13022
+ cur = build_norm(cur,
13023
+ model.output_norm, NULL,
13024
+ LLM_NORM_RMS, -1);
13025
+
13026
+ cb(cur, "result_norm", -1);
13027
+ res->t_embd = cur;
13028
+
13029
+ // lm_head
13030
+ cur = build_lora_mm(model.output, cur);
13031
+
13032
+ cb(cur, "result_output", -1);
13033
+ res->t_logits = cur;
13034
+
13035
+ ggml_build_forward_expand(gf, cur);
13036
+ }
13037
+ };
13038
+
13039
+ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
11619
13040
  llama_memory_i * res;
11620
13041
 
11621
13042
  switch (arch) {
13043
+ case LLM_ARCH_BERT:
13044
+ case LLM_ARCH_JINA_BERT_V2:
13045
+ case LLM_ARCH_NOMIC_BERT:
13046
+ case LLM_ARCH_NOMIC_BERT_MOE:
13047
+ {
13048
+ res = nullptr;
13049
+ } break;
11622
13050
  case LLM_ARCH_MAMBA:
11623
13051
  case LLM_ARCH_RWKV6:
11624
13052
  case LLM_ARCH_RWKV6QWEN2:
11625
13053
  case LLM_ARCH_RWKV7:
11626
13054
  case LLM_ARCH_ARWKV7:
11627
13055
  {
11628
- res = new llama_kv_cache_unified(hparams, {
11629
- /*.get_rope_factors =*/ nullptr
11630
- });
13056
+ res = new llama_kv_cache_recurrent(
13057
+ *this,
13058
+ GGML_TYPE_F32,
13059
+ GGML_TYPE_F32,
13060
+ cparams.offload_kqv,
13061
+ std::max((uint32_t) 1, cparams.n_seq_max));
11631
13062
  } break;
11632
13063
  default:
11633
13064
  {
11634
- res = new llama_kv_cache_unified(hparams, {
11635
- /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
11636
- // choose long/short freq factors based on the context size
11637
- if (layers[il].rope_freqs != nullptr) {
11638
- return layers[il].rope_freqs;
11639
- }
13065
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
11640
13066
 
11641
- if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
11642
- return layers[il].rope_long;
11643
- }
13067
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
11644
13068
 
11645
- return layers[il].rope_short;
11646
- }
11647
- });
13069
+ LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
13070
+
13071
+ res = new llama_kv_cache_unified(
13072
+ *this,
13073
+ params.type_k,
13074
+ params.type_v,
13075
+ !cparams.flash_attn,
13076
+ cparams.offload_kqv,
13077
+ cparams.n_ctx,
13078
+ padding);
11648
13079
  }
11649
13080
  }
11650
13081
 
@@ -11659,9 +13090,8 @@ llm_graph_result_ptr llama_model::build_graph(
11659
13090
 
11660
13091
  switch (arch) {
11661
13092
  case LLM_ARCH_LLAMA:
13093
+ case LLM_ARCH_LLAMA4:
11662
13094
  case LLM_ARCH_MINICPM:
11663
- case LLM_ARCH_GRANITE:
11664
- case LLM_ARCH_GRANITE_MOE:
11665
13095
  {
11666
13096
  llm = std::make_unique<llm_build_llama>(*this, params, gf);
11667
13097
  } break;
@@ -11692,6 +13122,7 @@ llm_graph_result_ptr llama_model::build_graph(
11692
13122
  case LLM_ARCH_BERT:
11693
13123
  case LLM_ARCH_JINA_BERT_V2:
11694
13124
  case LLM_ARCH_NOMIC_BERT:
13125
+ case LLM_ARCH_NOMIC_BERT_MOE:
11695
13126
  {
11696
13127
  llm = std::make_unique<llm_build_bert>(*this, params, gf);
11697
13128
  } break;
@@ -11723,6 +13154,14 @@ llm_graph_result_ptr llama_model::build_graph(
11723
13154
  {
11724
13155
  llm = std::make_unique<llm_build_qwen2moe>(*this, params, gf);
11725
13156
  } break;
13157
+ case LLM_ARCH_QWEN3:
13158
+ {
13159
+ llm = std::make_unique<llm_build_qwen3>(*this, params, gf);
13160
+ } break;
13161
+ case LLM_ARCH_QWEN3MOE:
13162
+ {
13163
+ llm = std::make_unique<llm_build_qwen3moe>(*this, params, gf);
13164
+ } break;
11726
13165
  case LLM_ARCH_PHI2:
11727
13166
  {
11728
13167
  llm = std::make_unique<llm_build_phi2>(*this, params, gf);
@@ -11828,6 +13267,10 @@ llm_graph_result_ptr llama_model::build_graph(
11828
13267
  {
11829
13268
  llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
11830
13269
  } break;
13270
+ case LLM_ARCH_GLM4:
13271
+ {
13272
+ llm = std::make_unique<llm_build_glm4>(*this, params, gf);
13273
+ } break;
11831
13274
  case LLM_ARCH_BITNET:
11832
13275
  {
11833
13276
  llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
@@ -11846,10 +13289,11 @@ llm_graph_result_ptr llama_model::build_graph(
11846
13289
  GGML_ABORT("invalid graph type");
11847
13290
  };
11848
13291
  } break;
11849
- //case LLM_ARCH_T5ENCODER:
11850
- // {
11851
- // llm.build_t5_enc(gf);
11852
- // } break;
13292
+ case LLM_ARCH_T5ENCODER:
13293
+ {
13294
+ llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
13295
+ }
13296
+ break;
11853
13297
  case LLM_ARCH_JAIS:
11854
13298
  {
11855
13299
  llm = std::make_unique<llm_build_jais>(*this, params, gf);
@@ -11878,6 +13322,11 @@ llm_graph_result_ptr llama_model::build_graph(
11878
13322
  {
11879
13323
  llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
11880
13324
  } break;
13325
+ case LLM_ARCH_GRANITE:
13326
+ case LLM_ARCH_GRANITE_MOE:
13327
+ {
13328
+ llm = std::make_unique<llm_build_granite>(*this, params, gf);
13329
+ } break;
11881
13330
  case LLM_ARCH_CHAMELEON:
11882
13331
  {
11883
13332
  llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
@@ -11886,6 +13335,14 @@ llm_graph_result_ptr llama_model::build_graph(
11886
13335
  {
11887
13336
  llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
11888
13337
  } break;
13338
+ case LLM_ARCH_PLM:
13339
+ {
13340
+ llm = std::make_unique<llm_build_plm>(*this, params, gf);
13341
+ } break;
13342
+ case LLM_ARCH_BAILINGMOE:
13343
+ {
13344
+ llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
13345
+ } break;
11889
13346
  default:
11890
13347
  GGML_ABORT("fatal error");
11891
13348
  }
@@ -11903,6 +13360,7 @@ llm_graph_result_ptr llama_model::build_graph(
11903
13360
  llama_model_params llama_model_default_params() {
11904
13361
  llama_model_params result = {
11905
13362
  /*.devices =*/ nullptr,
13363
+ /*.tensor_buft_overrides =*/ nullptr,
11906
13364
  /*.n_gpu_layers =*/ 0,
11907
13365
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
11908
13366
  /*.main_gpu =*/ 0,
@@ -11998,11 +13456,10 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
11998
13456
 
11999
13457
  // use what we call a normal RoPE, operating on pairs of consecutive head values
12000
13458
  case LLM_ARCH_LLAMA:
13459
+ case LLM_ARCH_LLAMA4:
12001
13460
  case LLM_ARCH_DECI:
12002
13461
  case LLM_ARCH_BAICHUAN:
12003
13462
  case LLM_ARCH_STARCODER:
12004
- case LLM_ARCH_PLAMO:
12005
- case LLM_ARCH_ORION:
12006
13463
  case LLM_ARCH_INTERNLM2:
12007
13464
  case LLM_ARCH_MINICPM:
12008
13465
  case LLM_ARCH_XVERSE:
@@ -12012,10 +13469,13 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
12012
13469
  case LLM_ARCH_ARCTIC:
12013
13470
  case LLM_ARCH_DEEPSEEK:
12014
13471
  case LLM_ARCH_DEEPSEEK2:
13472
+ case LLM_ARCH_PLM:
12015
13473
  case LLM_ARCH_CHATGLM:
13474
+ case LLM_ARCH_GLM4:
12016
13475
  case LLM_ARCH_GRANITE:
12017
13476
  case LLM_ARCH_GRANITE_MOE:
12018
13477
  case LLM_ARCH_CHAMELEON:
13478
+ case LLM_ARCH_BAILINGMOE:
12019
13479
  return LLAMA_ROPE_TYPE_NORM;
12020
13480
 
12021
13481
  // the pairs of head values are offset by n_rot/2
@@ -12024,16 +13484,20 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
12024
13484
  case LLM_ARCH_DBRX:
12025
13485
  case LLM_ARCH_BERT:
12026
13486
  case LLM_ARCH_NOMIC_BERT:
13487
+ case LLM_ARCH_NOMIC_BERT_MOE:
12027
13488
  case LLM_ARCH_STABLELM:
12028
13489
  case LLM_ARCH_BITNET:
12029
13490
  case LLM_ARCH_QWEN:
12030
13491
  case LLM_ARCH_QWEN2:
12031
13492
  case LLM_ARCH_QWEN2MOE:
13493
+ case LLM_ARCH_QWEN3:
13494
+ case LLM_ARCH_QWEN3MOE:
12032
13495
  case LLM_ARCH_OLMO2:
12033
13496
  case LLM_ARCH_OLMOE:
12034
13497
  case LLM_ARCH_PHI2:
12035
13498
  case LLM_ARCH_PHI3:
12036
13499
  case LLM_ARCH_PHIMOE:
13500
+ case LLM_ARCH_PLAMO:
12037
13501
  case LLM_ARCH_GEMMA:
12038
13502
  case LLM_ARCH_GEMMA2:
12039
13503
  case LLM_ARCH_GEMMA3:
@@ -12041,6 +13505,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
12041
13505
  case LLM_ARCH_OPENELM:
12042
13506
  case LLM_ARCH_GPTNEOX:
12043
13507
  case LLM_ARCH_CODESHELL:
13508
+ case LLM_ARCH_ORION:
12044
13509
  case LLM_ARCH_NEMOTRON:
12045
13510
  case LLM_ARCH_EXAONE:
12046
13511
  case LLM_ARCH_MINICPM3:
@@ -12113,6 +13578,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
12113
13578
  : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
12114
13579
  const auto & it = model->gguf_kv.find(key);
12115
13580
  if (it == model->gguf_kv.end()) {
13581
+ // one-off fix for very popular models (so we are not flooded with issues)
13582
+ // do not extend this list unless absolutely necessary
13583
+ // Mistral-Small-2503 does not have built-in chat template
13584
+ llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
13585
+ if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
13586
+ return "mistral-v7-tekken";
13587
+ }
13588
+
12116
13589
  return nullptr;
12117
13590
  }
12118
13591