whispercpp 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (610) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.md +99 -2
  4. data/ext/extconf.rb +1 -0
  5. data/ext/ruby_whisper.c +20 -4
  6. data/ext/ruby_whisper.h +30 -2
  7. data/ext/ruby_whisper_context.c +216 -124
  8. data/ext/ruby_whisper_context_params.c +163 -0
  9. data/ext/ruby_whisper_model.c +0 -1
  10. data/ext/ruby_whisper_params.c +0 -1
  11. data/ext/ruby_whisper_segment.c +0 -1
  12. data/ext/ruby_whisper_token.c +29 -9
  13. data/ext/ruby_whisper_transcribe.cpp +4 -1
  14. data/ext/ruby_whisper_vad_context.c +48 -1
  15. data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
  16. data/ext/ruby_whisper_vad_params.c +0 -1
  17. data/ext/ruby_whisper_vad_segment.c +0 -1
  18. data/ext/ruby_whisper_vad_segments.c +0 -1
  19. data/ext/sources/CMakeLists.txt +1 -1
  20. data/ext/sources/bindings/javascript/package.json +1 -1
  21. data/ext/sources/cmake/whisper-config.cmake.in +5 -40
  22. data/ext/sources/examples/bench/bench.cpp +23 -18
  23. data/ext/sources/examples/cli/cli.cpp +8 -0
  24. data/ext/sources/examples/common-ggml.cpp +2 -0
  25. data/ext/sources/examples/miniaudio.h +4507 -2131
  26. data/ext/sources/examples/server/server.cpp +18 -4
  27. data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
  28. data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
  29. data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
  30. data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
  31. data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
  32. data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
  33. data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
  34. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  35. data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
  36. data/ext/sources/examples/talk-llama/llama-context.h +27 -28
  37. data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
  38. data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
  39. data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
  40. data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
  41. data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
  42. data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
  43. data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
  44. data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
  45. data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
  47. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
  48. data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
  49. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
  50. data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
  51. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
  52. data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
  53. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
  54. data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
  55. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
  56. data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
  57. data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
  58. data/ext/sources/examples/talk-llama/llama-model.h +72 -19
  59. data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
  60. data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
  61. data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
  62. data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
  63. data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
  64. data/ext/sources/examples/talk-llama/llama.cpp +76 -22
  65. data/ext/sources/examples/talk-llama/llama.h +63 -30
  66. data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
  67. data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
  68. data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
  69. data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
  70. data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
  71. data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
  72. data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
  73. data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
  74. data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
  75. data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
  76. data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
  77. data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
  78. data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
  79. data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
  80. data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
  81. data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
  82. data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
  83. data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
  84. data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
  85. data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
  86. data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
  87. data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
  88. data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
  89. data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
  90. data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
  91. data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
  92. data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
  93. data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
  94. data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
  95. data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
  96. data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
  97. data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
  98. data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
  99. data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
  100. data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
  101. data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
  102. data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
  103. data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
  104. data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
  105. data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
  106. data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
  107. data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
  108. data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
  109. data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
  110. data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
  111. data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
  112. data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
  113. data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
  114. data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
  115. data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
  116. data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
  117. data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
  118. data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
  119. data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
  120. data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
  121. data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
  122. data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
  123. data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
  124. data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
  125. data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
  126. data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
  127. data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
  128. data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
  129. data/ext/sources/examples/talk-llama/models/models.h +181 -46
  130. data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
  131. data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
  132. data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
  133. data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
  134. data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
  135. data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
  136. data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
  137. data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
  138. data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
  139. data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
  140. data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
  141. data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
  142. data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
  143. data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
  144. data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
  145. data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
  146. data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
  147. data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
  148. data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
  149. data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
  150. data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
  151. data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
  152. data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
  153. data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
  154. data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
  155. data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
  156. data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
  157. data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
  158. data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
  159. data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
  160. data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
  161. data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
  162. data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
  163. data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
  164. data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
  165. data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
  166. data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
  167. data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
  168. data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
  169. data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
  170. data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
  171. data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
  172. data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
  173. data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
  174. data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
  175. data/ext/sources/ggml/CMakeLists.txt +9 -3
  176. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  177. data/ext/sources/ggml/include/ggml-cann.h +1 -1
  178. data/ext/sources/ggml/include/ggml-cpu.h +5 -0
  179. data/ext/sources/ggml/include/ggml-openvino.h +37 -0
  180. data/ext/sources/ggml/include/ggml-opt.h +1 -1
  181. data/ext/sources/ggml/include/ggml-rpc.h +6 -1
  182. data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
  183. data/ext/sources/ggml/include/ggml.h +56 -9
  184. data/ext/sources/ggml/src/CMakeLists.txt +3 -0
  185. data/ext/sources/ggml/src/ggml-alloc.c +4 -9
  186. data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
  187. data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
  188. data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
  189. data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
  190. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  191. data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
  192. data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
  193. data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
  194. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
  195. data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
  196. data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
  197. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
  198. data/ext/sources/ggml/src/ggml-common.h +11 -0
  199. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
  200. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
  201. data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
  202. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
  203. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
  204. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
  205. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
  206. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
  207. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
  208. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
  209. data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
  210. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
  211. data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
  212. data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
  213. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
  214. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
  215. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
  216. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
  217. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
  218. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
  219. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
  220. data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
  221. data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
  222. data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
  223. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
  224. data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
  225. data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  226. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
  227. data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
  228. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
  229. data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  230. data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
  231. data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
  232. data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
  233. data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
  234. data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
  235. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
  236. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
  237. data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
  238. data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
  239. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
  240. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
  241. data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
  242. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
  243. data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
  244. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
  245. data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
  246. data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
  247. data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
  248. data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
  249. data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
  250. data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
  251. data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
  252. data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
  253. data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
  254. data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
  255. data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
  256. data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
  257. data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
  258. data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
  259. data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
  260. data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
  261. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
  262. data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
  263. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
  264. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
  265. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
  266. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
  267. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
  268. data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
  269. data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
  270. data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
  271. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
  272. data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
  273. data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
  274. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
  275. data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
  276. data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
  277. data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
  278. data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
  279. data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
  280. data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
  281. data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  282. data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
  283. data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
  284. data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
  285. data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
  286. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
  287. data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
  288. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  289. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  290. data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  291. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
  292. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
  293. data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
  294. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
  295. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
  296. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  297. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
  298. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  299. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  300. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  301. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
  302. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
  303. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  304. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  305. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  306. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  307. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
  308. data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
  309. data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
  310. data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
  311. data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
  312. data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
  313. data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
  314. data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
  315. data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
  316. data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
  317. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  318. data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  319. data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
  320. data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  321. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
  322. data/ext/sources/ggml/src/ggml-impl.h +62 -0
  323. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
  324. data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
  325. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
  326. data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
  327. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
  328. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
  329. data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
  330. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
  331. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
  332. data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
  333. data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
  334. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
  335. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
  336. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
  337. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
  338. data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
  339. data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
  340. data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
  341. data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
  342. data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
  343. data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
  344. data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
  345. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  346. data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
  347. data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
  348. data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
  349. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  350. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  351. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  352. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  353. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  354. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  355. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  356. data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
  357. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  358. data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
  359. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
  360. data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
  361. data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
  362. data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  363. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
  364. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
  365. data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
  366. data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  367. data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
  368. data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
  369. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
  370. data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
  371. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
  372. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
  373. data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
  374. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
  375. data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
  376. data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
  377. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
  378. data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
  379. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
  380. data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
  381. data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
  382. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
  383. data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
  384. data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
  385. data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
  386. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
  387. data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
  388. data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
  389. data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
  390. data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
  391. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
  392. data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
  393. data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
  394. data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
  395. data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
  396. data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
  397. data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
  398. data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
  399. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
  400. data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
  401. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
  402. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
  403. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
  404. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
  405. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
  406. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
  407. data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
  408. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
  409. data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
  410. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
  411. data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
  412. data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
  413. data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
  414. data/ext/sources/ggml/src/ggml-quants.c +96 -5
  415. data/ext/sources/ggml/src/ggml-quants.h +3 -0
  416. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
  417. data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
  418. data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
  419. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
  420. data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
  421. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
  422. data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
  423. data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
  424. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
  425. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
  426. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  427. data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
  428. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
  429. data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
  430. data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
  431. data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
  432. data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
  433. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
  434. data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
  435. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
  436. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
  437. data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
  438. data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
  439. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
  440. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
  441. data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
  442. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
  443. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
  444. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
  445. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
  446. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
  447. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
  448. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
  449. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
  450. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
  451. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
  452. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
  453. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
  454. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
  455. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
  456. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
  457. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
  458. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
  459. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
  460. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
  461. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
  462. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
  463. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
  464. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
  465. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
  466. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
  467. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
  468. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
  469. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
  470. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
  471. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
  472. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
  473. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
  474. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
  475. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
  476. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
  477. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
  478. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
  479. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
  480. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
  481. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
  482. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
  483. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
  484. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
  485. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
  486. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
  487. data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
  488. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
  489. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
  490. data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  491. data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  492. data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  493. data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  494. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  495. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  496. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  497. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  498. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  499. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  500. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  501. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  502. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  503. data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  504. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  505. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  506. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  507. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  508. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  509. data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  510. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  511. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  512. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  513. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  514. data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  515. data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  516. data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  517. data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  518. data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  519. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  520. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  521. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  522. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  523. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  524. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  525. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  526. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  527. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  528. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  529. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  530. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  531. data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  532. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
  533. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
  534. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
  535. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
  536. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
  537. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
  538. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
  539. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
  540. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
  541. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
  542. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
  543. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
  544. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
  545. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
  546. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
  547. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  548. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
  549. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
  550. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
  551. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
  552. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
  553. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
  554. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
  555. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
  556. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
  557. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
  558. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
  559. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
  560. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
  561. data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
  562. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  563. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  564. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  565. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
  566. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
  567. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
  568. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
  569. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  570. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
  571. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
  572. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
  573. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
  574. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
  575. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
  576. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
  577. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
  578. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  579. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
  580. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
  581. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
  582. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  583. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  584. data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
  585. data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
  586. data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
  587. data/ext/sources/ggml/src/ggml.c +167 -33
  588. data/ext/sources/ggml/src/gguf.cpp +229 -44
  589. data/ext/sources/src/whisper.cpp +6 -28
  590. data/sig/whisper.rbs +43 -2
  591. data/test/test_context_params.rb +82 -0
  592. data/test/test_token.rb +11 -0
  593. data/test/test_vad_context.rb +58 -8
  594. data/test/test_whisper.rb +20 -0
  595. data/whispercpp.gemspec +1 -1
  596. metadata +240 -28
  597. data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
  598. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
  599. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
  600. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
  601. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
  602. data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
  603. data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
  604. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
  605. data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
  606. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
  607. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
  608. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
  609. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
  610. data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
@@ -1,5 +1,6 @@
1
1
  #include "llama-model.h"
2
2
 
3
+ #include "ggml.h"
3
4
  #include "llama-impl.h"
4
5
  #include "llama-mmap.h"
5
6
  #include "llama-cparams.h"
@@ -8,6 +9,7 @@
8
9
  #include "llama-kv-cache.h"
9
10
  #include "llama-kv-cache-iswa.h"
10
11
  #include "llama-memory-hybrid.h"
12
+ #include "llama-memory-hybrid-iswa.h"
11
13
  #include "llama-memory-recurrent.h"
12
14
 
13
15
  #include "ggml-cpp.h"
@@ -17,6 +19,7 @@
17
19
  #include <algorithm>
18
20
  #include <cassert>
19
21
  #include <cfloat>
22
+ #include <cstdint>
20
23
  #include <cstring>
21
24
  #include <cmath>
22
25
  #include <functional>
@@ -60,6 +63,7 @@ const char * llm_type_name(llm_type type) {
60
63
  case LLM_TYPE_0_3B: return "0.3B";
61
64
  case LLM_TYPE_0_5B: return "0.5B";
62
65
  case LLM_TYPE_0_6B: return "0.6B";
66
+ case LLM_TYPE_0_8B: return "0.8B";
63
67
  case LLM_TYPE_1B: return "1B";
64
68
  case LLM_TYPE_1_2B: return "1.2B";
65
69
  case LLM_TYPE_1_3B: return "1.3B";
@@ -122,17 +126,25 @@ const char * llm_type_name(llm_type type) {
122
126
  case LLM_TYPE_8B_A1B: return "8B.A1B";
123
127
  case LLM_TYPE_16B_A1B: return "16B.A1B";
124
128
  case LLM_TYPE_21B_A3B: return "21B.A3B";
129
+ case LLM_TYPE_24B_A2B: return "24B.A2B";
125
130
  case LLM_TYPE_30B_A3B: return "30B.A3B";
126
131
  case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
132
+ case LLM_TYPE_35B_A3B: return "35B.A3B";
133
+ case LLM_TYPE_48B_A3B: return "48B.A3B";
127
134
  case LLM_TYPE_80B_A3B: return "80B.A3B";
128
135
  case LLM_TYPE_100B_A6B: return "100B.A6B";
129
136
  case LLM_TYPE_102B_A12B: return "102B.A12B";
130
137
  case LLM_TYPE_106B_A12B: return "106B.A12B";
138
+ case LLM_TYPE_120B_A12B: return "120B.A12B";
139
+ case LLM_TYPE_122B_A10B: return "122B.A10B";
140
+ case LLM_TYPE_196B_A11B: return "196B.A11B";
131
141
  case LLM_TYPE_230B_A10B: return "230B.A10B";
132
142
  case LLM_TYPE_235B_A22B: return "235B.A22B";
133
143
  case LLM_TYPE_300B_A47B: return "300B.A47B";
134
144
  case LLM_TYPE_310B_A15B: return "310B.A15B";
135
145
  case LLM_TYPE_355B_A32B: return "355B.A32B";
146
+ case LLM_TYPE_397B_A17B: return "397B.A17B";
147
+ case LLM_TYPE_744B_A40B: return "744B.A40B";
136
148
  case LLM_TYPE_E2B: return "E2B";
137
149
  case LLM_TYPE_E4B: return "E4B";
138
150
  default: return "?B";
@@ -168,160 +180,6 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
168
180
  return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
169
181
  }
170
182
 
171
- // checks if the weight tensor can be used with the specified buffer type and device
172
- static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
173
- GGML_ASSERT(w != nullptr);
174
-
175
- if (op == GGML_OP_NONE) {
176
- return true;
177
- }
178
-
179
- ggml_init_params params = {
180
- /*.mem_size =*/ ggml_tensor_overhead()*8,
181
- /*.mem_buffer =*/ NULL,
182
- /*.no_alloc =*/ true,
183
- };
184
- ggml_context_ptr ctx_ptr { ggml_init(params) };
185
- if (!ctx_ptr) {
186
- throw std::runtime_error(format("failed to create ggml context"));
187
- }
188
- ggml_context * ctx = ctx_ptr.get();
189
-
190
- ggml_tensor * op_tensor = nullptr;
191
-
192
- switch (op) {
193
- case GGML_OP_GET_ROWS:
194
- {
195
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
196
- op_tensor = ggml_get_rows(ctx, w, b);
197
- } break;
198
- case GGML_OP_MUL_MAT:
199
- {
200
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
201
- op_tensor = ggml_mul_mat(ctx, w, b);
202
- } break;
203
- case GGML_OP_MUL_MAT_ID:
204
- {
205
- int n_expert_used = hparams.n_expert_used;
206
- ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
207
- ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
208
- op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
209
- } break;
210
- case GGML_OP_ADD:
211
- {
212
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
213
- op_tensor = ggml_add(ctx, a, w);
214
- } break;
215
- case GGML_OP_ADD_ID:
216
- {
217
- int n_expert_used = hparams.n_expert_used;
218
- ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
219
- ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
220
- op_tensor = ggml_add_id(ctx, a, w, c);
221
- } break;
222
- case GGML_OP_MUL:
223
- {
224
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
225
- op_tensor = ggml_mul(ctx, a, w);
226
- } break;
227
- case GGML_OP_DIV:
228
- {
229
- ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
230
- op_tensor = ggml_div(ctx, a, w);
231
- } break;
232
- case GGML_OP_ROPE:
233
- {
234
- int n_embd_head = hparams.n_embd_head_v;
235
- int n_head = hparams.n_head();
236
- ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
237
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
238
- op_tensor = ggml_rope_ext(
239
- ctx, a, b, w,
240
- 0, 0, 0, 0, 0,
241
- 0, 0, 0, 0
242
- );
243
-
244
- } break;
245
- case GGML_OP_SSM_CONV:
246
- {
247
- const int64_t n_seq_tokens = 512;
248
- const int64_t n_seqs = 3;
249
- ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
250
- op_tensor = ggml_ssm_conv(ctx, conv_x, w);
251
- } break;
252
- case GGML_OP_SSM_SCAN:
253
- {
254
- // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
255
- const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
256
- const int64_t n_head = w->ne[1];
257
- const int64_t head_dim = hparams.ssm_d_inner / n_head;
258
- const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
259
- const int64_t n_seq_tokens = 512;
260
- const int64_t n_seqs = 3;
261
- ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
262
- ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
263
- ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
264
- ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
265
- ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
266
- ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
267
- op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
268
- } break;
269
- case GGML_OP_RWKV_WKV6:
270
- {
271
- // FIXME
272
- const int64_t S = 123;
273
- const int64_t H = 123;
274
- const int64_t n_tokens = 123;
275
- const int64_t n_seqs = 123;
276
- ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
277
- ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
278
- ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
279
- ggml_tensor * tf = w;
280
- ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
281
- ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
282
- op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
283
- } break;
284
- case GGML_OP_IM2COL:
285
- {
286
- const int n_embd_inp = hparams.n_embd_inp();
287
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
288
- op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
289
- } break;
290
- case GGML_OP_SCALE:
291
- {
292
- op_tensor = ggml_scale(ctx, w, 1.0f);
293
- } break;
294
- default:
295
- GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
296
- }
297
-
298
- // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
299
- GGML_ASSERT(w->buffer == nullptr);
300
- w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
301
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
302
- ggml_backend_buffer_free(w->buffer);
303
- w->buffer = nullptr;
304
-
305
- return op_supported;
306
- }
307
-
308
- // lists of buffer types used for each layer
309
- using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
310
-
311
- // find the first buffer type in the list that can use the tensor
312
- static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
313
- GGML_ASSERT(!buft_list.empty());
314
- for (const auto & cur : buft_list) {
315
- ggml_backend_dev_t cur_dev = cur.first;
316
- ggml_backend_buffer_type_t cur_buft = cur.second;
317
- if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
318
- return cur_buft;
319
- }
320
- }
321
-
322
- return nullptr;
323
- }
324
-
325
183
  // CPU: ACCEL -> GPU host -> CPU extra -> CPU
326
184
  static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
327
185
  buft_list_t buft_list;
@@ -446,7 +304,7 @@ struct llama_model::impl {
446
304
  llama_mlocks mlock_bufs;
447
305
  llama_mlocks mlock_mmaps;
448
306
 
449
- // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
307
+ // contexts where the model tensors metadata is stored as well as the corresponding buffers:
450
308
  std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
451
309
 
452
310
  buft_list_t cpu_buft_list;
@@ -468,7 +326,11 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
468
326
  pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
469
327
  }
470
328
 
471
- llama_model::~llama_model() = default;
329
+ llama_model::~llama_model() {
330
+ for (auto * lora : loras) {
331
+ delete lora;
332
+ }
333
+ }
472
334
 
473
335
  void llama_model::load_stats(llama_model_loader & ml) {
474
336
  pimpl->n_elements = ml.n_elements;
@@ -483,7 +345,7 @@ void llama_model::load_arch(llama_model_loader & ml) {
483
345
  }
484
346
 
485
347
  void llama_model::load_hparams(llama_model_loader & ml) {
486
- const gguf_context * ctx = ml.meta.get();
348
+ const gguf_context * ctx = ml.metadata;
487
349
 
488
350
  // get metadata as string
489
351
  for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -507,7 +369,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
507
369
 
508
370
  ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
509
371
  ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
510
- ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
372
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false);
511
373
  ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
512
374
  ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
513
375
  ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
@@ -515,7 +377,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
515
377
  ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
516
378
 
517
379
  if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
518
- ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
380
+ ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd);
381
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl);
519
382
 
520
383
  ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
521
384
  ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
@@ -554,6 +417,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
554
417
  std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
555
418
  std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
556
419
  std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
420
+ std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f);
421
+ std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);
557
422
 
558
423
  ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
559
424
  ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
@@ -595,26 +460,37 @@ void llama_model::load_hparams(llama_model_loader & ml) {
595
460
  // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
596
461
  // gpt-j n_rot = rotary_dim
597
462
 
598
- hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
599
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
463
+ hparams.n_embd_head_k_full = hparams.n_embd / hparams.n_head();
464
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false);
600
465
 
601
- hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
602
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
466
+ hparams.n_embd_head_v_full = hparams.n_embd / hparams.n_head();
467
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false);
603
468
 
604
469
  // sanity check for n_rot (optional)
605
- hparams.n_rot = hparams.n_embd_head_k;
470
+ hparams.n_rot_full = hparams.n_embd_head_k_full;
606
471
 
607
- ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
472
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full, false);
608
473
 
609
474
  if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
610
- if (hparams.n_rot != hparams.n_embd_head_k) {
611
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
475
+ if (hparams.n_rot_full != hparams.n_embd_head_k_full) {
476
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot_full, hparams.n_embd_head_k_full));
612
477
  }
613
478
  }
614
479
  } else {
615
- hparams.n_rot = 0;
616
- hparams.n_embd_head_k = 0;
617
- hparams.n_embd_head_v = 0;
480
+ hparams.n_rot_full = 0;
481
+ hparams.n_embd_head_k_full = 0;
482
+ hparams.n_embd_head_v_full = 0;
483
+ }
484
+
485
+ // head size and n_rot for SWA layers
486
+ {
487
+ hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full;
488
+ hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full;
489
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa, false);
490
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa, false);
491
+
492
+ hparams.n_rot_swa = hparams.n_rot_full;
493
+ ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
618
494
  }
619
495
 
620
496
  // for differentiating model types
@@ -674,7 +550,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
674
550
  hparams.n_attn_temp_floor_scale = 8192;
675
551
  hparams.f_attn_temp_scale = 0.1f;
676
552
  hparams.f_attn_temp_offset = 1.0f;
677
- hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
553
+ uint32_t swa_period = 4; // pattern: 3 chunked - 1 full
554
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
555
+ hparams.set_swa_pattern(swa_period);
678
556
 
679
557
  hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
680
558
  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -711,7 +589,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
711
589
  case LLM_ARCH_AFMOE:
712
590
  {
713
591
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
714
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
592
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
715
593
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
716
594
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
717
595
  ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
@@ -723,7 +601,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
723
601
  // Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
724
602
  if (hparams.n_swa > 0) {
725
603
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
726
- hparams.set_swa_pattern(4);
604
+ uint32_t swa_period = 4;
605
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
606
+ hparams.set_swa_pattern(swa_period);
727
607
 
728
608
  hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
729
609
  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -868,7 +748,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
868
748
  case LLM_ARCH_BERT:
869
749
  {
870
750
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
871
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
751
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
872
752
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
873
753
 
874
754
  switch (hparams.n_layer) {
@@ -891,18 +771,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
891
771
  {
892
772
  const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
893
773
  if (found_swa && hparams.n_swa > 0) {
894
- uint32_t swa_period = 3;
895
774
  hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
896
-
897
- ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
775
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
776
+ uint32_t swa_period = 3;
898
777
  ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
899
- hparams.set_swa_pattern(swa_period);
778
+ hparams.set_swa_pattern(swa_period, true);
900
779
  } else {
901
780
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
902
781
  }
903
782
 
904
783
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
905
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
784
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
906
785
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
907
786
 
908
787
  switch (hparams.n_layer) {
@@ -918,7 +797,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
918
797
  case LLM_ARCH_JINA_BERT_V2:
919
798
  {
920
799
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
921
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
800
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
922
801
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
923
802
  hparams.f_max_alibi_bias = 8.0f;
924
803
 
@@ -931,7 +810,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
931
810
  case LLM_ARCH_JINA_BERT_V3:
932
811
  {
933
812
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
934
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
813
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
935
814
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
936
815
 
937
816
  switch (hparams.n_layer) {
@@ -944,8 +823,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
944
823
  case LLM_ARCH_NOMIC_BERT_MOE:
945
824
  {
946
825
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
947
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
948
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
826
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
827
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
949
828
  ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
950
829
 
951
830
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
@@ -959,13 +838,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
959
838
  case LLM_ARCH_NEO_BERT:
960
839
  {
961
840
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
962
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
963
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
841
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
842
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
964
843
 
965
844
  if (hparams.n_layer == 28) {
966
845
  type = LLM_TYPE_250M;
967
846
  }
968
847
  } break;
848
+ case LLM_ARCH_EUROBERT:
849
+ {
850
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
851
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
852
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
853
+
854
+ if (hparams.n_layer == 12) {
855
+ type = LLM_TYPE_SMALL; // 0.2B
856
+ }
857
+ } break;
969
858
  case LLM_ARCH_BLOOM:
970
859
  {
971
860
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -988,7 +877,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
988
877
  {
989
878
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
990
879
  ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
991
- ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
880
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
992
881
 
993
882
  switch (hparams.n_layer) {
994
883
  case 32: type = LLM_TYPE_7B; break;
@@ -1237,19 +1126,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1237
1126
  break;
1238
1127
  default: type = LLM_TYPE_UNKNOWN;
1239
1128
  }
1240
-
1241
- // Load attention parameters
1242
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
1243
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
1244
1129
  } break;
1245
1130
  case LLM_ARCH_PLAMO3:
1246
1131
  {
1247
1132
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1248
1133
  const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1249
1134
  if (found_swa && hparams.n_swa > 0) {
1250
- uint32_t swa_period = 8;
1251
1135
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1252
- ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
1136
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1137
+ uint32_t swa_period = 8;
1253
1138
  ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1254
1139
  hparams.set_swa_pattern(swa_period);
1255
1140
  } else {
@@ -1312,7 +1197,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1312
1197
  {
1313
1198
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1314
1199
  hparams.n_swa = 4096; // default value of gemma 2
1315
- hparams.set_swa_pattern(2);
1200
+ uint32_t swa_period = 2;
1201
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1202
+ hparams.set_swa_pattern(swa_period);
1316
1203
  hparams.attn_soft_cap = true;
1317
1204
  hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1318
1205
  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -1333,14 +1220,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1333
1220
  // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
1334
1221
  hparams.f_attention_scale = type == LLM_TYPE_27B
1335
1222
  ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1336
- : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1223
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
1337
1224
  } break;
1338
1225
  case LLM_ARCH_GEMMA3:
1339
1226
  {
1340
1227
  const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1341
1228
  if (found_swa && hparams.n_swa > 0) {
1342
1229
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1343
- hparams.set_swa_pattern(6);
1230
+ uint32_t swa_period = 6;
1231
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1232
+ hparams.set_swa_pattern(swa_period);
1344
1233
 
1345
1234
  ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1346
1235
  } else {
@@ -1364,12 +1253,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1364
1253
  // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
1365
1254
  hparams.f_attention_scale = type == LLM_TYPE_27B
1366
1255
  ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
1367
- : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1256
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
1368
1257
  } break;
1369
1258
  case LLM_ARCH_GEMMA3N:
1370
1259
  {
1260
+ uint32_t swa_period = 5;
1261
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1371
1262
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1372
- hparams.set_swa_pattern(5);
1263
+ hparams.set_swa_pattern(swa_period);
1373
1264
 
1374
1265
  hparams.n_layer_kv_from_start = 20;
1375
1266
  hparams.f_attention_scale = 1.0f;
@@ -1387,14 +1278,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1387
1278
  case LLM_ARCH_GEMMA_EMBEDDING:
1388
1279
  {
1389
1280
  hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
1390
- hparams.set_swa_pattern(6);
1281
+ uint32_t swa_period = 6;
1282
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1283
+ hparams.set_swa_pattern(swa_period);
1391
1284
 
1392
1285
  hparams.causal_attn = false; // embeddings do not use causal attention
1393
1286
 
1394
1287
  ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1395
1288
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1396
1289
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1397
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
1290
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
1398
1291
 
1399
1292
  //applied only if model converted with --sentence-transformers-dense-modules
1400
1293
  ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
@@ -1409,7 +1302,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1409
1302
  case 24: type = LLM_TYPE_0_3B; break;
1410
1303
  default: type = LLM_TYPE_UNKNOWN;
1411
1304
  }
1412
- hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
1305
+ hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
1413
1306
 
1414
1307
  } break;
1415
1308
  case LLM_ARCH_STARCODER2:
@@ -1501,7 +1394,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1501
1394
  }
1502
1395
 
1503
1396
  switch (hparams.n_layer) {
1504
- // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
1397
+ // TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
1505
1398
  case 12: // 900M 8x???M
1506
1399
  case 32: // 51B 16x?B
1507
1400
  default: type = LLM_TYPE_UNKNOWN;
@@ -1519,7 +1412,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1519
1412
  } break;
1520
1413
  case LLM_ARCH_COMMAND_R:
1521
1414
  {
1522
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
1415
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
1523
1416
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1524
1417
  switch (hparams.n_layer) {
1525
1418
  case 40: type = LLM_TYPE_35B; break;
@@ -1529,7 +1422,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1529
1422
  case LLM_ARCH_COHERE2:
1530
1423
  {
1531
1424
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1532
- hparams.set_swa_pattern(4);
1425
+ uint32_t swa_period = 4;
1426
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1427
+ hparams.set_swa_pattern(swa_period);
1533
1428
  hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1534
1429
  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1535
1430
 
@@ -1571,7 +1466,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1571
1466
  const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1572
1467
  if (found_swa && hparams.n_swa > 0) {
1573
1468
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1574
- hparams.set_swa_pattern(4);
1469
+ uint32_t swa_period = 4;
1470
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1471
+ hparams.set_swa_pattern(swa_period);
1575
1472
 
1576
1473
  hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1577
1474
  hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
@@ -1678,10 +1575,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1678
1575
  case LLM_ARCH_DEEPSEEK:
1679
1576
  {
1680
1577
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1681
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1578
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
1682
1579
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1683
1580
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1684
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1581
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1685
1582
 
1686
1583
  switch (hparams.n_ff_exp) {
1687
1584
  case 1408: type = LLM_TYPE_16B; break;
@@ -1691,16 +1588,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1691
1588
  } break;
1692
1589
  case LLM_ARCH_DEEPSEEK2:
1693
1590
  {
1694
- // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
1695
- bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
1591
+ // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
1592
+ const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
1593
+
1696
1594
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1697
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1595
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
1698
1596
  if (!is_lite) {
1699
1597
  ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1700
1598
  }
1701
1599
  ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1702
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
1703
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
1600
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false);
1601
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
1704
1602
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1705
1603
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1706
1604
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
@@ -1709,7 +1607,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1709
1607
  if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1710
1608
  // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
1711
1609
  // that have no expert_gating_func model parameter set
1712
- hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1610
+ if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
1611
+ // GLM 4.7 Lite
1612
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1613
+ } else {
1614
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1615
+ }
1713
1616
  }
1714
1617
 
1715
1618
  if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
@@ -1726,6 +1629,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1726
1629
 
1727
1630
  switch (hparams.n_layer) {
1728
1631
  case 27: type = LLM_TYPE_16B; break;
1632
+ case 47: type = LLM_TYPE_30B_A3B; break;
1729
1633
  case 60: type = LLM_TYPE_236B; break;
1730
1634
  case 61: type = LLM_TYPE_671B; break;
1731
1635
  default: type = LLM_TYPE_UNKNOWN;
@@ -1765,7 +1669,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1765
1669
  {
1766
1670
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1767
1671
  ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1672
+
1673
+ // NextN/MTP parameters (GLM-OCR)
1674
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1675
+
1676
+ // TODO: when MTP is implemented, this should probably be updated if needed
1677
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1678
+
1768
1679
  switch (hparams.n_layer) {
1680
+ case 17: type = LLM_TYPE_1B; break; // GLM-OCR
1769
1681
  case 40: type = LLM_TYPE_9B; break;
1770
1682
  case 61: type = LLM_TYPE_32B; break;
1771
1683
  default: type = LLM_TYPE_UNKNOWN;
@@ -1782,7 +1694,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1782
1694
  ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
1783
1695
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1784
1696
  ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
1785
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1697
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1786
1698
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1787
1699
 
1788
1700
  // Expert gating function (GLM-4.5 uses sigmoid)
@@ -1804,6 +1716,50 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1804
1716
  default: type = LLM_TYPE_UNKNOWN;
1805
1717
  }
1806
1718
  } break;
1719
+ case LLM_ARCH_GLM_DSA:
1720
+ {
1721
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1722
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1723
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
1724
+
1725
+ // MoE parameters
1726
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
1727
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
1728
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1729
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
1730
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1731
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1732
+
1733
+ // deepseek MLA parameters
1734
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1735
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1736
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false);
1737
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
1738
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1739
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1740
+
1741
+ // DSA parameters
1742
+ ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
1743
+ ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
1744
+ ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k);
1745
+
1746
+ // Expert gating function (GLM-4.5 uses sigmoid)
1747
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1748
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
1749
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
1750
+ }
1751
+
1752
+ // NextN/MTP parameters
1753
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1754
+
1755
+ // TODO: when MTP is implemented, this should probably be updated if needed
1756
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1757
+
1758
+ switch (hparams.n_layer) {
1759
+ case 79: type = LLM_TYPE_744B_A40B; break;
1760
+ default: type = LLM_TYPE_UNKNOWN;
1761
+ }
1762
+ } break;
1807
1763
  case LLM_ARCH_BITNET:
1808
1764
  {
1809
1765
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1857,7 +1813,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1857
1813
  case LLM_ARCH_JAIS:
1858
1814
  {
1859
1815
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1860
- ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
1816
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
1861
1817
 
1862
1818
  switch (hparams.n_layer) {
1863
1819
  case 24: type = LLM_TYPE_1_3B; break;
@@ -1866,6 +1822,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1866
1822
  default: type = LLM_TYPE_UNKNOWN;
1867
1823
  }
1868
1824
  } break;
1825
+ case LLM_ARCH_JAIS2:
1826
+ {
1827
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
1828
+
1829
+ switch (hparams.n_layer) {
1830
+ case 32: type = LLM_TYPE_8B; break;
1831
+ case 68: type = LLM_TYPE_70B; break;
1832
+ default: type = LLM_TYPE_UNKNOWN;
1833
+ }
1834
+ } break;
1869
1835
  case LLM_ARCH_NEMOTRON:
1870
1836
  {
1871
1837
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1896,10 +1862,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1896
1862
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
1897
1863
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1898
1864
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1865
+ ml.get_key(LLM_KV_MOE_LATENT_SIZE, hparams.moe_latent_size, false);
1899
1866
 
1900
1867
  switch (hparams.n_layer) {
1901
1868
  case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
1902
1869
  case 56: type = LLM_TYPE_9B; break;
1870
+ case 88: type = LLM_TYPE_120B_A12B; break;
1903
1871
  default: type = LLM_TYPE_UNKNOWN;
1904
1872
  }
1905
1873
  } break;
@@ -1917,7 +1885,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1917
1885
  if (hparams.n_layer == 64) { // 32B
1918
1886
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1919
1887
  hparams.n_swa = 4096;
1920
- hparams.set_swa_pattern(4);
1888
+ uint32_t swa_period = 4;
1889
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1890
+ hparams.set_swa_pattern(swa_period);
1921
1891
 
1922
1892
  hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1923
1893
  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -1933,6 +1903,36 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1933
1903
  default: type = LLM_TYPE_UNKNOWN;
1934
1904
  }
1935
1905
  } break;
1906
+ case LLM_ARCH_EXAONE_MOE:
1907
+ {
1908
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1909
+ hparams.n_swa = 128;
1910
+ uint32_t swa_period = 4;
1911
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1912
+ hparams.set_swa_pattern(swa_period);
1913
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1914
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1915
+
1916
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1917
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1918
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1919
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
1920
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1921
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
1922
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
1923
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
1924
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1925
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
1926
+
1927
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1928
+
1929
+ switch (hparams.n_layer) {
1930
+ case 32: type = LLM_TYPE_30B_A3B; break;
1931
+ case 48:
1932
+ case 49: type = LLM_TYPE_235B_A22B; break;
1933
+ default: type = LLM_TYPE_UNKNOWN;
1934
+ }
1935
+ } break;
1936
1936
  case LLM_ARCH_RWKV6:
1937
1937
  case LLM_ARCH_RWKV6QWEN2:
1938
1938
  {
@@ -2006,9 +2006,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2006
2006
  {
2007
2007
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2008
2008
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
2009
- ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
2010
- ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
2011
- ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
2009
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, false);
2010
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
2011
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, false);
2012
2012
 
2013
2013
  // Granite uses rope_finetuned as a switch for rope, so default to true
2014
2014
  bool rope_finetuned = true;
@@ -2066,7 +2066,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2066
2066
  {
2067
2067
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2068
2068
  hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
2069
- ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
2069
+ ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);
2070
2070
 
2071
2071
  switch (hparams.n_layer) {
2072
2072
  case 32: type = LLM_TYPE_7B; break;
@@ -2079,15 +2079,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2079
2079
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2080
2080
  ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
2081
2081
  ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
2082
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
2082
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
2083
2083
  } break;
2084
2084
  case LLM_ARCH_BAILINGMOE:
2085
2085
  {
2086
2086
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2087
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2087
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
2088
2088
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2089
2089
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
2090
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
2090
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
2091
2091
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
2092
2092
 
2093
2093
  switch (hparams.n_layer) {
@@ -2099,11 +2099,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2099
2099
  case LLM_ARCH_BAILINGMOE2:
2100
2100
  {
2101
2101
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2102
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2102
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
2103
2103
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2104
- ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
2104
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2105
2105
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
2106
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
2106
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
2107
2107
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
2108
2108
  ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
2109
2109
  ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
@@ -2122,10 +2122,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2122
2122
  case LLM_ARCH_DOTS1:
2123
2123
  {
2124
2124
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2125
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2125
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
2126
2126
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2127
2127
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
2128
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
2128
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
2129
2129
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
2130
2130
  ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
2131
2131
  switch (hparams.n_layer) {
@@ -2135,13 +2135,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2135
2135
  } break;
2136
2136
  case LLM_ARCH_ERNIE4_5:
2137
2137
  case LLM_ARCH_ERNIE4_5_MOE:
2138
+ case LLM_ARCH_PADDLEOCR:
2138
2139
  {
2140
+ // paddleocr need mrope_section
2141
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
2142
+
2139
2143
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2140
2144
  if (arch == LLM_ARCH_ERNIE4_5_MOE) {
2141
2145
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2142
2146
  ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2143
2147
  ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
2144
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2148
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
2145
2149
  }
2146
2150
 
2147
2151
  switch (hparams.n_layer) {
@@ -2186,7 +2190,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2186
2190
  {
2187
2191
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2188
2192
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2189
- ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
2193
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2190
2194
 
2191
2195
  switch (hparams.n_layer) {
2192
2196
  case 32: type = LLM_TYPE_A13B; break;
@@ -2222,7 +2226,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2222
2226
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
2223
2227
 
2224
2228
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2225
- hparams.set_swa_pattern(2);
2229
+ uint32_t swa_period = 2;
2230
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
2231
+ hparams.set_swa_pattern(swa_period);
2226
2232
 
2227
2233
  hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2228
2234
  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -2249,12 +2255,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2249
2255
  case 10752: type = LLM_TYPE_2_6B; break;
2250
2256
  default: type = LLM_TYPE_UNKNOWN;
2251
2257
  }
2258
+ if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
2259
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2260
+ for (uint32_t il = 0; il < hparams.n_layer; ++il) {
2261
+ hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il];
2262
+ }
2263
+ }
2252
2264
  } break;
2253
2265
  case LLM_ARCH_LFM2MOE:
2254
2266
  {
2255
2267
  ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
2256
2268
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2257
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
2269
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
2258
2270
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2259
2271
  ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
2260
2272
 
@@ -2262,16 +2274,22 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2262
2274
  hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
2263
2275
  }
2264
2276
 
2265
- type = LLM_TYPE_8B_A1B;
2277
+ switch (hparams.n_layer) {
2278
+ case 24: type = LLM_TYPE_8B_A1B; break;
2279
+ case 40: type = LLM_TYPE_24B_A2B; break;
2280
+ default: type = LLM_TYPE_UNKNOWN;
2281
+ }
2266
2282
  } break;
2267
2283
  case LLM_ARCH_SMALLTHINKER:
2268
2284
  {
2269
2285
  const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
2270
2286
 
2271
2287
  if (found_swa && hparams.n_swa > 0) {
2272
- hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2273
- hparams.n_swa = 4096;
2274
- hparams.set_swa_pattern(4, true);
2288
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2289
+ hparams.n_swa = 4096;
2290
+ uint32_t swa_period = 4;
2291
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
2292
+ hparams.set_swa_pattern(swa_period, true);
2275
2293
 
2276
2294
  hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2277
2295
  hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
@@ -2294,7 +2312,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2294
2312
  case LLM_ARCH_GROVEMOE:
2295
2313
  {
2296
2314
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2297
- ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
2315
+ ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp, false);
2298
2316
  ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
2299
2317
  ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
2300
2318
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -2359,8 +2377,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2359
2377
  ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
2360
2378
 
2361
2379
  // Mark recurrent layers (linear attention layers)
2362
- for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2363
- hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
2380
+ {
2381
+ uint32_t full_attn_interval = 4;
2382
+ ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2383
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2384
+ hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2385
+ }
2364
2386
  }
2365
2387
 
2366
2388
  switch (hparams.n_layer) {
@@ -2368,6 +2390,65 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2368
2390
  default: type = LLM_TYPE_UNKNOWN;
2369
2391
  }
2370
2392
  } break;
2393
+ case LLM_ARCH_QWEN35:
2394
+ {
2395
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2396
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
2397
+
2398
+ // Load linear attention (gated delta net) parameters
2399
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
2400
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
2401
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
2402
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2403
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
2404
+
2405
+ // Mark recurrent layers (linear attention layers)
2406
+ {
2407
+ uint32_t full_attn_interval = 4;
2408
+ ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2409
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2410
+ hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2411
+ }
2412
+ }
2413
+
2414
+ switch (hparams.n_layer) {
2415
+ case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break;
2416
+ case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break;
2417
+ case 64: type = LLM_TYPE_27B; break;
2418
+ default: type = LLM_TYPE_UNKNOWN;
2419
+ }
2420
+ } break;
2421
+ case LLM_ARCH_QWEN35MOE:
2422
+ {
2423
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
2424
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2425
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2426
+
2427
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
2428
+
2429
+ // Load linear attention (gated delta net) parameters
2430
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
2431
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
2432
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
2433
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
2434
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
2435
+
2436
+ // Mark recurrent layers (linear attention layers)
2437
+ {
2438
+ uint32_t full_attn_interval = 4;
2439
+ ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
2440
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2441
+ hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
2442
+ }
2443
+ }
2444
+
2445
+ switch (hparams.n_layer) {
2446
+ case 40: type = LLM_TYPE_35B_A3B; break;
2447
+ case 48: type = LLM_TYPE_122B_A10B; break;
2448
+ case 60: type = LLM_TYPE_397B_A17B; break;
2449
+ default: type = LLM_TYPE_UNKNOWN;
2450
+ }
2451
+ } break;
2371
2452
  case LLM_ARCH_MISTRAL3:
2372
2453
  {
2373
2454
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -2402,7 +2483,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2402
2483
 
2403
2484
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2404
2485
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
2405
- ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
2486
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2406
2487
  ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
2407
2488
 
2408
2489
  switch (hparams.n_layer) {
@@ -2410,7 +2491,69 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2410
2491
  default: type = LLM_TYPE_UNKNOWN;
2411
2492
  }
2412
2493
  } break;
2413
- default: throw std::runtime_error("unsupported model architecture");
2494
+ case LLM_ARCH_KIMI_LINEAR:
2495
+ {
2496
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2497
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl);
2498
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl);
2499
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
2500
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
2501
+ ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda);
2502
+
2503
+ // MLA qk_rope_head_dim (for reference)
2504
+ // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
2505
+
2506
+ // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
2507
+ // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
2508
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
2509
+ hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent
2510
+ }
2511
+
2512
+ // MoE parameters - Kimi uses moe_intermediate_size = 1024
2513
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2514
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
2515
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
2516
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
2517
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
2518
+
2519
+ switch (hparams.n_layer) {
2520
+ case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
2521
+ default: type = LLM_TYPE_UNKNOWN;
2522
+ }
2523
+ } break;
2524
+ case LLM_ARCH_STEP35:
2525
+ {
2526
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2527
+
2528
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2529
+
2530
+ // full_attention layer only use half of the RoPE dimensions
2531
+ hparams.n_rot_full = hparams.n_rot_full / 2;
2532
+
2533
+ // MoE + SWA parameters
2534
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
2535
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
2536
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
2537
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
2538
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
2539
+
2540
+ // Step35 uses sigmoid gating by default (if not set in GGUF)
2541
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
2542
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
2543
+ }
2544
+
2545
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
2546
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2547
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
2548
+ ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false);
2549
+ ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
2550
+
2551
+ switch (hparams.n_layer) {
2552
+ case 45: type = LLM_TYPE_196B_A11B; break;
2553
+ default: type = LLM_TYPE_UNKNOWN;
2554
+ }
2555
+ } break;
2556
+ default: throw std::runtime_error("unsupported model architecture: " + arch_name());
2414
2557
  }
2415
2558
 
2416
2559
  pimpl->n_bytes = ml.n_bytes;
@@ -2508,224 +2651,63 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2508
2651
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
2509
2652
  pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
2510
2653
 
2511
- // assign the repeating layers to the devices according to the splits
2512
- pimpl->dev_layer.resize(n_layer);
2513
- for (int il = 0; il < n_layer; ++il) {
2514
- pimpl->dev_layer[il] = get_layer_buft_list(il);
2515
- }
2516
-
2517
- // assign the output layer
2518
- pimpl->dev_output = get_layer_buft_list(n_layer);
2519
-
2520
- // one ggml context per buffer type
2521
- int max_n_tensors = ml.n_tensors;
2522
- max_n_tensors += 1; // duplicated output tensor
2523
- max_n_tensors += n_layer*2; // duplicated rope freq tensors
2524
- const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
2525
-
2526
- // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2527
- struct ggml_backend_buft_comparator {
2528
- bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2529
- return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
2530
- }
2531
- };
2532
- std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
2533
-
2534
- auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
2535
- auto it = ctx_map.find(buft);
2536
- if (it == ctx_map.end()) {
2537
- ggml_init_params params = {
2538
- /*.mem_size =*/ ctx_size,
2539
- /*.mem_buffer =*/ NULL,
2540
- /*.no_alloc =*/ true,
2541
- };
2542
-
2543
- ggml_context * ctx = ggml_init(params);
2544
- if (!ctx) {
2545
- throw std::runtime_error(format("failed to create ggml context"));
2546
- }
2547
-
2548
- ctx_map.emplace(buft, ctx);
2549
-
2550
- return ctx;
2551
- }
2552
- return it->second.get();
2553
- };
2554
-
2555
- const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
2556
- const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2557
- const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
2558
-
2559
- // create tensors for the weights
2560
- {
2561
- // note: cast to int64_t since we will use these for the tensor dimensions
2562
- const int64_t n_head = hparams.n_head();
2563
- const int64_t n_head_kv = hparams.n_head_kv();
2564
- const int64_t n_embd = hparams.n_embd;
2565
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
2566
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
2567
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
2568
- const int64_t n_embd_head_v = hparams.n_embd_head_v;
2569
- const int64_t n_ff = hparams.n_ff();
2570
- const int64_t n_embd_gqa = n_embd_v_gqa;
2571
- const int64_t n_vocab = vocab.n_tokens();
2572
- const int64_t n_token_types = vocab.n_token_types();
2573
- const int64_t n_rot = hparams.n_rot;
2574
- const int64_t n_expert = hparams.n_expert;
2575
- const int64_t n_expert_used = hparams.n_expert_used;
2576
- const int64_t n_ctx_train = hparams.n_ctx_train;
2577
-
2578
- if (n_expert > 0 && hparams.n_expert_used == 0) {
2579
- throw std::runtime_error("model has expert layers but no expert layers are used");
2580
- }
2581
-
2582
- int n_moved_tensors = 0;
2583
- ggml_tensor * first_moved_tensor = nullptr;
2584
- ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
2585
- ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
2586
-
2587
- auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
2588
- ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
2589
-
2590
- if (!t_meta) {
2591
- if (flags & TENSOR_NOT_REQUIRED) {
2592
- return nullptr;
2593
- }
2594
- throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
2595
- }
2596
-
2597
- // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
2598
- // the tensor is duplicated
2599
- // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
2600
- llm_tensor tn_tensor = tn.tensor;
2601
- if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
2602
- tn_tensor = LLM_TENSOR_OUTPUT;
2603
- }
2604
-
2605
- llm_tensor_info info;
2606
- try {
2607
- info = llm_tensor_info_for(tn_tensor);
2608
- } catch (const std::out_of_range & e) {
2609
- throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
2610
- }
2611
-
2612
- // skip unused tensors
2613
- if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
2614
- const size_t nbytes = ggml_nbytes(t_meta);
2615
- LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
2616
-
2617
- ml.size_data -= nbytes;
2618
- ml.n_created++;
2619
-
2620
- return nullptr;
2621
- }
2622
-
2623
- // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
2624
- ggml_op op;
2625
- bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
2626
- if (bias) {
2627
- if (info.op == GGML_OP_MUL_MAT_ID) {
2628
- op = GGML_OP_ADD_ID;
2629
- } else {
2630
- op = GGML_OP_ADD;
2631
- }
2632
- } else {
2633
- op = info.op;
2634
- }
2635
-
2636
- // sanity checks
2637
- if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
2638
- if (tn.bid != -1) {
2639
- GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
2640
- }
2641
- } else {
2642
- if (tn.bid == -1) {
2643
- GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
2644
- }
2645
- }
2646
-
2647
- // select the buffer type for this tensor
2648
- buft_list_t * buft_list;
2649
- switch (info.layer) {
2650
- case LLM_TENSOR_LAYER_INPUT:
2651
- buft_list = pimpl->dev_input.buft_list;
2652
- break;
2653
- case LLM_TENSOR_LAYER_OUTPUT:
2654
- buft_list = pimpl->dev_output.buft_list;
2655
- break;
2656
- case LLM_TENSOR_LAYER_REPEATING:
2657
- buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
2658
- break;
2659
- default:
2660
- GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
2661
- }
2662
-
2663
- ggml_backend_buffer_type_t buft = nullptr;
2664
-
2665
- // check overrides
2666
- if (ml.tensor_buft_overrides) {
2667
- std::string tensor_name = tn.str();
2668
- for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2669
- std::regex pattern(overrides->pattern);
2670
- if (std::regex_search(tensor_name, pattern)) {
2671
- if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2672
- // when overriding to a CPU buffer, consider the extra buffer types
2673
- buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
2674
- } else {
2675
- buft = overrides->buft;
2676
- }
2677
-
2678
- LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2679
- tensor_name.c_str(),
2680
- ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
2681
- ggml_backend_buft_name(buft));
2682
- break;
2683
- }
2684
- }
2685
- }
2686
-
2687
- if (!buft) {
2688
- buft = select_weight_buft(hparams, t_meta, op, *buft_list);
2689
- if (!buft) {
2690
- throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
2691
- }
2692
- }
2654
+ // assign the repeating layers to the devices according to the splits
2655
+ pimpl->dev_layer.resize(n_layer);
2656
+ for (int il = 0; il < n_layer; ++il) {
2657
+ pimpl->dev_layer[il] = get_layer_buft_list(il);
2658
+ }
2693
2659
 
2694
- // avoid using a host buffer when using mmap
2695
- auto * buft_dev = ggml_backend_buft_get_device(buft);
2696
- if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
2697
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2698
- if (!cpu_dev) {
2699
- throw std::runtime_error("no CPU backend found");
2700
- }
2701
- buft = ggml_backend_dev_buffer_type(cpu_dev);
2702
- }
2660
+ // assign the output layer
2661
+ pimpl->dev_output = get_layer_buft_list(n_layer);
2703
2662
 
2704
- if (buft != buft_list->front().second) {
2705
- n_moved_tensors++;
2706
- if (!first_moved_tensor) {
2707
- first_moved_tensor = t_meta;
2708
- first_moved_from_buft = buft_list->front().second;
2709
- first_moved_to_buft = buft;
2710
- }
2711
- }
2663
+ const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
2664
+ const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
2665
+ const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
2666
+ const auto TENSOR_SKIP_IF_VIRTUAL = llama_model_loader::TENSOR_SKIP_IF_VIRTUAL;
2712
2667
 
2713
- ggml_context * ctx = ctx_for_buft(buft);
2668
+ // create tensors for the weights
2669
+ {
2670
+ // note: cast to int64_t since we will use these for the tensor dimensions
2671
+ const int64_t n_head = hparams.n_head();
2672
+ const int64_t n_head_kv = hparams.n_head_kv();
2673
+ const int64_t n_embd = hparams.n_embd;
2674
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
2675
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
2676
+ const int64_t n_embd_head_k = hparams.n_embd_head_k();
2677
+ const int64_t n_embd_head_v = hparams.n_embd_head_v();
2678
+ const int64_t n_ff = hparams.n_ff();
2679
+ const int64_t n_embd_gqa = n_embd_v_gqa;
2680
+ const int64_t n_vocab = vocab.n_tokens();
2681
+ const int64_t n_token_types = vocab.n_token_types();
2682
+ const int64_t n_rot = hparams.n_rot();
2683
+ const int64_t n_expert = hparams.n_expert;
2684
+ const int64_t n_expert_used = hparams.n_expert_used;
2685
+ const int64_t n_ctx_train = hparams.n_ctx_train;
2714
2686
 
2715
- // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
2716
- if (flags & TENSOR_DUPLICATED) {
2717
- ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
2718
- if (t) {
2719
- return t;
2720
- }
2721
- }
2722
- return ml.create_tensor(ctx, tn, ne, flags);
2687
+ if (n_expert > 0 && hparams.n_expert_used == 0) {
2688
+ throw std::runtime_error("model has expert layers but no expert layers are used");
2689
+ }
2690
+
2691
+ auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
2692
+ const buft_list_t * buft_list_layer = tn.bid == -1 ? nullptr : pimpl->dev_layer.at(tn.bid).buft_list;
2693
+ return ml.create_tensor(
2694
+ hparams, &pimpl->cpu_buft_list, pimpl->dev_input.buft_list, pimpl->dev_output.buft_list, buft_list_layer,
2695
+ tn, ne, flags);
2723
2696
  };
2724
2697
 
2725
2698
  layers.resize(n_layer);
2726
2699
 
2727
2700
  // TODO: move to a separate function
2728
2701
  const auto tn = LLM_TN(arch);
2702
+
2703
+ // helper: try merged gate_up_exps first, fall back to separate gate and up
2704
+ auto create_tensor_gate_up_exps = [&](llama_layer & layer, int bid, int64_t n_embd_, int64_t n_ff_, int64_t n_expert_, int flags) {
2705
+ layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", bid), {n_embd_, n_ff_ * 2, n_expert_}, TENSOR_NOT_REQUIRED);
2706
+ if (layer.ffn_gate_up_exps == nullptr) {
2707
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
2708
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
2709
+ }
2710
+ };
2729
2711
  switch (arch) {
2730
2712
  case LLM_ARCH_LLAMA:
2731
2713
  case LLM_ARCH_REFACT:
@@ -2879,6 +2861,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2879
2861
  } break;
2880
2862
  case LLM_ARCH_LLAMA4:
2881
2863
  {
2864
+ if (n_expert == 0) {
2865
+ throw std::runtime_error(arch_name() + " model cannot have zero experts");
2866
+ }
2882
2867
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2883
2868
 
2884
2869
  // output
@@ -2891,7 +2876,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2891
2876
  }
2892
2877
 
2893
2878
  for (int i = 0; i < n_layer; ++i) {
2894
- bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
2879
+ const bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
2895
2880
 
2896
2881
  auto & layer = layers[i];
2897
2882
 
@@ -2907,7 +2892,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2907
2892
  layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
2908
2893
 
2909
2894
  if (is_moe_layer) {
2910
- int n_ff_exp = hparams.n_ff_exp;
2895
+ const int64_t n_ff_exp = hparams.n_ff_exp;
2911
2896
 
2912
2897
  layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2913
2898
  layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
@@ -2994,8 +2979,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2994
2979
  } break;
2995
2980
  case LLM_ARCH_MINICPM3:
2996
2981
  {
2997
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
2998
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
2982
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
2983
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
2999
2984
 
3000
2985
  const int64_t q_lora_rank = hparams.n_lora_q;
3001
2986
  const int64_t kv_lora_rank = hparams.n_lora_kv;
@@ -3038,7 +3023,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3038
3023
  case LLM_ARCH_GROK:
3039
3024
  {
3040
3025
  if (n_expert == 0) {
3041
- throw std::runtime_error("Grok model cannot have zero experts");
3026
+ throw std::runtime_error(arch_name() + " model cannot have zero experts");
3042
3027
  }
3043
3028
 
3044
3029
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3210,6 +3195,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3210
3195
  case LLM_ARCH_NOMIC_BERT_MOE:
3211
3196
  case LLM_ARCH_JINA_BERT_V3:
3212
3197
  {
3198
+ if (n_token_types == 0) {
3199
+ throw std::runtime_error(arch_name() + " model needs to define token type count");
3200
+ }
3213
3201
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3214
3202
  type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
3215
3203
 
@@ -3294,9 +3282,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3294
3282
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3295
3283
  }
3296
3284
 
3297
- cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3298
- cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3299
- cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3285
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3286
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
3287
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
3288
+ cls_norm = create_tensor(tn(LLM_TENSOR_CLS_NORM, "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
3300
3289
 
3301
3290
  } break;
3302
3291
  case LLM_ARCH_NEO_BERT:
@@ -3325,6 +3314,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3325
3314
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3326
3315
  }
3327
3316
  } break;
3317
+ case LLM_ARCH_EUROBERT:
3318
+ {
3319
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3320
+
3321
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3322
+
3323
+ for (int i = 0; i < n_layer; ++i) {
3324
+ auto & layer = layers[i];
3325
+
3326
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3327
+
3328
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3329
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3330
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3331
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3332
+
3333
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3334
+
3335
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3336
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3337
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3338
+ }
3339
+ } break;
3328
3340
  case LLM_ARCH_JINA_BERT_V2:
3329
3341
  {
3330
3342
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
@@ -3452,8 +3464,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3452
3464
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3453
3465
  layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3454
3466
 
3455
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3456
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3467
+ // FIXME test-llama-archs crashes if q_norm is created
3468
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
3469
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
3457
3470
 
3458
3471
  layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3459
3472
  layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
@@ -3839,8 +3852,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3839
3852
  const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
3840
3853
 
3841
3854
  // attention parameters
3842
- const uint32_t qk_dim = hparams.n_embd_head_k;
3843
- const uint32_t v_dim = hparams.n_embd_head_v;
3855
+ const uint32_t qk_dim = hparams.n_embd_head_k();
3856
+ const uint32_t v_dim = hparams.n_embd_head_v();
3844
3857
 
3845
3858
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3846
3859
 
@@ -3900,8 +3913,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3900
3913
  } break;
3901
3914
  case LLM_ARCH_PLAMO3:
3902
3915
  {
3903
- const int64_t head_dim_q = hparams.n_embd_head_k;
3904
- const int64_t head_dim_v = hparams.n_embd_head_v;
3916
+ const int64_t head_dim_q = hparams.n_embd_head_k();
3917
+ const int64_t head_dim_v = hparams.n_embd_head_v();
3905
3918
 
3906
3919
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3907
3920
 
@@ -4648,7 +4661,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4648
4661
  } break;
4649
4662
  case LLM_ARCH_SEED_OSS:
4650
4663
  {
4651
- const uint32_t head_dim = hparams.n_embd_head_k;
4664
+ const uint32_t head_dim = hparams.n_embd_head_k();
4652
4665
  const int64_t n_qo_dim = n_head * head_dim;
4653
4666
  const int64_t n_kv_dim = n_head_kv * head_dim;
4654
4667
 
@@ -4871,17 +4884,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4871
4884
  } break;
4872
4885
  case LLM_ARCH_DEEPSEEK2:
4873
4886
  {
4874
- // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
4875
- const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
4876
-
4877
- const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
4887
+ const bool is_mla = hparams.is_mla();
4878
4888
 
4879
4889
  // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
4880
- const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
4881
- const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
4890
+ const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
4891
+ const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
4882
4892
 
4883
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
4893
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
4884
4894
  const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
4895
+ GGML_ASSERT(n_embd_head_qk_nope >= 1);
4885
4896
 
4886
4897
  const int64_t q_lora_rank = hparams.n_lora_q;
4887
4898
  const int64_t kv_lora_rank = hparams.n_lora_kv;
@@ -4903,13 +4914,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4903
4914
  auto & layer = layers[i];
4904
4915
 
4905
4916
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4906
- if (!is_lite) {
4917
+ if (q_lora_rank > 0) {
4907
4918
  layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
4908
4919
  }
4909
4920
 
4910
4921
  layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
4911
4922
 
4912
- if (!is_lite) {
4923
+ if (q_lora_rank > 0) {
4913
4924
  layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
4914
4925
  layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
4915
4926
  } else {
@@ -4946,9 +4957,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4946
4957
  }
4947
4958
 
4948
4959
  // MoE branch
4949
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4950
4960
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4951
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4961
+ create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
4952
4962
 
4953
4963
  // Shared expert branch
4954
4964
  layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
@@ -4959,8 +4969,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4959
4969
  } break;
4960
4970
  case LLM_ARCH_PLM:
4961
4971
  {
4962
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
4963
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
4972
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
4973
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
4964
4974
  const int64_t kv_lora_rank = hparams.n_lora_kv;
4965
4975
 
4966
4976
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5000,23 +5010,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5000
5010
  layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
5001
5011
 
5002
5012
  layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
5003
- layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5013
+ layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5004
5014
  layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
5005
- layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5015
+ layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5006
5016
  layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
5007
- layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5017
+ layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5008
5018
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5009
- layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5019
+ layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5010
5020
 
5011
5021
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5012
5022
  layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
5013
5023
 
5014
5024
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5015
- layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5025
+ layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5016
5026
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5017
- layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5027
+ layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5018
5028
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5019
- layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5029
+ layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
5020
5030
  }
5021
5031
  } break;
5022
5032
  case LLM_ARCH_T5:
@@ -5074,7 +5084,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5074
5084
 
5075
5085
  layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
5076
5086
  // this tensor seems to be unused in HF transformers implementation
5077
- layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
5087
+ layer.attn_rel_b_cross = create_tensor(
5088
+ tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
5078
5089
 
5079
5090
  layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5080
5091
  layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
@@ -5152,6 +5163,45 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5152
5163
  layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
5153
5164
  }
5154
5165
  } break;
5166
+ case LLM_ARCH_JAIS2:
5167
+ {
5168
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5169
+
5170
+ // output
5171
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5172
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5173
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5174
+ if (!output) {
5175
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5176
+ }
5177
+
5178
+ for (int i = 0; i < n_layer; ++i) {
5179
+ auto & layer = layers[i];
5180
+
5181
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5182
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
5183
+
5184
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5185
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5186
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5187
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5188
+
5189
+ // attention biases - all have shape n_embd (output dimension of projections)
5190
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
5191
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
5192
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
5193
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
5194
+
5195
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5196
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
5197
+
5198
+ // Jais-2 uses simple MLP (no gate) with biases
5199
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5200
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
5201
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5202
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
5203
+ }
5204
+ } break;
5155
5205
  case LLM_ARCH_CHATGLM:
5156
5206
  {
5157
5207
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5202,30 +5252,48 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5202
5252
  }
5203
5253
 
5204
5254
  for (int i = 0; i < n_layer; ++i) {
5255
+ int flags = 0;
5256
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5257
+ // skip all tensors in the NextN layers
5258
+ flags |= TENSOR_SKIP;
5259
+ }
5260
+
5205
5261
  auto & layer = layers[i];
5206
5262
 
5207
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5208
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5209
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
5263
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5264
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
5265
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
5210
5266
 
5211
5267
  if (layer.wqkv == nullptr) {
5212
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5213
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5214
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5215
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
5216
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5217
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
5268
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, flags);
5269
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, flags);
5270
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, flags);
5271
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
5272
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
5273
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
5218
5274
  }
5219
5275
 
5220
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5276
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
5221
5277
 
5222
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5278
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
5223
5279
 
5224
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5225
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5226
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
5280
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5281
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
5282
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, flags);
5227
5283
 
5228
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5284
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
5285
+
5286
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5287
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5288
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5289
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5290
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5291
+
5292
+ // Optional tensors
5293
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5294
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5295
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
5296
+ }
5229
5297
  }
5230
5298
  } break;
5231
5299
  case LLM_ARCH_GLM4_MOE:
@@ -5329,6 +5397,108 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5329
5397
  }
5330
5398
  }
5331
5399
  break;
5400
+ case LLM_ARCH_GLM_DSA:
5401
+ {
5402
+ const bool is_mla = hparams.is_mla();
5403
+ if (!is_mla) {
5404
+ throw std::runtime_error("GLM_DSA architecture requires MLA");
5405
+ }
5406
+
5407
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
5408
+ const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
5409
+ const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
5410
+
5411
+ const int64_t n_embd_head_qk_rope = hparams.n_rot();
5412
+ const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
5413
+
5414
+ const int64_t q_lora_rank = hparams.n_lora_q;
5415
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
5416
+
5417
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5418
+ const int64_t n_expert_shared = hparams.n_expert_shared;
5419
+
5420
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5421
+
5422
+ // output
5423
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5424
+ // try to load output.weight, if not found, use token_embd (tied embeddings)
5425
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5426
+ if (!output) {
5427
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5428
+ }
5429
+
5430
+ for (int i = 0; i < n_layer; ++i) {
5431
+ int flags = 0;
5432
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5433
+ // skip all tensors in the NextN layers
5434
+ // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
5435
+ flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
5436
+ }
5437
+
5438
+ auto & layer = layers[i];
5439
+
5440
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5441
+ layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags);
5442
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags);
5443
+
5444
+ layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags);
5445
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
5446
+
5447
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags);
5448
+
5449
+ // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
5450
+ layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags);
5451
+ layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags);
5452
+
5453
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags);
5454
+
5455
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5456
+
5457
+ // DSA indexer
5458
+ layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags);
5459
+ layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags);
5460
+ layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags);
5461
+ layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags);
5462
+ layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
5463
+ if (i < (int) hparams.n_layer_dense_lead) {
5464
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
5465
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
5466
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
5467
+ } else {
5468
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
5469
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
5470
+
5471
+ if (n_expert == 0) {
5472
+ throw std::runtime_error("n_expert must be > 0");
5473
+ }
5474
+ if (n_expert_used == 0) {
5475
+ throw std::runtime_error("n_expert_used must be > 0");
5476
+ }
5477
+
5478
+ // MoE branch
5479
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
5480
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
5481
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
5482
+
5483
+ // Shared expert branch
5484
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
5485
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, flags);
5486
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
5487
+ }
5488
+
5489
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5490
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5491
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5492
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5493
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5494
+
5495
+ // Optional tensors
5496
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5497
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
5498
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
5499
+ }
5500
+ }
5501
+ } break;
5332
5502
  case LLM_ARCH_NEMOTRON:
5333
5503
  {
5334
5504
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5377,6 +5547,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5377
5547
  const int64_t n_ssm_head = hparams.ssm_dt_rank;
5378
5548
  const int64_t n_group = hparams.ssm_n_group;
5379
5549
  const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
5550
+ const int64_t moe_n_embd = hparams.moe_latent_size > 0 ? hparams.moe_latent_size : n_embd;
5380
5551
 
5381
5552
  // embeddings
5382
5553
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -5436,8 +5607,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5436
5607
  layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
5437
5608
 
5438
5609
  // MoE branch
5439
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5440
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
5610
+ layer.ffn_latent_down = create_tensor(tn(LLM_TENSOR_FFN_LATENT_DOWN, "weight", i), {n_embd, moe_n_embd}, TENSOR_NOT_REQUIRED);
5611
+ layer.ffn_latent_up = create_tensor(tn(LLM_TENSOR_FFN_LATENT_UP, "weight", i), {moe_n_embd, n_embd}, TENSOR_NOT_REQUIRED);
5612
+
5613
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, moe_n_embd, n_expert}, 0);
5614
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {moe_n_embd, n_ff_exp, n_expert}, 0);
5441
5615
 
5442
5616
  // Shared expert branch
5443
5617
  layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
@@ -5504,16 +5678,94 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5504
5678
  layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5505
5679
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
5506
5680
 
5507
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5681
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
5682
+
5683
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5684
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5685
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5686
+
5687
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5688
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5689
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5690
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5691
+ }
5692
+ } break;
5693
+ case LLM_ARCH_EXAONE_MOE:
5694
+ {
5695
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5696
+ const int64_t n_expert = hparams.n_expert;
5697
+ const int64_t n_expert_used = hparams.n_expert_used;
5698
+ const int64_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
5699
+ const int64_t head_dim = hparams.n_embd_head_k();
5700
+ const int64_t n_qo_dim = n_head * head_dim;
5701
+ const int64_t n_kv_dim = n_head_kv * head_dim;
5702
+
5703
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5704
+
5705
+ // output
5706
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5707
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5708
+
5709
+ if (output == NULL) {
5710
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5711
+ }
5712
+
5713
+ for (int i = 0; i < n_layer; ++i) {
5714
+ int flags = 0;
5715
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5716
+ // skip all tensors in the NextN layers
5717
+ flags |= TENSOR_SKIP;
5718
+ }
5719
+
5720
+ auto & layer = layers[i];
5721
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, flags);
5722
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, flags);
5723
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, flags);
5724
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
5725
+
5726
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
5727
+
5728
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5729
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
5730
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
5731
+
5732
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5733
+
5734
+ // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
5735
+ if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
5736
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
5737
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
5738
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
5739
+ } else {
5740
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
5741
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
5742
+
5743
+ if (n_expert == 0) {
5744
+ throw std::runtime_error("n_expert must be > 0");
5745
+ }
5746
+ if (n_expert_used == 0) {
5747
+ throw std::runtime_error("n_expert_used must be > 0");
5748
+ }
5749
+
5750
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
5751
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
5752
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
5508
5753
 
5509
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
5510
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5511
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5754
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5755
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
5756
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5757
+ }
5512
5758
 
5513
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5514
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
5515
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5516
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5759
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5760
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5761
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
5762
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
5763
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
5764
+
5765
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
5766
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
5767
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
5768
+ }
5517
5769
  }
5518
5770
  } break;
5519
5771
  case LLM_ARCH_RWKV6:
@@ -5806,9 +6058,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5806
6058
  } break;
5807
6059
  case LLM_ARCH_WAVTOKENIZER_DEC:
5808
6060
  {
5809
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
6061
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd, n_vocab}, 0);
5810
6062
 
5811
- conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
6063
+ conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd, hparams.posnet.n_embd}, 0);
5812
6064
  conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
5813
6065
 
5814
6066
  // posnet
@@ -5904,8 +6156,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5904
6156
  output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
5905
6157
  }
5906
6158
 
5907
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
5908
- output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
6159
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, hparams.n_embd_out()}, 0);
6160
+ output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {hparams.n_embd_out()}, 0);
5909
6161
  } break;
5910
6162
  case LLM_ARCH_BAILINGMOE:
5911
6163
  {
@@ -6161,6 +6413,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6161
6413
  } break;
6162
6414
  case LLM_ARCH_ERNIE4_5:
6163
6415
  case LLM_ARCH_ERNIE4_5_MOE:
6416
+ case LLM_ARCH_PADDLEOCR:
6164
6417
  {
6165
6418
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6166
6419
 
@@ -6303,6 +6556,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6303
6556
 
6304
6557
  for (int i = 0; i < n_layer; ++i) {
6305
6558
  auto & layer = layers[i];
6559
+ const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
6306
6560
 
6307
6561
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6308
6562
 
@@ -6321,9 +6575,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6321
6575
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
6322
6576
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
6323
6577
 
6324
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
6325
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
6326
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
6578
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
6579
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
6580
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
6327
6581
  }
6328
6582
  } break;
6329
6583
  case LLM_ARCH_HUNYUAN_DENSE:
@@ -6481,7 +6735,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6481
6735
  }
6482
6736
 
6483
6737
  // for LFM2-ColBert-350M
6484
- dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
6738
+ dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
6739
+ dense_2_out_layers_b = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "bias"), {hparams.n_embd_out() }, TENSOR_NOT_REQUIRED);
6485
6740
  } break;
6486
6741
  case LLM_ARCH_SMALLTHINKER:
6487
6742
  {
@@ -6637,6 +6892,141 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6637
6892
  layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
6638
6893
  }
6639
6894
  } break;
6895
+ case LLM_ARCH_KIMI_LINEAR:
6896
+ {
6897
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
6898
+
6899
+ // output
6900
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
6901
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
6902
+
6903
+ for (int i = 0; i < n_layer; ++i) {
6904
+ auto & layer = layers[i];
6905
+
6906
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
6907
+
6908
+ // Check for KDA specific tensors to determine layer type or if it's a mixed model
6909
+ // Assuming KDA layer if KDA tensors are present
6910
+
6911
+ // KDA uses head_dim = 128 (from linear_attn_config.head_dim)
6912
+ const int64_t n_embd_head_k_kda = hparams.n_embd_head_kda;
6913
+ const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
6914
+ const int64_t ssm_d_conv = hparams.ssm_d_conv;
6915
+
6916
+ if (hparams.is_recurrent(i)) {
6917
+ // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
6918
+ // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
6919
+ layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
6920
+ if (!layer.ssm_q_conv) {
6921
+ layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
6922
+ }
6923
+
6924
+ // KDA Layer - Conv1d weights may be 3D or 4D
6925
+ layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
6926
+ if (!layer.ssm_k_conv) {
6927
+ layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
6928
+ }
6929
+ layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
6930
+ if (!layer.ssm_v_conv) {
6931
+ layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0);
6932
+ }
6933
+
6934
+ // q, k, v projections
6935
+ // Python: q_proj, k_proj, v_proj
6936
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
6937
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
6938
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_v_kda * n_head}, 0);
6939
+
6940
+ // KDA specific projections
6941
+ // f_a_proj, f_b_proj
6942
+ layer.ssm_f_a = create_tensor(tn(LLM_TENSOR_SSM_F_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); // head_dim
6943
+ layer.ssm_f_b = create_tensor(tn(LLM_TENSOR_SSM_F_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); // projection_size
6944
+
6945
+ // b_proj (beta mixing coefficient)
6946
+ layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0);
6947
+
6948
+ // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) Note: -exp(A_log) is applied in convert_hf_to_gguf.py
6949
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED);
6950
+ if (!layer.ssm_a) {
6951
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
6952
+ }
6953
+
6954
+ // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096]
6955
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_embd_head_k_kda * n_head}, 0);
6956
+
6957
+ // g_a_proj, g_b_proj (output gate)
6958
+ layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0);
6959
+ layer.ssm_g_b = create_tensor(tn(LLM_TENSOR_SSM_G_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0);
6960
+
6961
+ // o_norm (reusing SSM_NORM)
6962
+ layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated
6963
+
6964
+ // o_proj
6965
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0);
6966
+
6967
+ } else {
6968
+ // MLA Layer - use MLA-specific head dimensions
6969
+ const int64_t q_lora_rank = hparams.n_lora_q;
6970
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
6971
+ const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
6972
+ const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
6973
+
6974
+ layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
6975
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
6976
+
6977
+ if (layer.attn_q_a_norm) {
6978
+ layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
6979
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
6980
+ } else {
6981
+ // Kimi MLA without Q compression: wq = [n_embd, n_head * n_embd_head_k_mla]
6982
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
6983
+ }
6984
+
6985
+ // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
6986
+ // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
6987
+ const int64_t qk_rope_head_dim = hparams.n_rot(); // From config: qk_rope_head_dim
6988
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
6989
+ // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
6990
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
6991
+ {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
6992
+ if (!layer.wkv_b) { // MLA KV cache enabled
6993
+ layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
6994
+ layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
6995
+ }
6996
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
6997
+ }
6998
+
6999
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7000
+
7001
+ // MoE intermediate size (different from dense FFN)
7002
+ const int64_t n_ff_exp = hparams.n_ff_exp;
7003
+
7004
+ // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
7005
+ // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
7006
+ if (i < (int) hparams.n_layer_dense_lead) {
7007
+ // Dense FFN layer - use normal n_ff
7008
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
7009
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
7010
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
7011
+ } else {
7012
+ // MoE layer - use n_ff_exp (1024) instead of n_ff (9216)
7013
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
7014
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
7015
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
7016
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
7017
+
7018
+ // Shared experts use moe_intermediate_size * num_shared_experts
7019
+ // Kimi: shared_expert_intermediate_size = 1024 * 1 = 1024
7020
+ // Tensors are 2D: [n_embd, n_ff_shexp] or [n_ff_shexp, n_embd]
7021
+ const int64_t n_ff_shexp_actual = n_ff_exp * (hparams.n_expert_shared > 0 ? hparams.n_expert_shared : 1);
7022
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
7023
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED);
7024
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
7025
+
7026
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
7027
+ }
7028
+ }
7029
+ } break;
6640
7030
  case LLM_ARCH_COGVLM:
6641
7031
  {
6642
7032
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -6718,6 +7108,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6718
7108
  } break;
6719
7109
  case LLM_ARCH_QWEN3NEXT:
6720
7110
  {
7111
+ if (n_expert == 0) {
7112
+ throw std::runtime_error(arch_name() + " model cannot have zero experts");
7113
+ }
7114
+
6721
7115
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
6722
7116
 
6723
7117
  // output
@@ -6746,6 +7140,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6746
7140
 
6747
7141
  for (int i = 0; i < n_layer; ++i) {
6748
7142
  auto & layer = layers[i];
7143
+ const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
6749
7144
 
6750
7145
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
6751
7146
  layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
@@ -6776,15 +7171,138 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6776
7171
  }
6777
7172
 
6778
7173
  layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
6779
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
6780
7174
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
6781
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
7175
+ create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
7176
+
7177
+ // Shared experts
7178
+ layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
7179
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
7180
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
7181
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
7182
+ }
7183
+ } break;
7184
+ case LLM_ARCH_QWEN35MOE:
7185
+ {
7186
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7187
+
7188
+ // output
7189
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7190
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
7191
+
7192
+ // if output is NULL, init from the input tok embed
7193
+ if (output == NULL) {
7194
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
7195
+ }
7196
+
7197
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
7198
+
7199
+ // Calculate dimensions from hyperparameters
7200
+ const int64_t head_k_dim = hparams.ssm_d_state;
7201
+ const int64_t head_v_dim = hparams.ssm_d_state;
7202
+ const int64_t n_k_heads = hparams.ssm_n_group;
7203
+ const int64_t n_v_heads = hparams.ssm_dt_rank;
7204
+ const int64_t key_dim = head_k_dim * n_k_heads;
7205
+ const int64_t value_dim = head_v_dim * n_v_heads;
7206
+ const int64_t conv_dim = key_dim * 2 + value_dim;
7207
+
7208
+ for (int i = 0; i < n_layer; ++i) {
7209
+ auto & layer = layers[i];
7210
+
7211
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
7212
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
7213
+
7214
+ if (!hparams.is_recurrent(i)) {
7215
+ // Attention layers
7216
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
7217
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
7218
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
7219
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7220
+
7221
+ // Q/K normalization for attention layers
7222
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7223
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7224
+ } else {
7225
+ // Linear attention (gated delta net) specific tensors
7226
+ // Create tensors with calculated dimensions
7227
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
7228
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
7229
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
7230
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
7231
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
7232
+ layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
7233
+ layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
7234
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
7235
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
7236
+ }
7237
+
7238
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
7239
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
7240
+ create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
6782
7241
 
6783
7242
  // Shared experts
7243
+ const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
7244
+
6784
7245
  layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
6785
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
6786
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
6787
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
7246
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
7247
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
7248
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
7249
+ }
7250
+ } break;
7251
+ case LLM_ARCH_QWEN35:
7252
+ {
7253
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
7254
+
7255
+ // output
7256
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
7257
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
7258
+
7259
+ // if output is NULL, init from the input tok embed
7260
+ if (output == NULL) {
7261
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
7262
+ }
7263
+
7264
+ // Calculate dimensions from hyperparameters
7265
+ const int64_t head_k_dim = hparams.ssm_d_state;
7266
+ const int64_t head_v_dim = hparams.ssm_d_state;
7267
+ const int64_t n_k_heads = hparams.ssm_n_group;
7268
+ const int64_t n_v_heads = hparams.ssm_dt_rank;
7269
+ const int64_t key_dim = head_k_dim * n_k_heads;
7270
+ const int64_t value_dim = head_v_dim * n_v_heads;
7271
+ const int64_t conv_dim = key_dim * 2 + value_dim;
7272
+
7273
+ for (int i = 0; i < n_layer; ++i) {
7274
+ auto & layer = layers[i];
7275
+
7276
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
7277
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
7278
+
7279
+ if (!hparams.is_recurrent(i)) {
7280
+ // Attention layers
7281
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
7282
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
7283
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
7284
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
7285
+
7286
+ // Q/K normalization for attention layers
7287
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
7288
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
7289
+ } else {
7290
+ // Linear attention (gated delta net) specific tensors
7291
+ // Create tensors with calculated dimensions
7292
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
7293
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
7294
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
7295
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
7296
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
7297
+ layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
7298
+ layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
7299
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
7300
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
7301
+ }
7302
+
7303
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
7304
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
7305
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
6788
7306
  }
6789
7307
  } break;
6790
7308
  case LLM_ARCH_MIMO2:
@@ -6825,6 +7343,72 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6825
7343
  layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
6826
7344
  }
6827
7345
  } break;
7346
+ case LLM_ARCH_STEP35:
7347
+ {
7348
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7349
+
7350
+ // output
7351
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
7352
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
7353
+
7354
+ // STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
7355
+ // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
7356
+ uint32_t n_rot_max = 0;
7357
+ for (int i = 0; i < n_layer; ++i) {
7358
+ n_rot_max = std::max(n_rot_max, hparams.n_rot(i));
7359
+ }
7360
+ if (n_rot_max == 0) {
7361
+ n_rot_max = n_rot;
7362
+ }
7363
+
7364
+ for (int i = 0; i < n_layer; ++i) {
7365
+ auto & layer = layers[i];
7366
+
7367
+ const uint32_t n_head_l = hparams.n_head(i);
7368
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
7369
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
7370
+
7371
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
7372
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
7373
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
7374
+
7375
+ // optional rope factors (llama3) / longrope tensors
7376
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7377
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7378
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7379
+ } else {
7380
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
7381
+ }
7382
+
7383
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_l}, 0);
7384
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
7385
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
7386
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
7387
+
7388
+ // head-wise attention gate (Step35 self_attn.g_proj)
7389
+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);
7390
+
7391
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7392
+
7393
+ // dense MLP (leading dense blocks)
7394
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
7395
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
7396
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
7397
+
7398
+ // MoE routed experts + selection bias (router_bias)
7399
+ const int64_t n_ff_exp = hparams.n_ff_exp;
7400
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7401
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
7402
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
7403
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
7404
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
7405
+
7406
+ // shared expert MLP
7407
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
7408
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
7409
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
7410
+ }
7411
+ } break;
6828
7412
  case LLM_ARCH_MAINCODER:
6829
7413
  {
6830
7414
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -6860,10 +7444,72 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6860
7444
  throw std::runtime_error("unknown architecture");
6861
7445
  }
6862
7446
 
6863
- if (n_moved_tensors > 0) {
6864
- LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
6865
- __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
6866
- ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
7447
+ // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2)
7448
+ // this avoids having to add scale loading to every architecture
7449
+ for (int i = 0; i < n_layer; ++i) {
7450
+ auto & layer = layers[i];
7451
+
7452
+ // attention weight scales (per-tensor, shape {1})
7453
+ if (!layer.wq_s && layer.wq) {
7454
+ layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7455
+ }
7456
+ if (!layer.wk_s && layer.wk) {
7457
+ layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7458
+ }
7459
+ if (!layer.wv_s && layer.wv) {
7460
+ layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7461
+ }
7462
+ if (!layer.wo_s && layer.wo) {
7463
+ layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7464
+ }
7465
+ if (!layer.wqkv_s && layer.wqkv) {
7466
+ layer.wqkv_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7467
+ }
7468
+ if (!layer.wqkv_gate_s && layer.wqkv_gate) {
7469
+ layer.wqkv_gate_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7470
+ }
7471
+
7472
+ // dense FFN weight scales (per-tensor, shape {1})
7473
+ if (!layer.ffn_gate_s && layer.ffn_gate) {
7474
+ layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7475
+ }
7476
+ if (!layer.ffn_down_s && layer.ffn_down) {
7477
+ layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7478
+ }
7479
+ if (!layer.ffn_up_s && layer.ffn_up) {
7480
+ layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7481
+ }
7482
+ if (!layer.ffn_gate_shexp_s && layer.ffn_gate_shexp) {
7483
+ layer.ffn_gate_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7484
+ }
7485
+ if (!layer.ffn_down_shexp_s && layer.ffn_down_shexp) {
7486
+ layer.ffn_down_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7487
+ }
7488
+ if (!layer.ffn_up_shexp_s && layer.ffn_up_shexp) {
7489
+ layer.ffn_up_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7490
+ }
7491
+
7492
+ // MoE expert weight scales (per-expert, shape {n_expert})
7493
+ if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) {
7494
+ layer.ffn_gate_exps_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
7495
+ }
7496
+ if (!layer.ffn_down_exps_s && layer.ffn_down_exps) {
7497
+ layer.ffn_down_exps_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
7498
+ }
7499
+ if (!layer.ffn_up_exps_s && layer.ffn_up_exps) {
7500
+ layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
7501
+ }
7502
+
7503
+ // recurrent / linear-attention weight scales (per-tensor, shape {1})
7504
+ if (!layer.ssm_out_s && layer.ssm_out) {
7505
+ layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7506
+ }
7507
+ if (!layer.ssm_alpha_s && layer.ssm_alpha) {
7508
+ layer.ssm_alpha_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7509
+ }
7510
+ if (!layer.ssm_beta_s && layer.ssm_beta) {
7511
+ layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
7512
+ }
6867
7513
  }
6868
7514
  }
6869
7515
 
@@ -6874,13 +7520,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6874
7520
 
6875
7521
  // create the backend buffers
6876
7522
  std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
6877
- ctx_buf_maps.reserve(ctx_map.size());
7523
+ ctx_buf_maps.reserve(ml.ctx_map.size());
6878
7524
 
6879
7525
  // Ensure we have enough capacity for the maximum backend buffer we will potentially create
6880
- const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
7526
+ const size_t n_max_backend_buffer = ml.ctx_map.size() * ml.files.size();
6881
7527
  pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
6882
7528
 
6883
- for (auto & [buft, ctx_ptr] : ctx_map) {
7529
+ for (auto & [buft, ctx_ptr] : ml.ctx_map) {
6884
7530
  ggml_context * ctx = ctx_ptr.get();
6885
7531
 
6886
7532
  // skip contexts without tensors
@@ -7101,59 +7747,62 @@ void llama_model::print_info() const {
7101
7747
  };
7102
7748
 
7103
7749
  // hparams
7104
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
7105
- LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
7106
- LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
7750
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
7751
+ LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
7752
+ LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
7107
7753
 
7108
7754
  if (!hparams.vocab_only) {
7109
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
7110
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
7111
- LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
7112
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
7113
- LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
7114
- LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
7115
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
7116
- LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
7117
- LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
7118
- LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
7119
- LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
7120
- LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
7121
- LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
7122
- LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
7123
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
7124
- LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
7125
- LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
7126
- LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
7127
- LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
7128
- LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
7129
- LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
7130
- LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
7131
- LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
7132
- LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
7133
- LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
7134
- LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
7135
- LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
7136
- LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
7137
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
7138
- LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
7139
- LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
7755
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
7756
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
7757
+ LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
7758
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
7759
+ LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
7760
+ LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
7761
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full);
7762
+ LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
7763
+ LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
7764
+ LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full);
7765
+ LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full);
7766
+ LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
7767
+ LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
7768
+ LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
7769
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
7770
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
7771
+ LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
7772
+ LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
7773
+ LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
7774
+ LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
7775
+ LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
7776
+ LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
7777
+ LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
7778
+ LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
7779
+ LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
7780
+ LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
7781
+ LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
7782
+ LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
7783
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
7784
+ LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
7785
+ LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
7140
7786
  if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7141
- LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
7142
- LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
7787
+ LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
7788
+ LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
7789
+ LLAMA_LOG_INFO("%s: n_embd_head_k_swa = %u\n", __func__, hparams.n_embd_head_k_swa);
7790
+ LLAMA_LOG_INFO("%s: n_embd_head_v_swa = %u\n", __func__, hparams.n_embd_head_v_swa);
7791
+ LLAMA_LOG_INFO("%s: n_rot_swa = %u\n", __func__, hparams.n_rot_swa);
7143
7792
  }
7144
- LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
7145
- LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
7146
- LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
7793
+ LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
7794
+ LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
7795
+ LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
7147
7796
  // MRoPE (Multi-axis Rotary Position Embedding) sections
7148
7797
  if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
7149
- LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
7798
+ LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
7150
7799
  }
7151
7800
  if (!classifier_labels.empty()) {
7152
- LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
7801
+ LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
7153
7802
 
7154
7803
  size_t i = 0;
7155
7804
  for (auto label : classifier_labels) {
7156
- LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
7805
+ LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
7157
7806
  }
7158
7807
  }
7159
7808
  }
@@ -7165,57 +7814,59 @@ void llama_model::print_info() const {
7165
7814
  arch == LLM_ARCH_PLAMO2 ||
7166
7815
  arch == LLM_ARCH_GRANITE_HYBRID ||
7167
7816
  arch == LLM_ARCH_QWEN3NEXT ||
7817
+ arch == LLM_ARCH_QWEN35 ||
7818
+ arch == LLM_ARCH_QWEN35MOE ||
7168
7819
  arch == LLM_ARCH_NEMOTRON_H ||
7169
7820
  arch == LLM_ARCH_NEMOTRON_H_MOE) {
7170
- LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
7171
- LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
7172
- LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
7173
- LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
7174
- LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
7175
- LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
7821
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
7822
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
7823
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
7824
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
7825
+ LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
7826
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
7176
7827
  }
7177
7828
 
7178
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
7829
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
7179
7830
  if (pimpl->n_elements >= 1e12) {
7180
- LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
7831
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
7181
7832
  } else if (pimpl->n_elements >= 1e9) {
7182
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
7833
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
7183
7834
  } else if (pimpl->n_elements >= 1e6) {
7184
- LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
7835
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
7185
7836
  } else {
7186
- LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
7837
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
7187
7838
  }
7188
7839
 
7189
7840
  // general kv
7190
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
7841
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
7191
7842
 
7192
7843
  if (arch == LLM_ARCH_DEEPSEEK) {
7193
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7194
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7195
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7196
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7844
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7845
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7846
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7847
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7197
7848
  }
7198
7849
 
7199
- if (arch == LLM_ARCH_DEEPSEEK2) {
7200
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7201
- LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
7202
- LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
7203
- LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
7204
- LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
7205
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7206
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7207
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7208
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7209
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7850
+ if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA) {
7851
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7852
+ LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
7853
+ LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
7854
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
7855
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
7856
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7857
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7858
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7859
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7860
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7210
7861
  }
7211
7862
 
7212
7863
  if (arch == LLM_ARCH_QWEN2MOE) {
7213
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7214
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7864
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7865
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7215
7866
  }
7216
7867
 
7217
7868
  if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
7218
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7869
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7219
7870
  }
7220
7871
 
7221
7872
  if (arch == LLM_ARCH_MINICPM ||
@@ -7223,41 +7874,41 @@ void llama_model::print_info() const {
7223
7874
  arch == LLM_ARCH_GRANITE_MOE ||
7224
7875
  arch == LLM_ARCH_GRANITE_HYBRID ||
7225
7876
  arch == LLM_ARCH_NEMOTRON_H_MOE) {
7226
- LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
7227
- LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
7228
- LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
7229
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7877
+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
7878
+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
7879
+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
7880
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7230
7881
  }
7231
7882
 
7232
7883
  if (arch == LLM_ARCH_BAILINGMOE) {
7233
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7234
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7235
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7236
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7237
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7884
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7885
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7886
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7887
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7888
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7238
7889
  }
7239
7890
 
7240
7891
  if (arch == LLM_ARCH_BAILINGMOE2) {
7241
- LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7242
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7243
- LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7244
- LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7245
- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7246
- LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7247
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7248
- LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
7892
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7893
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7894
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7895
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7896
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7897
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
7898
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7899
+ LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
7249
7900
  }
7250
7901
 
7251
7902
  if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
7252
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7253
- LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7903
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7904
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
7254
7905
  }
7255
7906
 
7256
7907
  if (arch == LLM_ARCH_GROVEMOE) {
7257
- LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7258
- LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
7259
- LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
7260
- LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
7908
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7909
+ LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
7910
+ LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
7911
+ LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
7261
7912
  }
7262
7913
 
7263
7914
  vocab.print_info();
@@ -7372,6 +8023,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
7372
8023
  case LLM_ARCH_NOMIC_BERT:
7373
8024
  case LLM_ARCH_NOMIC_BERT_MOE:
7374
8025
  case LLM_ARCH_NEO_BERT:
8026
+ case LLM_ARCH_EUROBERT:
7375
8027
  case LLM_ARCH_WAVTOKENIZER_DEC:
7376
8028
  case LLM_ARCH_MODERN_BERT:
7377
8029
  case LLM_ARCH_GEMMA_EMBEDDING:
@@ -7396,7 +8048,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
7396
8048
  cparams.n_seq_max,
7397
8049
  nullptr);
7398
8050
  } else if (llm_arch_is_hybrid(arch)) {
7399
-
7400
8051
  // The main difference between hybrid architectures is the
7401
8052
  // layer filters, so pick the right one here
7402
8053
  llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
@@ -7413,23 +8064,44 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
7413
8064
  };
7414
8065
  }
7415
8066
 
7416
- res = new llama_memory_hybrid(
7417
- /* model */ *this,
7418
- /* attn_type_k */ params.type_k,
7419
- /* attn_type_v */ params.type_v,
7420
- /* attn_v_trans */ !cparams.flash_attn,
7421
- /* attn_kv_size */ cparams.n_ctx,
7422
- /* attn_n_pad */ 1,
7423
- /* attn_n_swa */ hparams.n_swa,
7424
- /* attn_swa_type */ hparams.swa_type,
7425
- /* recurrent_type_k */ GGML_TYPE_F32,
7426
- /* recurrent_type_v */ GGML_TYPE_F32,
7427
- /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
7428
- /* n_seq_max */ cparams.n_seq_max,
7429
- /* offload */ cparams.offload_kqv,
7430
- /* unified */ cparams.kv_unified,
7431
- /* filter_attn */ std::move(filter_attn),
7432
- /* filter_recr */ std::move(filter_recr));
8067
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
8068
+ // Use hybrid-iswa for hybrid models with SWA
8069
+ res = new llama_memory_hybrid_iswa(
8070
+ /* model */ *this,
8071
+ /* attn_type_k */ params.type_k,
8072
+ /* attn_type_v */ params.type_v,
8073
+ /* attn_v_trans */ !cparams.flash_attn,
8074
+ /* attn_swa_full */ params.swa_full,
8075
+ /* attn_kv_size */ cparams.n_ctx_seq,
8076
+ /* attn_n_ubatch */ cparams.n_ubatch,
8077
+ /* attn_n_pad */ 1,
8078
+ /* recurrent_type_r */ GGML_TYPE_F32,
8079
+ /* recurrent_type_s */ GGML_TYPE_F32,
8080
+ /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
8081
+ /* n_seq_max */ cparams.n_seq_max,
8082
+ /* offload */ cparams.offload_kqv,
8083
+ /* unified */ cparams.kv_unified,
8084
+ /* filter_attn */ std::move(filter_attn),
8085
+ /* filter_recr */ std::move(filter_recr));
8086
+ } else {
8087
+ res = new llama_memory_hybrid(
8088
+ /* model */ *this,
8089
+ /* attn_type_k */ params.type_k,
8090
+ /* attn_type_v */ params.type_v,
8091
+ /* attn_v_trans */ !cparams.flash_attn,
8092
+ /* attn_kv_size */ cparams.n_ctx_seq,
8093
+ /* attn_n_pad */ 1,
8094
+ /* attn_n_swa */ hparams.n_swa,
8095
+ /* attn_swa_type */ hparams.swa_type,
8096
+ /* recurrent_type_k */ GGML_TYPE_F32,
8097
+ /* recurrent_type_v */ GGML_TYPE_F32,
8098
+ /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
8099
+ /* n_seq_max */ cparams.n_seq_max,
8100
+ /* offload */ cparams.offload_kqv,
8101
+ /* unified */ cparams.kv_unified,
8102
+ /* filter_attn */ std::move(filter_attn),
8103
+ /* filter_recr */ std::move(filter_recr));
8104
+ }
7433
8105
  } else {
7434
8106
  llama_memory_i::layer_reuse_cb reuse = nullptr;
7435
8107
 
@@ -7549,6 +8221,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7549
8221
  {
7550
8222
  llm = std::make_unique<llm_build_neo_bert>(*this, params);
7551
8223
  } break;
8224
+ case LLM_ARCH_EUROBERT:
8225
+ {
8226
+ llm = std::make_unique<llm_build_eurobert>(*this, params);
8227
+ } break;
7552
8228
  case LLM_ARCH_BLOOM:
7553
8229
  {
7554
8230
  llm = std::make_unique<llm_build_bloom>(*this, params);
@@ -7748,6 +8424,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7748
8424
  llm = std::make_unique<llm_build_deepseek>(*this, params);
7749
8425
  } break;
7750
8426
  case LLM_ARCH_DEEPSEEK2:
8427
+ case LLM_ARCH_GLM_DSA:
7751
8428
  {
7752
8429
  llm = std::make_unique<llm_build_deepseek2>(*this, params);
7753
8430
  } break;
@@ -7790,6 +8467,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7790
8467
  {
7791
8468
  llm = std::make_unique<llm_build_jais>(*this, params);
7792
8469
  } break;
8470
+ case LLM_ARCH_JAIS2:
8471
+ {
8472
+ llm = std::make_unique<llm_build_jais2>(*this, params);
8473
+ } break;
7793
8474
  case LLM_ARCH_NEMOTRON:
7794
8475
  {
7795
8476
  llm = std::make_unique<llm_build_nemotron>(*this, params);
@@ -7811,6 +8492,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7811
8492
  llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
7812
8493
  }
7813
8494
  } break;
8495
+ case LLM_ARCH_EXAONE_MOE:
8496
+ {
8497
+ llm = std::make_unique<llm_build_exaone_moe>(*this, params);
8498
+ } break;
7814
8499
  case LLM_ARCH_RWKV6:
7815
8500
  {
7816
8501
  llm = std::make_unique<llm_build_rwkv6>(*this, params);
@@ -7881,6 +8566,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7881
8566
  {
7882
8567
  llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
7883
8568
  } break;
8569
+ case LLM_ARCH_PADDLEOCR:
8570
+ {
8571
+ llm = std::make_unique<llm_build_paddleocr>(*this, params);
8572
+ } break;
7884
8573
  case LLM_ARCH_HUNYUAN_MOE:
7885
8574
  {
7886
8575
  llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
@@ -7904,7 +8593,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7904
8593
  case LLM_ARCH_LFM2:
7905
8594
  case LLM_ARCH_LFM2MOE:
7906
8595
  {
7907
- llm = std::make_unique<llm_build_lfm2>(*this, params);
8596
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
8597
+ llm = std::make_unique<llm_build_lfm2<true>>(*this, params);
8598
+ } else {
8599
+ llm = std::make_unique<llm_build_lfm2<false>>(*this, params);
8600
+ }
7908
8601
  } break;
7909
8602
  case LLM_ARCH_SMALLTHINKER:
7910
8603
  {
@@ -7938,6 +8631,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7938
8631
  {
7939
8632
  llm = std::make_unique<llm_build_qwen3next>(*this, params);
7940
8633
  } break;
8634
+ case LLM_ARCH_QWEN35:
8635
+ {
8636
+ llm = std::make_unique<llm_build_qwen35>(*this, params);
8637
+ } break;
8638
+ case LLM_ARCH_QWEN35MOE:
8639
+ {
8640
+ llm = std::make_unique<llm_build_qwen35moe>(*this, params);
8641
+ } break;
7941
8642
  case LLM_ARCH_MISTRAL3:
7942
8643
  {
7943
8644
  llm = std::make_unique<llm_build_mistral3>(*this, params);
@@ -7946,12 +8647,20 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7946
8647
  {
7947
8648
  llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
7948
8649
  } break;
8650
+ case LLM_ARCH_KIMI_LINEAR:
8651
+ {
8652
+ llm = std::make_unique<llm_build_kimi_linear>(*this, params);
8653
+ } break;
8654
+ case LLM_ARCH_STEP35:
8655
+ {
8656
+ llm = std::make_unique<llm_build_step35_iswa>(*this, params);
8657
+ } break;
7949
8658
  default:
7950
8659
  GGML_ABORT("fatal error");
7951
8660
  }
7952
8661
 
7953
8662
  // add on pooling layer
7954
- llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
8663
+ llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm);
7955
8664
 
7956
8665
  // add backend sampling layers (if any)
7957
8666
  llm->build_sampling();
@@ -7960,7 +8669,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7960
8669
  // there will be two additional dense projection layers
7961
8670
  // dense linear projections are applied after pooling
7962
8671
  // TODO: move reranking logic here and generalize
7963
- llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
8672
+ llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
7964
8673
 
7965
8674
  llm->res->set_outputs();
7966
8675
 
@@ -7985,7 +8694,7 @@ llama_model_params llama_model_default_params() {
7985
8694
  /*.kv_overrides =*/ nullptr,
7986
8695
  /*.vocab_only =*/ false,
7987
8696
  /*.use_mmap =*/ true,
7988
- /*.use_direct_io =*/ true,
8697
+ /*.use_direct_io =*/ false,
7989
8698
  /*.use_mlock =*/ false,
7990
8699
  /*.check_tensors =*/ false,
7991
8700
  /*.use_extra_bufts =*/ true,
@@ -8021,7 +8730,7 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
8021
8730
  }
8022
8731
 
8023
8732
  int32_t llama_model_n_embd_out(const llama_model * model) {
8024
- return model->hparams.get_n_embd_out();
8733
+ return model->hparams.n_embd_out();
8025
8734
  }
8026
8735
 
8027
8736
  int32_t llama_model_n_layer(const llama_model * model) {
@@ -8095,6 +8804,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
8095
8804
  case LLM_ARCH_WAVTOKENIZER_DEC:
8096
8805
  case LLM_ARCH_NEMOTRON_H:
8097
8806
  case LLM_ARCH_NEMOTRON_H_MOE:
8807
+ case LLM_ARCH_KIMI_LINEAR:
8098
8808
  return LLAMA_ROPE_TYPE_NONE;
8099
8809
 
8100
8810
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -8128,6 +8838,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
8128
8838
  case LLM_ARCH_MISTRAL3:
8129
8839
  case LLM_ARCH_LLAMA_EMBED:
8130
8840
  case LLM_ARCH_MAINCODER:
8841
+ case LLM_ARCH_GLM_DSA:
8131
8842
  return LLAMA_ROPE_TYPE_NORM;
8132
8843
 
8133
8844
  // the pairs of head values are offset by n_rot/2
@@ -8140,6 +8851,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
8140
8851
  case LLM_ARCH_MODERN_BERT:
8141
8852
  case LLM_ARCH_NOMIC_BERT:
8142
8853
  case LLM_ARCH_NOMIC_BERT_MOE:
8854
+ case LLM_ARCH_EUROBERT:
8143
8855
  case LLM_ARCH_STABLELM:
8144
8856
  case LLM_ARCH_BITNET:
8145
8857
  case LLM_ARCH_QWEN:
@@ -8171,10 +8883,12 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
8171
8883
  case LLM_ARCH_NEMOTRON:
8172
8884
  case LLM_ARCH_EXAONE:
8173
8885
  case LLM_ARCH_EXAONE4:
8886
+ case LLM_ARCH_EXAONE_MOE:
8174
8887
  case LLM_ARCH_MINICPM3:
8175
8888
  case LLM_ARCH_BAILINGMOE2:
8176
8889
  case LLM_ARCH_DOTS1:
8177
8890
  case LLM_ARCH_HUNYUAN_MOE:
8891
+ case LLM_ARCH_JAIS2:
8178
8892
  case LLM_ARCH_OPENAI_MOE:
8179
8893
  case LLM_ARCH_HUNYUAN_DENSE:
8180
8894
  case LLM_ARCH_LFM2:
@@ -8189,12 +8903,16 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
8189
8903
  case LLM_ARCH_AFMOE:
8190
8904
  case LLM_ARCH_QWEN3NEXT:
8191
8905
  case LLM_ARCH_MIMO2:
8906
+ case LLM_ARCH_STEP35:
8192
8907
  return LLAMA_ROPE_TYPE_NEOX;
8193
8908
 
8194
8909
  case LLM_ARCH_QWEN2VL:
8910
+ case LLM_ARCH_PADDLEOCR:
8195
8911
  return LLAMA_ROPE_TYPE_MROPE;
8196
8912
  case LLM_ARCH_QWEN3VL:
8197
8913
  case LLM_ARCH_QWEN3VLMOE:
8914
+ case LLM_ARCH_QWEN35:
8915
+ case LLM_ARCH_QWEN35MOE:
8198
8916
  return LLAMA_ROPE_TYPE_IMROPE;
8199
8917
 
8200
8918
  case LLM_ARCH_GLM4: