@fugood/llama.node 0.3.15 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +243 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +14 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  136. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  138. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
  143. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  144. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  145. package/src/llama.cpp/include/llama.h +30 -11
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  147. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  149. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  150. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  151. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  152. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  153. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  154. package/src/llama.cpp/src/llama-arch.cpp +161 -17
  155. package/src/llama.cpp/src/llama-arch.h +16 -0
  156. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  157. package/src/llama.cpp/src/llama-chat.h +6 -2
  158. package/src/llama.cpp/src/llama-context.cpp +108 -92
  159. package/src/llama.cpp/src/llama-context.h +1 -2
  160. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  161. package/src/llama.cpp/src/llama-graph.h +26 -6
  162. package/src/llama.cpp/src/llama-hparams.h +13 -0
  163. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  164. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  165. package/src/llama.cpp/src/llama-memory.h +1 -1
  166. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  167. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  168. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  169. package/src/llama.cpp/src/llama-model.cpp +1544 -291
  170. package/src/llama.cpp/src/llama-model.h +13 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  172. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  173. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  174. package/src/llama.cpp/src/llama.cpp +1 -1
  175. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  176. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  177. package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
  178. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  179. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  180. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  181. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  182. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  183. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  184. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  185. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  186. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  188. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  189. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  190. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  191. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  192. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  193. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  203. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -17,6 +17,7 @@
17
17
  #include <cmath>
18
18
  #include <functional>
19
19
  #include <map>
20
+ #include <regex>
20
21
  #include <sstream>
21
22
  #include <stdexcept>
22
23
 
@@ -42,11 +43,14 @@ const char * llm_type_name(llm_type type) {
42
43
  case LLM_TYPE_770M: return "770M";
43
44
  case LLM_TYPE_780M: return "780M";
44
45
  case LLM_TYPE_0_5B: return "0.5B";
46
+ case LLM_TYPE_0_6B: return "0.6B";
45
47
  case LLM_TYPE_1B: return "1B";
46
48
  case LLM_TYPE_1_3B: return "1.3B";
47
49
  case LLM_TYPE_1_4B: return "1.4B";
48
50
  case LLM_TYPE_1_5B: return "1.5B";
49
51
  case LLM_TYPE_1_6B: return "1.6B";
52
+ case LLM_TYPE_1_7B: return "1.7B";
53
+ case LLM_TYPE_1_8B: return "1.8B";
50
54
  case LLM_TYPE_2B: return "2B";
51
55
  case LLM_TYPE_2_8B: return "2.8B";
52
56
  case LLM_TYPE_2_9B: return "2.9B";
@@ -64,6 +68,7 @@ const char * llm_type_name(llm_type type) {
64
68
  case LLM_TYPE_15B: return "15B";
65
69
  case LLM_TYPE_16B: return "16B";
66
70
  case LLM_TYPE_20B: return "20B";
71
+ case LLM_TYPE_27B: return "27B";
67
72
  case LLM_TYPE_30B: return "30B";
68
73
  case LLM_TYPE_32B: return "32B";
69
74
  case LLM_TYPE_34B: return "34B";
@@ -72,6 +77,7 @@ const char * llm_type_name(llm_type type) {
72
77
  case LLM_TYPE_65B: return "65B";
73
78
  case LLM_TYPE_70B: return "70B";
74
79
  case LLM_TYPE_236B: return "236B";
80
+ case LLM_TYPE_290B: return "290B";
75
81
  case LLM_TYPE_314B: return "314B";
76
82
  case LLM_TYPE_671B: return "671B";
77
83
  case LLM_TYPE_SMALL: return "0.1B";
@@ -86,7 +92,10 @@ const char * llm_type_name(llm_type type) {
86
92
  case LLM_TYPE_16x3_8B: return "16x3.8B";
87
93
  case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
88
94
  case LLM_TYPE_57B_A14B: return "57B.A14B";
89
- case LLM_TYPE_27B: return "27B";
95
+ case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
96
+ case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
97
+ case LLM_TYPE_30B_A3B: return "30B.A3B";
98
+ case LLM_TYPE_235B_A22B: return "235B.A22B";
90
99
  default: return "?B";
91
100
  }
92
101
  }
@@ -255,7 +264,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
255
264
  return nullptr;
256
265
  }
257
266
 
258
- // CPU: ACCEL -> CPU extra -> GPU host -> CPU
267
+ // CPU: ACCEL -> GPU host -> CPU extra -> CPU
259
268
  static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
260
269
  buft_list_t buft_list;
261
270
 
@@ -271,19 +280,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
271
280
  }
272
281
  }
273
282
 
274
- // add extra buffer types
275
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
276
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
277
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
278
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
279
- if (ggml_backend_dev_get_extra_bufts_fn) {
280
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
281
- while (extra_bufts && *extra_bufts) {
282
- buft_list.emplace_back(cpu_dev, *extra_bufts);
283
- ++extra_bufts;
284
- }
285
- }
286
-
287
283
  // add a host buffer type
288
284
  // storing the tensors in a host buffer is useful when the processing of large batches
289
285
  // is offloaded to a GPU device, since it reduces the time spent on data transfers
@@ -298,6 +294,20 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
298
294
  }
299
295
  }
300
296
 
297
+ // add extra buffer types, only if no GPU device is present
298
+ // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
299
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
300
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
301
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
302
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
303
+ if (ggml_backend_dev_get_extra_bufts_fn) {
304
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
305
+ while (extra_bufts && *extra_bufts) {
306
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
307
+ ++extra_bufts;
308
+ }
309
+ }
310
+
301
311
  // add the CPU buffer type
302
312
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
303
313
  ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -375,9 +385,12 @@ struct llama_model::impl {
375
385
  layer_dev dev_input = {};
376
386
  layer_dev dev_output = {};
377
387
  std::vector<layer_dev> dev_layer;
388
+
389
+ bool has_tensor_overrides;
378
390
  };
379
391
 
380
392
  llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
393
+ pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
381
394
  }
382
395
 
383
396
  llama_model::~llama_model() {}
@@ -543,6 +556,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
543
556
  }
544
557
  }
545
558
  } break;
559
+ case LLM_ARCH_LLAMA4:
560
+ {
561
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
562
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
563
+ ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
564
+ hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
565
+ hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
566
+ hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
567
+
568
+ switch (hparams.n_expert) {
569
+ case 16: type = LLM_TYPE_17B_16E; break;
570
+ case 128: type = LLM_TYPE_17B_128E; break;
571
+ default: type = LLM_TYPE_UNKNOWN;
572
+ }
573
+
574
+ if (type == LLM_TYPE_17B_128E) {
575
+ hparams.use_kq_norm = false;
576
+ }
577
+ } break;
546
578
  case LLM_ARCH_DECI:
547
579
  {
548
580
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -667,10 +699,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
667
699
  }
668
700
  } break;
669
701
  case LLM_ARCH_NOMIC_BERT:
702
+ case LLM_ARCH_NOMIC_BERT_MOE:
670
703
  {
671
704
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
672
705
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
673
706
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
707
+ ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
674
708
 
675
709
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
676
710
  type = LLM_TYPE_137M;
@@ -759,6 +793,28 @@ void llama_model::load_hparams(llama_model_loader & ml) {
759
793
  default: type = LLM_TYPE_UNKNOWN;
760
794
  }
761
795
  } break;
796
+ case LLM_ARCH_QWEN3:
797
+ {
798
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
799
+ switch (hparams.n_layer) {
800
+ case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
801
+ case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
802
+ case 40: type = LLM_TYPE_14B; break;
803
+ case 64: type = LLM_TYPE_32B; break;
804
+ default: type = LLM_TYPE_UNKNOWN;
805
+ }
806
+ } break;
807
+ case LLM_ARCH_QWEN3MOE:
808
+ {
809
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
810
+
811
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
812
+ switch (hparams.n_layer) {
813
+ case 48: type = LLM_TYPE_30B_A3B; break;
814
+ case 94: type = LLM_TYPE_235B_A22B; break;
815
+ default: type = LLM_TYPE_UNKNOWN;
816
+ }
817
+ } break;
762
818
  case LLM_ARCH_PHI2:
763
819
  {
764
820
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1112,6 +1168,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1112
1168
  ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1113
1169
  }
1114
1170
  ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1171
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
1172
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
1115
1173
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1116
1174
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1117
1175
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
@@ -1131,6 +1189,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1131
1189
  default: type = LLM_TYPE_UNKNOWN;
1132
1190
  }
1133
1191
  } break;
1192
+ case LLM_ARCH_PLM:
1193
+ {
1194
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1195
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1196
+ switch (hparams.n_layer) {
1197
+ case 32: type = LLM_TYPE_1_8B; break;
1198
+ default: type = LLM_TYPE_UNKNOWN;
1199
+ }
1200
+ } break;
1134
1201
  case LLM_ARCH_CHATGLM:
1135
1202
  {
1136
1203
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1152,6 +1219,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1152
1219
  default: type = LLM_TYPE_UNKNOWN;
1153
1220
  }
1154
1221
  } break;
1222
+ case LLM_ARCH_GLM4:
1223
+ {
1224
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1225
+ switch (hparams.n_layer) {
1226
+ case 40: type = LLM_TYPE_9B; break;
1227
+ case 61: type = LLM_TYPE_32B; break;
1228
+ default: type = LLM_TYPE_UNKNOWN;
1229
+ }
1230
+ } break;
1155
1231
  case LLM_ARCH_BITNET:
1156
1232
  {
1157
1233
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1317,6 +1393,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1317
1393
  ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
1318
1394
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
1319
1395
  } break;
1396
+ case LLM_ARCH_BAILINGMOE:
1397
+ {
1398
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1399
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1400
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1401
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1402
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1403
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1404
+
1405
+ switch (hparams.n_layer) {
1406
+ case 28: type = LLM_TYPE_16B; break;
1407
+ case 88: type = LLM_TYPE_290B; break;
1408
+ default: type = LLM_TYPE_UNKNOWN;
1409
+ }
1410
+ } break;
1320
1411
  default: throw std::runtime_error("unsupported model architecture");
1321
1412
  }
1322
1413
 
@@ -1544,9 +1635,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1544
1635
  GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
1545
1636
  }
1546
1637
 
1547
- ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
1638
+ ggml_backend_buffer_type_t buft = nullptr;
1639
+
1640
+ // check overrides
1641
+ if (ml.tensor_buft_overrides) {
1642
+ std::string tensor_name = tn.str();
1643
+ for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
1644
+ std::regex pattern(overrides->pattern);
1645
+ if (std::regex_search(tensor_name, pattern)) {
1646
+ LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
1647
+ buft = overrides->buft;
1648
+ break;
1649
+ }
1650
+ }
1651
+ }
1652
+
1548
1653
  if (!buft) {
1549
- throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
1654
+ buft = select_weight_buft(hparams, t_meta, op, *buft_list);
1655
+ if (!buft) {
1656
+ throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
1657
+ }
1550
1658
  }
1551
1659
 
1552
1660
  // avoid using a host buffer when using mmap
@@ -1642,6 +1750,56 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1642
1750
  }
1643
1751
  }
1644
1752
  } break;
1753
+ case LLM_ARCH_LLAMA4:
1754
+ {
1755
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1756
+
1757
+ // output
1758
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1759
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1760
+
1761
+ // if output is NULL, init from the input tok embed
1762
+ if (output == NULL) {
1763
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
1764
+ }
1765
+
1766
+ GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
1767
+ for (int i = 0; i < n_layer; ++i) {
1768
+ bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
1769
+
1770
+ auto & layer = layers[i];
1771
+
1772
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1773
+
1774
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
1775
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1776
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1777
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
1778
+
1779
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1780
+
1781
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1782
+
1783
+ if (is_moe_layer) {
1784
+ int n_ff_exp = hparams.n_ff_exp;
1785
+
1786
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1787
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
1788
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
1789
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
1790
+
1791
+ // Shared expert
1792
+ const int64_t n_ff_shexp = n_ff_exp;
1793
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1794
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
1795
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1796
+ } else {
1797
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1798
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1799
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1800
+ }
1801
+ }
1802
+ } break;
1645
1803
  case LLM_ARCH_DECI:
1646
1804
  {
1647
1805
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -1911,6 +2069,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1911
2069
  } break;
1912
2070
  case LLM_ARCH_BERT:
1913
2071
  case LLM_ARCH_NOMIC_BERT:
2072
+ case LLM_ARCH_NOMIC_BERT_MOE:
1914
2073
  {
1915
2074
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1916
2075
  type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
@@ -1944,20 +2103,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1944
2103
  layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1945
2104
  }
1946
2105
 
2106
+ if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
2107
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2108
+ }
2109
+
1947
2110
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1948
2111
 
1949
2112
  layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
1950
2113
  layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
1951
2114
 
1952
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1953
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1954
-
1955
- if (arch == LLM_ARCH_BERT) {
2115
+ if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
1956
2116
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
1957
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
1958
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2117
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
2118
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2119
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1959
2120
  } else {
1960
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2121
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2122
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2123
+
2124
+ if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
2125
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2126
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2127
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2128
+ } else {
2129
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2130
+ }
1961
2131
  }
1962
2132
 
1963
2133
  layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
@@ -2210,9 +2380,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2210
2380
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2211
2381
 
2212
2382
  // optional bias tensors
2213
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
2214
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
2215
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
2383
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2384
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2385
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
2216
2386
 
2217
2387
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2218
2388
 
@@ -2241,6 +2411,77 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2241
2411
  layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
2242
2412
  }
2243
2413
  } break;
2414
+ case LLM_ARCH_QWEN3:
2415
+ {
2416
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2417
+
2418
+ // output
2419
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2420
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2421
+ // if output is NULL, init from the input tok embed
2422
+ if (output == NULL) {
2423
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2424
+ }
2425
+
2426
+ for (int i = 0; i < n_layer; ++i) {
2427
+ auto & layer = layers[i];
2428
+
2429
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2430
+
2431
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2432
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2433
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2434
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2435
+
2436
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2437
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2438
+
2439
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2440
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2441
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2442
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2443
+ }
2444
+ } break;
2445
+ case LLM_ARCH_QWEN3MOE:
2446
+ {
2447
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2448
+
2449
+ // output
2450
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2451
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2452
+
2453
+ for (int i = 0; i < n_layer; ++i) {
2454
+ auto & layer = layers[i];
2455
+
2456
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2457
+
2458
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2459
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2460
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2461
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2462
+
2463
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2464
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2465
+
2466
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2467
+
2468
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2469
+
2470
+ if (n_expert == 0) {
2471
+ throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
2472
+ }
2473
+ if (n_expert_used == 0) {
2474
+ throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
2475
+ }
2476
+
2477
+ // MoE branch
2478
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
2479
+
2480
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2481
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2482
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2483
+ }
2484
+ } break;
2244
2485
  case LLM_ARCH_PHI2:
2245
2486
  {
2246
2487
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -2329,7 +2570,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2329
2570
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
2330
2571
  layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
2331
2572
 
2332
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
2573
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
2333
2574
  if (layer.wqkv == nullptr) {
2334
2575
  layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2335
2576
  layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
@@ -2558,7 +2799,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2558
2799
 
2559
2800
  // output
2560
2801
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2561
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
2802
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2803
+
2804
+ // if output is NULL, init from the input tok embed
2805
+ if (output == NULL) {
2806
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2807
+ }
2562
2808
 
2563
2809
  for (int i = 0; i < n_layer; ++i) {
2564
2810
  auto & layer = layers[i];
@@ -2985,8 +3231,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2985
3231
  {
2986
3232
  const bool is_lite = (hparams.n_layer == 27);
2987
3233
 
3234
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
3235
+
3236
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
3237
+ const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
3238
+ const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
3239
+
2988
3240
  const int64_t n_embd_head_qk_rope = hparams.n_rot;
2989
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
3241
+ const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
2990
3242
 
2991
3243
  const int64_t q_lora_rank = hparams.n_lora_q;
2992
3244
  const int64_t kv_lora_rank = hparams.n_lora_kv;
@@ -3012,14 +3264,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3012
3264
 
3013
3265
  if (!is_lite) {
3014
3266
  layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
3015
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
3267
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
3016
3268
  } else {
3017
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3269
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
3018
3270
  }
3019
3271
 
3020
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
3021
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
3022
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
3272
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
3273
+
3274
+ // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
3275
+ if (is_mla) {
3276
+ layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
3277
+ layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
3278
+ } else {
3279
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
3280
+ }
3281
+
3282
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
3023
3283
 
3024
3284
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3025
3285
 
@@ -3050,6 +3310,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3050
3310
  }
3051
3311
  }
3052
3312
  } break;
3313
+ case LLM_ARCH_PLM:
3314
+ {
3315
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
3316
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
3317
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
3318
+
3319
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3320
+
3321
+ // output
3322
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3323
+ // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3324
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3325
+
3326
+ for (int i = 0; i < n_layer; ++i) {
3327
+ auto & layer = layers[i];
3328
+
3329
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3330
+
3331
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3332
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
3333
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
3334
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
3335
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
3336
+
3337
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3338
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3339
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3340
+ }
3341
+ } break;
3053
3342
  case LLM_ARCH_BITNET:
3054
3343
  {
3055
3344
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3215,16 +3504,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3215
3504
  auto & layer = layers[i];
3216
3505
 
3217
3506
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3218
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3219
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3507
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3508
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3220
3509
 
3221
3510
  if (layer.wqkv == nullptr) {
3222
3511
  layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3223
3512
  layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3224
3513
  layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3225
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3226
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3227
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
3514
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3515
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3516
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3228
3517
  }
3229
3518
 
3230
3519
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
@@ -3236,23 +3525,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3236
3525
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3237
3526
  }
3238
3527
  } break;
3239
- case LLM_ARCH_NEMOTRON:
3528
+ case LLM_ARCH_GLM4:
3240
3529
  {
3241
3530
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3242
3531
 
3243
3532
  // output
3244
- output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3245
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3246
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3533
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3534
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3535
+ // if output is NULL, init from the input tok embed
3536
+ if (output == NULL) {
3537
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3538
+ }
3247
3539
 
3248
3540
  for (int i = 0; i < n_layer; ++i) {
3249
3541
  auto & layer = layers[i];
3250
3542
 
3251
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3252
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3543
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3544
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3545
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3253
3546
 
3254
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3255
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3547
+ if (layer.wqkv == nullptr) {
3548
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3549
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3550
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3551
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3552
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3553
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3554
+ }
3555
+
3556
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3557
+
3558
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3559
+
3560
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3561
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3562
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
3563
+
3564
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3565
+ }
3566
+ } break;
3567
+ case LLM_ARCH_NEMOTRON:
3568
+ {
3569
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3570
+
3571
+ // output
3572
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3573
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
3574
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3575
+
3576
+ for (int i = 0; i < n_layer; ++i) {
3577
+ auto & layer = layers[i];
3578
+
3579
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3580
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
3581
+
3582
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3583
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3256
3584
  layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3257
3585
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3258
3586
 
@@ -3335,12 +3663,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3335
3663
  layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
3336
3664
 
3337
3665
  layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
3338
- layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3339
- layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3340
- layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3341
- layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3342
- layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
3343
- layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, llama_model_loader::TENSOR_NOT_REQUIRED);
3666
+ layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
3667
+ layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
3668
+ layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
3669
+ layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
3670
+ layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
3671
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
3344
3672
  GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
3345
3673
 
3346
3674
  layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
@@ -3370,7 +3698,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3370
3698
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3371
3699
 
3372
3700
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3373
- output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3701
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
3374
3702
  output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3375
3703
 
3376
3704
  const int time_mix_extra_dim = hparams.time_mix_extra_dim;
@@ -3396,7 +3724,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3396
3724
  layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
3397
3725
  layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
3398
3726
 
3399
- layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3727
+ layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
3400
3728
  layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
3401
3729
  layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
3402
3730
  layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
@@ -3405,9 +3733,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3405
3733
  layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
3406
3734
  layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
3407
3735
  // optional bias tensors
3408
- layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3409
- layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3410
- layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
3736
+ layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
3737
+ layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
3738
+ layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
3411
3739
 
3412
3740
  layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
3413
3741
 
@@ -3528,8 +3856,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3528
3856
  layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
3529
3857
  }
3530
3858
 
3531
- layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, llama_model_loader::TENSOR_NOT_REQUIRED);
3532
- layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3859
+ layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
3860
+ layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
3533
3861
 
3534
3862
  try {
3535
3863
  layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
@@ -3546,8 +3874,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3546
3874
  layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
3547
3875
  layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
3548
3876
 
3549
- layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3550
- layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3877
+ layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
3878
+ layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3551
3879
  layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
3552
3880
 
3553
3881
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -3694,6 +4022,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3694
4022
  output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
3695
4023
  output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
3696
4024
  } break;
4025
+ case LLM_ARCH_BAILINGMOE:
4026
+ {
4027
+ const int64_t n_ff_exp = hparams.n_ff_exp;
4028
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4029
+
4030
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4031
+
4032
+ // output
4033
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4034
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4035
+
4036
+ for (int i = 0; i < n_layer; ++i) {
4037
+ auto & layer = layers[i];
4038
+
4039
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4040
+
4041
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
4042
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
4043
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
4044
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
4045
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4046
+
4047
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4048
+
4049
+ if (n_expert == 0) {
4050
+ throw std::runtime_error("n_expert must be > 0");
4051
+ }
4052
+ if (n_expert_used == 0) {
4053
+ throw std::runtime_error("n_expert_used must be > 0");
4054
+ }
4055
+
4056
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4057
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4058
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4059
+
4060
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4061
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
4062
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4063
+ }
4064
+ } break;
3697
4065
  default:
3698
4066
  throw std::runtime_error("unknown architecture");
3699
4067
  }
@@ -3962,6 +4330,8 @@ void llama_model::print_info() const {
3962
4330
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
3963
4331
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
3964
4332
  LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
4333
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
4334
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
3965
4335
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
3966
4336
  LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
3967
4337
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
@@ -3975,12 +4345,24 @@ void llama_model::print_info() const {
3975
4345
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
3976
4346
  }
3977
4347
 
4348
+ if (arch == LLM_ARCH_QWEN3MOE) {
4349
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4350
+ }
4351
+
3978
4352
  if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
3979
4353
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
3980
4354
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
3981
4355
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
3982
4356
  }
3983
4357
 
4358
+ if (arch == LLM_ARCH_BAILINGMOE) {
4359
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
4360
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4361
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
4362
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
4363
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
4364
+ }
4365
+
3984
4366
  vocab.print_info();
3985
4367
  }
3986
4368
 
@@ -4042,6 +4424,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
4042
4424
  });
4043
4425
  }
4044
4426
 
4427
+ bool llama_model::has_tensor_overrides() const {
4428
+ return pimpl->has_tensor_overrides;
4429
+ }
4430
+
4045
4431
  const ggml_tensor * llama_model::get_tensor(const char * name) const {
4046
4432
  auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
4047
4433
  [name](const std::pair<std::string, ggml_tensor *> & it) {
@@ -4069,12 +4455,22 @@ struct llm_build_llama : public llm_graph_context {
4069
4455
  // inp_pos - contains the positions
4070
4456
  ggml_tensor * inp_pos = build_inp_pos();
4071
4457
 
4458
+ // temperature tuning
4459
+ ggml_tensor * inp_attn_scale = nullptr;
4460
+ if (arch == LLM_ARCH_LLAMA4) {
4461
+ inp_attn_scale = build_inp_attn_scale();
4462
+ }
4463
+
4072
4464
  auto * inp_attn = build_attn_inp_kv_unified();
4073
4465
 
4074
4466
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4075
4467
  for (int il = 0; il < n_layer; ++il) {
4076
4468
  ggml_tensor * inpSA = inpL;
4077
4469
 
4470
+ bool use_rope = arch == LLM_ARCH_LLAMA4
4471
+ ? (il + 1) % hparams.n_no_rope_layer_step != 0
4472
+ : true;
4473
+
4078
4474
  // norm
4079
4475
  cur = build_norm(inpL,
4080
4476
  model.layers[il].attn_norm, NULL,
@@ -4112,25 +4508,38 @@ struct llm_build_llama : public llm_graph_context {
4112
4508
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
4113
4509
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
4114
4510
 
4115
- Qcur = ggml_rope_ext(
4116
- ctx0, Qcur, inp_pos, rope_factors,
4117
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4118
- ext_factor, attn_factor, beta_fast, beta_slow
4119
- );
4511
+ if (use_rope) {
4512
+ Qcur = ggml_rope_ext(
4513
+ ctx0, Qcur, inp_pos, rope_factors,
4514
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4515
+ ext_factor, attn_factor, beta_fast, beta_slow
4516
+ );
4120
4517
 
4121
- Kcur = ggml_rope_ext(
4122
- ctx0, Kcur, inp_pos, rope_factors,
4123
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4124
- ext_factor, attn_factor, beta_fast, beta_slow
4125
- );
4518
+ Kcur = ggml_rope_ext(
4519
+ ctx0, Kcur, inp_pos, rope_factors,
4520
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4521
+ ext_factor, attn_factor, beta_fast, beta_slow
4522
+ );
4523
+ } else if (inp_attn_scale) {
4524
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
4525
+ }
4126
4526
 
4127
4527
  cb(Qcur, "Qcur", il);
4128
4528
  cb(Kcur, "Kcur", il);
4129
4529
  cb(Vcur, "Vcur", il);
4130
4530
 
4531
+ if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
4532
+ // Llama4TextL2Norm
4533
+ Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
4534
+ Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
4535
+ cb(Qcur, "Qcur_normed", il);
4536
+ cb(Kcur, "Kcur_normed", il);
4537
+ }
4538
+
4131
4539
  cur = build_attn(inp_attn, gf,
4132
4540
  model.layers[il].wo, model.layers[il].bo,
4133
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
4541
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4542
+ cb(cur, "attn_out", il);
4134
4543
  }
4135
4544
 
4136
4545
  if (il == n_layer - 1) {
@@ -4148,7 +4557,7 @@ struct llm_build_llama : public llm_graph_context {
4148
4557
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4149
4558
  cb(ffn_inp, "ffn_inp", il);
4150
4559
 
4151
- // feed-forward network
4560
+ // feed-forward network (non-MoE)
4152
4561
  if (model.layers[il].ffn_gate_inp == nullptr) {
4153
4562
 
4154
4563
  cur = build_norm(ffn_inp,
@@ -4163,6 +4572,38 @@ struct llm_build_llama : public llm_graph_context {
4163
4572
  NULL,
4164
4573
  LLM_FFN_SILU, LLM_FFN_PAR, il);
4165
4574
  cb(cur, "ffn_out", il);
4575
+
4576
+ } else if (arch == LLM_ARCH_LLAMA4) {
4577
+ // llama4 MoE
4578
+ ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
4579
+ model.layers[il].ffn_norm, NULL,
4580
+ LLM_NORM_RMS, il);
4581
+ cb(cur, "ffn_norm", il);
4582
+
4583
+ ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
4584
+ model.layers[il].ffn_gate_inp,
4585
+ model.layers[il].ffn_up_exps,
4586
+ model.layers[il].ffn_gate_exps,
4587
+ model.layers[il].ffn_down_exps,
4588
+ nullptr,
4589
+ n_expert, n_expert_used,
4590
+ LLM_FFN_SILU, false,
4591
+ false, 0.0,
4592
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
4593
+ il);
4594
+
4595
+ // Shared experts
4596
+ ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
4597
+ model.layers[il].ffn_up_shexp, NULL, NULL,
4598
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
4599
+ model.layers[il].ffn_down_shexp, NULL, NULL,
4600
+ NULL,
4601
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
4602
+ cb(shexp_out, "ffn_moe_shexp", il);
4603
+
4604
+ cur = ggml_add(ctx0, moe_out, shexp_out);
4605
+ cb(cur, "ffn_moe_out_merged", il);
4606
+
4166
4607
  } else {
4167
4608
  // MoE branch
4168
4609
  cur = build_norm(ffn_inp,
@@ -4310,7 +4751,7 @@ struct llm_build_deci : public llm_graph_context {
4310
4751
 
4311
4752
  cur = build_attn(inp_attn, gf,
4312
4753
  model.layers[il].wo, model.layers[il].bo,
4313
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
4754
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4314
4755
  }
4315
4756
 
4316
4757
  if (il == n_layer - 1) {
@@ -4452,7 +4893,7 @@ struct llm_build_baichuan : public llm_graph_context {
4452
4893
 
4453
4894
  cur = build_attn(inp_attn, gf,
4454
4895
  model.layers[il].wo, NULL,
4455
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4896
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4456
4897
  }
4457
4898
 
4458
4899
  if (il == n_layer - 1) {
@@ -4567,7 +5008,7 @@ struct llm_build_xverse : public llm_graph_context {
4567
5008
 
4568
5009
  cur = build_attn(inp_attn, gf,
4569
5010
  model.layers[il].wo, NULL,
4570
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5011
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4571
5012
  }
4572
5013
 
4573
5014
  if (il == n_layer - 1) {
@@ -4692,7 +5133,7 @@ struct llm_build_falcon : public llm_graph_context {
4692
5133
 
4693
5134
  cur = build_attn(inp_attn, gf,
4694
5135
  model.layers[il].wo, NULL,
4695
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5136
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4696
5137
  }
4697
5138
 
4698
5139
  if (il == n_layer - 1) {
@@ -4822,7 +5263,7 @@ struct llm_build_grok : public llm_graph_context {
4822
5263
 
4823
5264
  cur = build_attn(inp_attn, gf,
4824
5265
  model.layers[il].wo, model.layers[il].bo,
4825
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
5266
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
4826
5267
  }
4827
5268
 
4828
5269
  if (il == n_layer - 1) {
@@ -4973,7 +5414,7 @@ struct llm_build_dbrx : public llm_graph_context {
4973
5414
 
4974
5415
  cur = build_attn(inp_attn, gf,
4975
5416
  model.layers[il].wo, NULL,
4976
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5417
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4977
5418
  }
4978
5419
 
4979
5420
  if (il == n_layer - 1) {
@@ -5087,7 +5528,7 @@ struct llm_build_starcoder : public llm_graph_context {
5087
5528
 
5088
5529
  cur = build_attn(inp_attn, gf,
5089
5530
  model.layers[il].wo, model.layers[il].bo,
5090
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5531
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5091
5532
  }
5092
5533
 
5093
5534
  if (il == n_layer - 1) {
@@ -5186,7 +5627,7 @@ struct llm_build_refact : public llm_graph_context {
5186
5627
 
5187
5628
  cur = build_attn(inp_attn, gf,
5188
5629
  model.layers[il].wo, NULL,
5189
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5630
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5190
5631
  }
5191
5632
 
5192
5633
  if (il == n_layer - 1) {
@@ -5313,6 +5754,11 @@ struct llm_build_bert : public llm_graph_context {
5313
5754
  cur = build_lora_mm(model.layers[il].wqkv, cur);
5314
5755
  cb(cur, "wqkv", il);
5315
5756
 
5757
+ if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5758
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5759
+ cb(cur, "bqkv", il);
5760
+ }
5761
+
5316
5762
  Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5317
5763
  Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5318
5764
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@@ -5340,7 +5786,7 @@ struct llm_build_bert : public llm_graph_context {
5340
5786
 
5341
5787
  cur = build_attn(inp_attn, gf,
5342
5788
  model.layers[il].wo, model.layers[il].bo,
5343
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5789
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5344
5790
  cb(cur, "kqv_out", il);
5345
5791
 
5346
5792
  if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
@@ -5365,13 +5811,29 @@ struct llm_build_bert : public llm_graph_context {
5365
5811
  cb(ffn_inp, "ffn_inp", il);
5366
5812
 
5367
5813
  // feed-forward network
5368
- if (model.arch == LLM_ARCH_BERT) {
5814
+ if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
5815
+ // MoE branch
5816
+ cur = build_moe_ffn(cur,
5817
+ model.layers[il].ffn_gate_inp,
5818
+ model.layers[il].ffn_up_exps,
5819
+ nullptr,
5820
+ model.layers[il].ffn_down_exps,
5821
+ nullptr,
5822
+ hparams.n_expert,
5823
+ hparams.n_expert_used,
5824
+ LLM_FFN_GELU,
5825
+ false, false,
5826
+ 0.0f,
5827
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
5828
+ cb(cur, "ffn_moe_out", il);
5829
+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5369
5830
  cur = build_ffn(cur,
5370
5831
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
5371
5832
  NULL, NULL, NULL,
5372
5833
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5373
5834
  NULL,
5374
5835
  LLM_FFN_GELU, LLM_FFN_SEQ, il);
5836
+ cb(cur, "ffn_out", il);
5375
5837
  } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
5376
5838
  cur = build_ffn(cur,
5377
5839
  model.layers[il].ffn_up, NULL, NULL,
@@ -5379,6 +5841,7 @@ struct llm_build_bert : public llm_graph_context {
5379
5841
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5380
5842
  NULL,
5381
5843
  LLM_FFN_GELU, LLM_FFN_PAR, il);
5844
+ cb(cur, "ffn_out", il);
5382
5845
  } else {
5383
5846
  cur = build_ffn(cur,
5384
5847
  model.layers[il].ffn_up, NULL, NULL,
@@ -5386,8 +5849,8 @@ struct llm_build_bert : public llm_graph_context {
5386
5849
  model.layers[il].ffn_down, NULL, NULL,
5387
5850
  NULL,
5388
5851
  LLM_FFN_SILU, LLM_FFN_PAR, il);
5852
+ cb(cur, "ffn_out", il);
5389
5853
  }
5390
- cb(cur, "ffn_out", il);
5391
5854
 
5392
5855
  // attentions bypass the intermediate layer
5393
5856
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -5457,7 +5920,7 @@ struct llm_build_bloom : public llm_graph_context {
5457
5920
 
5458
5921
  cur = build_attn(inp_attn, gf,
5459
5922
  model.layers[il].wo, model.layers[il].bo,
5460
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5923
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5461
5924
  }
5462
5925
 
5463
5926
  if (il == n_layer - 1) {
@@ -5598,7 +6061,7 @@ struct llm_build_mpt : public llm_graph_context {
5598
6061
 
5599
6062
  cur = build_attn(inp_attn, gf,
5600
6063
  model.layers[il].wo, model.layers[il].bo,
5601
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6064
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5602
6065
  }
5603
6066
 
5604
6067
  if (il == n_layer - 1) {
@@ -5744,7 +6207,7 @@ struct llm_build_stablelm : public llm_graph_context {
5744
6207
 
5745
6208
  cur = build_attn(inp_attn, gf,
5746
6209
  model.layers[il].wo, NULL,
5747
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6210
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5748
6211
  }
5749
6212
 
5750
6213
  if (il == n_layer - 1) {
@@ -5867,7 +6330,7 @@ struct llm_build_qwen : public llm_graph_context {
5867
6330
 
5868
6331
  cur = build_attn(inp_attn, gf,
5869
6332
  model.layers[il].wo, NULL,
5870
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6333
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5871
6334
  }
5872
6335
 
5873
6336
  if (il == n_layer - 1) {
@@ -5987,7 +6450,7 @@ struct llm_build_qwen2 : public llm_graph_context {
5987
6450
 
5988
6451
  cur = build_attn(inp_attn, gf,
5989
6452
  model.layers[il].wo, model.layers[il].bo,
5990
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6453
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5991
6454
  }
5992
6455
 
5993
6456
  if (il == n_layer - 1) {
@@ -6108,7 +6571,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
6108
6571
 
6109
6572
  cur = build_attn(inp_attn, gf,
6110
6573
  model.layers[il].wo, model.layers[il].bo,
6111
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6574
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6112
6575
  }
6113
6576
 
6114
6577
  if (il == n_layer - 1) {
@@ -6193,16 +6656,25 @@ struct llm_build_qwen2moe : public llm_graph_context {
6193
6656
  {
6194
6657
  // compute Q and K and RoPE them
6195
6658
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
6196
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6197
6659
  cb(Qcur, "Qcur", il);
6660
+ if (model.layers[il].bq) {
6661
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
6662
+ cb(Qcur, "Qcur", il);
6663
+ }
6198
6664
 
6199
6665
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
6200
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6201
6666
  cb(Kcur, "Kcur", il);
6667
+ if (model.layers[il].bk) {
6668
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
6669
+ cb(Kcur, "Kcur", il);
6670
+ }
6202
6671
 
6203
6672
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
6204
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6205
6673
  cb(Vcur, "Vcur", il);
6674
+ if (model.layers[il].bv) {
6675
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
6676
+ cb(Vcur, "Vcur", il);
6677
+ }
6206
6678
 
6207
6679
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6208
6680
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
@@ -6226,7 +6698,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
6226
6698
 
6227
6699
  cur = build_attn(inp_attn, gf,
6228
6700
  model.layers[il].wo, model.layers[il].bo,
6229
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6701
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6230
6702
  }
6231
6703
 
6232
6704
  if (il == n_layer - 1) {
@@ -6257,7 +6729,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
6257
6729
  false, 0.0,
6258
6730
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
6259
6731
  il);
6260
- cb(cur, "ffn_moe_out", il);
6732
+ cb(moe_out, "ffn_moe_out", il);
6261
6733
 
6262
6734
  // FFN shared expert
6263
6735
  {
@@ -6313,16 +6785,14 @@ struct llm_build_qwen2moe : public llm_graph_context {
6313
6785
  }
6314
6786
  };
6315
6787
 
6316
- struct llm_build_phi2 : public llm_graph_context {
6317
- llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6788
+ struct llm_build_qwen3 : public llm_graph_context {
6789
+ llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6318
6790
  const int64_t n_embd_head = hparams.n_embd_head_v;
6319
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6320
6791
 
6321
6792
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6793
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6322
6794
 
6323
6795
  ggml_tensor * cur;
6324
- ggml_tensor * attn_norm_output;
6325
- ggml_tensor * ffn_output;
6326
6796
  ggml_tensor * inpL;
6327
6797
 
6328
6798
  inpL = build_inp_embd(model.tok_embd);
@@ -6333,48 +6803,42 @@ struct llm_build_phi2 : public llm_graph_context {
6333
6803
  auto * inp_attn = build_attn_inp_kv_unified();
6334
6804
 
6335
6805
  for (int il = 0; il < n_layer; ++il) {
6336
- attn_norm_output = build_norm(inpL,
6337
- model.layers[il].attn_norm,
6338
- model.layers[il].attn_norm_b,
6339
- LLM_NORM, il);
6340
- cb(attn_norm_output, "attn_norm", il);
6806
+ ggml_tensor * inpSA = inpL;
6807
+
6808
+ // norm
6809
+ cur = build_norm(inpL,
6810
+ model.layers[il].attn_norm, NULL,
6811
+ LLM_NORM_RMS, il);
6812
+ cb(cur, "attn_norm", il);
6341
6813
 
6342
6814
  // self-attention
6343
6815
  {
6344
- ggml_tensor * Qcur = nullptr;
6345
- ggml_tensor * Kcur = nullptr;
6346
- ggml_tensor * Vcur = nullptr;
6347
-
6348
- if (model.layers[il].wqkv) {
6349
- cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
6350
- cb(cur, "wqkv", il);
6351
-
6352
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6353
- cb(cur, "bqkv", il);
6354
-
6355
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6356
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6357
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6358
- } else {
6359
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
6360
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
6361
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
6362
- }
6363
-
6816
+ // compute Q and K and RoPE them
6817
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
6364
6818
  cb(Qcur, "Qcur", il);
6819
+
6820
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
6365
6821
  cb(Kcur, "Kcur", il);
6822
+
6823
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
6366
6824
  cb(Vcur, "Vcur", il);
6367
6825
 
6368
6826
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6369
6827
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6370
6828
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6371
6829
 
6830
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
6831
+ cb(Qcur, "Qcur_normed", il);
6832
+
6372
6833
  Qcur = ggml_rope_ext(
6373
6834
  ctx0, Qcur, inp_pos, nullptr,
6374
6835
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6375
6836
  ext_factor, attn_factor, beta_fast, beta_slow
6376
6837
  );
6377
6838
 
6839
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
6840
+ cb(Kcur, "Kcur_normed", il);
6841
+
6378
6842
  Kcur = ggml_rope_ext(
6379
6843
  ctx0, Kcur, inp_pos, nullptr,
6380
6844
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -6385,36 +6849,36 @@ struct llm_build_phi2 : public llm_graph_context {
6385
6849
  cb(Kcur, "Kcur", il);
6386
6850
  cb(Vcur, "Vcur", il);
6387
6851
 
6388
- // with phi2, we scale the Q to avoid precision issues
6389
- // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
6390
- Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
6391
-
6392
6852
  cur = build_attn(inp_attn, gf,
6393
6853
  model.layers[il].wo, model.layers[il].bo,
6394
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
6854
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6395
6855
  }
6396
6856
 
6397
6857
  if (il == n_layer - 1) {
6398
6858
  // skip computing output for unused tokens
6399
6859
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6400
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6401
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6402
- attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
6860
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6861
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6403
6862
  }
6404
6863
 
6405
- // FF
6406
- {
6407
- ffn_output = build_ffn(attn_norm_output,
6408
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
6409
- NULL, NULL, NULL,
6410
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
6411
- NULL,
6412
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
6413
- cb(ffn_output, "ffn_out", il);
6414
- }
6864
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6865
+ cb(ffn_inp, "ffn_inp", il);
6415
6866
 
6416
- cur = ggml_add(ctx0, cur, ffn_output);
6417
- cur = ggml_add(ctx0, cur, inpL);
6867
+ // feed-forward network
6868
+ cur = build_norm(ffn_inp,
6869
+ model.layers[il].ffn_norm, NULL,
6870
+ LLM_NORM_RMS, il);
6871
+ cb(cur, "ffn_norm", il);
6872
+
6873
+ cur = build_ffn(cur,
6874
+ model.layers[il].ffn_up, NULL, NULL,
6875
+ model.layers[il].ffn_gate, NULL, NULL,
6876
+ model.layers[il].ffn_down, NULL, NULL,
6877
+ NULL,
6878
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
6879
+ cb(cur, "ffn_out", il);
6880
+
6881
+ cur = ggml_add(ctx0, cur, ffn_inp);
6418
6882
 
6419
6883
  cur = build_cvec(cur, il);
6420
6884
  cb(cur, "l_out", il);
@@ -6423,18 +6887,17 @@ struct llm_build_phi2 : public llm_graph_context {
6423
6887
  inpL = cur;
6424
6888
  }
6425
6889
 
6426
- cur = build_norm(inpL,
6427
- model.output_norm,
6428
- model.output_norm_b,
6429
- LLM_NORM, -1);
6890
+ cur = inpL;
6891
+
6892
+ cur = build_norm(cur,
6893
+ model.output_norm, NULL,
6894
+ LLM_NORM_RMS, -1);
6430
6895
 
6431
6896
  cb(cur, "result_norm", -1);
6432
6897
  res->t_embd = cur;
6433
6898
 
6899
+ // lm_head
6434
6900
  cur = build_lora_mm(model.output, cur);
6435
- cb(cur, "result_output_no_bias", -1);
6436
-
6437
- cur = ggml_add(ctx0, cur, model.output_b);
6438
6901
 
6439
6902
  cb(cur, "result_output", -1);
6440
6903
  res->t_logits = cur;
@@ -6443,10 +6906,268 @@ struct llm_build_phi2 : public llm_graph_context {
6443
6906
  }
6444
6907
  };
6445
6908
 
6446
- struct llm_build_phi3 : public llm_graph_context {
6447
- llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6909
+ struct llm_build_qwen3moe : public llm_graph_context {
6910
+ llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6448
6911
  const int64_t n_embd_head = hparams.n_embd_head_v;
6449
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6912
+
6913
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6914
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6915
+
6916
+ ggml_tensor * cur;
6917
+ ggml_tensor * inpL;
6918
+
6919
+ inpL = build_inp_embd(model.tok_embd);
6920
+
6921
+ // inp_pos - contains the positions
6922
+ ggml_tensor * inp_pos = build_inp_pos();
6923
+
6924
+ auto * inp_attn = build_attn_inp_kv_unified();
6925
+
6926
+ for (int il = 0; il < n_layer; ++il) {
6927
+ ggml_tensor * inpSA = inpL;
6928
+
6929
+ // norm
6930
+ cur = build_norm(inpL,
6931
+ model.layers[il].attn_norm, NULL,
6932
+ LLM_NORM_RMS, il);
6933
+ cb(cur, "attn_norm", il);
6934
+
6935
+ // self_attention
6936
+ {
6937
+ // compute Q and K and RoPE them
6938
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
6939
+ cb(Qcur, "Qcur", il);
6940
+
6941
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
6942
+ cb(Kcur, "Kcur", il);
6943
+
6944
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
6945
+ cb(Vcur, "Vcur", il);
6946
+
6947
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6948
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6949
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6950
+
6951
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
6952
+ cb(Qcur, "Qcur_normed", il);
6953
+
6954
+ Qcur = ggml_rope_ext(
6955
+ ctx0, Qcur, inp_pos, nullptr,
6956
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6957
+ ext_factor, attn_factor, beta_fast, beta_slow
6958
+ );
6959
+
6960
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
6961
+ cb(Kcur, "Kcur_normed", il);
6962
+
6963
+ Kcur = ggml_rope_ext(
6964
+ ctx0, Kcur, inp_pos, nullptr,
6965
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6966
+ ext_factor, attn_factor, beta_fast, beta_slow
6967
+ );
6968
+
6969
+ cb(Qcur, "Qcur", il);
6970
+ cb(Kcur, "Kcur", il);
6971
+ cb(Vcur, "Vcur", il);
6972
+
6973
+ cur = build_attn(inp_attn, gf,
6974
+ model.layers[il].wo, model.layers[il].bo,
6975
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6976
+ }
6977
+
6978
+ if (il == n_layer - 1) {
6979
+ // skip computing output for unused tokens
6980
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6981
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6982
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6983
+ }
6984
+
6985
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6986
+ cb(ffn_inp, "ffn_inp", il);
6987
+
6988
+ // MoE branch
6989
+ cur = build_norm(ffn_inp,
6990
+ model.layers[il].ffn_norm, NULL,
6991
+ LLM_NORM_RMS, il);
6992
+ cb(cur, "ffn_norm", il);
6993
+
6994
+ ggml_tensor * moe_out =
6995
+ build_moe_ffn(cur,
6996
+ model.layers[il].ffn_gate_inp,
6997
+ model.layers[il].ffn_up_exps,
6998
+ model.layers[il].ffn_gate_exps,
6999
+ model.layers[il].ffn_down_exps,
7000
+ nullptr,
7001
+ n_expert, n_expert_used,
7002
+ LLM_FFN_SILU, true,
7003
+ false, 0.0,
7004
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
7005
+ il);
7006
+ cb(moe_out, "ffn_moe_out", il);
7007
+ cur = moe_out;
7008
+
7009
+ cur = ggml_add(ctx0, cur, ffn_inp);
7010
+
7011
+ cur = build_cvec(cur, il);
7012
+ cb(cur, "l_out", il);
7013
+
7014
+ // input for next layer
7015
+ inpL = cur;
7016
+ }
7017
+
7018
+ cur = inpL;
7019
+
7020
+ cur = build_norm(cur,
7021
+ model.output_norm, NULL,
7022
+ LLM_NORM_RMS, -1);
7023
+
7024
+ cb(cur, "result_norm", -1);
7025
+ res->t_embd = cur;
7026
+
7027
+ // lm_head
7028
+ cur = build_lora_mm(model.output, cur);
7029
+
7030
+ cb(cur, "result_output", -1);
7031
+ res->t_logits = cur;
7032
+
7033
+ ggml_build_forward_expand(gf, cur);
7034
+ }
7035
+ };
7036
+
7037
+ struct llm_build_phi2 : public llm_graph_context {
7038
+ llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7039
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7040
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7041
+
7042
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7043
+
7044
+ ggml_tensor * cur;
7045
+ ggml_tensor * attn_norm_output;
7046
+ ggml_tensor * ffn_output;
7047
+ ggml_tensor * inpL;
7048
+
7049
+ inpL = build_inp_embd(model.tok_embd);
7050
+
7051
+ // inp_pos - contains the positions
7052
+ ggml_tensor * inp_pos = build_inp_pos();
7053
+
7054
+ auto * inp_attn = build_attn_inp_kv_unified();
7055
+
7056
+ for (int il = 0; il < n_layer; ++il) {
7057
+ attn_norm_output = build_norm(inpL,
7058
+ model.layers[il].attn_norm,
7059
+ model.layers[il].attn_norm_b,
7060
+ LLM_NORM, il);
7061
+ cb(attn_norm_output, "attn_norm", il);
7062
+
7063
+ // self-attention
7064
+ {
7065
+ ggml_tensor * Qcur = nullptr;
7066
+ ggml_tensor * Kcur = nullptr;
7067
+ ggml_tensor * Vcur = nullptr;
7068
+
7069
+ if (model.layers[il].wqkv) {
7070
+ cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
7071
+ cb(cur, "wqkv", il);
7072
+
7073
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7074
+ cb(cur, "bqkv", il);
7075
+
7076
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7077
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7078
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7079
+ } else {
7080
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7081
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7082
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
7083
+ }
7084
+
7085
+ cb(Qcur, "Qcur", il);
7086
+ cb(Kcur, "Kcur", il);
7087
+ cb(Vcur, "Vcur", il);
7088
+
7089
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7090
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7091
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7092
+
7093
+ Qcur = ggml_rope_ext(
7094
+ ctx0, Qcur, inp_pos, nullptr,
7095
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7096
+ ext_factor, attn_factor, beta_fast, beta_slow
7097
+ );
7098
+
7099
+ Kcur = ggml_rope_ext(
7100
+ ctx0, Kcur, inp_pos, nullptr,
7101
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7102
+ ext_factor, attn_factor, beta_fast, beta_slow
7103
+ );
7104
+
7105
+ cb(Qcur, "Qcur", il);
7106
+ cb(Kcur, "Kcur", il);
7107
+ cb(Vcur, "Vcur", il);
7108
+
7109
+ // with phi2, we scale the Q to avoid precision issues
7110
+ // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
7111
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
7112
+
7113
+ cur = build_attn(inp_attn, gf,
7114
+ model.layers[il].wo, model.layers[il].bo,
7115
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7116
+ }
7117
+
7118
+ if (il == n_layer - 1) {
7119
+ // skip computing output for unused tokens
7120
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7121
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7122
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7123
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
7124
+ }
7125
+
7126
+ // FF
7127
+ {
7128
+ ffn_output = build_ffn(attn_norm_output,
7129
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
7130
+ NULL, NULL, NULL,
7131
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
7132
+ NULL,
7133
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
7134
+ cb(ffn_output, "ffn_out", il);
7135
+ }
7136
+
7137
+ cur = ggml_add(ctx0, cur, ffn_output);
7138
+ cur = ggml_add(ctx0, cur, inpL);
7139
+
7140
+ cur = build_cvec(cur, il);
7141
+ cb(cur, "l_out", il);
7142
+
7143
+ // input for next layer
7144
+ inpL = cur;
7145
+ }
7146
+
7147
+ cur = build_norm(inpL,
7148
+ model.output_norm,
7149
+ model.output_norm_b,
7150
+ LLM_NORM, -1);
7151
+
7152
+ cb(cur, "result_norm", -1);
7153
+ res->t_embd = cur;
7154
+
7155
+ cur = build_lora_mm(model.output, cur);
7156
+ cb(cur, "result_output_no_bias", -1);
7157
+
7158
+ cur = ggml_add(ctx0, cur, model.output_b);
7159
+
7160
+ cb(cur, "result_output", -1);
7161
+ res->t_logits = cur;
7162
+
7163
+ ggml_build_forward_expand(gf, cur);
7164
+ }
7165
+ };
7166
+
7167
+ struct llm_build_phi3 : public llm_graph_context {
7168
+ llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7169
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7170
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6450
7171
 
6451
7172
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6452
7173
 
@@ -6520,7 +7241,7 @@ struct llm_build_phi3 : public llm_graph_context {
6520
7241
 
6521
7242
  cur = build_attn(inp_attn, gf,
6522
7243
  model.layers[il].wo, model.layers[il].bo,
6523
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
7244
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
6524
7245
  }
6525
7246
 
6526
7247
  if (il == n_layer - 1) {
@@ -6655,7 +7376,7 @@ struct llm_build_plamo : public llm_graph_context {
6655
7376
 
6656
7377
  cur = build_attn(inp_attn, gf,
6657
7378
  model.layers[il].wo, NULL,
6658
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7379
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6659
7380
  }
6660
7381
  ggml_tensor * sa_out = cur;
6661
7382
 
@@ -6762,7 +7483,7 @@ struct llm_build_gpt2 : public llm_graph_context {
6762
7483
 
6763
7484
  cur = build_attn(inp_attn, gf,
6764
7485
  model.layers[il].wo, model.layers[il].bo,
6765
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7486
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6766
7487
  }
6767
7488
 
6768
7489
  if (il == n_layer - 1) {
@@ -6878,7 +7599,7 @@ struct llm_build_codeshell : public llm_graph_context {
6878
7599
 
6879
7600
  cur = build_attn(inp_attn, gf,
6880
7601
  model.layers[il].wo, model.layers[il].bo,
6881
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7602
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6882
7603
  }
6883
7604
 
6884
7605
  if (il == n_layer - 1) {
@@ -7007,7 +7728,7 @@ struct llm_build_orion : public llm_graph_context {
7007
7728
 
7008
7729
  cur = build_attn(inp_attn, gf,
7009
7730
  model.layers[il].wo, NULL,
7010
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7731
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7011
7732
  }
7012
7733
 
7013
7734
  if (il == n_layer - 1) {
@@ -7134,7 +7855,7 @@ struct llm_build_internlm2 : public llm_graph_context {
7134
7855
 
7135
7856
  cur = build_attn(inp_attn, gf,
7136
7857
  model.layers[il].wo, model.layers[il].bo,
7137
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7858
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7138
7859
  }
7139
7860
 
7140
7861
  if (il == n_layer - 1) {
@@ -7331,7 +8052,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7331
8052
 
7332
8053
  cur = build_attn(inp_attn, gf,
7333
8054
  model.layers[il].wo, NULL,
7334
- q_states, k_states, v_states, nullptr, kq_scale, il);
8055
+ q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
7335
8056
  }
7336
8057
 
7337
8058
  if (il == n_layer - 1) {
@@ -7461,7 +8182,7 @@ struct llm_build_gemma : public llm_graph_context {
7461
8182
 
7462
8183
  cur = build_attn(inp_attn, gf,
7463
8184
  model.layers[il].wo, NULL,
7464
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
8185
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7465
8186
  }
7466
8187
 
7467
8188
  if (il == n_layer - 1) {
@@ -7583,7 +8304,7 @@ struct llm_build_gemma2 : public llm_graph_context {
7583
8304
 
7584
8305
  cur = build_attn(inp_attn, gf,
7585
8306
  model.layers[il].wo, NULL,
7586
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
8307
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7587
8308
  }
7588
8309
 
7589
8310
  cur = build_norm(cur,
@@ -7724,7 +8445,7 @@ struct llm_build_gemma3 : public llm_graph_context {
7724
8445
 
7725
8446
  cur = build_attn(inp_attn, gf,
7726
8447
  model.layers[il].wo, NULL,
7727
- Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il);
8448
+ Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
7728
8449
  }
7729
8450
 
7730
8451
  cur = build_norm(cur,
@@ -7864,7 +8585,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
7864
8585
 
7865
8586
  cur = build_attn(inp_attn, gf,
7866
8587
  model.layers[il].wo, model.layers[il].bo,
7867
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8588
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7868
8589
  }
7869
8590
 
7870
8591
  if (il == n_layer - 1) {
@@ -8199,7 +8920,7 @@ struct llm_build_command_r : public llm_graph_context {
8199
8920
 
8200
8921
  cur = build_attn(inp_attn, gf,
8201
8922
  model.layers[il].wo, model.layers[il].bo,
8202
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8923
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8203
8924
  }
8204
8925
 
8205
8926
  if (il == n_layer - 1) {
@@ -8334,7 +9055,7 @@ struct llm_build_cohere2 : public llm_graph_context {
8334
9055
 
8335
9056
  cur = build_attn(inp_attn, gf,
8336
9057
  model.layers[il].wo, model.layers[il].bo,
8337
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9058
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8338
9059
  }
8339
9060
 
8340
9061
  if (il == n_layer - 1) {
@@ -8465,7 +9186,7 @@ struct llm_build_olmo : public llm_graph_context {
8465
9186
 
8466
9187
  cur = build_attn(inp_attn, gf,
8467
9188
  model.layers[il].wo, nullptr,
8468
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9189
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8469
9190
  }
8470
9191
 
8471
9192
  if (il == n_layer - 1) {
@@ -8585,7 +9306,7 @@ struct llm_build_olmo2 : public llm_graph_context {
8585
9306
 
8586
9307
  cur = build_attn(inp_attn, gf,
8587
9308
  model.layers[il].wo, NULL,
8588
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9309
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8589
9310
  }
8590
9311
 
8591
9312
  cur = build_norm(cur,
@@ -8718,7 +9439,7 @@ struct llm_build_olmoe : public llm_graph_context {
8718
9439
 
8719
9440
  cur = build_attn(inp_attn, gf,
8720
9441
  model.layers[il].wo, NULL,
8721
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9442
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8722
9443
  }
8723
9444
 
8724
9445
  if (il == n_layer - 1) {
@@ -8851,7 +9572,7 @@ struct llm_build_openelm : public llm_graph_context {
8851
9572
 
8852
9573
  cur = build_attn(inp_attn, gf,
8853
9574
  model.layers[il].wo, NULL,
8854
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9575
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8855
9576
  }
8856
9577
 
8857
9578
  if (il == n_layer - 1) {
@@ -8965,7 +9686,7 @@ struct llm_build_gptneox : public llm_graph_context {
8965
9686
 
8966
9687
  cur = build_attn(inp_attn, gf,
8967
9688
  model.layers[il].wo, model.layers[il].bo,
8968
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9689
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8969
9690
  }
8970
9691
 
8971
9692
  if (il == n_layer - 1) {
@@ -9115,7 +9836,7 @@ struct llm_build_arctic : public llm_graph_context {
9115
9836
 
9116
9837
  cur = build_attn(inp_attn, gf,
9117
9838
  model.layers[il].wo, NULL,
9118
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9839
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9119
9840
  }
9120
9841
 
9121
9842
  if (il == n_layer - 1) {
@@ -9270,7 +9991,7 @@ struct llm_build_deepseek : public llm_graph_context {
9270
9991
 
9271
9992
  cur = build_attn(inp_attn, gf,
9272
9993
  model.layers[il].wo, model.layers[il].bo,
9273
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
9994
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
9274
9995
  }
9275
9996
 
9276
9997
  if (il == n_layer - 1) {
@@ -9360,16 +10081,23 @@ struct llm_build_deepseek2 : public llm_graph_context {
9360
10081
  llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9361
10082
  bool is_lite = (hparams.n_layer == 27);
9362
10083
 
9363
- // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
9364
- // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
9365
- const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
9366
- const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
9367
- const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
10084
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
10085
+
10086
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
10087
+ const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
10088
+ const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
10089
+
10090
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
10091
+ const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
9368
10092
 
9369
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
9370
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
9371
10093
  const uint32_t kv_lora_rank = hparams.n_lora_kv;
9372
10094
 
10095
+ // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
10096
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
10097
+ const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
10098
+ const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
10099
+ const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
10100
+
9373
10101
  ggml_tensor * cur;
9374
10102
  ggml_tensor * inpL;
9375
10103
 
@@ -9394,16 +10122,14 @@ struct llm_build_deepseek2 : public llm_graph_context {
9394
10122
  {
9395
10123
  ggml_tensor * q = NULL;
9396
10124
  if (!is_lite) {
9397
- // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
9398
10125
  q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
9399
10126
  cb(q, "q", il);
9400
10127
 
9401
10128
  q = build_norm(q,
9402
- model.layers[il].attn_q_a_norm, NULL,
10129
+ model.layers[il].attn_q_a_norm, nullptr,
9403
10130
  LLM_NORM_RMS, il);
9404
10131
  cb(q, "q", il);
9405
10132
 
9406
- // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
9407
10133
  q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
9408
10134
  cb(q, "q", il);
9409
10135
  } else {
@@ -9411,96 +10137,125 @@ struct llm_build_deepseek2 : public llm_graph_context {
9411
10137
  cb(q, "q", il);
9412
10138
  }
9413
10139
 
9414
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
9415
- ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
9416
- ggml_row_size(q->type, hparams.n_embd_head_k),
9417
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
10140
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
10141
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
10142
+ n_embd_head_qk_nope, n_head, n_tokens,
10143
+ ggml_row_size(q->type, n_embd_head_k),
10144
+ ggml_row_size(q->type, n_embd_head_k) * n_head,
9418
10145
  0);
9419
10146
  cb(q_nope, "q_nope", il);
9420
10147
 
9421
- // and {n_head * n_embd_head_qk_rope, n_tokens}
9422
- ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
9423
- ggml_row_size(q->type, hparams.n_embd_head_k),
9424
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
10148
+ // and {n_embd_head_qk_rope, n_head, n_tokens}
10149
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
10150
+ n_embd_head_qk_rope, n_head, n_tokens,
10151
+ ggml_row_size(q->type, n_embd_head_k),
10152
+ ggml_row_size(q->type, n_embd_head_k) * n_head,
9425
10153
  ggml_row_size(q->type, n_embd_head_qk_nope));
9426
10154
  cb(q_pe, "q_pe", il);
9427
10155
 
9428
- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
9429
- ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
9430
- cb(kv_pe_compresseed, "kv_pe_compresseed", il);
10156
+ ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
10157
+ cb(kv_cmpr_pe, "kv_cmpr_pe", il);
9431
10158
 
9432
10159
  // split into {kv_lora_rank, n_tokens}
9433
- ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
9434
- kv_pe_compresseed->nb[1],
10160
+ ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
10161
+ kv_lora_rank, n_tokens,
10162
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
9435
10163
  0);
9436
- cb(kv_compressed, "kv_compressed", il);
10164
+ cb(kv_cmpr, "kv_cmpr", il);
10165
+
10166
+ // and {n_embd_head_qk_rope, 1, n_tokens}
10167
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
10168
+ n_embd_head_qk_rope, 1, n_tokens,
10169
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
10170
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
10171
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
10172
+ cb(k_pe, "k_pe", il);
9437
10173
 
9438
- // and {n_embd_head_qk_rope, n_tokens}
9439
- ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
9440
- kv_pe_compresseed->nb[1],
9441
- kv_pe_compresseed->nb[1],
9442
- ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
10174
+ q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
10175
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10176
+ ext_factor, attn_factor, beta_fast, beta_slow
10177
+ );
10178
+ cb(q_pe, "q_pe", il);
10179
+
10180
+ k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
10181
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10182
+ ext_factor, attn_factor, beta_fast, beta_slow
10183
+ );
9443
10184
  cb(k_pe, "k_pe", il);
9444
10185
 
9445
- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
9446
- kv_compressed = ggml_cont(ctx0, kv_compressed);
9447
- kv_compressed = build_norm(kv_compressed,
9448
- model.layers[il].attn_kv_a_norm, NULL,
10186
+ kv_cmpr = build_norm(kv_cmpr,
10187
+ model.layers[il].attn_kv_a_norm, nullptr,
9449
10188
  LLM_NORM_RMS, il);
9450
- cb(kv_compressed, "kv_compressed", il);
10189
+ cb(kv_cmpr, "kv_cmpr", il);
9451
10190
 
9452
- // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
9453
- ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
9454
- cb(kv, "kv", il);
10191
+ if (is_mla) {
10192
+ // {n_embd_head_qk_nope, n_tokens, n_head}
10193
+ q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
10194
+ cb(q_nope, "q_nope_perm", il);
9455
10195
 
9456
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
9457
- ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
9458
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
9459
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
9460
- 0);
9461
- cb(k_nope, "k_nope", il);
10196
+ // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
10197
+ ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
10198
+ cb(q_nope_absorbed, "q_nope_absorbed", il);
9462
10199
 
9463
- // and {n_head * n_embd_head_v, n_tokens}
9464
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
9465
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
9466
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
9467
- ggml_row_size(kv->type, (n_embd_head_qk_nope)));
9468
- cb(v_states, "v_states", il);
10200
+ // {kv_lora_rank, n_head, n_tokens}
10201
+ q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
10202
+ cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
9469
10203
 
9470
- v_states = ggml_cont(ctx0, v_states);
9471
- cb(v_states, "v_states", il);
10204
+ // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
10205
+ // note: rope must go first for in-place context shifting in build_rope_shift()
10206
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
10207
+ cb(Qcur, "Qcur", il);
9472
10208
 
9473
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
9474
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
9475
- 0);
9476
- cb(v_states, "v_states", il);
10209
+ kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
10210
+ cb(kv_cmpr, "kv_cmpr_reshape", il);
9477
10211
 
9478
- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
9479
- q_pe = ggml_rope_ext(
9480
- ctx0, q_pe, inp_pos, nullptr,
9481
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9482
- ext_factor, attn_factor_scaled, beta_fast, beta_slow
9483
- );
9484
- cb(q_pe, "q_pe", il);
10212
+ // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
10213
+ ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
10214
+ cb(Kcur, "Kcur", il);
9485
10215
 
9486
- // shared RoPE key
9487
- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
9488
- k_pe = ggml_rope_ext(
9489
- ctx0, k_pe, inp_pos, nullptr,
9490
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9491
- ext_factor, attn_factor_scaled, beta_fast, beta_slow
9492
- );
9493
- cb(k_pe, "k_pe", il);
10216
+ // {kv_lora_rank, 1, n_tokens}
10217
+ ggml_tensor * Vcur = kv_cmpr;
10218
+ cb(Vcur, "Vcur", il);
9494
10219
 
9495
- ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
9496
- cb(q_states, "q_states", il);
10220
+ // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
10221
+ cur = build_attn(inp_attn, gf,
10222
+ model.layers[il].wo, NULL,
10223
+ Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
10224
+ } else {
10225
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
10226
+ cb(kv, "kv", il);
10227
+
10228
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
10229
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
10230
+ n_embd_head_qk_nope, n_head, n_tokens,
10231
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
10232
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
10233
+ 0);
10234
+ cb(k_nope, "k_nope_view", il);
9497
10235
 
9498
- ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
9499
- cb(k_states, "k_states", il);
10236
+ // and {n_embd_head_v, n_head, n_tokens}
10237
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
10238
+ n_embd_head_v, n_head, n_tokens,
10239
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
10240
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
10241
+ ggml_row_size(kv->type, n_embd_head_qk_nope));
10242
+ cb(Vcur, "Vcur_view", il);
9500
10243
 
9501
- cur = build_attn(inp_attn, gf,
9502
- model.layers[il].wo, NULL,
9503
- q_states, k_states, v_states, nullptr, kq_scale, il);
10244
+ Vcur = ggml_cont(ctx0, Vcur);
10245
+ cb(Vcur, "Vcur_cont", il);
10246
+
10247
+ // note: rope must go first for in-place context shifting in build_rope_shift()
10248
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
10249
+ cb(Qcur, "Qcur", il);
10250
+
10251
+ ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
10252
+ cb(Kcur, "Kcur", il);
10253
+
10254
+ // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
10255
+ cur = build_attn(inp_attn, gf,
10256
+ model.layers[il].wo, NULL,
10257
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
10258
+ }
9504
10259
  }
9505
10260
 
9506
10261
  if (il == n_layer - 1) {
@@ -9666,7 +10421,7 @@ struct llm_build_bitnet : public llm_graph_context {
9666
10421
 
9667
10422
  cur = build_attn(inp_attn, gf,
9668
10423
  NULL, NULL,
9669
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10424
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9670
10425
 
9671
10426
  cur = build_norm(cur,
9672
10427
  model.layers[il].attn_sub_norm, NULL,
@@ -9789,7 +10544,7 @@ struct llm_build_t5_enc : public llm_graph_context {
9789
10544
 
9790
10545
  cur = build_attn(inp_attn, gf,
9791
10546
  model.layers[il].wo_enc, nullptr,
9792
- Qcur, Kcur, Vcur, kq_b, 1.0f, il);
10547
+ Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
9793
10548
  cb(cur, "kqv_out", il);
9794
10549
  }
9795
10550
 
@@ -9895,7 +10650,7 @@ struct llm_build_t5_dec : public llm_graph_context {
9895
10650
 
9896
10651
  cur = build_attn(inp_attn_self, gf,
9897
10652
  model.layers[il].wo, model.layers[il].bo,
9898
- Qcur, Kcur, Vcur, kq_b, 1.0f, il);
10653
+ Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
9899
10654
  cb(cur, "kqv_out", il);
9900
10655
  }
9901
10656
 
@@ -9927,7 +10682,7 @@ struct llm_build_t5_dec : public llm_graph_context {
9927
10682
 
9928
10683
  cur = build_attn(inp_attn_cross, gf,
9929
10684
  model.layers[il].wo_cross, nullptr,
9930
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
10685
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9931
10686
  cb(cur, "kqv_out", il);
9932
10687
 
9933
10688
  //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
@@ -10060,7 +10815,7 @@ struct llm_build_jais : public llm_graph_context {
10060
10815
 
10061
10816
  cur = build_attn(inp_attn, gf,
10062
10817
  model.layers[il].wo, model.layers[il].bo,
10063
- Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il);
10818
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
10064
10819
  }
10065
10820
 
10066
10821
  if (il == n_layer - 1) {
@@ -10192,7 +10947,7 @@ struct llm_build_chatglm : public llm_graph_context {
10192
10947
 
10193
10948
  cur = build_attn(inp_attn, gf,
10194
10949
  model.layers[il].wo, NULL,
10195
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10950
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10196
10951
  }
10197
10952
 
10198
10953
  if (il == n_layer - 1) {
@@ -10245,6 +11000,157 @@ struct llm_build_chatglm : public llm_graph_context {
10245
11000
  }
10246
11001
  };
10247
11002
 
11003
+ struct llm_build_glm4 : public llm_graph_context {
11004
+ llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11005
+ const int64_t n_embd_head = hparams.n_embd_head_v;
11006
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
11007
+
11008
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
11009
+
11010
+ ggml_tensor * cur;
11011
+ ggml_tensor * inpL;
11012
+
11013
+ inpL = build_inp_embd(model.tok_embd);
11014
+
11015
+ // inp_pos - contains the positions
11016
+ ggml_tensor * inp_pos = build_inp_pos();
11017
+
11018
+ auto * inp_attn = build_attn_inp_kv_unified();
11019
+
11020
+ for (int il = 0; il < n_layer; ++il) {
11021
+ ggml_tensor * inpSA = inpL;
11022
+
11023
+ // Pre-attention norm
11024
+ cur = build_norm(inpL,
11025
+ model.layers[il].attn_norm,
11026
+ NULL,
11027
+ LLM_NORM_RMS, il);
11028
+ cb(cur, "attn_norm", il);
11029
+
11030
+ // self-attention
11031
+ {
11032
+ ggml_tensor * Qcur = nullptr;
11033
+ ggml_tensor * Kcur = nullptr;
11034
+ ggml_tensor * Vcur = nullptr;
11035
+
11036
+ if (model.layers[il].wqkv == nullptr) {
11037
+ Qcur = build_lora_mm(model.layers[il].wq, cur);
11038
+ if (model.layers[il].bq) {
11039
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
11040
+ }
11041
+ Kcur = build_lora_mm(model.layers[il].wk, cur);
11042
+ if (model.layers[il].bk) {
11043
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
11044
+ }
11045
+ Vcur = build_lora_mm(model.layers[il].wv, cur);
11046
+ if (model.layers[il].bv) {
11047
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
11048
+ }
11049
+ } else {
11050
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
11051
+ cb(cur, "wqkv", il);
11052
+ if (model.layers[il].bqkv) {
11053
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
11054
+ cb(cur, "bqkv", il);
11055
+ }
11056
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
11057
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
11058
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
11059
+ }
11060
+
11061
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11062
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11063
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11064
+
11065
+ Qcur = ggml_rope_ext(
11066
+ ctx0, Qcur, inp_pos, nullptr,
11067
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11068
+ ext_factor, attn_factor, beta_fast, beta_slow
11069
+ );
11070
+
11071
+ Kcur = ggml_rope_ext(
11072
+ ctx0, Kcur, inp_pos, nullptr,
11073
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11074
+ ext_factor, attn_factor, beta_fast, beta_slow
11075
+ );
11076
+
11077
+ cb(Qcur, "Qcur", il);
11078
+ cb(Kcur, "Kcur", il);
11079
+ cb(Vcur, "Vcur", il);
11080
+
11081
+ cur = build_attn(inp_attn, gf,
11082
+ model.layers[il].wo, NULL,
11083
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11084
+ }
11085
+
11086
+ if (il == n_layer - 1) {
11087
+ // skip computing output for unused tokens
11088
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11089
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11090
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11091
+ }
11092
+
11093
+ // Post-attention norm (new!)
11094
+ cur = build_norm(cur,
11095
+ model.layers[il].attn_post_norm,
11096
+ NULL,
11097
+ LLM_NORM_RMS, il);
11098
+ cb(cur, "post_attn_norm", il);
11099
+
11100
+ // Add the input (residual connection after post-attention norm)
11101
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11102
+ cb(ffn_inp, "ffn_inp", il);
11103
+
11104
+ // FF
11105
+ {
11106
+ // Pre-MLP norm
11107
+ cur = build_norm(ffn_inp,
11108
+ model.layers[il].ffn_norm,
11109
+ NULL,
11110
+ LLM_NORM_RMS, il);
11111
+ cb(cur, "ffn_norm", il);
11112
+
11113
+ // MLP
11114
+ cur = build_ffn(cur,
11115
+ model.layers[il].ffn_up, NULL, NULL,
11116
+ NULL, NULL, NULL,
11117
+ model.layers[il].ffn_down, NULL, NULL,
11118
+ NULL,
11119
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
11120
+ cb(cur, "ffn_out", il);
11121
+
11122
+ // Post-MLP norm
11123
+ cur = build_norm(cur,
11124
+ model.layers[il].ffn_post_norm,
11125
+ NULL,
11126
+ LLM_NORM_RMS, il);
11127
+ cb(cur, "post_mlp_norm", il);
11128
+ }
11129
+
11130
+ // Add residual connection after post-MLP norm
11131
+ inpL = ggml_add(ctx0, cur, ffn_inp);
11132
+ cb(inpL, "l_out", il);
11133
+ }
11134
+
11135
+ // Final norm
11136
+ cur = build_norm(inpL,
11137
+ model.output_norm,
11138
+ NULL,
11139
+ LLM_NORM_RMS, -1);
11140
+
11141
+ cb(cur, "result_norm", -1);
11142
+ res->t_embd = cur;
11143
+
11144
+ // Output projection
11145
+ cur = build_lora_mm(model.output, cur);
11146
+
11147
+ cb(cur, "result_output", -1);
11148
+ res->t_logits = cur;
11149
+
11150
+ ggml_build_forward_expand(gf, cur);
11151
+ }
11152
+ };
11153
+
10248
11154
  struct llm_build_nemotron : public llm_graph_context {
10249
11155
  llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
10250
11156
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -10318,7 +11224,7 @@ struct llm_build_nemotron : public llm_graph_context {
10318
11224
 
10319
11225
  cur = build_attn(inp_attn, gf,
10320
11226
  model.layers[il].wo, model.layers[il].bo,
10321
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11227
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10322
11228
  }
10323
11229
 
10324
11230
  if (il == n_layer - 1) {
@@ -10449,7 +11355,7 @@ struct llm_build_exaone : public llm_graph_context {
10449
11355
 
10450
11356
  cur = build_attn(inp_attn, gf,
10451
11357
  model.layers[il].wo, model.layers[il].bo,
10452
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11358
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10453
11359
  }
10454
11360
 
10455
11361
  if (il == n_layer - 1) {
@@ -11351,7 +12257,7 @@ struct llm_build_chameleon : public llm_graph_context {
11351
12257
 
11352
12258
  cur = build_attn(inp_attn, gf,
11353
12259
  model.layers[il].wo, nullptr,
11354
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12260
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11355
12261
 
11356
12262
  if (hparams.swin_norm) {
11357
12263
  cur = build_norm(cur,
@@ -11588,32 +12494,348 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
11588
12494
  }
11589
12495
  };
11590
12496
 
11591
- llama_memory_i * llama_model::create_memory() const {
11592
- llama_memory_i * res;
12497
+ struct llm_build_plm : public llm_graph_context {
12498
+ llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12499
+ const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
11593
12500
 
11594
- switch (arch) {
11595
- case LLM_ARCH_MAMBA:
11596
- case LLM_ARCH_RWKV6:
11597
- case LLM_ARCH_RWKV6QWEN2:
11598
- case LLM_ARCH_RWKV7:
11599
- case LLM_ARCH_ARWKV7:
11600
- {
11601
- res = new llama_kv_cache_unified(hparams, {
11602
- /*.get_rope_factors =*/ nullptr
11603
- });
11604
- } break;
11605
- default:
11606
- {
11607
- res = new llama_kv_cache_unified(hparams, {
11608
- /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
11609
- // choose long/short freq factors based on the context size
11610
- if (layers[il].rope_freqs != nullptr) {
11611
- return layers[il].rope_freqs;
11612
- }
12501
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
12502
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
12503
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
11613
12504
 
11614
- if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
11615
- return layers[il].rope_long;
11616
- }
12505
+ ggml_tensor * cur;
12506
+ ggml_tensor * inpL;
12507
+
12508
+ // {n_embd, n_tokens}
12509
+ inpL = build_inp_embd(model.tok_embd);
12510
+
12511
+ // inp_pos - contains the positions
12512
+ ggml_tensor * inp_pos = build_inp_pos();
12513
+
12514
+ auto * inp_attn = build_attn_inp_kv_unified();
12515
+
12516
+ for (int il = 0; il < n_layer; ++il) {
12517
+ ggml_tensor * inpSA = inpL;
12518
+
12519
+ // norm
12520
+ cur = build_norm(inpL,
12521
+ model.layers[il].attn_norm, NULL,
12522
+ LLM_NORM_RMS, il);
12523
+ cb(cur, "attn_norm", il);
12524
+
12525
+ // self_attention
12526
+ {
12527
+ ggml_tensor * q = NULL;
12528
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
12529
+ cb(q, "q", il);
12530
+
12531
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
12532
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
12533
+ ggml_row_size(q->type, hparams.n_embd_head_k),
12534
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
12535
+ 0);
12536
+ cb(q_nope, "q_nope", il);
12537
+
12538
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
12539
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
12540
+ ggml_row_size(q->type, hparams.n_embd_head_k),
12541
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
12542
+ ggml_row_size(q->type, n_embd_head_qk_nope));
12543
+ cb(q_pe, "q_pe", il);
12544
+
12545
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
12546
+ ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
12547
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
12548
+
12549
+ // split into {kv_lora_rank, n_tokens}
12550
+ ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
12551
+ kv_pe_compresseed->nb[1],
12552
+ 0);
12553
+ cb(kv_compressed, "kv_compressed", il);
12554
+
12555
+ // and {n_embd_head_qk_rope, n_tokens}
12556
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
12557
+ kv_pe_compresseed->nb[1],
12558
+ kv_pe_compresseed->nb[1],
12559
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
12560
+ cb(k_pe, "k_pe", il);
12561
+
12562
+ kv_compressed = build_norm(kv_compressed,
12563
+ model.layers[il].attn_kv_a_norm, NULL,
12564
+ LLM_NORM_RMS, il);
12565
+ cb(kv_compressed, "kv_compressed", il);
12566
+
12567
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
12568
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
12569
+ cb(kv, "kv", il);
12570
+
12571
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
12572
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
12573
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
12574
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
12575
+ 0);
12576
+ cb(k_nope, "k_nope", il);
12577
+
12578
+ // and {n_head * n_embd_head_v, n_tokens}
12579
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
12580
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
12581
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
12582
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
12583
+ cb(v_states, "v_states", il);
12584
+
12585
+ v_states = ggml_cont(ctx0, v_states);
12586
+ cb(v_states, "v_states", il);
12587
+
12588
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
12589
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
12590
+ 0);
12591
+ cb(v_states, "v_states", il);
12592
+
12593
+ q_pe = ggml_rope_ext(
12594
+ ctx0, q_pe, inp_pos, nullptr,
12595
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12596
+ ext_factor, attn_factor, beta_fast, beta_slow
12597
+ );
12598
+ cb(q_pe, "q_pe", il);
12599
+
12600
+ // shared RoPE key
12601
+ k_pe = ggml_rope_ext(
12602
+ ctx0, k_pe, inp_pos, nullptr,
12603
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12604
+ ext_factor, attn_factor, beta_fast, beta_slow
12605
+ );
12606
+ cb(k_pe, "k_pe", il);
12607
+
12608
+ ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
12609
+ cb(q_states, "q_states", il);
12610
+
12611
+ ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
12612
+ cb(k_states, "k_states", il);
12613
+
12614
+ cur = build_attn(inp_attn, gf,
12615
+ model.layers[il].wo, NULL,
12616
+ q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
12617
+ }
12618
+
12619
+ if (il == n_layer - 1) {
12620
+ // skip computing output for unused tokens
12621
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12622
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12623
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12624
+ }
12625
+
12626
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12627
+ cb(ffn_inp, "ffn_inp", il);
12628
+
12629
+ cur = build_norm(ffn_inp,
12630
+ model.layers[il].ffn_norm, NULL,
12631
+ LLM_NORM_RMS, il);
12632
+ cb(cur, "ffn_norm", il);
12633
+
12634
+ cur = build_ffn(cur,
12635
+ model.layers[il].ffn_up, NULL, NULL,
12636
+ NULL, NULL, NULL,
12637
+ model.layers[il].ffn_down, NULL, NULL,
12638
+ NULL,
12639
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
12640
+ cb(cur, "ffn_out", il);
12641
+
12642
+ cur = ggml_add(ctx0, cur, ffn_inp);
12643
+
12644
+ cur = build_cvec(cur, il);
12645
+ cb(cur, "l_out", il);
12646
+
12647
+ // input for next layer
12648
+ inpL = cur;
12649
+ }
12650
+
12651
+ cur = inpL;
12652
+
12653
+ cur = build_norm(cur,
12654
+ model.output_norm, NULL,
12655
+ LLM_NORM_RMS, -1);
12656
+
12657
+ cb(cur, "result_norm", -1);
12658
+ res->t_embd = cur;
12659
+
12660
+ cur = build_lora_mm(model.output, cur);
12661
+
12662
+ cb(cur, "result_output", -1);
12663
+ res->t_logits = cur;
12664
+
12665
+ ggml_build_forward_expand(gf, cur);
12666
+ }
12667
+ };
12668
+
12669
+ struct llm_build_bailingmoe : public llm_graph_context {
12670
+ llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12671
+ ggml_tensor * cur;
12672
+ ggml_tensor * inpL;
12673
+
12674
+ inpL = build_inp_embd(model.tok_embd);
12675
+
12676
+ // inp_pos - contains the positions
12677
+ ggml_tensor * inp_pos = build_inp_pos();
12678
+
12679
+ auto * inp_attn = build_attn_inp_kv_unified();
12680
+
12681
+ for (int il = 0; il < n_layer; ++il) {
12682
+ ggml_tensor * inpSA = inpL;
12683
+
12684
+ // norm
12685
+ cur = build_norm(inpL,
12686
+ model.layers[il].attn_norm, NULL,
12687
+ LLM_NORM_RMS, il);
12688
+ cb(cur, "attn_norm", il);
12689
+
12690
+ // self-attention
12691
+ {
12692
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
12693
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
12694
+
12695
+ // compute Q and K and RoPE them
12696
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12697
+ cb(Qcur, "Qcur", il);
12698
+ if (model.layers[il].bq) {
12699
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12700
+ cb(Qcur, "Qcur", il);
12701
+ }
12702
+
12703
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12704
+ cb(Kcur, "Kcur", il);
12705
+ if (model.layers[il].bk) {
12706
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12707
+ cb(Kcur, "Kcur", il);
12708
+ }
12709
+
12710
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12711
+ cb(Vcur, "Vcur", il);
12712
+ if (model.layers[il].bv) {
12713
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12714
+ cb(Vcur, "Vcur", il);
12715
+ }
12716
+
12717
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
12718
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
12719
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
12720
+
12721
+ Qcur = ggml_rope_ext(
12722
+ ctx0, Qcur, inp_pos, rope_factors,
12723
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12724
+ ext_factor, attn_factor, beta_fast, beta_slow
12725
+ );
12726
+
12727
+ Kcur = ggml_rope_ext(
12728
+ ctx0, Kcur, inp_pos, rope_factors,
12729
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12730
+ ext_factor, attn_factor, beta_fast, beta_slow
12731
+ );
12732
+
12733
+ cb(Qcur, "Qcur", il);
12734
+ cb(Kcur, "Kcur", il);
12735
+ cb(Vcur, "Vcur", il);
12736
+
12737
+ cur = build_attn(inp_attn, gf,
12738
+ model.layers[il].wo, model.layers[il].bo,
12739
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
12740
+ }
12741
+
12742
+ if (il == n_layer - 1) {
12743
+ // skip computing output for unused tokens
12744
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12745
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12746
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12747
+ }
12748
+
12749
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12750
+ cb(ffn_inp, "ffn_inp", il);
12751
+
12752
+ cur = build_norm(ffn_inp,
12753
+ model.layers[il].ffn_norm, NULL,
12754
+ LLM_NORM_RMS, il);
12755
+ cb(cur, "ffn_norm", il);
12756
+
12757
+ ggml_tensor * moe_out =
12758
+ build_moe_ffn(cur,
12759
+ model.layers[il].ffn_gate_inp,
12760
+ model.layers[il].ffn_up_exps,
12761
+ model.layers[il].ffn_gate_exps,
12762
+ model.layers[il].ffn_down_exps,
12763
+ nullptr,
12764
+ n_expert, n_expert_used,
12765
+ LLM_FFN_SILU, hparams.expert_weights_norm,
12766
+ false, hparams.expert_weights_scale,
12767
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12768
+ il);
12769
+ cb(moe_out, "ffn_moe_out", il);
12770
+
12771
+ // FFN shared expert
12772
+ {
12773
+ ggml_tensor * ffn_shexp = build_ffn(cur,
12774
+ model.layers[il].ffn_up_shexp, NULL, NULL,
12775
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
12776
+ model.layers[il].ffn_down_shexp, NULL, NULL,
12777
+ NULL,
12778
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12779
+ cb(ffn_shexp, "ffn_shexp", il);
12780
+
12781
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
12782
+ cb(cur, "ffn_out", il);
12783
+ }
12784
+
12785
+ cur = ggml_add(ctx0, cur, ffn_inp);
12786
+
12787
+ cur = build_cvec(cur, il);
12788
+ cb(cur, "l_out", il);
12789
+
12790
+ // input for next layer
12791
+ inpL = cur;
12792
+ }
12793
+
12794
+ cur = inpL;
12795
+
12796
+ cur = build_norm(cur,
12797
+ model.output_norm, NULL,
12798
+ LLM_NORM_RMS, -1);
12799
+
12800
+ cb(cur, "result_norm", -1);
12801
+ res->t_embd = cur;
12802
+
12803
+ // lm_head
12804
+ cur = build_lora_mm(model.output, cur);
12805
+
12806
+ cb(cur, "result_output", -1);
12807
+ res->t_logits = cur;
12808
+
12809
+ ggml_build_forward_expand(gf, cur);
12810
+ }
12811
+ };
12812
+
12813
+ llama_memory_i * llama_model::create_memory() const {
12814
+ llama_memory_i * res;
12815
+
12816
+ switch (arch) {
12817
+ case LLM_ARCH_MAMBA:
12818
+ case LLM_ARCH_RWKV6:
12819
+ case LLM_ARCH_RWKV6QWEN2:
12820
+ case LLM_ARCH_RWKV7:
12821
+ case LLM_ARCH_ARWKV7:
12822
+ {
12823
+ res = new llama_kv_cache_unified(hparams, {
12824
+ /*.get_rope_factors =*/ nullptr
12825
+ });
12826
+ } break;
12827
+ default:
12828
+ {
12829
+ res = new llama_kv_cache_unified(hparams, {
12830
+ /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
12831
+ // choose long/short freq factors based on the context size
12832
+ if (layers[il].rope_freqs != nullptr) {
12833
+ return layers[il].rope_freqs;
12834
+ }
12835
+
12836
+ if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
12837
+ return layers[il].rope_long;
12838
+ }
11617
12839
 
11618
12840
  return layers[il].rope_short;
11619
12841
  }
@@ -11632,6 +12854,7 @@ llm_graph_result_ptr llama_model::build_graph(
11632
12854
 
11633
12855
  switch (arch) {
11634
12856
  case LLM_ARCH_LLAMA:
12857
+ case LLM_ARCH_LLAMA4:
11635
12858
  case LLM_ARCH_MINICPM:
11636
12859
  case LLM_ARCH_GRANITE:
11637
12860
  case LLM_ARCH_GRANITE_MOE:
@@ -11665,6 +12888,7 @@ llm_graph_result_ptr llama_model::build_graph(
11665
12888
  case LLM_ARCH_BERT:
11666
12889
  case LLM_ARCH_JINA_BERT_V2:
11667
12890
  case LLM_ARCH_NOMIC_BERT:
12891
+ case LLM_ARCH_NOMIC_BERT_MOE:
11668
12892
  {
11669
12893
  llm = std::make_unique<llm_build_bert>(*this, params, gf);
11670
12894
  } break;
@@ -11696,6 +12920,14 @@ llm_graph_result_ptr llama_model::build_graph(
11696
12920
  {
11697
12921
  llm = std::make_unique<llm_build_qwen2moe>(*this, params, gf);
11698
12922
  } break;
12923
+ case LLM_ARCH_QWEN3:
12924
+ {
12925
+ llm = std::make_unique<llm_build_qwen3>(*this, params, gf);
12926
+ } break;
12927
+ case LLM_ARCH_QWEN3MOE:
12928
+ {
12929
+ llm = std::make_unique<llm_build_qwen3moe>(*this, params, gf);
12930
+ } break;
11699
12931
  case LLM_ARCH_PHI2:
11700
12932
  {
11701
12933
  llm = std::make_unique<llm_build_phi2>(*this, params, gf);
@@ -11801,6 +13033,10 @@ llm_graph_result_ptr llama_model::build_graph(
11801
13033
  {
11802
13034
  llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
11803
13035
  } break;
13036
+ case LLM_ARCH_GLM4:
13037
+ {
13038
+ llm = std::make_unique<llm_build_glm4>(*this, params, gf);
13039
+ } break;
11804
13040
  case LLM_ARCH_BITNET:
11805
13041
  {
11806
13042
  llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
@@ -11819,10 +13055,11 @@ llm_graph_result_ptr llama_model::build_graph(
11819
13055
  GGML_ABORT("invalid graph type");
11820
13056
  };
11821
13057
  } break;
11822
- //case LLM_ARCH_T5ENCODER:
11823
- // {
11824
- // llm.build_t5_enc(gf);
11825
- // } break;
13058
+ case LLM_ARCH_T5ENCODER:
13059
+ {
13060
+ llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
13061
+ }
13062
+ break;
11826
13063
  case LLM_ARCH_JAIS:
11827
13064
  {
11828
13065
  llm = std::make_unique<llm_build_jais>(*this, params, gf);
@@ -11859,6 +13096,14 @@ llm_graph_result_ptr llama_model::build_graph(
11859
13096
  {
11860
13097
  llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
11861
13098
  } break;
13099
+ case LLM_ARCH_PLM:
13100
+ {
13101
+ llm = std::make_unique<llm_build_plm>(*this, params, gf);
13102
+ } break;
13103
+ case LLM_ARCH_BAILINGMOE:
13104
+ {
13105
+ llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
13106
+ } break;
11862
13107
  default:
11863
13108
  GGML_ABORT("fatal error");
11864
13109
  }
@@ -11876,6 +13121,7 @@ llm_graph_result_ptr llama_model::build_graph(
11876
13121
  llama_model_params llama_model_default_params() {
11877
13122
  llama_model_params result = {
11878
13123
  /*.devices =*/ nullptr,
13124
+ /*.tensor_buft_overrides =*/ nullptr,
11879
13125
  /*.n_gpu_layers =*/ 0,
11880
13126
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
11881
13127
  /*.main_gpu =*/ 0,
@@ -11971,6 +13217,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
11971
13217
 
11972
13218
  // use what we call a normal RoPE, operating on pairs of consecutive head values
11973
13219
  case LLM_ARCH_LLAMA:
13220
+ case LLM_ARCH_LLAMA4:
11974
13221
  case LLM_ARCH_DECI:
11975
13222
  case LLM_ARCH_BAICHUAN:
11976
13223
  case LLM_ARCH_STARCODER:
@@ -11985,10 +13232,13 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
11985
13232
  case LLM_ARCH_ARCTIC:
11986
13233
  case LLM_ARCH_DEEPSEEK:
11987
13234
  case LLM_ARCH_DEEPSEEK2:
13235
+ case LLM_ARCH_PLM:
11988
13236
  case LLM_ARCH_CHATGLM:
13237
+ case LLM_ARCH_GLM4:
11989
13238
  case LLM_ARCH_GRANITE:
11990
13239
  case LLM_ARCH_GRANITE_MOE:
11991
13240
  case LLM_ARCH_CHAMELEON:
13241
+ case LLM_ARCH_BAILINGMOE:
11992
13242
  return LLAMA_ROPE_TYPE_NORM;
11993
13243
 
11994
13244
  // the pairs of head values are offset by n_rot/2
@@ -11997,11 +13247,14 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
11997
13247
  case LLM_ARCH_DBRX:
11998
13248
  case LLM_ARCH_BERT:
11999
13249
  case LLM_ARCH_NOMIC_BERT:
13250
+ case LLM_ARCH_NOMIC_BERT_MOE:
12000
13251
  case LLM_ARCH_STABLELM:
12001
13252
  case LLM_ARCH_BITNET:
12002
13253
  case LLM_ARCH_QWEN:
12003
13254
  case LLM_ARCH_QWEN2:
12004
13255
  case LLM_ARCH_QWEN2MOE:
13256
+ case LLM_ARCH_QWEN3:
13257
+ case LLM_ARCH_QWEN3MOE:
12005
13258
  case LLM_ARCH_OLMO2:
12006
13259
  case LLM_ARCH_OLMOE:
12007
13260
  case LLM_ARCH_PHI2: