@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -17,6 +17,7 @@
17
17
  #include <cmath>
18
18
  #include <functional>
19
19
  #include <map>
20
+ #include <regex>
20
21
  #include <sstream>
21
22
  #include <stdexcept>
22
23
 
@@ -42,11 +43,14 @@ const char * llm_type_name(llm_type type) {
42
43
  case LLM_TYPE_770M: return "770M";
43
44
  case LLM_TYPE_780M: return "780M";
44
45
  case LLM_TYPE_0_5B: return "0.5B";
46
+ case LLM_TYPE_0_6B: return "0.6B";
45
47
  case LLM_TYPE_1B: return "1B";
46
48
  case LLM_TYPE_1_3B: return "1.3B";
47
49
  case LLM_TYPE_1_4B: return "1.4B";
48
50
  case LLM_TYPE_1_5B: return "1.5B";
49
51
  case LLM_TYPE_1_6B: return "1.6B";
52
+ case LLM_TYPE_1_7B: return "1.7B";
53
+ case LLM_TYPE_1_8B: return "1.8B";
50
54
  case LLM_TYPE_2B: return "2B";
51
55
  case LLM_TYPE_2_8B: return "2.8B";
52
56
  case LLM_TYPE_2_9B: return "2.9B";
@@ -64,6 +68,7 @@ const char * llm_type_name(llm_type type) {
64
68
  case LLM_TYPE_15B: return "15B";
65
69
  case LLM_TYPE_16B: return "16B";
66
70
  case LLM_TYPE_20B: return "20B";
71
+ case LLM_TYPE_27B: return "27B";
67
72
  case LLM_TYPE_30B: return "30B";
68
73
  case LLM_TYPE_32B: return "32B";
69
74
  case LLM_TYPE_34B: return "34B";
@@ -72,6 +77,7 @@ const char * llm_type_name(llm_type type) {
72
77
  case LLM_TYPE_65B: return "65B";
73
78
  case LLM_TYPE_70B: return "70B";
74
79
  case LLM_TYPE_236B: return "236B";
80
+ case LLM_TYPE_290B: return "290B";
75
81
  case LLM_TYPE_314B: return "314B";
76
82
  case LLM_TYPE_671B: return "671B";
77
83
  case LLM_TYPE_SMALL: return "0.1B";
@@ -86,7 +92,10 @@ const char * llm_type_name(llm_type type) {
86
92
  case LLM_TYPE_16x3_8B: return "16x3.8B";
87
93
  case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
88
94
  case LLM_TYPE_57B_A14B: return "57B.A14B";
89
- case LLM_TYPE_27B: return "27B";
95
+ case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
96
+ case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
97
+ case LLM_TYPE_30B_A3B: return "30B.A3B";
98
+ case LLM_TYPE_235B_A22B: return "235B.A22B";
90
99
  default: return "?B";
91
100
  }
92
101
  }
@@ -255,7 +264,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
255
264
  return nullptr;
256
265
  }
257
266
 
258
- // CPU: ACCEL -> CPU extra -> GPU host -> CPU
267
+ // CPU: ACCEL -> GPU host -> CPU extra -> CPU
259
268
  static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
260
269
  buft_list_t buft_list;
261
270
 
@@ -271,32 +280,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
271
280
  }
272
281
  }
273
282
 
274
- bool has_gpu_device = false;
275
- for (auto * dev : devices) {
276
- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
277
- has_gpu_device = true;
278
- break;
279
- }
280
- }
281
-
282
- // add extra buffer types, only if no GPU device is present
283
- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
284
- if (!has_gpu_device) {
285
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
286
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
287
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
288
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
289
- if (ggml_backend_dev_get_extra_bufts_fn) {
290
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
291
- while (extra_bufts && *extra_bufts) {
292
- buft_list.emplace_back(cpu_dev, *extra_bufts);
293
- ++extra_bufts;
294
- }
295
- }
296
- } else {
297
- LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
298
- }
299
-
300
283
  // add a host buffer type
301
284
  // storing the tensors in a host buffer is useful when the processing of large batches
302
285
  // is offloaded to a GPU device, since it reduces the time spent on data transfers
@@ -311,6 +294,20 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
311
294
  }
312
295
  }
313
296
 
297
+ // add extra buffer types, only if no GPU device is present
298
+ // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
299
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
300
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
301
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
302
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
303
+ if (ggml_backend_dev_get_extra_bufts_fn) {
304
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
305
+ while (extra_bufts && *extra_bufts) {
306
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
307
+ ++extra_bufts;
308
+ }
309
+ }
310
+
314
311
  // add the CPU buffer type
315
312
  for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
316
313
  ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -388,9 +385,12 @@ struct llama_model::impl {
388
385
  layer_dev dev_input = {};
389
386
  layer_dev dev_output = {};
390
387
  std::vector<layer_dev> dev_layer;
388
+
389
+ bool has_tensor_overrides;
391
390
  };
392
391
 
393
392
  llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
393
+ pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
394
394
  }
395
395
 
396
396
  llama_model::~llama_model() {}
@@ -556,6 +556,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
556
556
  }
557
557
  }
558
558
  } break;
559
+ case LLM_ARCH_LLAMA4:
560
+ {
561
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
562
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
563
+ ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
564
+ hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
565
+ hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
566
+ hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
567
+
568
+ switch (hparams.n_expert) {
569
+ case 16: type = LLM_TYPE_17B_16E; break;
570
+ case 128: type = LLM_TYPE_17B_128E; break;
571
+ default: type = LLM_TYPE_UNKNOWN;
572
+ }
573
+
574
+ if (type == LLM_TYPE_17B_128E) {
575
+ hparams.use_kq_norm = false;
576
+ }
577
+ } break;
559
578
  case LLM_ARCH_DECI:
560
579
  {
561
580
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -680,10 +699,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
680
699
  }
681
700
  } break;
682
701
  case LLM_ARCH_NOMIC_BERT:
702
+ case LLM_ARCH_NOMIC_BERT_MOE:
683
703
  {
684
704
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
685
705
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
686
706
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
707
+ ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
687
708
 
688
709
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
689
710
  type = LLM_TYPE_137M;
@@ -772,6 +793,28 @@ void llama_model::load_hparams(llama_model_loader & ml) {
772
793
  default: type = LLM_TYPE_UNKNOWN;
773
794
  }
774
795
  } break;
796
+ case LLM_ARCH_QWEN3:
797
+ {
798
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
799
+ switch (hparams.n_layer) {
800
+ case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
801
+ case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
802
+ case 40: type = LLM_TYPE_14B; break;
803
+ case 64: type = LLM_TYPE_32B; break;
804
+ default: type = LLM_TYPE_UNKNOWN;
805
+ }
806
+ } break;
807
+ case LLM_ARCH_QWEN3MOE:
808
+ {
809
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
810
+
811
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
812
+ switch (hparams.n_layer) {
813
+ case 48: type = LLM_TYPE_30B_A3B; break;
814
+ case 94: type = LLM_TYPE_235B_A22B; break;
815
+ default: type = LLM_TYPE_UNKNOWN;
816
+ }
817
+ } break;
775
818
  case LLM_ARCH_PHI2:
776
819
  {
777
820
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1125,6 +1168,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1125
1168
  ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1126
1169
  }
1127
1170
  ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1171
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
1172
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
1128
1173
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1129
1174
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1130
1175
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
@@ -1144,6 +1189,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1144
1189
  default: type = LLM_TYPE_UNKNOWN;
1145
1190
  }
1146
1191
  } break;
1192
+ case LLM_ARCH_PLM:
1193
+ {
1194
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1195
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1196
+ switch (hparams.n_layer) {
1197
+ case 32: type = LLM_TYPE_1_8B; break;
1198
+ default: type = LLM_TYPE_UNKNOWN;
1199
+ }
1200
+ } break;
1147
1201
  case LLM_ARCH_CHATGLM:
1148
1202
  {
1149
1203
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1165,6 +1219,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1165
1219
  default: type = LLM_TYPE_UNKNOWN;
1166
1220
  }
1167
1221
  } break;
1222
+ case LLM_ARCH_GLM4:
1223
+ {
1224
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1225
+ switch (hparams.n_layer) {
1226
+ case 40: type = LLM_TYPE_9B; break;
1227
+ case 61: type = LLM_TYPE_32B; break;
1228
+ default: type = LLM_TYPE_UNKNOWN;
1229
+ }
1230
+ } break;
1168
1231
  case LLM_ARCH_BITNET:
1169
1232
  {
1170
1233
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1330,6 +1393,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1330
1393
  ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
1331
1394
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
1332
1395
  } break;
1396
+ case LLM_ARCH_BAILINGMOE:
1397
+ {
1398
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1399
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1400
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1401
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1402
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1403
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1404
+
1405
+ switch (hparams.n_layer) {
1406
+ case 28: type = LLM_TYPE_16B; break;
1407
+ case 88: type = LLM_TYPE_290B; break;
1408
+ default: type = LLM_TYPE_UNKNOWN;
1409
+ }
1410
+ } break;
1333
1411
  default: throw std::runtime_error("unsupported model architecture");
1334
1412
  }
1335
1413
 
@@ -1557,9 +1635,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1557
1635
  GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
1558
1636
  }
1559
1637
 
1560
- ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
1638
+ ggml_backend_buffer_type_t buft = nullptr;
1639
+
1640
+ // check overrides
1641
+ if (ml.tensor_buft_overrides) {
1642
+ std::string tensor_name = tn.str();
1643
+ for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
1644
+ std::regex pattern(overrides->pattern);
1645
+ if (std::regex_search(tensor_name, pattern)) {
1646
+ LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
1647
+ buft = overrides->buft;
1648
+ break;
1649
+ }
1650
+ }
1651
+ }
1652
+
1561
1653
  if (!buft) {
1562
- throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
1654
+ buft = select_weight_buft(hparams, t_meta, op, *buft_list);
1655
+ if (!buft) {
1656
+ throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
1657
+ }
1563
1658
  }
1564
1659
 
1565
1660
  // avoid using a host buffer when using mmap
@@ -1655,6 +1750,56 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1655
1750
  }
1656
1751
  }
1657
1752
  } break;
1753
+ case LLM_ARCH_LLAMA4:
1754
+ {
1755
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1756
+
1757
+ // output
1758
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
1759
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
1760
+
1761
+ // if output is NULL, init from the input tok embed
1762
+ if (output == NULL) {
1763
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
1764
+ }
1765
+
1766
+ GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
1767
+ for (int i = 0; i < n_layer; ++i) {
1768
+ bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
1769
+
1770
+ auto & layer = layers[i];
1771
+
1772
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
1773
+
1774
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
1775
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
1776
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
1777
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
1778
+
1779
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1780
+
1781
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1782
+
1783
+ if (is_moe_layer) {
1784
+ int n_ff_exp = hparams.n_ff_exp;
1785
+
1786
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1787
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
1788
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
1789
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
1790
+
1791
+ // Shared expert
1792
+ const int64_t n_ff_shexp = n_ff_exp;
1793
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1794
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
1795
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
1796
+ } else {
1797
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1798
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1799
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1800
+ }
1801
+ }
1802
+ } break;
1658
1803
  case LLM_ARCH_DECI:
1659
1804
  {
1660
1805
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -1924,6 +2069,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1924
2069
  } break;
1925
2070
  case LLM_ARCH_BERT:
1926
2071
  case LLM_ARCH_NOMIC_BERT:
2072
+ case LLM_ARCH_NOMIC_BERT_MOE:
1927
2073
  {
1928
2074
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
1929
2075
  type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
@@ -1957,20 +2103,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1957
2103
  layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
1958
2104
  }
1959
2105
 
2106
+ if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
2107
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2108
+ }
2109
+
1960
2110
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
1961
2111
 
1962
2112
  layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
1963
2113
  layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
1964
2114
 
1965
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1966
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
1967
-
1968
- if (arch == LLM_ARCH_BERT) {
2115
+ if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
1969
2116
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
1970
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
1971
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2117
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
2118
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2119
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
1972
2120
  } else {
1973
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2121
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2122
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2123
+
2124
+ if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
2125
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2126
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2127
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2128
+ } else {
2129
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2130
+ }
1974
2131
  }
1975
2132
 
1976
2133
  layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
@@ -2254,6 +2411,77 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2254
2411
  layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
2255
2412
  }
2256
2413
  } break;
2414
+ case LLM_ARCH_QWEN3:
2415
+ {
2416
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2417
+
2418
+ // output
2419
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2420
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2421
+ // if output is NULL, init from the input tok embed
2422
+ if (output == NULL) {
2423
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2424
+ }
2425
+
2426
+ for (int i = 0; i < n_layer; ++i) {
2427
+ auto & layer = layers[i];
2428
+
2429
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2430
+
2431
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2432
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2433
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2434
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2435
+
2436
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2437
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2438
+
2439
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2440
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2441
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
2442
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2443
+ }
2444
+ } break;
2445
+ case LLM_ARCH_QWEN3MOE:
2446
+ {
2447
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2448
+
2449
+ // output
2450
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2451
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2452
+
2453
+ for (int i = 0; i < n_layer; ++i) {
2454
+ auto & layer = layers[i];
2455
+
2456
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2457
+
2458
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
2459
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2460
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2461
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
2462
+
2463
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2464
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2465
+
2466
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2467
+
2468
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2469
+
2470
+ if (n_expert == 0) {
2471
+ throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
2472
+ }
2473
+ if (n_expert_used == 0) {
2474
+ throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
2475
+ }
2476
+
2477
+ // MoE branch
2478
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
2479
+
2480
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2481
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2482
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2483
+ }
2484
+ } break;
2257
2485
  case LLM_ARCH_PHI2:
2258
2486
  {
2259
2487
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3003,8 +3231,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3003
3231
  {
3004
3232
  const bool is_lite = (hparams.n_layer == 27);
3005
3233
 
3234
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
3235
+
3236
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
3237
+ const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
3238
+ const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
3239
+
3006
3240
  const int64_t n_embd_head_qk_rope = hparams.n_rot;
3007
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
3241
+ const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
3008
3242
 
3009
3243
  const int64_t q_lora_rank = hparams.n_lora_q;
3010
3244
  const int64_t kv_lora_rank = hparams.n_lora_kv;
@@ -3030,14 +3264,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3030
3264
 
3031
3265
  if (!is_lite) {
3032
3266
  layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
3033
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
3267
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
3034
3268
  } else {
3035
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3269
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
3036
3270
  }
3037
3271
 
3038
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
3039
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
3040
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
3272
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
3273
+
3274
+ // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
3275
+ if (is_mla) {
3276
+ layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
3277
+ layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
3278
+ } else {
3279
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
3280
+ }
3281
+
3282
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
3041
3283
 
3042
3284
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3043
3285
 
@@ -3068,6 +3310,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3068
3310
  }
3069
3311
  }
3070
3312
  } break;
3313
+ case LLM_ARCH_PLM:
3314
+ {
3315
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
3316
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
3317
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
3318
+
3319
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3320
+
3321
+ // output
3322
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3323
+ // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3324
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3325
+
3326
+ for (int i = 0; i < n_layer; ++i) {
3327
+ auto & layer = layers[i];
3328
+
3329
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3330
+
3331
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3332
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
3333
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
3334
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
3335
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
3336
+
3337
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3338
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3339
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3340
+ }
3341
+ } break;
3071
3342
  case LLM_ARCH_BITNET:
3072
3343
  {
3073
3344
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3254,6 +3525,45 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3254
3525
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3255
3526
  }
3256
3527
  } break;
3528
+ case LLM_ARCH_GLM4:
3529
+ {
3530
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3531
+
3532
+ // output
3533
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3534
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3535
+ // if output is NULL, init from the input tok embed
3536
+ if (output == NULL) {
3537
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3538
+ }
3539
+
3540
+ for (int i = 0; i < n_layer; ++i) {
3541
+ auto & layer = layers[i];
3542
+
3543
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3544
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3545
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3546
+
3547
+ if (layer.wqkv == nullptr) {
3548
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3549
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3550
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3551
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3552
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3553
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3554
+ }
3555
+
3556
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3557
+
3558
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3559
+
3560
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3561
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3562
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
3563
+
3564
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3565
+ }
3566
+ } break;
3257
3567
  case LLM_ARCH_NEMOTRON:
3258
3568
  {
3259
3569
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -3712,8 +4022,48 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3712
4022
  output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
3713
4023
  output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
3714
4024
  } break;
3715
- default:
3716
- throw std::runtime_error("unknown architecture");
4025
+ case LLM_ARCH_BAILINGMOE:
4026
+ {
4027
+ const int64_t n_ff_exp = hparams.n_ff_exp;
4028
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4029
+
4030
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4031
+
4032
+ // output
4033
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4034
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4035
+
4036
+ for (int i = 0; i < n_layer; ++i) {
4037
+ auto & layer = layers[i];
4038
+
4039
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4040
+
4041
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
4042
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
4043
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
4044
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
4045
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4046
+
4047
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4048
+
4049
+ if (n_expert == 0) {
4050
+ throw std::runtime_error("n_expert must be > 0");
4051
+ }
4052
+ if (n_expert_used == 0) {
4053
+ throw std::runtime_error("n_expert_used must be > 0");
4054
+ }
4055
+
4056
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4057
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4058
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4059
+
4060
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4061
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
4062
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4063
+ }
4064
+ } break;
4065
+ default:
4066
+ throw std::runtime_error("unknown architecture");
3717
4067
  }
3718
4068
 
3719
4069
  if (n_moved_tensors > 0) {
@@ -3980,6 +4330,8 @@ void llama_model::print_info() const {
3980
4330
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
3981
4331
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
3982
4332
  LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
4333
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
4334
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
3983
4335
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
3984
4336
  LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
3985
4337
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
@@ -3993,12 +4345,24 @@ void llama_model::print_info() const {
3993
4345
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
3994
4346
  }
3995
4347
 
4348
+ if (arch == LLM_ARCH_QWEN3MOE) {
4349
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4350
+ }
4351
+
3996
4352
  if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
3997
4353
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
3998
4354
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
3999
4355
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
4000
4356
  }
4001
4357
 
4358
+ if (arch == LLM_ARCH_BAILINGMOE) {
4359
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
4360
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4361
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
4362
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
4363
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
4364
+ }
4365
+
4002
4366
  vocab.print_info();
4003
4367
  }
4004
4368
 
@@ -4060,6 +4424,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
4060
4424
  });
4061
4425
  }
4062
4426
 
4427
+ bool llama_model::has_tensor_overrides() const {
4428
+ return pimpl->has_tensor_overrides;
4429
+ }
4430
+
4063
4431
  const ggml_tensor * llama_model::get_tensor(const char * name) const {
4064
4432
  auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
4065
4433
  [name](const std::pair<std::string, ggml_tensor *> & it) {
@@ -4087,12 +4455,22 @@ struct llm_build_llama : public llm_graph_context {
4087
4455
  // inp_pos - contains the positions
4088
4456
  ggml_tensor * inp_pos = build_inp_pos();
4089
4457
 
4458
+ // temperature tuning
4459
+ ggml_tensor * inp_attn_scale = nullptr;
4460
+ if (arch == LLM_ARCH_LLAMA4) {
4461
+ inp_attn_scale = build_inp_attn_scale();
4462
+ }
4463
+
4090
4464
  auto * inp_attn = build_attn_inp_kv_unified();
4091
4465
 
4092
4466
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4093
4467
  for (int il = 0; il < n_layer; ++il) {
4094
4468
  ggml_tensor * inpSA = inpL;
4095
4469
 
4470
+ bool use_rope = arch == LLM_ARCH_LLAMA4
4471
+ ? (il + 1) % hparams.n_no_rope_layer_step != 0
4472
+ : true;
4473
+
4096
4474
  // norm
4097
4475
  cur = build_norm(inpL,
4098
4476
  model.layers[il].attn_norm, NULL,
@@ -4130,25 +4508,38 @@ struct llm_build_llama : public llm_graph_context {
4130
4508
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
4131
4509
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
4132
4510
 
4133
- Qcur = ggml_rope_ext(
4134
- ctx0, Qcur, inp_pos, rope_factors,
4135
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4136
- ext_factor, attn_factor, beta_fast, beta_slow
4137
- );
4511
+ if (use_rope) {
4512
+ Qcur = ggml_rope_ext(
4513
+ ctx0, Qcur, inp_pos, rope_factors,
4514
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4515
+ ext_factor, attn_factor, beta_fast, beta_slow
4516
+ );
4138
4517
 
4139
- Kcur = ggml_rope_ext(
4140
- ctx0, Kcur, inp_pos, rope_factors,
4141
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4142
- ext_factor, attn_factor, beta_fast, beta_slow
4143
- );
4518
+ Kcur = ggml_rope_ext(
4519
+ ctx0, Kcur, inp_pos, rope_factors,
4520
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4521
+ ext_factor, attn_factor, beta_fast, beta_slow
4522
+ );
4523
+ } else if (inp_attn_scale) {
4524
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
4525
+ }
4144
4526
 
4145
4527
  cb(Qcur, "Qcur", il);
4146
4528
  cb(Kcur, "Kcur", il);
4147
4529
  cb(Vcur, "Vcur", il);
4148
4530
 
4531
+ if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
4532
+ // Llama4TextL2Norm
4533
+ Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
4534
+ Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
4535
+ cb(Qcur, "Qcur_normed", il);
4536
+ cb(Kcur, "Kcur_normed", il);
4537
+ }
4538
+
4149
4539
  cur = build_attn(inp_attn, gf,
4150
4540
  model.layers[il].wo, model.layers[il].bo,
4151
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
4541
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4542
+ cb(cur, "attn_out", il);
4152
4543
  }
4153
4544
 
4154
4545
  if (il == n_layer - 1) {
@@ -4166,7 +4557,7 @@ struct llm_build_llama : public llm_graph_context {
4166
4557
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4167
4558
  cb(ffn_inp, "ffn_inp", il);
4168
4559
 
4169
- // feed-forward network
4560
+ // feed-forward network (non-MoE)
4170
4561
  if (model.layers[il].ffn_gate_inp == nullptr) {
4171
4562
 
4172
4563
  cur = build_norm(ffn_inp,
@@ -4181,6 +4572,38 @@ struct llm_build_llama : public llm_graph_context {
4181
4572
  NULL,
4182
4573
  LLM_FFN_SILU, LLM_FFN_PAR, il);
4183
4574
  cb(cur, "ffn_out", il);
4575
+
4576
+ } else if (arch == LLM_ARCH_LLAMA4) {
4577
+ // llama4 MoE
4578
+ ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
4579
+ model.layers[il].ffn_norm, NULL,
4580
+ LLM_NORM_RMS, il);
4581
+ cb(cur, "ffn_norm", il);
4582
+
4583
+ ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
4584
+ model.layers[il].ffn_gate_inp,
4585
+ model.layers[il].ffn_up_exps,
4586
+ model.layers[il].ffn_gate_exps,
4587
+ model.layers[il].ffn_down_exps,
4588
+ nullptr,
4589
+ n_expert, n_expert_used,
4590
+ LLM_FFN_SILU, false,
4591
+ false, 0.0,
4592
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
4593
+ il);
4594
+
4595
+ // Shared experts
4596
+ ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
4597
+ model.layers[il].ffn_up_shexp, NULL, NULL,
4598
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
4599
+ model.layers[il].ffn_down_shexp, NULL, NULL,
4600
+ NULL,
4601
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
4602
+ cb(shexp_out, "ffn_moe_shexp", il);
4603
+
4604
+ cur = ggml_add(ctx0, moe_out, shexp_out);
4605
+ cb(cur, "ffn_moe_out_merged", il);
4606
+
4184
4607
  } else {
4185
4608
  // MoE branch
4186
4609
  cur = build_norm(ffn_inp,
@@ -4328,7 +4751,7 @@ struct llm_build_deci : public llm_graph_context {
4328
4751
 
4329
4752
  cur = build_attn(inp_attn, gf,
4330
4753
  model.layers[il].wo, model.layers[il].bo,
4331
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
4754
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4332
4755
  }
4333
4756
 
4334
4757
  if (il == n_layer - 1) {
@@ -4470,7 +4893,7 @@ struct llm_build_baichuan : public llm_graph_context {
4470
4893
 
4471
4894
  cur = build_attn(inp_attn, gf,
4472
4895
  model.layers[il].wo, NULL,
4473
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4896
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4474
4897
  }
4475
4898
 
4476
4899
  if (il == n_layer - 1) {
@@ -4585,7 +5008,7 @@ struct llm_build_xverse : public llm_graph_context {
4585
5008
 
4586
5009
  cur = build_attn(inp_attn, gf,
4587
5010
  model.layers[il].wo, NULL,
4588
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5011
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4589
5012
  }
4590
5013
 
4591
5014
  if (il == n_layer - 1) {
@@ -4710,7 +5133,7 @@ struct llm_build_falcon : public llm_graph_context {
4710
5133
 
4711
5134
  cur = build_attn(inp_attn, gf,
4712
5135
  model.layers[il].wo, NULL,
4713
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5136
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4714
5137
  }
4715
5138
 
4716
5139
  if (il == n_layer - 1) {
@@ -4840,7 +5263,7 @@ struct llm_build_grok : public llm_graph_context {
4840
5263
 
4841
5264
  cur = build_attn(inp_attn, gf,
4842
5265
  model.layers[il].wo, model.layers[il].bo,
4843
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
5266
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
4844
5267
  }
4845
5268
 
4846
5269
  if (il == n_layer - 1) {
@@ -4991,7 +5414,7 @@ struct llm_build_dbrx : public llm_graph_context {
4991
5414
 
4992
5415
  cur = build_attn(inp_attn, gf,
4993
5416
  model.layers[il].wo, NULL,
4994
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5417
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4995
5418
  }
4996
5419
 
4997
5420
  if (il == n_layer - 1) {
@@ -5105,7 +5528,7 @@ struct llm_build_starcoder : public llm_graph_context {
5105
5528
 
5106
5529
  cur = build_attn(inp_attn, gf,
5107
5530
  model.layers[il].wo, model.layers[il].bo,
5108
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5531
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5109
5532
  }
5110
5533
 
5111
5534
  if (il == n_layer - 1) {
@@ -5204,7 +5627,7 @@ struct llm_build_refact : public llm_graph_context {
5204
5627
 
5205
5628
  cur = build_attn(inp_attn, gf,
5206
5629
  model.layers[il].wo, NULL,
5207
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5630
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5208
5631
  }
5209
5632
 
5210
5633
  if (il == n_layer - 1) {
@@ -5331,6 +5754,11 @@ struct llm_build_bert : public llm_graph_context {
5331
5754
  cur = build_lora_mm(model.layers[il].wqkv, cur);
5332
5755
  cb(cur, "wqkv", il);
5333
5756
 
5757
+ if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5758
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5759
+ cb(cur, "bqkv", il);
5760
+ }
5761
+
5334
5762
  Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5335
5763
  Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5336
5764
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@@ -5358,7 +5786,7 @@ struct llm_build_bert : public llm_graph_context {
5358
5786
 
5359
5787
  cur = build_attn(inp_attn, gf,
5360
5788
  model.layers[il].wo, model.layers[il].bo,
5361
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5789
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5362
5790
  cb(cur, "kqv_out", il);
5363
5791
 
5364
5792
  if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
@@ -5383,13 +5811,29 @@ struct llm_build_bert : public llm_graph_context {
5383
5811
  cb(ffn_inp, "ffn_inp", il);
5384
5812
 
5385
5813
  // feed-forward network
5386
- if (model.arch == LLM_ARCH_BERT) {
5814
+ if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
5815
+ // MoE branch
5816
+ cur = build_moe_ffn(cur,
5817
+ model.layers[il].ffn_gate_inp,
5818
+ model.layers[il].ffn_up_exps,
5819
+ nullptr,
5820
+ model.layers[il].ffn_down_exps,
5821
+ nullptr,
5822
+ hparams.n_expert,
5823
+ hparams.n_expert_used,
5824
+ LLM_FFN_GELU,
5825
+ false, false,
5826
+ 0.0f,
5827
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
5828
+ cb(cur, "ffn_moe_out", il);
5829
+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5387
5830
  cur = build_ffn(cur,
5388
5831
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
5389
5832
  NULL, NULL, NULL,
5390
5833
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5391
5834
  NULL,
5392
5835
  LLM_FFN_GELU, LLM_FFN_SEQ, il);
5836
+ cb(cur, "ffn_out", il);
5393
5837
  } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
5394
5838
  cur = build_ffn(cur,
5395
5839
  model.layers[il].ffn_up, NULL, NULL,
@@ -5397,6 +5841,7 @@ struct llm_build_bert : public llm_graph_context {
5397
5841
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5398
5842
  NULL,
5399
5843
  LLM_FFN_GELU, LLM_FFN_PAR, il);
5844
+ cb(cur, "ffn_out", il);
5400
5845
  } else {
5401
5846
  cur = build_ffn(cur,
5402
5847
  model.layers[il].ffn_up, NULL, NULL,
@@ -5404,8 +5849,8 @@ struct llm_build_bert : public llm_graph_context {
5404
5849
  model.layers[il].ffn_down, NULL, NULL,
5405
5850
  NULL,
5406
5851
  LLM_FFN_SILU, LLM_FFN_PAR, il);
5852
+ cb(cur, "ffn_out", il);
5407
5853
  }
5408
- cb(cur, "ffn_out", il);
5409
5854
 
5410
5855
  // attentions bypass the intermediate layer
5411
5856
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -5475,7 +5920,7 @@ struct llm_build_bloom : public llm_graph_context {
5475
5920
 
5476
5921
  cur = build_attn(inp_attn, gf,
5477
5922
  model.layers[il].wo, model.layers[il].bo,
5478
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5923
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5479
5924
  }
5480
5925
 
5481
5926
  if (il == n_layer - 1) {
@@ -5616,7 +6061,7 @@ struct llm_build_mpt : public llm_graph_context {
5616
6061
 
5617
6062
  cur = build_attn(inp_attn, gf,
5618
6063
  model.layers[il].wo, model.layers[il].bo,
5619
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6064
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5620
6065
  }
5621
6066
 
5622
6067
  if (il == n_layer - 1) {
@@ -5762,7 +6207,7 @@ struct llm_build_stablelm : public llm_graph_context {
5762
6207
 
5763
6208
  cur = build_attn(inp_attn, gf,
5764
6209
  model.layers[il].wo, NULL,
5765
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6210
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5766
6211
  }
5767
6212
 
5768
6213
  if (il == n_layer - 1) {
@@ -5885,7 +6330,7 @@ struct llm_build_qwen : public llm_graph_context {
5885
6330
 
5886
6331
  cur = build_attn(inp_attn, gf,
5887
6332
  model.layers[il].wo, NULL,
5888
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6333
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5889
6334
  }
5890
6335
 
5891
6336
  if (il == n_layer - 1) {
@@ -6005,7 +6450,7 @@ struct llm_build_qwen2 : public llm_graph_context {
6005
6450
 
6006
6451
  cur = build_attn(inp_attn, gf,
6007
6452
  model.layers[il].wo, model.layers[il].bo,
6008
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6453
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6009
6454
  }
6010
6455
 
6011
6456
  if (il == n_layer - 1) {
@@ -6126,7 +6571,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
6126
6571
 
6127
6572
  cur = build_attn(inp_attn, gf,
6128
6573
  model.layers[il].wo, model.layers[il].bo,
6129
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6574
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6130
6575
  }
6131
6576
 
6132
6577
  if (il == n_layer - 1) {
@@ -6253,7 +6698,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
6253
6698
 
6254
6699
  cur = build_attn(inp_attn, gf,
6255
6700
  model.layers[il].wo, model.layers[il].bo,
6256
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6701
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6257
6702
  }
6258
6703
 
6259
6704
  if (il == n_layer - 1) {
@@ -6284,7 +6729,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
6284
6729
  false, 0.0,
6285
6730
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
6286
6731
  il);
6287
- cb(cur, "ffn_moe_out", il);
6732
+ cb(moe_out, "ffn_moe_out", il);
6288
6733
 
6289
6734
  // FFN shared expert
6290
6735
  {
@@ -6340,16 +6785,14 @@ struct llm_build_qwen2moe : public llm_graph_context {
6340
6785
  }
6341
6786
  };
6342
6787
 
6343
- struct llm_build_phi2 : public llm_graph_context {
6344
- llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6788
+ struct llm_build_qwen3 : public llm_graph_context {
6789
+ llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6345
6790
  const int64_t n_embd_head = hparams.n_embd_head_v;
6346
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6347
6791
 
6348
6792
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6793
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6349
6794
 
6350
6795
  ggml_tensor * cur;
6351
- ggml_tensor * attn_norm_output;
6352
- ggml_tensor * ffn_output;
6353
6796
  ggml_tensor * inpL;
6354
6797
 
6355
6798
  inpL = build_inp_embd(model.tok_embd);
@@ -6360,48 +6803,42 @@ struct llm_build_phi2 : public llm_graph_context {
6360
6803
  auto * inp_attn = build_attn_inp_kv_unified();
6361
6804
 
6362
6805
  for (int il = 0; il < n_layer; ++il) {
6363
- attn_norm_output = build_norm(inpL,
6364
- model.layers[il].attn_norm,
6365
- model.layers[il].attn_norm_b,
6366
- LLM_NORM, il);
6367
- cb(attn_norm_output, "attn_norm", il);
6806
+ ggml_tensor * inpSA = inpL;
6807
+
6808
+ // norm
6809
+ cur = build_norm(inpL,
6810
+ model.layers[il].attn_norm, NULL,
6811
+ LLM_NORM_RMS, il);
6812
+ cb(cur, "attn_norm", il);
6368
6813
 
6369
6814
  // self-attention
6370
6815
  {
6371
- ggml_tensor * Qcur = nullptr;
6372
- ggml_tensor * Kcur = nullptr;
6373
- ggml_tensor * Vcur = nullptr;
6374
-
6375
- if (model.layers[il].wqkv) {
6376
- cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
6377
- cb(cur, "wqkv", il);
6378
-
6379
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6380
- cb(cur, "bqkv", il);
6381
-
6382
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6383
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6384
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6385
- } else {
6386
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
6387
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
6388
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
6389
- }
6390
-
6816
+ // compute Q and K and RoPE them
6817
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
6391
6818
  cb(Qcur, "Qcur", il);
6819
+
6820
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
6392
6821
  cb(Kcur, "Kcur", il);
6822
+
6823
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
6393
6824
  cb(Vcur, "Vcur", il);
6394
6825
 
6395
6826
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6396
6827
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6397
6828
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6398
6829
 
6830
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
6831
+ cb(Qcur, "Qcur_normed", il);
6832
+
6399
6833
  Qcur = ggml_rope_ext(
6400
6834
  ctx0, Qcur, inp_pos, nullptr,
6401
6835
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6402
6836
  ext_factor, attn_factor, beta_fast, beta_slow
6403
6837
  );
6404
6838
 
6839
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
6840
+ cb(Kcur, "Kcur_normed", il);
6841
+
6405
6842
  Kcur = ggml_rope_ext(
6406
6843
  ctx0, Kcur, inp_pos, nullptr,
6407
6844
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -6412,36 +6849,36 @@ struct llm_build_phi2 : public llm_graph_context {
6412
6849
  cb(Kcur, "Kcur", il);
6413
6850
  cb(Vcur, "Vcur", il);
6414
6851
 
6415
- // with phi2, we scale the Q to avoid precision issues
6416
- // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
6417
- Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
6418
-
6419
6852
  cur = build_attn(inp_attn, gf,
6420
6853
  model.layers[il].wo, model.layers[il].bo,
6421
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
6854
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6422
6855
  }
6423
6856
 
6424
6857
  if (il == n_layer - 1) {
6425
6858
  // skip computing output for unused tokens
6426
6859
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6427
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6428
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6429
- attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
6860
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6861
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6430
6862
  }
6431
6863
 
6432
- // FF
6433
- {
6434
- ffn_output = build_ffn(attn_norm_output,
6435
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
6436
- NULL, NULL, NULL,
6437
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
6438
- NULL,
6439
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
6440
- cb(ffn_output, "ffn_out", il);
6441
- }
6864
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6865
+ cb(ffn_inp, "ffn_inp", il);
6442
6866
 
6443
- cur = ggml_add(ctx0, cur, ffn_output);
6444
- cur = ggml_add(ctx0, cur, inpL);
6867
+ // feed-forward network
6868
+ cur = build_norm(ffn_inp,
6869
+ model.layers[il].ffn_norm, NULL,
6870
+ LLM_NORM_RMS, il);
6871
+ cb(cur, "ffn_norm", il);
6872
+
6873
+ cur = build_ffn(cur,
6874
+ model.layers[il].ffn_up, NULL, NULL,
6875
+ model.layers[il].ffn_gate, NULL, NULL,
6876
+ model.layers[il].ffn_down, NULL, NULL,
6877
+ NULL,
6878
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
6879
+ cb(cur, "ffn_out", il);
6880
+
6881
+ cur = ggml_add(ctx0, cur, ffn_inp);
6445
6882
 
6446
6883
  cur = build_cvec(cur, il);
6447
6884
  cb(cur, "l_out", il);
@@ -6450,18 +6887,17 @@ struct llm_build_phi2 : public llm_graph_context {
6450
6887
  inpL = cur;
6451
6888
  }
6452
6889
 
6453
- cur = build_norm(inpL,
6454
- model.output_norm,
6455
- model.output_norm_b,
6456
- LLM_NORM, -1);
6890
+ cur = inpL;
6891
+
6892
+ cur = build_norm(cur,
6893
+ model.output_norm, NULL,
6894
+ LLM_NORM_RMS, -1);
6457
6895
 
6458
6896
  cb(cur, "result_norm", -1);
6459
6897
  res->t_embd = cur;
6460
6898
 
6899
+ // lm_head
6461
6900
  cur = build_lora_mm(model.output, cur);
6462
- cb(cur, "result_output_no_bias", -1);
6463
-
6464
- cur = ggml_add(ctx0, cur, model.output_b);
6465
6901
 
6466
6902
  cb(cur, "result_output", -1);
6467
6903
  res->t_logits = cur;
@@ -6470,12 +6906,12 @@ struct llm_build_phi2 : public llm_graph_context {
6470
6906
  }
6471
6907
  };
6472
6908
 
6473
- struct llm_build_phi3 : public llm_graph_context {
6474
- llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6909
+ struct llm_build_qwen3moe : public llm_graph_context {
6910
+ llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6475
6911
  const int64_t n_embd_head = hparams.n_embd_head_v;
6476
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6477
6912
 
6478
6913
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6914
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
6479
6915
 
6480
6916
  ggml_tensor * cur;
6481
6917
  ggml_tensor * inpL;
@@ -6488,52 +6924,44 @@ struct llm_build_phi3 : public llm_graph_context {
6488
6924
  auto * inp_attn = build_attn_inp_kv_unified();
6489
6925
 
6490
6926
  for (int il = 0; il < n_layer; ++il) {
6491
- auto * residual = inpL;
6492
-
6493
- // self-attention
6494
- {
6495
- // rope freq factors for 128k context
6496
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
6497
-
6498
- ggml_tensor* attn_norm_output = build_norm(inpL,
6499
- model.layers[il].attn_norm,
6500
- model.layers[il].attn_norm_b,
6501
- LLM_NORM_RMS, il);
6502
- cb(attn_norm_output, "attn_norm", il);
6503
-
6504
- ggml_tensor * Qcur = nullptr;
6505
- ggml_tensor * Kcur = nullptr;
6506
- ggml_tensor * Vcur = nullptr;
6507
-
6508
- if (model.layers[il].wqkv) {
6509
- cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
6510
- cb(cur, "wqkv", il);
6927
+ ggml_tensor * inpSA = inpL;
6511
6928
 
6512
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
6513
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
6514
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
6515
- } else {
6516
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
6517
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
6518
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
6519
- }
6929
+ // norm
6930
+ cur = build_norm(inpL,
6931
+ model.layers[il].attn_norm, NULL,
6932
+ LLM_NORM_RMS, il);
6933
+ cb(cur, "attn_norm", il);
6520
6934
 
6935
+ // self_attention
6936
+ {
6937
+ // compute Q and K and RoPE them
6938
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
6521
6939
  cb(Qcur, "Qcur", il);
6940
+
6941
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
6522
6942
  cb(Kcur, "Kcur", il);
6943
+
6944
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
6523
6945
  cb(Vcur, "Vcur", il);
6524
6946
 
6525
6947
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6526
6948
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6527
6949
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6528
6950
 
6529
- Qcur = ggml_rope_ext(
6530
- ctx0, Qcur, inp_pos, rope_factors,
6951
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
6952
+ cb(Qcur, "Qcur_normed", il);
6953
+
6954
+ Qcur = ggml_rope_ext(
6955
+ ctx0, Qcur, inp_pos, nullptr,
6531
6956
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6532
6957
  ext_factor, attn_factor, beta_fast, beta_slow
6533
6958
  );
6534
6959
 
6960
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
6961
+ cb(Kcur, "Kcur_normed", il);
6962
+
6535
6963
  Kcur = ggml_rope_ext(
6536
- ctx0, Kcur, inp_pos, rope_factors,
6964
+ ctx0, Kcur, inp_pos, nullptr,
6537
6965
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6538
6966
  ext_factor, attn_factor, beta_fast, beta_slow
6539
6967
  );
@@ -6542,41 +6970,29 @@ struct llm_build_phi3 : public llm_graph_context {
6542
6970
  cb(Kcur, "Kcur", il);
6543
6971
  cb(Vcur, "Vcur", il);
6544
6972
 
6545
- Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
6546
- cb(Qcur, "Qcur", il);
6547
-
6548
6973
  cur = build_attn(inp_attn, gf,
6549
6974
  model.layers[il].wo, model.layers[il].bo,
6550
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
6975
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6551
6976
  }
6552
6977
 
6553
6978
  if (il == n_layer - 1) {
6554
6979
  // skip computing output for unused tokens
6555
- ggml_tensor* inp_out_ids = build_inp_out_ids();
6556
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6557
- residual = ggml_get_rows(ctx0, residual, inp_out_ids);
6980
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6981
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6982
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6558
6983
  }
6559
6984
 
6560
- cur = ggml_add(ctx0, cur, residual);
6561
- residual = cur;
6985
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
6986
+ cb(ffn_inp, "ffn_inp", il);
6562
6987
 
6563
- cur = build_norm(cur,
6564
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
6988
+ // MoE branch
6989
+ cur = build_norm(ffn_inp,
6990
+ model.layers[il].ffn_norm, NULL,
6565
6991
  LLM_NORM_RMS, il);
6566
6992
  cb(cur, "ffn_norm", il);
6567
6993
 
6568
- // feed-forward network
6569
- if (model.layers[il].ffn_gate_inp == nullptr) {
6570
- cur = build_ffn(cur,
6571
- model.layers[il].ffn_up, NULL, NULL,
6572
- NULL, NULL, NULL,
6573
- model.layers[il].ffn_down, NULL, NULL,
6574
- NULL,
6575
- LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
6576
- cb(cur, "ffn_out", il);
6577
- } else {
6578
- // MoE branch
6579
- cur = build_moe_ffn(cur,
6994
+ ggml_tensor * moe_out =
6995
+ build_moe_ffn(cur,
6580
6996
  model.layers[il].ffn_gate_inp,
6581
6997
  model.layers[il].ffn_up_exps,
6582
6998
  model.layers[il].ffn_gate_exps,
@@ -6587,10 +7003,10 @@ struct llm_build_phi3 : public llm_graph_context {
6587
7003
  false, 0.0,
6588
7004
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
6589
7005
  il);
6590
- cb(cur, "ffn_moe_out", il);
6591
- }
7006
+ cb(moe_out, "ffn_moe_out", il);
7007
+ cur = moe_out;
6592
7008
 
6593
- cur = ggml_add(ctx0, residual, cur);
7009
+ cur = ggml_add(ctx0, cur, ffn_inp);
6594
7010
 
6595
7011
  cur = build_cvec(cur, il);
6596
7012
  cb(cur, "l_out", il);
@@ -6599,21 +7015,18 @@ struct llm_build_phi3 : public llm_graph_context {
6599
7015
  inpL = cur;
6600
7016
  }
6601
7017
 
6602
- cur = build_norm(inpL,
6603
- model.output_norm,
6604
- model.output_norm_b,
7018
+ cur = inpL;
7019
+
7020
+ cur = build_norm(cur,
7021
+ model.output_norm, NULL,
6605
7022
  LLM_NORM_RMS, -1);
6606
7023
 
6607
7024
  cb(cur, "result_norm", -1);
6608
7025
  res->t_embd = cur;
6609
7026
 
7027
+ // lm_head
6610
7028
  cur = build_lora_mm(model.output, cur);
6611
7029
 
6612
- if (model.output_b != nullptr) {
6613
- cb(cur, "result_output_no_bias", -1);
6614
- cur = ggml_add(ctx0, cur, model.output_b);
6615
- }
6616
-
6617
7030
  cb(cur, "result_output", -1);
6618
7031
  res->t_logits = cur;
6619
7032
 
@@ -6621,14 +7034,16 @@ struct llm_build_phi3 : public llm_graph_context {
6621
7034
  }
6622
7035
  };
6623
7036
 
6624
- struct llm_build_plamo : public llm_graph_context {
6625
- llm_build_plamo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7037
+ struct llm_build_phi2 : public llm_graph_context {
7038
+ llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6626
7039
  const int64_t n_embd_head = hparams.n_embd_head_v;
7040
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6627
7041
 
6628
7042
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6629
- GGML_ASSERT(n_embd_head == hparams.n_rot);
6630
7043
 
6631
7044
  ggml_tensor * cur;
7045
+ ggml_tensor * attn_norm_output;
7046
+ ggml_tensor * ffn_output;
6632
7047
  ggml_tensor * inpL;
6633
7048
 
6634
7049
  inpL = build_inp_embd(model.tok_embd);
@@ -6639,25 +7054,36 @@ struct llm_build_plamo : public llm_graph_context {
6639
7054
  auto * inp_attn = build_attn_inp_kv_unified();
6640
7055
 
6641
7056
  for (int il = 0; il < n_layer; ++il) {
6642
-
6643
- // norm
6644
- cur = build_norm(inpL,
6645
- model.layers[il].attn_norm, NULL,
6646
- LLM_NORM_RMS, il);
6647
- cb(cur, "attn_norm", il);
6648
-
6649
- ggml_tensor * attention_norm = cur;
7057
+ attn_norm_output = build_norm(inpL,
7058
+ model.layers[il].attn_norm,
7059
+ model.layers[il].attn_norm_b,
7060
+ LLM_NORM, il);
7061
+ cb(attn_norm_output, "attn_norm", il);
6650
7062
 
6651
7063
  // self-attention
6652
7064
  {
6653
- // compute Q and K and RoPE them
6654
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
6655
- cb(Qcur, "Qcur", il);
7065
+ ggml_tensor * Qcur = nullptr;
7066
+ ggml_tensor * Kcur = nullptr;
7067
+ ggml_tensor * Vcur = nullptr;
6656
7068
 
6657
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
6658
- cb(Kcur, "Kcur", il);
7069
+ if (model.layers[il].wqkv) {
7070
+ cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
7071
+ cb(cur, "wqkv", il);
6659
7072
 
6660
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
7073
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7074
+ cb(cur, "bqkv", il);
7075
+
7076
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7077
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7078
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7079
+ } else {
7080
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7081
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7082
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
7083
+ }
7084
+
7085
+ cb(Qcur, "Qcur", il);
7086
+ cb(Kcur, "Kcur", il);
6661
7087
  cb(Vcur, "Vcur", il);
6662
7088
 
6663
7089
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -6666,13 +7092,13 @@ struct llm_build_plamo : public llm_graph_context {
6666
7092
 
6667
7093
  Qcur = ggml_rope_ext(
6668
7094
  ctx0, Qcur, inp_pos, nullptr,
6669
- n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
7095
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6670
7096
  ext_factor, attn_factor, beta_fast, beta_slow
6671
7097
  );
6672
7098
 
6673
7099
  Kcur = ggml_rope_ext(
6674
7100
  ctx0, Kcur, inp_pos, nullptr,
6675
- n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
7101
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6676
7102
  ext_factor, attn_factor, beta_fast, beta_slow
6677
7103
  );
6678
7104
 
@@ -6680,34 +7106,35 @@ struct llm_build_plamo : public llm_graph_context {
6680
7106
  cb(Kcur, "Kcur", il);
6681
7107
  cb(Vcur, "Vcur", il);
6682
7108
 
7109
+ // with phi2, we scale the Q to avoid precision issues
7110
+ // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
7111
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
7112
+
6683
7113
  cur = build_attn(inp_attn, gf,
6684
- model.layers[il].wo, NULL,
6685
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7114
+ model.layers[il].wo, model.layers[il].bo,
7115
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
6686
7116
  }
6687
- ggml_tensor * sa_out = cur;
6688
-
6689
- cur = attention_norm;
6690
7117
 
6691
7118
  if (il == n_layer - 1) {
6692
7119
  // skip computing output for unused tokens
6693
7120
  ggml_tensor * inp_out_ids = build_inp_out_ids();
6694
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6695
- sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
6696
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7121
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7122
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7123
+ attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
6697
7124
  }
6698
7125
 
6699
- // feed-forward network
7126
+ // FF
6700
7127
  {
6701
- cur = build_ffn(cur,
6702
- model.layers[il].ffn_up, NULL, NULL,
6703
- model.layers[il].ffn_gate, NULL, NULL,
6704
- model.layers[il].ffn_down, NULL, NULL,
7128
+ ffn_output = build_ffn(attn_norm_output,
7129
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
7130
+ NULL, NULL, NULL,
7131
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
6705
7132
  NULL,
6706
- LLM_FFN_SILU, LLM_FFN_PAR, il);
6707
- cb(cur, "ffn_out", il);
7133
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
7134
+ cb(ffn_output, "ffn_out", il);
6708
7135
  }
6709
7136
 
6710
- cur = ggml_add(ctx0, cur, sa_out);
7137
+ cur = ggml_add(ctx0, cur, ffn_output);
6711
7138
  cur = ggml_add(ctx0, cur, inpL);
6712
7139
 
6713
7140
  cur = build_cvec(cur, il);
@@ -6717,17 +7144,18 @@ struct llm_build_plamo : public llm_graph_context {
6717
7144
  inpL = cur;
6718
7145
  }
6719
7146
 
6720
- cur = inpL;
6721
-
6722
- cur = build_norm(cur,
6723
- model.output_norm, NULL,
6724
- LLM_NORM_RMS, -1);
7147
+ cur = build_norm(inpL,
7148
+ model.output_norm,
7149
+ model.output_norm_b,
7150
+ LLM_NORM, -1);
6725
7151
 
6726
7152
  cb(cur, "result_norm", -1);
6727
7153
  res->t_embd = cur;
6728
7154
 
6729
- // lm_head
6730
7155
  cur = build_lora_mm(model.output, cur);
7156
+ cb(cur, "result_output_no_bias", -1);
7157
+
7158
+ cur = ggml_add(ctx0, cur, model.output_b);
6731
7159
 
6732
7160
  cb(cur, "result_output", -1);
6733
7161
  res->t_logits = cur;
@@ -6736,15 +7164,14 @@ struct llm_build_plamo : public llm_graph_context {
6736
7164
  }
6737
7165
  };
6738
7166
 
6739
- struct llm_build_gpt2 : public llm_graph_context {
6740
- llm_build_gpt2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7167
+ struct llm_build_phi3 : public llm_graph_context {
7168
+ llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6741
7169
  const int64_t n_embd_head = hparams.n_embd_head_v;
6742
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7170
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6743
7171
 
6744
7172
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6745
7173
 
6746
7174
  ggml_tensor * cur;
6747
- ggml_tensor * pos;
6748
7175
  ggml_tensor * inpL;
6749
7176
 
6750
7177
  inpL = build_inp_embd(model.tok_embd);
@@ -6754,30 +7181,36 @@ struct llm_build_gpt2 : public llm_graph_context {
6754
7181
 
6755
7182
  auto * inp_attn = build_attn_inp_kv_unified();
6756
7183
 
6757
- pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
6758
- cb(pos, "pos_embd", -1);
6759
-
6760
- inpL = ggml_add(ctx0, inpL, pos);
6761
- cb(inpL, "inpL", -1);
6762
-
6763
7184
  for (int il = 0; il < n_layer; ++il) {
6764
- cur = build_norm(inpL,
6765
- model.layers[il].attn_norm,
6766
- model.layers[il].attn_norm_b,
6767
- LLM_NORM, il);
6768
- cb(cur, "attn_norm", il);
7185
+ auto * residual = inpL;
6769
7186
 
6770
7187
  // self-attention
6771
7188
  {
6772
- cur = build_lora_mm(model.layers[il].wqkv, cur);
6773
- cb(cur, "wqkv", il);
7189
+ // rope freq factors for 128k context
7190
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
6774
7191
 
6775
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6776
- cb(cur, "bqkv", il);
7192
+ ggml_tensor* attn_norm_output = build_norm(inpL,
7193
+ model.layers[il].attn_norm,
7194
+ model.layers[il].attn_norm_b,
7195
+ LLM_NORM_RMS, il);
7196
+ cb(attn_norm_output, "attn_norm", il);
6777
7197
 
6778
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6779
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6780
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7198
+ ggml_tensor * Qcur = nullptr;
7199
+ ggml_tensor * Kcur = nullptr;
7200
+ ggml_tensor * Vcur = nullptr;
7201
+
7202
+ if (model.layers[il].wqkv) {
7203
+ cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
7204
+ cb(cur, "wqkv", il);
7205
+
7206
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
7207
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
7208
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
7209
+ } else {
7210
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7211
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7212
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
7213
+ }
6781
7214
 
6782
7215
  cb(Qcur, "Qcur", il);
6783
7216
  cb(Kcur, "Kcur", il);
@@ -6787,39 +7220,300 @@ struct llm_build_gpt2 : public llm_graph_context {
6787
7220
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6788
7221
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6789
7222
 
7223
+ Qcur = ggml_rope_ext(
7224
+ ctx0, Qcur, inp_pos, rope_factors,
7225
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7226
+ ext_factor, attn_factor, beta_fast, beta_slow
7227
+ );
7228
+
7229
+ Kcur = ggml_rope_ext(
7230
+ ctx0, Kcur, inp_pos, rope_factors,
7231
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7232
+ ext_factor, attn_factor, beta_fast, beta_slow
7233
+ );
7234
+
7235
+ cb(Qcur, "Qcur", il);
7236
+ cb(Kcur, "Kcur", il);
7237
+ cb(Vcur, "Vcur", il);
7238
+
7239
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
7240
+ cb(Qcur, "Qcur", il);
7241
+
6790
7242
  cur = build_attn(inp_attn, gf,
6791
7243
  model.layers[il].wo, model.layers[il].bo,
6792
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7244
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
6793
7245
  }
6794
7246
 
6795
7247
  if (il == n_layer - 1) {
6796
7248
  // skip computing output for unused tokens
6797
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6798
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6799
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7249
+ ggml_tensor* inp_out_ids = build_inp_out_ids();
7250
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7251
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
6800
7252
  }
6801
7253
 
6802
- // add the input
6803
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
6804
- cb(ffn_inp, "ffn_inp", il);
7254
+ cur = ggml_add(ctx0, cur, residual);
7255
+ residual = cur;
6805
7256
 
6806
- // FF
6807
- {
6808
- cur = build_norm(ffn_inp,
6809
- model.layers[il].ffn_norm,
6810
- model.layers[il].ffn_norm_b,
6811
- LLM_NORM, il);
6812
- cb(cur, "ffn_norm", il);
7257
+ cur = build_norm(cur,
7258
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
7259
+ LLM_NORM_RMS, il);
7260
+ cb(cur, "ffn_norm", il);
6813
7261
 
7262
+ // feed-forward network
7263
+ if (model.layers[il].ffn_gate_inp == nullptr) {
6814
7264
  cur = build_ffn(cur,
6815
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
6816
- NULL, NULL, NULL,
6817
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
7265
+ model.layers[il].ffn_up, NULL, NULL,
7266
+ NULL, NULL, NULL,
7267
+ model.layers[il].ffn_down, NULL, NULL,
6818
7268
  NULL,
6819
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
7269
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
6820
7270
  cb(cur, "ffn_out", il);
6821
- }
6822
-
7271
+ } else {
7272
+ // MoE branch
7273
+ cur = build_moe_ffn(cur,
7274
+ model.layers[il].ffn_gate_inp,
7275
+ model.layers[il].ffn_up_exps,
7276
+ model.layers[il].ffn_gate_exps,
7277
+ model.layers[il].ffn_down_exps,
7278
+ nullptr,
7279
+ n_expert, n_expert_used,
7280
+ LLM_FFN_SILU, true,
7281
+ false, 0.0,
7282
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
7283
+ il);
7284
+ cb(cur, "ffn_moe_out", il);
7285
+ }
7286
+
7287
+ cur = ggml_add(ctx0, residual, cur);
7288
+
7289
+ cur = build_cvec(cur, il);
7290
+ cb(cur, "l_out", il);
7291
+
7292
+ // input for next layer
7293
+ inpL = cur;
7294
+ }
7295
+
7296
+ cur = build_norm(inpL,
7297
+ model.output_norm,
7298
+ model.output_norm_b,
7299
+ LLM_NORM_RMS, -1);
7300
+
7301
+ cb(cur, "result_norm", -1);
7302
+ res->t_embd = cur;
7303
+
7304
+ cur = build_lora_mm(model.output, cur);
7305
+
7306
+ if (model.output_b != nullptr) {
7307
+ cb(cur, "result_output_no_bias", -1);
7308
+ cur = ggml_add(ctx0, cur, model.output_b);
7309
+ }
7310
+
7311
+ cb(cur, "result_output", -1);
7312
+ res->t_logits = cur;
7313
+
7314
+ ggml_build_forward_expand(gf, cur);
7315
+ }
7316
+ };
7317
+
7318
+ struct llm_build_plamo : public llm_graph_context {
7319
+ llm_build_plamo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7320
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7321
+
7322
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7323
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
7324
+
7325
+ ggml_tensor * cur;
7326
+ ggml_tensor * inpL;
7327
+
7328
+ inpL = build_inp_embd(model.tok_embd);
7329
+
7330
+ // inp_pos - contains the positions
7331
+ ggml_tensor * inp_pos = build_inp_pos();
7332
+
7333
+ auto * inp_attn = build_attn_inp_kv_unified();
7334
+
7335
+ for (int il = 0; il < n_layer; ++il) {
7336
+
7337
+ // norm
7338
+ cur = build_norm(inpL,
7339
+ model.layers[il].attn_norm, NULL,
7340
+ LLM_NORM_RMS, il);
7341
+ cb(cur, "attn_norm", il);
7342
+
7343
+ ggml_tensor * attention_norm = cur;
7344
+
7345
+ // self-attention
7346
+ {
7347
+ // compute Q and K and RoPE them
7348
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
7349
+ cb(Qcur, "Qcur", il);
7350
+
7351
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
7352
+ cb(Kcur, "Kcur", il);
7353
+
7354
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
7355
+ cb(Vcur, "Vcur", il);
7356
+
7357
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7358
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7359
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7360
+
7361
+ Qcur = ggml_rope_ext(
7362
+ ctx0, Qcur, inp_pos, nullptr,
7363
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
7364
+ ext_factor, attn_factor, beta_fast, beta_slow
7365
+ );
7366
+
7367
+ Kcur = ggml_rope_ext(
7368
+ ctx0, Kcur, inp_pos, nullptr,
7369
+ n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
7370
+ ext_factor, attn_factor, beta_fast, beta_slow
7371
+ );
7372
+
7373
+ cb(Qcur, "Qcur", il);
7374
+ cb(Kcur, "Kcur", il);
7375
+ cb(Vcur, "Vcur", il);
7376
+
7377
+ cur = build_attn(inp_attn, gf,
7378
+ model.layers[il].wo, NULL,
7379
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7380
+ }
7381
+ ggml_tensor * sa_out = cur;
7382
+
7383
+ cur = attention_norm;
7384
+
7385
+ if (il == n_layer - 1) {
7386
+ // skip computing output for unused tokens
7387
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7388
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7389
+ sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
7390
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7391
+ }
7392
+
7393
+ // feed-forward network
7394
+ {
7395
+ cur = build_ffn(cur,
7396
+ model.layers[il].ffn_up, NULL, NULL,
7397
+ model.layers[il].ffn_gate, NULL, NULL,
7398
+ model.layers[il].ffn_down, NULL, NULL,
7399
+ NULL,
7400
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
7401
+ cb(cur, "ffn_out", il);
7402
+ }
7403
+
7404
+ cur = ggml_add(ctx0, cur, sa_out);
7405
+ cur = ggml_add(ctx0, cur, inpL);
7406
+
7407
+ cur = build_cvec(cur, il);
7408
+ cb(cur, "l_out", il);
7409
+
7410
+ // input for next layer
7411
+ inpL = cur;
7412
+ }
7413
+
7414
+ cur = inpL;
7415
+
7416
+ cur = build_norm(cur,
7417
+ model.output_norm, NULL,
7418
+ LLM_NORM_RMS, -1);
7419
+
7420
+ cb(cur, "result_norm", -1);
7421
+ res->t_embd = cur;
7422
+
7423
+ // lm_head
7424
+ cur = build_lora_mm(model.output, cur);
7425
+
7426
+ cb(cur, "result_output", -1);
7427
+ res->t_logits = cur;
7428
+
7429
+ ggml_build_forward_expand(gf, cur);
7430
+ }
7431
+ };
7432
+
7433
+ struct llm_build_gpt2 : public llm_graph_context {
7434
+ llm_build_gpt2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7435
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7436
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7437
+
7438
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7439
+
7440
+ ggml_tensor * cur;
7441
+ ggml_tensor * pos;
7442
+ ggml_tensor * inpL;
7443
+
7444
+ inpL = build_inp_embd(model.tok_embd);
7445
+
7446
+ // inp_pos - contains the positions
7447
+ ggml_tensor * inp_pos = build_inp_pos();
7448
+
7449
+ auto * inp_attn = build_attn_inp_kv_unified();
7450
+
7451
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7452
+ cb(pos, "pos_embd", -1);
7453
+
7454
+ inpL = ggml_add(ctx0, inpL, pos);
7455
+ cb(inpL, "inpL", -1);
7456
+
7457
+ for (int il = 0; il < n_layer; ++il) {
7458
+ cur = build_norm(inpL,
7459
+ model.layers[il].attn_norm,
7460
+ model.layers[il].attn_norm_b,
7461
+ LLM_NORM, il);
7462
+ cb(cur, "attn_norm", il);
7463
+
7464
+ // self-attention
7465
+ {
7466
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
7467
+ cb(cur, "wqkv", il);
7468
+
7469
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7470
+ cb(cur, "bqkv", il);
7471
+
7472
+ ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7473
+ ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7474
+ ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7475
+
7476
+ cb(Qcur, "Qcur", il);
7477
+ cb(Kcur, "Kcur", il);
7478
+ cb(Vcur, "Vcur", il);
7479
+
7480
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7481
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7482
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7483
+
7484
+ cur = build_attn(inp_attn, gf,
7485
+ model.layers[il].wo, model.layers[il].bo,
7486
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7487
+ }
7488
+
7489
+ if (il == n_layer - 1) {
7490
+ // skip computing output for unused tokens
7491
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7492
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7493
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7494
+ }
7495
+
7496
+ // add the input
7497
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
7498
+ cb(ffn_inp, "ffn_inp", il);
7499
+
7500
+ // FF
7501
+ {
7502
+ cur = build_norm(ffn_inp,
7503
+ model.layers[il].ffn_norm,
7504
+ model.layers[il].ffn_norm_b,
7505
+ LLM_NORM, il);
7506
+ cb(cur, "ffn_norm", il);
7507
+
7508
+ cur = build_ffn(cur,
7509
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
7510
+ NULL, NULL, NULL,
7511
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
7512
+ NULL,
7513
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
7514
+ cb(cur, "ffn_out", il);
7515
+ }
7516
+
6823
7517
  cur = ggml_add(ctx0, cur, ffn_inp);
6824
7518
 
6825
7519
  cur = build_cvec(cur, il);
@@ -6905,7 +7599,7 @@ struct llm_build_codeshell : public llm_graph_context {
6905
7599
 
6906
7600
  cur = build_attn(inp_attn, gf,
6907
7601
  model.layers[il].wo, model.layers[il].bo,
6908
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7602
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6909
7603
  }
6910
7604
 
6911
7605
  if (il == n_layer - 1) {
@@ -7034,7 +7728,7 @@ struct llm_build_orion : public llm_graph_context {
7034
7728
 
7035
7729
  cur = build_attn(inp_attn, gf,
7036
7730
  model.layers[il].wo, NULL,
7037
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7731
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7038
7732
  }
7039
7733
 
7040
7734
  if (il == n_layer - 1) {
@@ -7161,7 +7855,7 @@ struct llm_build_internlm2 : public llm_graph_context {
7161
7855
 
7162
7856
  cur = build_attn(inp_attn, gf,
7163
7857
  model.layers[il].wo, model.layers[il].bo,
7164
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7858
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7165
7859
  }
7166
7860
 
7167
7861
  if (il == n_layer - 1) {
@@ -7358,7 +8052,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7358
8052
 
7359
8053
  cur = build_attn(inp_attn, gf,
7360
8054
  model.layers[il].wo, NULL,
7361
- q_states, k_states, v_states, nullptr, kq_scale, il);
8055
+ q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
7362
8056
  }
7363
8057
 
7364
8058
  if (il == n_layer - 1) {
@@ -7488,7 +8182,7 @@ struct llm_build_gemma : public llm_graph_context {
7488
8182
 
7489
8183
  cur = build_attn(inp_attn, gf,
7490
8184
  model.layers[il].wo, NULL,
7491
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
8185
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7492
8186
  }
7493
8187
 
7494
8188
  if (il == n_layer - 1) {
@@ -7610,7 +8304,7 @@ struct llm_build_gemma2 : public llm_graph_context {
7610
8304
 
7611
8305
  cur = build_attn(inp_attn, gf,
7612
8306
  model.layers[il].wo, NULL,
7613
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
8307
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7614
8308
  }
7615
8309
 
7616
8310
  cur = build_norm(cur,
@@ -7751,7 +8445,7 @@ struct llm_build_gemma3 : public llm_graph_context {
7751
8445
 
7752
8446
  cur = build_attn(inp_attn, gf,
7753
8447
  model.layers[il].wo, NULL,
7754
- Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il);
8448
+ Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
7755
8449
  }
7756
8450
 
7757
8451
  cur = build_norm(cur,
@@ -7891,7 +8585,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
7891
8585
 
7892
8586
  cur = build_attn(inp_attn, gf,
7893
8587
  model.layers[il].wo, model.layers[il].bo,
7894
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8588
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7895
8589
  }
7896
8590
 
7897
8591
  if (il == n_layer - 1) {
@@ -8226,7 +8920,7 @@ struct llm_build_command_r : public llm_graph_context {
8226
8920
 
8227
8921
  cur = build_attn(inp_attn, gf,
8228
8922
  model.layers[il].wo, model.layers[il].bo,
8229
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8923
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8230
8924
  }
8231
8925
 
8232
8926
  if (il == n_layer - 1) {
@@ -8361,7 +9055,7 @@ struct llm_build_cohere2 : public llm_graph_context {
8361
9055
 
8362
9056
  cur = build_attn(inp_attn, gf,
8363
9057
  model.layers[il].wo, model.layers[il].bo,
8364
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9058
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8365
9059
  }
8366
9060
 
8367
9061
  if (il == n_layer - 1) {
@@ -8492,7 +9186,7 @@ struct llm_build_olmo : public llm_graph_context {
8492
9186
 
8493
9187
  cur = build_attn(inp_attn, gf,
8494
9188
  model.layers[il].wo, nullptr,
8495
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9189
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8496
9190
  }
8497
9191
 
8498
9192
  if (il == n_layer - 1) {
@@ -8612,7 +9306,7 @@ struct llm_build_olmo2 : public llm_graph_context {
8612
9306
 
8613
9307
  cur = build_attn(inp_attn, gf,
8614
9308
  model.layers[il].wo, NULL,
8615
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9309
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8616
9310
  }
8617
9311
 
8618
9312
  cur = build_norm(cur,
@@ -8745,7 +9439,7 @@ struct llm_build_olmoe : public llm_graph_context {
8745
9439
 
8746
9440
  cur = build_attn(inp_attn, gf,
8747
9441
  model.layers[il].wo, NULL,
8748
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9442
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8749
9443
  }
8750
9444
 
8751
9445
  if (il == n_layer - 1) {
@@ -8878,7 +9572,7 @@ struct llm_build_openelm : public llm_graph_context {
8878
9572
 
8879
9573
  cur = build_attn(inp_attn, gf,
8880
9574
  model.layers[il].wo, NULL,
8881
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9575
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8882
9576
  }
8883
9577
 
8884
9578
  if (il == n_layer - 1) {
@@ -8992,7 +9686,7 @@ struct llm_build_gptneox : public llm_graph_context {
8992
9686
 
8993
9687
  cur = build_attn(inp_attn, gf,
8994
9688
  model.layers[il].wo, model.layers[il].bo,
8995
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9689
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8996
9690
  }
8997
9691
 
8998
9692
  if (il == n_layer - 1) {
@@ -9142,7 +9836,7 @@ struct llm_build_arctic : public llm_graph_context {
9142
9836
 
9143
9837
  cur = build_attn(inp_attn, gf,
9144
9838
  model.layers[il].wo, NULL,
9145
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9839
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9146
9840
  }
9147
9841
 
9148
9842
  if (il == n_layer - 1) {
@@ -9297,7 +9991,7 @@ struct llm_build_deepseek : public llm_graph_context {
9297
9991
 
9298
9992
  cur = build_attn(inp_attn, gf,
9299
9993
  model.layers[il].wo, model.layers[il].bo,
9300
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
9994
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
9301
9995
  }
9302
9996
 
9303
9997
  if (il == n_layer - 1) {
@@ -9387,15 +10081,22 @@ struct llm_build_deepseek2 : public llm_graph_context {
9387
10081
  llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9388
10082
  bool is_lite = (hparams.n_layer == 27);
9389
10083
 
10084
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
10085
+
10086
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
10087
+ const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
10088
+ const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
10089
+
10090
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
10091
+ const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
10092
+
10093
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
10094
+
9390
10095
  // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
9391
10096
  // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
9392
10097
  const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
9393
- const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
9394
- const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
9395
-
9396
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
9397
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
9398
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
10098
+ const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
10099
+ const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
9399
10100
 
9400
10101
  ggml_tensor * cur;
9401
10102
  ggml_tensor * inpL;
@@ -9421,16 +10122,14 @@ struct llm_build_deepseek2 : public llm_graph_context {
9421
10122
  {
9422
10123
  ggml_tensor * q = NULL;
9423
10124
  if (!is_lite) {
9424
- // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
9425
10125
  q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
9426
10126
  cb(q, "q", il);
9427
10127
 
9428
10128
  q = build_norm(q,
9429
- model.layers[il].attn_q_a_norm, NULL,
10129
+ model.layers[il].attn_q_a_norm, nullptr,
9430
10130
  LLM_NORM_RMS, il);
9431
10131
  cb(q, "q", il);
9432
10132
 
9433
- // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
9434
10133
  q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
9435
10134
  cb(q, "q", il);
9436
10135
  } else {
@@ -9438,96 +10137,125 @@ struct llm_build_deepseek2 : public llm_graph_context {
9438
10137
  cb(q, "q", il);
9439
10138
  }
9440
10139
 
9441
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
9442
- ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
9443
- ggml_row_size(q->type, hparams.n_embd_head_k),
9444
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
10140
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
10141
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
10142
+ n_embd_head_qk_nope, n_head, n_tokens,
10143
+ ggml_row_size(q->type, n_embd_head_k),
10144
+ ggml_row_size(q->type, n_embd_head_k) * n_head,
9445
10145
  0);
9446
10146
  cb(q_nope, "q_nope", il);
9447
10147
 
9448
- // and {n_head * n_embd_head_qk_rope, n_tokens}
9449
- ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
9450
- ggml_row_size(q->type, hparams.n_embd_head_k),
9451
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
10148
+ // and {n_embd_head_qk_rope, n_head, n_tokens}
10149
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
10150
+ n_embd_head_qk_rope, n_head, n_tokens,
10151
+ ggml_row_size(q->type, n_embd_head_k),
10152
+ ggml_row_size(q->type, n_embd_head_k) * n_head,
9452
10153
  ggml_row_size(q->type, n_embd_head_qk_nope));
9453
10154
  cb(q_pe, "q_pe", il);
9454
10155
 
9455
- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
9456
- ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
9457
- cb(kv_pe_compresseed, "kv_pe_compresseed", il);
10156
+ ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
10157
+ cb(kv_cmpr_pe, "kv_cmpr_pe", il);
9458
10158
 
9459
10159
  // split into {kv_lora_rank, n_tokens}
9460
- ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
9461
- kv_pe_compresseed->nb[1],
10160
+ ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
10161
+ kv_lora_rank, n_tokens,
10162
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
9462
10163
  0);
9463
- cb(kv_compressed, "kv_compressed", il);
10164
+ cb(kv_cmpr, "kv_cmpr", il);
10165
+
10166
+ // and {n_embd_head_qk_rope, 1, n_tokens}
10167
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
10168
+ n_embd_head_qk_rope, 1, n_tokens,
10169
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
10170
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
10171
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
10172
+ cb(k_pe, "k_pe", il);
9464
10173
 
9465
- // and {n_embd_head_qk_rope, n_tokens}
9466
- ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
9467
- kv_pe_compresseed->nb[1],
9468
- kv_pe_compresseed->nb[1],
9469
- ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
10174
+ q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
10175
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10176
+ ext_factor, attn_factor, beta_fast, beta_slow
10177
+ );
10178
+ cb(q_pe, "q_pe", il);
10179
+
10180
+ k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
10181
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10182
+ ext_factor, attn_factor, beta_fast, beta_slow
10183
+ );
9470
10184
  cb(k_pe, "k_pe", il);
9471
10185
 
9472
- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
9473
- kv_compressed = ggml_cont(ctx0, kv_compressed);
9474
- kv_compressed = build_norm(kv_compressed,
9475
- model.layers[il].attn_kv_a_norm, NULL,
10186
+ kv_cmpr = build_norm(kv_cmpr,
10187
+ model.layers[il].attn_kv_a_norm, nullptr,
9476
10188
  LLM_NORM_RMS, il);
9477
- cb(kv_compressed, "kv_compressed", il);
10189
+ cb(kv_cmpr, "kv_cmpr", il);
9478
10190
 
9479
- // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
9480
- ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
9481
- cb(kv, "kv", il);
10191
+ if (is_mla) {
10192
+ // {n_embd_head_qk_nope, n_tokens, n_head}
10193
+ q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
10194
+ cb(q_nope, "q_nope_perm", il);
9482
10195
 
9483
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
9484
- ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
9485
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
9486
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
9487
- 0);
9488
- cb(k_nope, "k_nope", il);
10196
+ // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
10197
+ ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
10198
+ cb(q_nope_absorbed, "q_nope_absorbed", il);
9489
10199
 
9490
- // and {n_head * n_embd_head_v, n_tokens}
9491
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
9492
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
9493
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
9494
- ggml_row_size(kv->type, (n_embd_head_qk_nope)));
9495
- cb(v_states, "v_states", il);
10200
+ // {kv_lora_rank, n_head, n_tokens}
10201
+ q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
10202
+ cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
9496
10203
 
9497
- v_states = ggml_cont(ctx0, v_states);
9498
- cb(v_states, "v_states", il);
10204
+ // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
10205
+ // note: rope must go first for in-place context shifting in build_rope_shift()
10206
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
10207
+ cb(Qcur, "Qcur", il);
9499
10208
 
9500
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
9501
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
9502
- 0);
9503
- cb(v_states, "v_states", il);
10209
+ kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
10210
+ cb(kv_cmpr, "kv_cmpr_reshape", il);
9504
10211
 
9505
- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
9506
- q_pe = ggml_rope_ext(
9507
- ctx0, q_pe, inp_pos, nullptr,
9508
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9509
- ext_factor, attn_factor_scaled, beta_fast, beta_slow
9510
- );
9511
- cb(q_pe, "q_pe", il);
10212
+ // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
10213
+ ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
10214
+ cb(Kcur, "Kcur", il);
9512
10215
 
9513
- // shared RoPE key
9514
- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
9515
- k_pe = ggml_rope_ext(
9516
- ctx0, k_pe, inp_pos, nullptr,
9517
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
9518
- ext_factor, attn_factor_scaled, beta_fast, beta_slow
9519
- );
9520
- cb(k_pe, "k_pe", il);
10216
+ // {kv_lora_rank, 1, n_tokens}
10217
+ ggml_tensor * Vcur = kv_cmpr;
10218
+ cb(Vcur, "Vcur", il);
9521
10219
 
9522
- ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
9523
- cb(q_states, "q_states", il);
10220
+ // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
10221
+ cur = build_attn(inp_attn, gf,
10222
+ model.layers[il].wo, NULL,
10223
+ Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
10224
+ } else {
10225
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
10226
+ cb(kv, "kv", il);
10227
+
10228
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
10229
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
10230
+ n_embd_head_qk_nope, n_head, n_tokens,
10231
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
10232
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
10233
+ 0);
10234
+ cb(k_nope, "k_nope_view", il);
9524
10235
 
9525
- ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
9526
- cb(k_states, "k_states", il);
10236
+ // and {n_embd_head_v, n_head, n_tokens}
10237
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
10238
+ n_embd_head_v, n_head, n_tokens,
10239
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
10240
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
10241
+ ggml_row_size(kv->type, n_embd_head_qk_nope));
10242
+ cb(Vcur, "Vcur_view", il);
9527
10243
 
9528
- cur = build_attn(inp_attn, gf,
9529
- model.layers[il].wo, NULL,
9530
- q_states, k_states, v_states, nullptr, kq_scale, il);
10244
+ Vcur = ggml_cont(ctx0, Vcur);
10245
+ cb(Vcur, "Vcur_cont", il);
10246
+
10247
+ // note: rope must go first for in-place context shifting in build_rope_shift()
10248
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
10249
+ cb(Qcur, "Qcur", il);
10250
+
10251
+ ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
10252
+ cb(Kcur, "Kcur", il);
10253
+
10254
+ // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
10255
+ cur = build_attn(inp_attn, gf,
10256
+ model.layers[il].wo, NULL,
10257
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
10258
+ }
9531
10259
  }
9532
10260
 
9533
10261
  if (il == n_layer - 1) {
@@ -9693,7 +10421,7 @@ struct llm_build_bitnet : public llm_graph_context {
9693
10421
 
9694
10422
  cur = build_attn(inp_attn, gf,
9695
10423
  NULL, NULL,
9696
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10424
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9697
10425
 
9698
10426
  cur = build_norm(cur,
9699
10427
  model.layers[il].attn_sub_norm, NULL,
@@ -9816,7 +10544,7 @@ struct llm_build_t5_enc : public llm_graph_context {
9816
10544
 
9817
10545
  cur = build_attn(inp_attn, gf,
9818
10546
  model.layers[il].wo_enc, nullptr,
9819
- Qcur, Kcur, Vcur, kq_b, 1.0f, il);
10547
+ Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
9820
10548
  cb(cur, "kqv_out", il);
9821
10549
  }
9822
10550
 
@@ -9922,7 +10650,7 @@ struct llm_build_t5_dec : public llm_graph_context {
9922
10650
 
9923
10651
  cur = build_attn(inp_attn_self, gf,
9924
10652
  model.layers[il].wo, model.layers[il].bo,
9925
- Qcur, Kcur, Vcur, kq_b, 1.0f, il);
10653
+ Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
9926
10654
  cb(cur, "kqv_out", il);
9927
10655
  }
9928
10656
 
@@ -9954,7 +10682,7 @@ struct llm_build_t5_dec : public llm_graph_context {
9954
10682
 
9955
10683
  cur = build_attn(inp_attn_cross, gf,
9956
10684
  model.layers[il].wo_cross, nullptr,
9957
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
10685
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
9958
10686
  cb(cur, "kqv_out", il);
9959
10687
 
9960
10688
  //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
@@ -10087,7 +10815,7 @@ struct llm_build_jais : public llm_graph_context {
10087
10815
 
10088
10816
  cur = build_attn(inp_attn, gf,
10089
10817
  model.layers[il].wo, model.layers[il].bo,
10090
- Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il);
10818
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
10091
10819
  }
10092
10820
 
10093
10821
  if (il == n_layer - 1) {
@@ -10219,7 +10947,7 @@ struct llm_build_chatglm : public llm_graph_context {
10219
10947
 
10220
10948
  cur = build_attn(inp_attn, gf,
10221
10949
  model.layers[il].wo, NULL,
10222
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10950
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10223
10951
  }
10224
10952
 
10225
10953
  if (il == n_layer - 1) {
@@ -10272,6 +11000,157 @@ struct llm_build_chatglm : public llm_graph_context {
10272
11000
  }
10273
11001
  };
10274
11002
 
11003
+ struct llm_build_glm4 : public llm_graph_context {
11004
+ llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11005
+ const int64_t n_embd_head = hparams.n_embd_head_v;
11006
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
11007
+
11008
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
11009
+
11010
+ ggml_tensor * cur;
11011
+ ggml_tensor * inpL;
11012
+
11013
+ inpL = build_inp_embd(model.tok_embd);
11014
+
11015
+ // inp_pos - contains the positions
11016
+ ggml_tensor * inp_pos = build_inp_pos();
11017
+
11018
+ auto * inp_attn = build_attn_inp_kv_unified();
11019
+
11020
+ for (int il = 0; il < n_layer; ++il) {
11021
+ ggml_tensor * inpSA = inpL;
11022
+
11023
+ // Pre-attention norm
11024
+ cur = build_norm(inpL,
11025
+ model.layers[il].attn_norm,
11026
+ NULL,
11027
+ LLM_NORM_RMS, il);
11028
+ cb(cur, "attn_norm", il);
11029
+
11030
+ // self-attention
11031
+ {
11032
+ ggml_tensor * Qcur = nullptr;
11033
+ ggml_tensor * Kcur = nullptr;
11034
+ ggml_tensor * Vcur = nullptr;
11035
+
11036
+ if (model.layers[il].wqkv == nullptr) {
11037
+ Qcur = build_lora_mm(model.layers[il].wq, cur);
11038
+ if (model.layers[il].bq) {
11039
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
11040
+ }
11041
+ Kcur = build_lora_mm(model.layers[il].wk, cur);
11042
+ if (model.layers[il].bk) {
11043
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
11044
+ }
11045
+ Vcur = build_lora_mm(model.layers[il].wv, cur);
11046
+ if (model.layers[il].bv) {
11047
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
11048
+ }
11049
+ } else {
11050
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
11051
+ cb(cur, "wqkv", il);
11052
+ if (model.layers[il].bqkv) {
11053
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
11054
+ cb(cur, "bqkv", il);
11055
+ }
11056
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
11057
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
11058
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
11059
+ }
11060
+
11061
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11062
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11063
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11064
+
11065
+ Qcur = ggml_rope_ext(
11066
+ ctx0, Qcur, inp_pos, nullptr,
11067
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11068
+ ext_factor, attn_factor, beta_fast, beta_slow
11069
+ );
11070
+
11071
+ Kcur = ggml_rope_ext(
11072
+ ctx0, Kcur, inp_pos, nullptr,
11073
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11074
+ ext_factor, attn_factor, beta_fast, beta_slow
11075
+ );
11076
+
11077
+ cb(Qcur, "Qcur", il);
11078
+ cb(Kcur, "Kcur", il);
11079
+ cb(Vcur, "Vcur", il);
11080
+
11081
+ cur = build_attn(inp_attn, gf,
11082
+ model.layers[il].wo, NULL,
11083
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11084
+ }
11085
+
11086
+ if (il == n_layer - 1) {
11087
+ // skip computing output for unused tokens
11088
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11089
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11090
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11091
+ }
11092
+
11093
+ // Post-attention norm (new!)
11094
+ cur = build_norm(cur,
11095
+ model.layers[il].attn_post_norm,
11096
+ NULL,
11097
+ LLM_NORM_RMS, il);
11098
+ cb(cur, "post_attn_norm", il);
11099
+
11100
+ // Add the input (residual connection after post-attention norm)
11101
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11102
+ cb(ffn_inp, "ffn_inp", il);
11103
+
11104
+ // FF
11105
+ {
11106
+ // Pre-MLP norm
11107
+ cur = build_norm(ffn_inp,
11108
+ model.layers[il].ffn_norm,
11109
+ NULL,
11110
+ LLM_NORM_RMS, il);
11111
+ cb(cur, "ffn_norm", il);
11112
+
11113
+ // MLP
11114
+ cur = build_ffn(cur,
11115
+ model.layers[il].ffn_up, NULL, NULL,
11116
+ NULL, NULL, NULL,
11117
+ model.layers[il].ffn_down, NULL, NULL,
11118
+ NULL,
11119
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
11120
+ cb(cur, "ffn_out", il);
11121
+
11122
+ // Post-MLP norm
11123
+ cur = build_norm(cur,
11124
+ model.layers[il].ffn_post_norm,
11125
+ NULL,
11126
+ LLM_NORM_RMS, il);
11127
+ cb(cur, "post_mlp_norm", il);
11128
+ }
11129
+
11130
+ // Add residual connection after post-MLP norm
11131
+ inpL = ggml_add(ctx0, cur, ffn_inp);
11132
+ cb(inpL, "l_out", il);
11133
+ }
11134
+
11135
+ // Final norm
11136
+ cur = build_norm(inpL,
11137
+ model.output_norm,
11138
+ NULL,
11139
+ LLM_NORM_RMS, -1);
11140
+
11141
+ cb(cur, "result_norm", -1);
11142
+ res->t_embd = cur;
11143
+
11144
+ // Output projection
11145
+ cur = build_lora_mm(model.output, cur);
11146
+
11147
+ cb(cur, "result_output", -1);
11148
+ res->t_logits = cur;
11149
+
11150
+ ggml_build_forward_expand(gf, cur);
11151
+ }
11152
+ };
11153
+
10275
11154
  struct llm_build_nemotron : public llm_graph_context {
10276
11155
  llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
10277
11156
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -10345,7 +11224,7 @@ struct llm_build_nemotron : public llm_graph_context {
10345
11224
 
10346
11225
  cur = build_attn(inp_attn, gf,
10347
11226
  model.layers[il].wo, model.layers[il].bo,
10348
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11227
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10349
11228
  }
10350
11229
 
10351
11230
  if (il == n_layer - 1) {
@@ -10476,7 +11355,7 @@ struct llm_build_exaone : public llm_graph_context {
10476
11355
 
10477
11356
  cur = build_attn(inp_attn, gf,
10478
11357
  model.layers[il].wo, model.layers[il].bo,
10479
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11358
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10480
11359
  }
10481
11360
 
10482
11361
  if (il == n_layer - 1) {
@@ -11378,7 +12257,7 @@ struct llm_build_chameleon : public llm_graph_context {
11378
12257
 
11379
12258
  cur = build_attn(inp_attn, gf,
11380
12259
  model.layers[il].wo, nullptr,
11381
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12260
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11382
12261
 
11383
12262
  if (hparams.swin_norm) {
11384
12263
  cur = build_norm(cur,
@@ -11397,31 +12276,370 @@ struct llm_build_chameleon : public llm_graph_context {
11397
12276
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11398
12277
  cb(ffn_inp, "ffn_inp", il);
11399
12278
 
11400
- // feed-forward network
11401
- if (!hparams.swin_norm) {
11402
- cur = build_norm(ffn_inp,
11403
- model.layers[il].ffn_norm, NULL,
11404
- LLM_NORM_RMS, il);
11405
- cb(cur, "ffn_norm", il);
11406
- }
12279
+ // feed-forward network
12280
+ if (!hparams.swin_norm) {
12281
+ cur = build_norm(ffn_inp,
12282
+ model.layers[il].ffn_norm, NULL,
12283
+ LLM_NORM_RMS, il);
12284
+ cb(cur, "ffn_norm", il);
12285
+ }
12286
+
12287
+ cur = build_ffn(cur,
12288
+ model.layers[il].ffn_up, NULL, NULL,
12289
+ model.layers[il].ffn_gate, NULL, NULL,
12290
+ model.layers[il].ffn_down, NULL, NULL,
12291
+ NULL,
12292
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12293
+ cb(cur, "ffn_out", il);
12294
+
12295
+ if (hparams.swin_norm) {
12296
+ cur = build_norm(cur,
12297
+ model.layers[il].ffn_norm, NULL,
12298
+ LLM_NORM_RMS, il);
12299
+ cb(cur, "ffn_norm", il);
12300
+ }
12301
+
12302
+ cur = ggml_add(ctx0, cur, ffn_inp);
12303
+ cb(cur, "ffn_out", il);
12304
+
12305
+ cur = build_cvec(cur, il);
12306
+ cb(cur, "l_out", il);
12307
+
12308
+ // input for next layer
12309
+ inpL = cur;
12310
+ }
12311
+
12312
+ cur = inpL;
12313
+
12314
+ cur = build_norm(cur,
12315
+ model.output_norm, NULL,
12316
+ LLM_NORM_RMS, -1);
12317
+
12318
+ cb(cur, "result_norm", -1);
12319
+ res->t_embd = cur;
12320
+
12321
+ // lm_head
12322
+ cur = build_lora_mm(model.output, cur);
12323
+ cb(cur, "result_output_with_img_logits", -1);
12324
+
12325
+ // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
12326
+ // Needs to be removed once image outputs are supported.
12327
+ int img_token_end_idx = 8196;
12328
+ int img_token_start_idx = 4;
12329
+ int num_img_tokens = img_token_end_idx - img_token_start_idx;
12330
+ // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
12331
+ // which ensures that text token values are always at least larger than image token values
12332
+ ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
12333
+ img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
12334
+ cb(img_logits, "img_logits", -1);
12335
+
12336
+ cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
12337
+
12338
+ cb(cur, "result_output", -1);
12339
+ res->t_logits = cur;
12340
+
12341
+ ggml_build_forward_expand(gf, cur);
12342
+ }
12343
+ };
12344
+
12345
+ struct llm_build_wavtokenizer_dec : public llm_graph_context {
12346
+ llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12347
+ ggml_tensor * cur;
12348
+ ggml_tensor * inpL;
12349
+
12350
+ inpL = build_inp_embd(model.tok_embd);
12351
+
12352
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
12353
+
12354
+ cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
12355
+ cur = ggml_add(ctx0, cur, model.conv1d_b);
12356
+
12357
+ // posnet
12358
+ for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
12359
+ const auto & layer = model.layers[il].posnet;
12360
+
12361
+ inpL = cur;
12362
+
12363
+ switch (il) {
12364
+ case 0:
12365
+ case 1:
12366
+ case 3:
12367
+ case 4:
12368
+ {
12369
+ cur = build_norm(cur,
12370
+ layer.norm1,
12371
+ layer.norm1_b,
12372
+ LLM_NORM_GROUP, 0);
12373
+
12374
+ cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
12375
+
12376
+ cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
12377
+ cur = ggml_add(ctx0, cur, layer.conv1_b);
12378
+
12379
+ cur = build_norm(cur,
12380
+ layer.norm2,
12381
+ layer.norm2_b,
12382
+ LLM_NORM_GROUP, 0);
12383
+
12384
+ cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
12385
+
12386
+ cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
12387
+ cur = ggml_add(ctx0, cur, layer.conv2_b);
12388
+
12389
+ cur = ggml_add(ctx0, cur, inpL);
12390
+ } break;
12391
+ case 2:
12392
+ {
12393
+ cur = build_norm(cur,
12394
+ layer.attn_norm,
12395
+ layer.attn_norm_b,
12396
+ LLM_NORM_GROUP, 0);
12397
+
12398
+ ggml_tensor * q;
12399
+ ggml_tensor * k;
12400
+ ggml_tensor * v;
12401
+
12402
+ q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
12403
+ k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
12404
+ v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
12405
+
12406
+ q = ggml_add(ctx0, q, layer.attn_q_b);
12407
+ k = ggml_add(ctx0, k, layer.attn_k_b);
12408
+ v = ggml_add(ctx0, v, layer.attn_v_b);
12409
+
12410
+ q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
12411
+ k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
12412
+
12413
+ ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
12414
+
12415
+ kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
12416
+
12417
+ cur = ggml_mul_mat(ctx0, kq, v);
12418
+
12419
+ cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
12420
+ cur = ggml_add(ctx0, cur, layer.attn_o_b);
12421
+
12422
+ cur = ggml_add(ctx0, cur, inpL);
12423
+ } break;
12424
+ case 5:
12425
+ {
12426
+ cur = build_norm(cur,
12427
+ layer.norm,
12428
+ layer.norm_b,
12429
+ LLM_NORM_GROUP, 0);
12430
+ } break;
12431
+ default: GGML_ABORT("unknown posnet layer");
12432
+ };
12433
+ }
12434
+
12435
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
12436
+
12437
+ cur = build_norm(cur,
12438
+ model.tok_norm,
12439
+ model.tok_norm_b,
12440
+ LLM_NORM, -1);
12441
+
12442
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
12443
+
12444
+ inpL = cur;
12445
+
12446
+ // convnext
12447
+ for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
12448
+ const auto & layer = model.layers[il].convnext;
12449
+
12450
+ cur = inpL;
12451
+
12452
+ cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
12453
+ cur = ggml_add(ctx0, cur, layer.dw_b);
12454
+
12455
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
12456
+
12457
+ cur = build_norm(cur,
12458
+ layer.norm,
12459
+ layer.norm_b,
12460
+ LLM_NORM, -1);
12461
+
12462
+ cur = build_ffn(cur,
12463
+ layer.pw1, layer.pw1_b, NULL,
12464
+ NULL, NULL, NULL,
12465
+ layer.pw2, layer.pw2_b, NULL,
12466
+ NULL,
12467
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
12468
+
12469
+ cur = ggml_mul(ctx0, cur, layer.gamma);
12470
+
12471
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
12472
+
12473
+ inpL = ggml_add(ctx0, cur, inpL);
12474
+ }
12475
+
12476
+ cur = inpL;
12477
+
12478
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
12479
+
12480
+ cur = build_norm(cur,
12481
+ model.output_norm,
12482
+ model.output_norm_b,
12483
+ LLM_NORM, -1);
12484
+
12485
+ // lm_head
12486
+ cur = build_lora_mm(model.output, cur);
12487
+
12488
+ cur = ggml_add(ctx0, cur, model.output_b);
12489
+
12490
+ cb(cur, "result_embd", -1);
12491
+ res->t_embd = cur;
12492
+
12493
+ ggml_build_forward_expand(gf, cur);
12494
+ }
12495
+ };
12496
+
12497
+ struct llm_build_plm : public llm_graph_context {
12498
+ llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12499
+ const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
12500
+
12501
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
12502
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
12503
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
12504
+
12505
+ ggml_tensor * cur;
12506
+ ggml_tensor * inpL;
12507
+
12508
+ // {n_embd, n_tokens}
12509
+ inpL = build_inp_embd(model.tok_embd);
12510
+
12511
+ // inp_pos - contains the positions
12512
+ ggml_tensor * inp_pos = build_inp_pos();
12513
+
12514
+ auto * inp_attn = build_attn_inp_kv_unified();
12515
+
12516
+ for (int il = 0; il < n_layer; ++il) {
12517
+ ggml_tensor * inpSA = inpL;
12518
+
12519
+ // norm
12520
+ cur = build_norm(inpL,
12521
+ model.layers[il].attn_norm, NULL,
12522
+ LLM_NORM_RMS, il);
12523
+ cb(cur, "attn_norm", il);
12524
+
12525
+ // self_attention
12526
+ {
12527
+ ggml_tensor * q = NULL;
12528
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
12529
+ cb(q, "q", il);
12530
+
12531
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
12532
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
12533
+ ggml_row_size(q->type, hparams.n_embd_head_k),
12534
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
12535
+ 0);
12536
+ cb(q_nope, "q_nope", il);
12537
+
12538
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
12539
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
12540
+ ggml_row_size(q->type, hparams.n_embd_head_k),
12541
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
12542
+ ggml_row_size(q->type, n_embd_head_qk_nope));
12543
+ cb(q_pe, "q_pe", il);
12544
+
12545
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
12546
+ ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
12547
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
12548
+
12549
+ // split into {kv_lora_rank, n_tokens}
12550
+ ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
12551
+ kv_pe_compresseed->nb[1],
12552
+ 0);
12553
+ cb(kv_compressed, "kv_compressed", il);
12554
+
12555
+ // and {n_embd_head_qk_rope, n_tokens}
12556
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
12557
+ kv_pe_compresseed->nb[1],
12558
+ kv_pe_compresseed->nb[1],
12559
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
12560
+ cb(k_pe, "k_pe", il);
12561
+
12562
+ kv_compressed = build_norm(kv_compressed,
12563
+ model.layers[il].attn_kv_a_norm, NULL,
12564
+ LLM_NORM_RMS, il);
12565
+ cb(kv_compressed, "kv_compressed", il);
12566
+
12567
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
12568
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
12569
+ cb(kv, "kv", il);
12570
+
12571
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
12572
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
12573
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
12574
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
12575
+ 0);
12576
+ cb(k_nope, "k_nope", il);
12577
+
12578
+ // and {n_head * n_embd_head_v, n_tokens}
12579
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
12580
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
12581
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
12582
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
12583
+ cb(v_states, "v_states", il);
12584
+
12585
+ v_states = ggml_cont(ctx0, v_states);
12586
+ cb(v_states, "v_states", il);
12587
+
12588
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
12589
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
12590
+ 0);
12591
+ cb(v_states, "v_states", il);
12592
+
12593
+ q_pe = ggml_rope_ext(
12594
+ ctx0, q_pe, inp_pos, nullptr,
12595
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12596
+ ext_factor, attn_factor, beta_fast, beta_slow
12597
+ );
12598
+ cb(q_pe, "q_pe", il);
12599
+
12600
+ // shared RoPE key
12601
+ k_pe = ggml_rope_ext(
12602
+ ctx0, k_pe, inp_pos, nullptr,
12603
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12604
+ ext_factor, attn_factor, beta_fast, beta_slow
12605
+ );
12606
+ cb(k_pe, "k_pe", il);
12607
+
12608
+ ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
12609
+ cb(q_states, "q_states", il);
12610
+
12611
+ ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
12612
+ cb(k_states, "k_states", il);
12613
+
12614
+ cur = build_attn(inp_attn, gf,
12615
+ model.layers[il].wo, NULL,
12616
+ q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
12617
+ }
12618
+
12619
+ if (il == n_layer - 1) {
12620
+ // skip computing output for unused tokens
12621
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12622
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12623
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12624
+ }
12625
+
12626
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12627
+ cb(ffn_inp, "ffn_inp", il);
12628
+
12629
+ cur = build_norm(ffn_inp,
12630
+ model.layers[il].ffn_norm, NULL,
12631
+ LLM_NORM_RMS, il);
12632
+ cb(cur, "ffn_norm", il);
11407
12633
 
11408
12634
  cur = build_ffn(cur,
11409
12635
  model.layers[il].ffn_up, NULL, NULL,
11410
- model.layers[il].ffn_gate, NULL, NULL,
12636
+ NULL, NULL, NULL,
11411
12637
  model.layers[il].ffn_down, NULL, NULL,
11412
12638
  NULL,
11413
- LLM_FFN_SILU, LLM_FFN_PAR, il);
12639
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
11414
12640
  cb(cur, "ffn_out", il);
11415
12641
 
11416
- if (hparams.swin_norm) {
11417
- cur = build_norm(cur,
11418
- model.layers[il].ffn_norm, NULL,
11419
- LLM_NORM_RMS, il);
11420
- cb(cur, "ffn_norm", il);
11421
- }
11422
-
11423
12642
  cur = ggml_add(ctx0, cur, ffn_inp);
11424
- cb(cur, "ffn_out", il);
11425
12643
 
11426
12644
  cur = build_cvec(cur, il);
11427
12645
  cb(cur, "l_out", il);
@@ -11439,22 +12657,7 @@ struct llm_build_chameleon : public llm_graph_context {
11439
12657
  cb(cur, "result_norm", -1);
11440
12658
  res->t_embd = cur;
11441
12659
 
11442
- // lm_head
11443
12660
  cur = build_lora_mm(model.output, cur);
11444
- cb(cur, "result_output_with_img_logits", -1);
11445
-
11446
- // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
11447
- // Needs to be removed once image outputs are supported.
11448
- int img_token_end_idx = 8196;
11449
- int img_token_start_idx = 4;
11450
- int num_img_tokens = img_token_end_idx - img_token_start_idx;
11451
- // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
11452
- // which ensures that text token values are always at least larger than image token values
11453
- ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
11454
- img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
11455
- cb(img_logits, "img_logits", -1);
11456
-
11457
- cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
11458
12661
 
11459
12662
  cb(cur, "result_output", -1);
11460
12663
  res->t_logits = cur;
@@ -11463,153 +12666,145 @@ struct llm_build_chameleon : public llm_graph_context {
11463
12666
  }
11464
12667
  };
11465
12668
 
11466
- struct llm_build_wavtokenizer_dec : public llm_graph_context {
11467
- llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
12669
+ struct llm_build_bailingmoe : public llm_graph_context {
12670
+ llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11468
12671
  ggml_tensor * cur;
11469
12672
  ggml_tensor * inpL;
11470
12673
 
11471
12674
  inpL = build_inp_embd(model.tok_embd);
11472
12675
 
11473
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
11474
-
11475
- cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
11476
- cur = ggml_add(ctx0, cur, model.conv1d_b);
11477
-
11478
- // posnet
11479
- for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
11480
- const auto & layer = model.layers[il].posnet;
11481
-
11482
- inpL = cur;
11483
-
11484
- switch (il) {
11485
- case 0:
11486
- case 1:
11487
- case 3:
11488
- case 4:
11489
- {
11490
- cur = build_norm(cur,
11491
- layer.norm1,
11492
- layer.norm1_b,
11493
- LLM_NORM_GROUP, 0);
11494
-
11495
- cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
11496
-
11497
- cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
11498
- cur = ggml_add(ctx0, cur, layer.conv1_b);
11499
-
11500
- cur = build_norm(cur,
11501
- layer.norm2,
11502
- layer.norm2_b,
11503
- LLM_NORM_GROUP, 0);
11504
-
11505
- cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
11506
-
11507
- cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
11508
- cur = ggml_add(ctx0, cur, layer.conv2_b);
11509
-
11510
- cur = ggml_add(ctx0, cur, inpL);
11511
- } break;
11512
- case 2:
11513
- {
11514
- cur = build_norm(cur,
11515
- layer.attn_norm,
11516
- layer.attn_norm_b,
11517
- LLM_NORM_GROUP, 0);
11518
-
11519
- ggml_tensor * q;
11520
- ggml_tensor * k;
11521
- ggml_tensor * v;
12676
+ // inp_pos - contains the positions
12677
+ ggml_tensor * inp_pos = build_inp_pos();
11522
12678
 
11523
- q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
11524
- k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
11525
- v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
12679
+ auto * inp_attn = build_attn_inp_kv_unified();
11526
12680
 
11527
- q = ggml_add(ctx0, q, layer.attn_q_b);
11528
- k = ggml_add(ctx0, k, layer.attn_k_b);
11529
- v = ggml_add(ctx0, v, layer.attn_v_b);
12681
+ for (int il = 0; il < n_layer; ++il) {
12682
+ ggml_tensor * inpSA = inpL;
11530
12683
 
11531
- q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
11532
- k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
12684
+ // norm
12685
+ cur = build_norm(inpL,
12686
+ model.layers[il].attn_norm, NULL,
12687
+ LLM_NORM_RMS, il);
12688
+ cb(cur, "attn_norm", il);
11533
12689
 
11534
- ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
12690
+ // self-attention
12691
+ {
12692
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
12693
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
11535
12694
 
11536
- kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
12695
+ // compute Q and K and RoPE them
12696
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12697
+ cb(Qcur, "Qcur", il);
12698
+ if (model.layers[il].bq) {
12699
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12700
+ cb(Qcur, "Qcur", il);
12701
+ }
11537
12702
 
11538
- cur = ggml_mul_mat(ctx0, kq, v);
12703
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12704
+ cb(Kcur, "Kcur", il);
12705
+ if (model.layers[il].bk) {
12706
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12707
+ cb(Kcur, "Kcur", il);
12708
+ }
11539
12709
 
11540
- cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
11541
- cur = ggml_add(ctx0, cur, layer.attn_o_b);
12710
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12711
+ cb(Vcur, "Vcur", il);
12712
+ if (model.layers[il].bv) {
12713
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12714
+ cb(Vcur, "Vcur", il);
12715
+ }
11542
12716
 
11543
- cur = ggml_add(ctx0, cur, inpL);
11544
- } break;
11545
- case 5:
11546
- {
11547
- cur = build_norm(cur,
11548
- layer.norm,
11549
- layer.norm_b,
11550
- LLM_NORM_GROUP, 0);
11551
- } break;
11552
- default: GGML_ABORT("unknown posnet layer");
11553
- };
11554
- }
12717
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
12718
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
12719
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
11555
12720
 
11556
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
12721
+ Qcur = ggml_rope_ext(
12722
+ ctx0, Qcur, inp_pos, rope_factors,
12723
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12724
+ ext_factor, attn_factor, beta_fast, beta_slow
12725
+ );
11557
12726
 
11558
- cur = build_norm(cur,
11559
- model.tok_norm,
11560
- model.tok_norm_b,
11561
- LLM_NORM, -1);
12727
+ Kcur = ggml_rope_ext(
12728
+ ctx0, Kcur, inp_pos, rope_factors,
12729
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12730
+ ext_factor, attn_factor, beta_fast, beta_slow
12731
+ );
11562
12732
 
11563
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
12733
+ cb(Qcur, "Qcur", il);
12734
+ cb(Kcur, "Kcur", il);
12735
+ cb(Vcur, "Vcur", il);
11564
12736
 
11565
- inpL = cur;
12737
+ cur = build_attn(inp_attn, gf,
12738
+ model.layers[il].wo, model.layers[il].bo,
12739
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
12740
+ }
11566
12741
 
11567
- // convnext
11568
- for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
11569
- const auto & layer = model.layers[il].convnext;
12742
+ if (il == n_layer - 1) {
12743
+ // skip computing output for unused tokens
12744
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12745
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12746
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12747
+ }
11570
12748
 
11571
- cur = inpL;
12749
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12750
+ cb(ffn_inp, "ffn_inp", il);
11572
12751
 
11573
- cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
11574
- cur = ggml_add(ctx0, cur, layer.dw_b);
12752
+ cur = build_norm(ffn_inp,
12753
+ model.layers[il].ffn_norm, NULL,
12754
+ LLM_NORM_RMS, il);
12755
+ cb(cur, "ffn_norm", il);
11575
12756
 
11576
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
12757
+ ggml_tensor * moe_out =
12758
+ build_moe_ffn(cur,
12759
+ model.layers[il].ffn_gate_inp,
12760
+ model.layers[il].ffn_up_exps,
12761
+ model.layers[il].ffn_gate_exps,
12762
+ model.layers[il].ffn_down_exps,
12763
+ nullptr,
12764
+ n_expert, n_expert_used,
12765
+ LLM_FFN_SILU, hparams.expert_weights_norm,
12766
+ false, hparams.expert_weights_scale,
12767
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12768
+ il);
12769
+ cb(moe_out, "ffn_moe_out", il);
11577
12770
 
11578
- cur = build_norm(cur,
11579
- layer.norm,
11580
- layer.norm_b,
11581
- LLM_NORM, -1);
12771
+ // FFN shared expert
12772
+ {
12773
+ ggml_tensor * ffn_shexp = build_ffn(cur,
12774
+ model.layers[il].ffn_up_shexp, NULL, NULL,
12775
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
12776
+ model.layers[il].ffn_down_shexp, NULL, NULL,
12777
+ NULL,
12778
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12779
+ cb(ffn_shexp, "ffn_shexp", il);
11582
12780
 
11583
- cur = build_ffn(cur,
11584
- layer.pw1, layer.pw1_b, NULL,
11585
- NULL, NULL, NULL,
11586
- layer.pw2, layer.pw2_b, NULL,
11587
- NULL,
11588
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
12781
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
12782
+ cb(cur, "ffn_out", il);
12783
+ }
11589
12784
 
11590
- cur = ggml_mul(ctx0, cur, layer.gamma);
12785
+ cur = ggml_add(ctx0, cur, ffn_inp);
11591
12786
 
11592
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
12787
+ cur = build_cvec(cur, il);
12788
+ cb(cur, "l_out", il);
11593
12789
 
11594
- inpL = ggml_add(ctx0, cur, inpL);
12790
+ // input for next layer
12791
+ inpL = cur;
11595
12792
  }
11596
12793
 
11597
12794
  cur = inpL;
11598
12795
 
11599
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
11600
-
11601
12796
  cur = build_norm(cur,
11602
- model.output_norm,
11603
- model.output_norm_b,
11604
- LLM_NORM, -1);
12797
+ model.output_norm, NULL,
12798
+ LLM_NORM_RMS, -1);
12799
+
12800
+ cb(cur, "result_norm", -1);
12801
+ res->t_embd = cur;
11605
12802
 
11606
12803
  // lm_head
11607
12804
  cur = build_lora_mm(model.output, cur);
11608
12805
 
11609
- cur = ggml_add(ctx0, cur, model.output_b);
11610
-
11611
- cb(cur, "result_embd", -1);
11612
- res->t_embd = cur;
12806
+ cb(cur, "result_output", -1);
12807
+ res->t_logits = cur;
11613
12808
 
11614
12809
  ggml_build_forward_expand(gf, cur);
11615
12810
  }
@@ -11659,6 +12854,7 @@ llm_graph_result_ptr llama_model::build_graph(
11659
12854
 
11660
12855
  switch (arch) {
11661
12856
  case LLM_ARCH_LLAMA:
12857
+ case LLM_ARCH_LLAMA4:
11662
12858
  case LLM_ARCH_MINICPM:
11663
12859
  case LLM_ARCH_GRANITE:
11664
12860
  case LLM_ARCH_GRANITE_MOE:
@@ -11692,6 +12888,7 @@ llm_graph_result_ptr llama_model::build_graph(
11692
12888
  case LLM_ARCH_BERT:
11693
12889
  case LLM_ARCH_JINA_BERT_V2:
11694
12890
  case LLM_ARCH_NOMIC_BERT:
12891
+ case LLM_ARCH_NOMIC_BERT_MOE:
11695
12892
  {
11696
12893
  llm = std::make_unique<llm_build_bert>(*this, params, gf);
11697
12894
  } break;
@@ -11723,6 +12920,14 @@ llm_graph_result_ptr llama_model::build_graph(
11723
12920
  {
11724
12921
  llm = std::make_unique<llm_build_qwen2moe>(*this, params, gf);
11725
12922
  } break;
12923
+ case LLM_ARCH_QWEN3:
12924
+ {
12925
+ llm = std::make_unique<llm_build_qwen3>(*this, params, gf);
12926
+ } break;
12927
+ case LLM_ARCH_QWEN3MOE:
12928
+ {
12929
+ llm = std::make_unique<llm_build_qwen3moe>(*this, params, gf);
12930
+ } break;
11726
12931
  case LLM_ARCH_PHI2:
11727
12932
  {
11728
12933
  llm = std::make_unique<llm_build_phi2>(*this, params, gf);
@@ -11828,6 +13033,10 @@ llm_graph_result_ptr llama_model::build_graph(
11828
13033
  {
11829
13034
  llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
11830
13035
  } break;
13036
+ case LLM_ARCH_GLM4:
13037
+ {
13038
+ llm = std::make_unique<llm_build_glm4>(*this, params, gf);
13039
+ } break;
11831
13040
  case LLM_ARCH_BITNET:
11832
13041
  {
11833
13042
  llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
@@ -11846,10 +13055,11 @@ llm_graph_result_ptr llama_model::build_graph(
11846
13055
  GGML_ABORT("invalid graph type");
11847
13056
  };
11848
13057
  } break;
11849
- //case LLM_ARCH_T5ENCODER:
11850
- // {
11851
- // llm.build_t5_enc(gf);
11852
- // } break;
13058
+ case LLM_ARCH_T5ENCODER:
13059
+ {
13060
+ llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
13061
+ }
13062
+ break;
11853
13063
  case LLM_ARCH_JAIS:
11854
13064
  {
11855
13065
  llm = std::make_unique<llm_build_jais>(*this, params, gf);
@@ -11886,6 +13096,14 @@ llm_graph_result_ptr llama_model::build_graph(
11886
13096
  {
11887
13097
  llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
11888
13098
  } break;
13099
+ case LLM_ARCH_PLM:
13100
+ {
13101
+ llm = std::make_unique<llm_build_plm>(*this, params, gf);
13102
+ } break;
13103
+ case LLM_ARCH_BAILINGMOE:
13104
+ {
13105
+ llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
13106
+ } break;
11889
13107
  default:
11890
13108
  GGML_ABORT("fatal error");
11891
13109
  }
@@ -11903,6 +13121,7 @@ llm_graph_result_ptr llama_model::build_graph(
11903
13121
  llama_model_params llama_model_default_params() {
11904
13122
  llama_model_params result = {
11905
13123
  /*.devices =*/ nullptr,
13124
+ /*.tensor_buft_overrides =*/ nullptr,
11906
13125
  /*.n_gpu_layers =*/ 0,
11907
13126
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
11908
13127
  /*.main_gpu =*/ 0,
@@ -11998,6 +13217,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
11998
13217
 
11999
13218
  // use what we call a normal RoPE, operating on pairs of consecutive head values
12000
13219
  case LLM_ARCH_LLAMA:
13220
+ case LLM_ARCH_LLAMA4:
12001
13221
  case LLM_ARCH_DECI:
12002
13222
  case LLM_ARCH_BAICHUAN:
12003
13223
  case LLM_ARCH_STARCODER:
@@ -12012,10 +13232,13 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
12012
13232
  case LLM_ARCH_ARCTIC:
12013
13233
  case LLM_ARCH_DEEPSEEK:
12014
13234
  case LLM_ARCH_DEEPSEEK2:
13235
+ case LLM_ARCH_PLM:
12015
13236
  case LLM_ARCH_CHATGLM:
13237
+ case LLM_ARCH_GLM4:
12016
13238
  case LLM_ARCH_GRANITE:
12017
13239
  case LLM_ARCH_GRANITE_MOE:
12018
13240
  case LLM_ARCH_CHAMELEON:
13241
+ case LLM_ARCH_BAILINGMOE:
12019
13242
  return LLAMA_ROPE_TYPE_NORM;
12020
13243
 
12021
13244
  // the pairs of head values are offset by n_rot/2
@@ -12024,11 +13247,14 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
12024
13247
  case LLM_ARCH_DBRX:
12025
13248
  case LLM_ARCH_BERT:
12026
13249
  case LLM_ARCH_NOMIC_BERT:
13250
+ case LLM_ARCH_NOMIC_BERT_MOE:
12027
13251
  case LLM_ARCH_STABLELM:
12028
13252
  case LLM_ARCH_BITNET:
12029
13253
  case LLM_ARCH_QWEN:
12030
13254
  case LLM_ARCH_QWEN2:
12031
13255
  case LLM_ARCH_QWEN2MOE:
13256
+ case LLM_ARCH_QWEN3:
13257
+ case LLM_ARCH_QWEN3MOE:
12032
13258
  case LLM_ARCH_OLMO2:
12033
13259
  case LLM_ARCH_OLMOE:
12034
13260
  case LLM_ARCH_PHI2: