cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +22 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +42 -6
  4. package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
  5. package/android/src/main/jni.cpp +173 -18
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
  16. package/cpp/LICENSE +21 -0
  17. package/cpp/chat.cpp +129 -107
  18. package/cpp/chat.h +2 -0
  19. package/cpp/common.cpp +58 -78
  20. package/cpp/common.h +29 -21
  21. package/cpp/ggml-alloc.c +4 -1
  22. package/cpp/ggml-backend.cpp +9 -5
  23. package/cpp/ggml-backend.h +4 -4
  24. package/cpp/ggml-cpp.h +1 -1
  25. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  26. package/cpp/ggml-cpu/amx/amx.h +8 -0
  27. package/cpp/ggml-cpu/amx/common.h +91 -0
  28. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  29. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  31. package/cpp/ggml-cpu/common.h +72 -0
  32. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -103
  33. package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +306 -6
  34. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +114 -55
  35. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +32 -16
  36. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +353 -173
  37. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  38. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  39. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  40. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  41. package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -6
  42. package/{ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/vec.h +16 -0
  43. package/cpp/ggml-cpu.h +5 -0
  44. package/cpp/ggml-impl.h +16 -9
  45. package/cpp/ggml-llama-sim.metallib +0 -0
  46. package/cpp/ggml-llama.metallib +0 -0
  47. package/cpp/ggml-metal-impl.h +36 -11
  48. package/cpp/ggml-metal.m +810 -176
  49. package/cpp/ggml-opt.cpp +373 -190
  50. package/cpp/ggml-opt.h +49 -28
  51. package/cpp/ggml-quants.c +0 -6
  52. package/cpp/ggml.c +227 -282
  53. package/cpp/ggml.h +82 -101
  54. package/cpp/gguf.cpp +33 -33
  55. package/cpp/json-schema-to-grammar.cpp +3 -0
  56. package/cpp/llama-adapter.cpp +6 -0
  57. package/cpp/llama-arch.cpp +49 -17
  58. package/cpp/llama-arch.h +9 -0
  59. package/cpp/llama-batch.cpp +8 -2
  60. package/cpp/llama-batch.h +2 -1
  61. package/cpp/llama-chat.cpp +39 -16
  62. package/cpp/llama-chat.h +4 -2
  63. package/cpp/llama-context.cpp +440 -611
  64. package/cpp/llama-context.h +44 -33
  65. package/cpp/llama-cparams.h +1 -0
  66. package/cpp/llama-graph.cpp +214 -291
  67. package/cpp/llama-graph.h +69 -21
  68. package/cpp/llama-hparams.cpp +17 -1
  69. package/cpp/llama-hparams.h +39 -5
  70. package/cpp/llama-kv-cache.cpp +2067 -620
  71. package/cpp/llama-kv-cache.h +410 -108
  72. package/cpp/llama-memory.h +12 -1
  73. package/cpp/llama-model-loader.cpp +24 -15
  74. package/cpp/llama-model-saver.cpp +281 -0
  75. package/cpp/llama-model-saver.h +37 -0
  76. package/cpp/llama-model.cpp +1089 -359
  77. package/cpp/llama-model.h +19 -3
  78. package/cpp/llama-sampling.cpp +20 -7
  79. package/cpp/llama-vocab.cpp +54 -9
  80. package/cpp/llama-vocab.h +6 -0
  81. package/cpp/llama.cpp +14 -0
  82. package/cpp/llama.h +86 -142
  83. package/cpp/minja/chat-template.hpp +9 -5
  84. package/cpp/minja/minja.hpp +69 -36
  85. package/cpp/rn-llama.cpp +602 -190
  86. package/cpp/rn-llama.h +34 -8
  87. package/cpp/sampling.cpp +57 -50
  88. package/cpp/tools/mtmd/clip-impl.h +462 -0
  89. package/cpp/tools/mtmd/clip.cpp +4024 -0
  90. package/cpp/tools/mtmd/clip.h +101 -0
  91. package/cpp/tools/mtmd/miniaudio.h +93468 -0
  92. package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  93. package/cpp/tools/mtmd/mtmd-audio.h +62 -0
  94. package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
  95. package/cpp/tools/mtmd/mtmd.cpp +942 -0
  96. package/cpp/tools/mtmd/mtmd.h +362 -0
  97. package/cpp/tools/mtmd/stb_image.h +7988 -0
  98. package/ios/CMakeLists.txt +20 -10
  99. package/ios/RNLlama.h +6 -0
  100. package/ios/RNLlama.mm +82 -3
  101. package/ios/RNLlamaContext.h +5 -1
  102. package/ios/RNLlamaContext.mm +131 -38
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +29 -21
  105. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  106. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  107. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  108. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  109. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  110. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  111. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +82 -101
  112. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  113. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  114. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  115. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  116. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  117. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  118. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  119. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  120. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  121. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  122. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +86 -142
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  131. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  132. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  133. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  134. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  135. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  136. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  137. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  138. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  139. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  140. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  141. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  142. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  143. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  144. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  145. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  146. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  147. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  148. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  149. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  150. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  151. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  152. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  153. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  154. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  155. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  156. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  160. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
  161. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +29 -21
  162. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  163. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  164. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +82 -101
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  175. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  176. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  177. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  178. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  179. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  180. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  181. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +86 -142
  182. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  183. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  184. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  185. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  186. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  187. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  188. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  189. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  190. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  191. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  192. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  193. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  194. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  195. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  196. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  197. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  198. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  199. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  200. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  201. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  202. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  203. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  204. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  205. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  206. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  207. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  208. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  209. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  210. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  211. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  212. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  213. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  214. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  215. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  216. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  217. package/jest/mock.js +33 -7
  218. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  219. package/lib/commonjs/index.js +153 -21
  220. package/lib/commonjs/index.js.map +1 -1
  221. package/lib/module/NativeRNLlama.js.map +1 -1
  222. package/lib/module/index.js +152 -20
  223. package/lib/module/index.js.map +1 -1
  224. package/lib/typescript/NativeRNLlama.d.ts +54 -4
  225. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  226. package/lib/typescript/index.d.ts +72 -6
  227. package/lib/typescript/index.d.ts.map +1 -1
  228. package/package.json +1 -1
  229. package/src/NativeRNLlama.ts +72 -4
  230. package/src/index.ts +212 -38
  231. package/cpp/binary-ops.h +0 -16
  232. package/cpp/ops.h +0 -128
  233. package/cpp/simd-mappings.h +0 -888
  234. package/cpp/unary-ops.h +0 -28
  235. package/cpp/vec.h +0 -802
  236. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  237. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  238. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  239. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  240. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  241. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  242. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  243. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  244. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  245. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  246. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  247. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  248. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  249. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  250. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  251. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  252. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  253. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  254. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  255. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  256. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  257. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  258. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  259. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  260. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  261. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  262. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  263. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  264. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  265. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  266. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  267. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  268. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  269. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  270. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  271. package/lib/commonjs/chat.js +0 -37
  272. package/lib/commonjs/chat.js.map +0 -1
  273. package/lib/module/chat.js +0 -33
  274. package/lib/module/chat.js.map +0 -1
  275. package/lib/typescript/chat.d.ts +0 -10
  276. package/lib/typescript/chat.d.ts.map +0 -1
  277. package/src/chat.ts +0 -44
  278. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  279. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  280. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  281. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  282. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  283. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  284. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  285. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
@@ -40,14 +40,17 @@ const char * llm_type_name(llm_type type) {
40
40
  case LLM_TYPE_335M: return "335M";
41
41
  case LLM_TYPE_410M: return "410M";
42
42
  case LLM_TYPE_450M: return "450M";
43
+ case LLM_TYPE_475M: return "475M";
43
44
  case LLM_TYPE_770M: return "770M";
44
45
  case LLM_TYPE_780M: return "780M";
45
46
  case LLM_TYPE_0_5B: return "0.5B";
47
+ case LLM_TYPE_0_6B: return "0.6B";
46
48
  case LLM_TYPE_1B: return "1B";
47
49
  case LLM_TYPE_1_3B: return "1.3B";
48
50
  case LLM_TYPE_1_4B: return "1.4B";
49
51
  case LLM_TYPE_1_5B: return "1.5B";
50
52
  case LLM_TYPE_1_6B: return "1.6B";
53
+ case LLM_TYPE_1_7B: return "1.7B";
51
54
  case LLM_TYPE_1_8B: return "1.8B";
52
55
  case LLM_TYPE_2B: return "2B";
53
56
  case LLM_TYPE_2_8B: return "2.8B";
@@ -66,6 +69,7 @@ const char * llm_type_name(llm_type type) {
66
69
  case LLM_TYPE_15B: return "15B";
67
70
  case LLM_TYPE_16B: return "16B";
68
71
  case LLM_TYPE_20B: return "20B";
72
+ case LLM_TYPE_27B: return "27B";
69
73
  case LLM_TYPE_30B: return "30B";
70
74
  case LLM_TYPE_32B: return "32B";
71
75
  case LLM_TYPE_34B: return "34B";
@@ -74,7 +78,9 @@ const char * llm_type_name(llm_type type) {
74
78
  case LLM_TYPE_65B: return "65B";
75
79
  case LLM_TYPE_70B: return "70B";
76
80
  case LLM_TYPE_236B: return "236B";
81
+ case LLM_TYPE_290B: return "290B";
77
82
  case LLM_TYPE_314B: return "314B";
83
+ case LLM_TYPE_405B: return "405B";
78
84
  case LLM_TYPE_671B: return "671B";
79
85
  case LLM_TYPE_SMALL: return "0.1B";
80
86
  case LLM_TYPE_MEDIUM: return "0.4B";
@@ -88,10 +94,10 @@ const char * llm_type_name(llm_type type) {
88
94
  case LLM_TYPE_16x3_8B: return "16x3.8B";
89
95
  case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
90
96
  case LLM_TYPE_57B_A14B: return "57B.A14B";
91
- case LLM_TYPE_27B: return "27B";
92
- case LLM_TYPE_290B: return "290B";
93
97
  case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
94
98
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
99
+ case LLM_TYPE_30B_A3B: return "30B.A3B";
100
+ case LLM_TYPE_235B_A22B: return "235B.A22B";
95
101
  default: return "?B";
96
102
  }
97
103
  }
@@ -111,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
111
117
  { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
112
118
  };
113
119
 
120
+ std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
121
+ return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
122
+ }
123
+
114
124
  static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
115
125
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
116
126
  if (kv.second == name) {
@@ -293,6 +303,10 @@ static buft_list_t make_cpu_buft_list(const std::vector<lm_ggml_backend_dev_t> &
293
303
  // add extra buffer types, only if no GPU device is present
294
304
  // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
295
305
  auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
306
+ if (cpu_dev == nullptr) {
307
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
308
+ }
309
+
296
310
  auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
297
311
  auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
298
312
  lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
@@ -449,11 +463,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
449
463
  LM_GGML_ASSERT(hparams.n_expert_used == 0);
450
464
  }
451
465
 
452
- // zero-out the array hparams
453
466
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
454
467
  std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
455
468
  std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
456
469
 
470
+ std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
471
+
472
+ std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
473
+
457
474
  ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
458
475
  ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
459
476
 
@@ -557,9 +574,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
557
574
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
558
575
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
559
576
  ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
560
- hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
561
- hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
562
- hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
577
+
578
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
579
+ hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
580
+ hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
563
581
 
564
582
  switch (hparams.n_expert) {
565
583
  case 16: type = LLM_TYPE_17B_16E; break;
@@ -577,6 +595,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
577
595
  switch (hparams.n_layer) {
578
596
  case 32: type = LLM_TYPE_7B; break;
579
597
  case 80: type = LLM_TYPE_70B; break;
598
+ case 162: type = LLM_TYPE_405B; break;
580
599
  default: type = LLM_TYPE_UNKNOWN;
581
600
  }
582
601
  } break;
@@ -695,13 +714,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
695
714
  }
696
715
  } break;
697
716
  case LLM_ARCH_NOMIC_BERT:
717
+ case LLM_ARCH_NOMIC_BERT_MOE:
698
718
  {
699
719
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
700
720
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
701
721
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
722
+ ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
702
723
 
703
724
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
704
- type = LLM_TYPE_137M;
725
+ if (arch == LLM_ARCH_NOMIC_BERT) {
726
+ type = LLM_TYPE_137M;
727
+ } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
728
+ type = LLM_TYPE_475M;
729
+ }
705
730
  }
706
731
  } break;
707
732
  case LLM_ARCH_BLOOM:
@@ -762,6 +787,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
762
787
  // fall through
763
788
  case LLM_ARCH_QWEN2:
764
789
  {
790
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
765
791
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
766
792
  switch (hparams.n_layer) {
767
793
  case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
@@ -791,6 +817,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
791
817
  {
792
818
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
793
819
  switch (hparams.n_layer) {
820
+ case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
821
+ case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
822
+ case 40: type = LLM_TYPE_14B; break;
823
+ case 64: type = LLM_TYPE_32B; break;
794
824
  default: type = LLM_TYPE_UNKNOWN;
795
825
  }
796
826
  } break;
@@ -800,6 +830,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
800
830
 
801
831
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
802
832
  switch (hparams.n_layer) {
833
+ case 48: type = LLM_TYPE_30B_A3B; break;
834
+ case 94: type = LLM_TYPE_235B_A22B; break;
803
835
  default: type = LLM_TYPE_UNKNOWN;
804
836
  }
805
837
  } break;
@@ -824,22 +856,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
824
856
  default: type = LLM_TYPE_UNKNOWN;
825
857
  }
826
858
 
827
- // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
828
- if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
829
- // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
830
- hparams.n_swa = 2047;
831
- } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
832
- // default value for Phi-3-mini-128k-instruct
833
- // note: this seems incorrect because the window is bigger than the train context?
834
- hparams.n_swa = 262144;
835
- } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
836
- // default value for Phi-3-medium-128k-instruct
837
- // note: this seems incorrect because the window is equal to the train context?
838
- hparams.n_swa = 131072;
839
- }
840
- bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
841
- if (!found_swa && hparams.n_swa == 0) {
842
- throw std::runtime_error("invalid value for sliding_window");
859
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
860
+
861
+ if (found_swa && hparams.n_swa > 0) {
862
+ LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
863
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
864
+
865
+ // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
866
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
867
+
868
+ hparams.n_swa = 0;
869
+ hparams.set_swa_pattern(1);
843
870
  }
844
871
  } break;
845
872
  case LLM_ARCH_PHIMOE:
@@ -909,8 +936,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
909
936
  } break;
910
937
  case LLM_ARCH_GEMMA2:
911
938
  {
939
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
912
940
  hparams.n_swa = 4096; // default value of gemma 2
913
- hparams.n_swa_pattern = 2;
941
+ hparams.set_swa_pattern(2);
914
942
  hparams.attn_soft_cap = true;
915
943
 
916
944
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
@@ -927,7 +955,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
927
955
  } break;
928
956
  case LLM_ARCH_GEMMA3:
929
957
  {
930
- hparams.n_swa_pattern = 6;
958
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
959
+ hparams.set_swa_pattern(6);
931
960
 
932
961
  hparams.rope_freq_base_train_swa = 10000.0f;
933
962
  hparams.rope_freq_scale_train_swa = 1.0f;
@@ -1011,7 +1040,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1011
1040
  } break;
1012
1041
  case LLM_ARCH_COHERE2:
1013
1042
  {
1014
- hparams.n_swa_pattern = 4;
1043
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1044
+ hparams.set_swa_pattern(4);
1015
1045
 
1016
1046
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1017
1047
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -1156,6 +1186,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1156
1186
  ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
1157
1187
  }
1158
1188
  ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1189
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
1190
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
1159
1191
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1160
1192
  ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1161
1193
  ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
@@ -1205,6 +1237,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1205
1237
  default: type = LLM_TYPE_UNKNOWN;
1206
1238
  }
1207
1239
  } break;
1240
+ case LLM_ARCH_GLM4:
1241
+ {
1242
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1243
+ switch (hparams.n_layer) {
1244
+ case 40: type = LLM_TYPE_9B; break;
1245
+ case 61: type = LLM_TYPE_32B; break;
1246
+ default: type = LLM_TYPE_UNKNOWN;
1247
+ }
1248
+ } break;
1208
1249
  case LLM_ARCH_BITNET:
1209
1250
  {
1210
1251
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1350,6 +1391,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1350
1391
  // Add additional layer/vocab/etc checks here for other model sizes
1351
1392
  default: type = LLM_TYPE_UNKNOWN;
1352
1393
  }
1394
+
1395
+ // For Granite MoE Shared
1396
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1353
1397
  } break;
1354
1398
  case LLM_ARCH_CHAMELEON:
1355
1399
  {
@@ -1453,6 +1497,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1453
1497
  }
1454
1498
 
1455
1499
  lm_ggml_backend_dev_t cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
1500
+ if (cpu_dev == nullptr) {
1501
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
1502
+ }
1456
1503
  const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
1457
1504
  const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
1458
1505
  auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
@@ -1620,8 +1667,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1620
1667
  for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
1621
1668
  std::regex pattern(overrides->pattern);
1622
1669
  if (std::regex_search(tensor_name, pattern)) {
1623
- LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), lm_ggml_backend_buft_name(overrides->buft));
1624
1670
  buft = overrides->buft;
1671
+ LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
1672
+ tensor_name.c_str(),
1673
+ lm_ggml_nbytes(t_meta) / 1024 / 1024, lm_ggml_type_name(t_meta->type),
1674
+ lm_ggml_backend_buft_name(buft));
1625
1675
  break;
1626
1676
  }
1627
1677
  }
@@ -1638,6 +1688,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1638
1688
  auto * buft_dev = lm_ggml_backend_buft_get_device(buft);
1639
1689
  if (ml.use_mmap && buft_dev && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
1640
1690
  auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
1691
+ if (!cpu_dev) {
1692
+ throw std::runtime_error("no CPU backend found");
1693
+ }
1641
1694
  buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
1642
1695
  }
1643
1696
 
@@ -1724,6 +1777,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1724
1777
  layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1725
1778
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1726
1779
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1780
+
1781
+ // For Granite MoE Shared
1782
+ if (hparams.n_ff_shexp > 0) {
1783
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1784
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1785
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
1786
+ }
1727
1787
  }
1728
1788
  }
1729
1789
  } break;
@@ -1819,7 +1879,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1819
1879
  layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
1820
1880
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1821
1881
 
1822
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1882
+ if (n_ff > 0) {
1883
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1884
+ }
1823
1885
 
1824
1886
  if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
1825
1887
  layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
@@ -1829,9 +1891,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1829
1891
  layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1830
1892
  }
1831
1893
 
1832
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1833
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1834
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1894
+ if (n_ff > 0) {
1895
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1896
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1897
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1898
+ }
1835
1899
 
1836
1900
  // optional MLP bias
1837
1901
  layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
@@ -2046,6 +2110,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2046
2110
  } break;
2047
2111
  case LLM_ARCH_BERT:
2048
2112
  case LLM_ARCH_NOMIC_BERT:
2113
+ case LLM_ARCH_NOMIC_BERT_MOE:
2049
2114
  {
2050
2115
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2051
2116
  type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
@@ -2079,20 +2144,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2079
2144
  layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2080
2145
  }
2081
2146
 
2147
+ if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
2148
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2149
+ }
2150
+
2082
2151
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2083
2152
 
2084
2153
  layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
2085
2154
  layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
2086
2155
 
2087
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2088
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2089
-
2090
- if (arch == LLM_ARCH_BERT) {
2156
+ if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
2091
2157
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2092
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2093
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2158
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
2159
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2160
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2094
2161
  } else {
2095
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2162
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2163
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2164
+
2165
+ if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
2166
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
2167
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
2168
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
2169
+ } else {
2170
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2171
+ }
2096
2172
  }
2097
2173
 
2098
2174
  layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
@@ -3196,8 +3272,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3196
3272
  {
3197
3273
  const bool is_lite = (hparams.n_layer == 27);
3198
3274
 
3275
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
3276
+
3277
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
3278
+ const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
3279
+ const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
3280
+
3199
3281
  const int64_t n_embd_head_qk_rope = hparams.n_rot;
3200
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
3282
+ const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
3201
3283
 
3202
3284
  const int64_t q_lora_rank = hparams.n_lora_q;
3203
3285
  const int64_t kv_lora_rank = hparams.n_lora_kv;
@@ -3223,14 +3305,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3223
3305
 
3224
3306
  if (!is_lite) {
3225
3307
  layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
3226
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
3308
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
3227
3309
  } else {
3228
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3310
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
3229
3311
  }
3230
3312
 
3231
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
3232
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
3233
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
3313
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
3314
+
3315
+ // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
3316
+ if (is_mla) {
3317
+ layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
3318
+ layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
3319
+ } else {
3320
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
3321
+ }
3322
+
3323
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
3234
3324
 
3235
3325
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3236
3326
 
@@ -3449,7 +3539,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3449
3539
 
3450
3540
  // output
3451
3541
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3452
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3542
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3543
+ // if output is NULL, init from the input tok embed
3544
+ if (output == NULL) {
3545
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3546
+ }
3453
3547
 
3454
3548
  for (int i = 0; i < n_layer; ++i) {
3455
3549
  auto & layer = layers[i];
@@ -3476,6 +3570,45 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3476
3570
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3477
3571
  }
3478
3572
  } break;
3573
+ case LLM_ARCH_GLM4:
3574
+ {
3575
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3576
+
3577
+ // output
3578
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3579
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3580
+ // if output is NULL, init from the input tok embed
3581
+ if (output == NULL) {
3582
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3583
+ }
3584
+
3585
+ for (int i = 0; i < n_layer; ++i) {
3586
+ auto & layer = layers[i];
3587
+
3588
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3589
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3590
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
3591
+
3592
+ if (layer.wqkv == nullptr) {
3593
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3594
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3595
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3596
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3597
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3598
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
3599
+ }
3600
+
3601
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3602
+
3603
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3604
+
3605
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3606
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3607
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
3608
+
3609
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3610
+ }
3611
+ } break;
3479
3612
  case LLM_ARCH_NEMOTRON:
3480
3613
  {
3481
3614
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4015,6 +4148,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4015
4148
  if (!dev) {
4016
4149
  // FIXME: workaround for CPU backend buft having a NULL device
4017
4150
  dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
4151
+ if (!dev) {
4152
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
4153
+ }
4018
4154
  }
4019
4155
  lm_ggml_backend_dev_props props;
4020
4156
  lm_ggml_backend_dev_get_props(dev, &props);
@@ -4144,7 +4280,7 @@ uint64_t llama_model::n_elements() const {
4144
4280
  }
4145
4281
 
4146
4282
  void llama_model::print_info() const {
4147
- const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
4283
+ const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
4148
4284
 
4149
4285
  auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
4150
4286
  bool is_var = false;
@@ -4187,7 +4323,7 @@ void llama_model::print_info() const {
4187
4323
  LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
4188
4324
  LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
4189
4325
  LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
4190
- LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern);
4326
+ LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
4191
4327
  LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
4192
4328
  LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
4193
4329
  LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
@@ -4205,7 +4341,7 @@ void llama_model::print_info() const {
4205
4341
  LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
4206
4342
  LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
4207
4343
  LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
4208
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
4344
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
4209
4345
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
4210
4346
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4211
4347
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
@@ -4242,6 +4378,8 @@ void llama_model::print_info() const {
4242
4378
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
4243
4379
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
4244
4380
  LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
4381
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
4382
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
4245
4383
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4246
4384
  LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
4247
4385
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
@@ -4259,10 +4397,13 @@ void llama_model::print_info() const {
4259
4397
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4260
4398
  }
4261
4399
 
4262
- if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
4400
+ if (arch == LLM_ARCH_MINICPM ||
4401
+ arch == LLM_ARCH_GRANITE ||
4402
+ arch == LLM_ARCH_GRANITE_MOE) {
4263
4403
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
4264
4404
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
4265
4405
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
4406
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
4266
4407
  }
4267
4408
 
4268
4409
  if (arch == LLM_ARCH_BAILINGMOE) {
@@ -4350,6 +4491,29 @@ const lm_ggml_tensor * llama_model::get_tensor(const char * name) const {
4350
4491
  return it->second;
4351
4492
  }
4352
4493
 
4494
+ float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
4495
+ return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
4496
+ }
4497
+
4498
+ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
4499
+ return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
4500
+ }
4501
+
4502
+ lm_ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
4503
+ const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
4504
+
4505
+ // choose long/short freq factors based on the context size
4506
+ if (layers[il].rope_freqs != nullptr) {
4507
+ return layers[il].rope_freqs;
4508
+ }
4509
+
4510
+ if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
4511
+ return layers[il].rope_long;
4512
+ }
4513
+
4514
+ return layers[il].rope_short;
4515
+ }
4516
+
4353
4517
  struct llm_build_llama : public llm_graph_context {
4354
4518
  llm_build_llama(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
4355
4519
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -4365,22 +4529,13 @@ struct llm_build_llama : public llm_graph_context {
4365
4529
  // inp_pos - contains the positions
4366
4530
  lm_ggml_tensor * inp_pos = build_inp_pos();
4367
4531
 
4368
- // temperature tuning
4369
- lm_ggml_tensor * inp_attn_scale = nullptr;
4370
- if (arch == LLM_ARCH_LLAMA4) {
4371
- inp_attn_scale = build_inp_attn_scale();
4372
- }
4373
-
4374
4532
  auto * inp_attn = build_attn_inp_kv_unified();
4375
4533
 
4376
4534
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4535
+
4377
4536
  for (int il = 0; il < n_layer; ++il) {
4378
4537
  lm_ggml_tensor * inpSA = inpL;
4379
4538
 
4380
- bool use_rope = arch == LLM_ARCH_LLAMA4
4381
- ? (il + 1) % hparams.n_no_rope_layer_step != 0
4382
- : true;
4383
-
4384
4539
  // norm
4385
4540
  cur = build_norm(inpL,
4386
4541
  model.layers[il].attn_norm, NULL,
@@ -4390,7 +4545,7 @@ struct llm_build_llama : public llm_graph_context {
4390
4545
  // self-attention
4391
4546
  {
4392
4547
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4393
- lm_ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
4548
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4394
4549
 
4395
4550
  // compute Q and K and RoPE them
4396
4551
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4418,37 +4573,25 @@ struct llm_build_llama : public llm_graph_context {
4418
4573
  Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
4419
4574
  Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
4420
4575
 
4421
- if (use_rope) {
4422
- Qcur = lm_ggml_rope_ext(
4423
- ctx0, Qcur, inp_pos, rope_factors,
4424
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4425
- ext_factor, attn_factor, beta_fast, beta_slow
4426
- );
4576
+ Qcur = lm_ggml_rope_ext(
4577
+ ctx0, Qcur, inp_pos, rope_factors,
4578
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4579
+ ext_factor, attn_factor, beta_fast, beta_slow
4580
+ );
4427
4581
 
4428
- Kcur = lm_ggml_rope_ext(
4429
- ctx0, Kcur, inp_pos, rope_factors,
4430
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4431
- ext_factor, attn_factor, beta_fast, beta_slow
4432
- );
4433
- } else if (inp_attn_scale) {
4434
- Qcur = lm_ggml_mul(ctx0, Qcur, inp_attn_scale);
4435
- }
4582
+ Kcur = lm_ggml_rope_ext(
4583
+ ctx0, Kcur, inp_pos, rope_factors,
4584
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4585
+ ext_factor, attn_factor, beta_fast, beta_slow
4586
+ );
4436
4587
 
4437
4588
  cb(Qcur, "Qcur", il);
4438
4589
  cb(Kcur, "Kcur", il);
4439
4590
  cb(Vcur, "Vcur", il);
4440
4591
 
4441
- if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
4442
- // Llama4TextL2Norm
4443
- Qcur = lm_ggml_rms_norm(ctx0, Qcur, 1e-6);
4444
- Kcur = lm_ggml_rms_norm(ctx0, Kcur, 1e-6);
4445
- cb(Qcur, "Qcur_normed", il);
4446
- cb(Kcur, "Kcur_normed", il);
4447
- }
4448
-
4449
4592
  cur = build_attn(inp_attn, gf,
4450
4593
  model.layers[il].wo, model.layers[il].bo,
4451
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
4594
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4452
4595
  cb(cur, "attn_out", il);
4453
4596
  }
4454
4597
 
@@ -4459,11 +4602,6 @@ struct llm_build_llama : public llm_graph_context {
4459
4602
  inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
4460
4603
  }
4461
4604
 
4462
- // For Granite architecture
4463
- if (hparams.f_residual_scale) {
4464
- cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
4465
- }
4466
-
4467
4605
  lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
4468
4606
  cb(ffn_inp, "ffn_inp", il);
4469
4607
 
@@ -4482,38 +4620,6 @@ struct llm_build_llama : public llm_graph_context {
4482
4620
  NULL,
4483
4621
  LLM_FFN_SILU, LLM_FFN_PAR, il);
4484
4622
  cb(cur, "ffn_out", il);
4485
-
4486
- } else if (arch == LLM_ARCH_LLAMA4) {
4487
- // llama4 MoE
4488
- lm_ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
4489
- model.layers[il].ffn_norm, NULL,
4490
- LLM_NORM_RMS, il);
4491
- cb(cur, "ffn_norm", il);
4492
-
4493
- lm_ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
4494
- model.layers[il].ffn_gate_inp,
4495
- model.layers[il].ffn_up_exps,
4496
- model.layers[il].ffn_gate_exps,
4497
- model.layers[il].ffn_down_exps,
4498
- nullptr,
4499
- n_expert, n_expert_used,
4500
- LLM_FFN_SILU, false,
4501
- false, 0.0,
4502
- LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
4503
- il);
4504
-
4505
- // Shared experts
4506
- lm_ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
4507
- model.layers[il].ffn_up_shexp, NULL, NULL,
4508
- model.layers[il].ffn_gate_shexp, NULL, NULL,
4509
- model.layers[il].ffn_down_shexp, NULL, NULL,
4510
- NULL,
4511
- LLM_FFN_SILU, LLM_FFN_PAR, il);
4512
- cb(shexp_out, "ffn_moe_shexp", il);
4513
-
4514
- cur = lm_ggml_add(ctx0, moe_out, shexp_out);
4515
- cb(cur, "ffn_moe_out_merged", il);
4516
-
4517
4623
  } else {
4518
4624
  // MoE branch
4519
4625
  cur = build_norm(ffn_inp,
@@ -4535,11 +4641,6 @@ struct llm_build_llama : public llm_graph_context {
4535
4641
  cb(cur, "ffn_moe_out", il);
4536
4642
  }
4537
4643
 
4538
- // For Granite architecture
4539
- if (hparams.f_residual_scale) {
4540
- cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
4541
- }
4542
-
4543
4644
  cur = lm_ggml_add(ctx0, cur, ffn_inp);
4544
4645
  cb(cur, "ffn_out", il);
4545
4646
 
@@ -4562,11 +4663,6 @@ struct llm_build_llama : public llm_graph_context {
4562
4663
  // lm_head
4563
4664
  cur = build_lora_mm(model.output, cur);
4564
4665
 
4565
- // For Granite architecture
4566
- if (hparams.f_logit_scale) {
4567
- cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4568
- }
4569
-
4570
4666
  cb(cur, "result_output", -1);
4571
4667
  res->t_logits = cur;
4572
4668
 
@@ -4574,8 +4670,8 @@ struct llm_build_llama : public llm_graph_context {
4574
4670
  }
4575
4671
  };
4576
4672
 
4577
- struct llm_build_deci : public llm_graph_context {
4578
- llm_build_deci(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
4673
+ struct llm_build_llama_iswa : public llm_graph_context {
4674
+ llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
4579
4675
  const int64_t n_embd_head = hparams.n_embd_head_v;
4580
4676
 
4581
4677
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -4589,33 +4685,29 @@ struct llm_build_deci : public llm_graph_context {
4589
4685
  // inp_pos - contains the positions
4590
4686
  lm_ggml_tensor * inp_pos = build_inp_pos();
4591
4687
 
4592
- auto * inp_attn = build_attn_inp_kv_unified();
4688
+ // temperature tuning
4689
+ lm_ggml_tensor * inp_attn_scale = nullptr;
4690
+ inp_attn_scale = build_inp_attn_scale();
4691
+
4692
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
4593
4693
 
4594
4694
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4695
+
4595
4696
  for (int il = 0; il < n_layer; ++il) {
4596
4697
  lm_ggml_tensor * inpSA = inpL;
4597
- const int64_t n_head_kv = hparams.n_head_kv(il);
4598
- const int64_t n_head = hparams.n_head(il);
4599
4698
 
4600
- if (n_head == 0) {
4601
- // attention-free layer of Llama-3_1-Nemotron-51B
4602
- cur = inpL;
4603
- } else {
4604
- // norm
4605
- cur = build_norm(inpL,
4606
- model.layers[il].attn_norm, NULL,
4607
- LLM_NORM_RMS, il);
4608
- cb(cur, "attn_norm", il);
4609
- }
4699
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
4610
4700
 
4611
- if (n_head > 0 && n_head_kv == 0) {
4612
- // "linear attention" of Llama-3_1-Nemotron-51B
4613
- cur = build_lora_mm(model.layers[il].wo, cur);
4614
- cb(cur, "wo", il);
4615
- } else if (n_head > 0) {
4616
- // self-attention
4701
+ // norm
4702
+ cur = build_norm(inpL,
4703
+ model.layers[il].attn_norm, NULL,
4704
+ LLM_NORM_RMS, il);
4705
+ cb(cur, "attn_norm", il);
4706
+
4707
+ // self-attention
4708
+ {
4617
4709
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4618
- lm_ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
4710
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4619
4711
 
4620
4712
  // compute Q and K and RoPE them
4621
4713
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4643,25 +4735,38 @@ struct llm_build_deci : public llm_graph_context {
4643
4735
  Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
4644
4736
  Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
4645
4737
 
4646
- Qcur = lm_ggml_rope_ext(
4647
- ctx0, Qcur, inp_pos, rope_factors,
4648
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4649
- ext_factor, attn_factor, beta_fast, beta_slow
4650
- );
4738
+ if (use_rope) {
4739
+ Qcur = lm_ggml_rope_ext(
4740
+ ctx0, Qcur, inp_pos, rope_factors,
4741
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4742
+ ext_factor, attn_factor, beta_fast, beta_slow
4743
+ );
4651
4744
 
4652
- Kcur = lm_ggml_rope_ext(
4653
- ctx0, Kcur, inp_pos, rope_factors,
4654
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4655
- ext_factor, attn_factor, beta_fast, beta_slow
4656
- );
4745
+ Kcur = lm_ggml_rope_ext(
4746
+ ctx0, Kcur, inp_pos, rope_factors,
4747
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4748
+ ext_factor, attn_factor, beta_fast, beta_slow
4749
+ );
4750
+ } else if (inp_attn_scale) {
4751
+ Qcur = lm_ggml_mul(ctx0, Qcur, inp_attn_scale);
4752
+ }
4657
4753
 
4658
4754
  cb(Qcur, "Qcur", il);
4659
4755
  cb(Kcur, "Kcur", il);
4660
4756
  cb(Vcur, "Vcur", il);
4661
4757
 
4758
+ if (use_rope && hparams.use_kq_norm) {
4759
+ // Llama4TextL2Norm
4760
+ Qcur = lm_ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
4761
+ Kcur = lm_ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
4762
+ cb(Qcur, "Qcur_normed", il);
4763
+ cb(Kcur, "Kcur_normed", il);
4764
+ }
4765
+
4662
4766
  cur = build_attn(inp_attn, gf,
4663
4767
  model.layers[il].wo, model.layers[il].bo,
4664
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
4768
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4769
+ cb(cur, "attn_out", il);
4665
4770
  }
4666
4771
 
4667
4772
  if (il == n_layer - 1) {
@@ -4671,19 +4776,10 @@ struct llm_build_deci : public llm_graph_context {
4671
4776
  inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
4672
4777
  }
4673
4778
 
4674
- // For Granite architecture
4675
- if (hparams.f_residual_scale) {
4676
- cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
4677
- }
4678
-
4679
- // modified to support attention-free layer of Llama-3_1-Nemotron-51B
4680
- lm_ggml_tensor * ffn_inp = cur;
4681
- if (n_head > 0) {
4682
- ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
4683
- cb(ffn_inp, "ffn_inp", il);
4684
- }
4779
+ lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
4780
+ cb(ffn_inp, "ffn_inp", il);
4685
4781
 
4686
- // feed-forward network
4782
+ // feed-forward network (non-MoE)
4687
4783
  if (model.layers[il].ffn_gate_inp == nullptr) {
4688
4784
  cur = build_norm(ffn_inp,
4689
4785
  model.layers[il].ffn_norm, NULL,
@@ -4697,12 +4793,36 @@ struct llm_build_deci : public llm_graph_context {
4697
4793
  NULL,
4698
4794
  LLM_FFN_SILU, LLM_FFN_PAR, il);
4699
4795
  cb(cur, "ffn_out", il);
4700
- }
4701
-
4702
- // For Granite architecture
4703
- if (hparams.f_residual_scale) {
4704
- cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
4705
- }
4796
+ } else {
4797
+ lm_ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
4798
+ model.layers[il].ffn_norm, NULL,
4799
+ LLM_NORM_RMS, il);
4800
+ cb(cur, "ffn_norm", il);
4801
+
4802
+ lm_ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
4803
+ model.layers[il].ffn_gate_inp,
4804
+ model.layers[il].ffn_up_exps,
4805
+ model.layers[il].ffn_gate_exps,
4806
+ model.layers[il].ffn_down_exps,
4807
+ nullptr,
4808
+ n_expert, n_expert_used,
4809
+ LLM_FFN_SILU, false,
4810
+ false, 0.0,
4811
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
4812
+ il);
4813
+
4814
+ // Shared experts
4815
+ lm_ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
4816
+ model.layers[il].ffn_up_shexp, NULL, NULL,
4817
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
4818
+ model.layers[il].ffn_down_shexp, NULL, NULL,
4819
+ NULL,
4820
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
4821
+ cb(shexp_out, "ffn_moe_shexp", il);
4822
+
4823
+ cur = lm_ggml_add(ctx0, moe_out, shexp_out);
4824
+ cb(cur, "ffn_moe_out_merged", il);
4825
+ }
4706
4826
 
4707
4827
  cur = lm_ggml_add(ctx0, cur, ffn_inp);
4708
4828
  cb(cur, "ffn_out", il);
@@ -4726,11 +4846,161 @@ struct llm_build_deci : public llm_graph_context {
4726
4846
  // lm_head
4727
4847
  cur = build_lora_mm(model.output, cur);
4728
4848
 
4729
- // For Granite architecture
4730
- if (hparams.f_logit_scale) {
4731
- cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4849
+ cb(cur, "result_output", -1);
4850
+ res->t_logits = cur;
4851
+
4852
+ lm_ggml_build_forward_expand(gf, cur);
4853
+ }
4854
+ };
4855
+
4856
+ struct llm_build_deci : public llm_graph_context {
4857
+ llm_build_deci(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
4858
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4859
+
4860
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4861
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
4862
+
4863
+ lm_ggml_tensor * cur;
4864
+ lm_ggml_tensor * inpL;
4865
+
4866
+ inpL = build_inp_embd(model.tok_embd);
4867
+
4868
+ // inp_pos - contains the positions
4869
+ lm_ggml_tensor * inp_pos = build_inp_pos();
4870
+
4871
+ auto * inp_attn = build_attn_inp_kv_unified();
4872
+
4873
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4874
+ for (int il = 0; il < n_layer; ++il) {
4875
+ lm_ggml_tensor * inpSA = inpL;
4876
+ const int64_t n_head_kv = hparams.n_head_kv(il);
4877
+ const int64_t n_head = hparams.n_head(il);
4878
+ const int64_t n_ff = hparams.n_ff(il);
4879
+
4880
+ if (n_head == 0) {
4881
+ // attention-free layer of Llama-3_1-Nemotron-51B
4882
+ cur = inpL;
4883
+ } else {
4884
+ // norm
4885
+ cur = build_norm(inpL,
4886
+ model.layers[il].attn_norm, NULL,
4887
+ LLM_NORM_RMS, il);
4888
+ cb(cur, "attn_norm", il);
4889
+ }
4890
+
4891
+ if (n_head > 0 && n_head_kv == 0) {
4892
+ // "linear attention" of Llama-3_1-Nemotron-51B
4893
+ cur = build_lora_mm(model.layers[il].wo, cur);
4894
+ cb(cur, "wo", il);
4895
+ } else if (n_head > 0) {
4896
+ // self-attention
4897
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
4898
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4899
+
4900
+ // compute Q and K and RoPE them
4901
+ lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
4902
+ cb(Qcur, "Qcur", il);
4903
+ if (model.layers[il].bq) {
4904
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
4905
+ cb(Qcur, "Qcur", il);
4906
+ }
4907
+
4908
+ lm_ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
4909
+ cb(Kcur, "Kcur", il);
4910
+ if (model.layers[il].bk) {
4911
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
4912
+ cb(Kcur, "Kcur", il);
4913
+ }
4914
+
4915
+ lm_ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
4916
+ cb(Vcur, "Vcur", il);
4917
+ if (model.layers[il].bv) {
4918
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
4919
+ cb(Vcur, "Vcur", il);
4920
+ }
4921
+
4922
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
4923
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
4924
+ Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
4925
+
4926
+ Qcur = lm_ggml_rope_ext(
4927
+ ctx0, Qcur, inp_pos, rope_factors,
4928
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4929
+ ext_factor, attn_factor, beta_fast, beta_slow
4930
+ );
4931
+
4932
+ Kcur = lm_ggml_rope_ext(
4933
+ ctx0, Kcur, inp_pos, rope_factors,
4934
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4935
+ ext_factor, attn_factor, beta_fast, beta_slow
4936
+ );
4937
+
4938
+ cb(Qcur, "Qcur", il);
4939
+ cb(Kcur, "Kcur", il);
4940
+ cb(Vcur, "Vcur", il);
4941
+
4942
+ cur = build_attn(inp_attn, gf,
4943
+ model.layers[il].wo, model.layers[il].bo,
4944
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4945
+ }
4946
+
4947
+ if (il == n_layer - 1) {
4948
+ // skip computing output for unused tokens
4949
+ lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
4950
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
4951
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
4952
+ }
4953
+
4954
+ // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
4955
+ if (n_ff == 0) {
4956
+ continue;
4957
+ }
4958
+
4959
+ // modified to support attention-free layer of Llama-3_1-Nemotron-51B
4960
+ lm_ggml_tensor * ffn_inp = cur;
4961
+ if (n_head > 0) {
4962
+ ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
4963
+ cb(ffn_inp, "ffn_inp", il);
4964
+ }
4965
+
4966
+ // feed-forward network
4967
+ if (model.layers[il].ffn_gate_inp == nullptr) {
4968
+ cur = build_norm(ffn_inp,
4969
+ model.layers[il].ffn_norm, NULL,
4970
+ LLM_NORM_RMS, il);
4971
+ cb(cur, "ffn_norm", il);
4972
+
4973
+ cur = build_ffn(cur,
4974
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
4975
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
4976
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
4977
+ NULL,
4978
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
4979
+ cb(cur, "ffn_out", il);
4980
+ }
4981
+
4982
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
4983
+ cb(cur, "ffn_out", il);
4984
+
4985
+ cur = build_cvec(cur, il);
4986
+ cb(cur, "l_out", il);
4987
+
4988
+ // input for next layer
4989
+ inpL = cur;
4732
4990
  }
4733
4991
 
4992
+ cur = inpL;
4993
+
4994
+ cur = build_norm(cur,
4995
+ model.output_norm, NULL,
4996
+ LLM_NORM_RMS, -1);
4997
+
4998
+ cb(cur, "result_norm", -1);
4999
+ res->t_embd = cur;
5000
+
5001
+ // lm_head
5002
+ cur = build_lora_mm(model.output, cur);
5003
+
4734
5004
  cb(cur, "result_output", -1);
4735
5005
  res->t_logits = cur;
4736
5006
 
@@ -4803,7 +5073,7 @@ struct llm_build_baichuan : public llm_graph_context {
4803
5073
 
4804
5074
  cur = build_attn(inp_attn, gf,
4805
5075
  model.layers[il].wo, NULL,
4806
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5076
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4807
5077
  }
4808
5078
 
4809
5079
  if (il == n_layer - 1) {
@@ -4918,7 +5188,7 @@ struct llm_build_xverse : public llm_graph_context {
4918
5188
 
4919
5189
  cur = build_attn(inp_attn, gf,
4920
5190
  model.layers[il].wo, NULL,
4921
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5191
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
4922
5192
  }
4923
5193
 
4924
5194
  if (il == n_layer - 1) {
@@ -5043,7 +5313,7 @@ struct llm_build_falcon : public llm_graph_context {
5043
5313
 
5044
5314
  cur = build_attn(inp_attn, gf,
5045
5315
  model.layers[il].wo, NULL,
5046
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5316
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5047
5317
  }
5048
5318
 
5049
5319
  if (il == n_layer - 1) {
@@ -5173,7 +5443,7 @@ struct llm_build_grok : public llm_graph_context {
5173
5443
 
5174
5444
  cur = build_attn(inp_attn, gf,
5175
5445
  model.layers[il].wo, model.layers[il].bo,
5176
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
5446
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
5177
5447
  }
5178
5448
 
5179
5449
  if (il == n_layer - 1) {
@@ -5324,7 +5594,7 @@ struct llm_build_dbrx : public llm_graph_context {
5324
5594
 
5325
5595
  cur = build_attn(inp_attn, gf,
5326
5596
  model.layers[il].wo, NULL,
5327
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5597
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5328
5598
  }
5329
5599
 
5330
5600
  if (il == n_layer - 1) {
@@ -5438,7 +5708,7 @@ struct llm_build_starcoder : public llm_graph_context {
5438
5708
 
5439
5709
  cur = build_attn(inp_attn, gf,
5440
5710
  model.layers[il].wo, model.layers[il].bo,
5441
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5711
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5442
5712
  }
5443
5713
 
5444
5714
  if (il == n_layer - 1) {
@@ -5537,7 +5807,7 @@ struct llm_build_refact : public llm_graph_context {
5537
5807
 
5538
5808
  cur = build_attn(inp_attn, gf,
5539
5809
  model.layers[il].wo, NULL,
5540
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5810
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5541
5811
  }
5542
5812
 
5543
5813
  if (il == n_layer - 1) {
@@ -5664,6 +5934,11 @@ struct llm_build_bert : public llm_graph_context {
5664
5934
  cur = build_lora_mm(model.layers[il].wqkv, cur);
5665
5935
  cb(cur, "wqkv", il);
5666
5936
 
5937
+ if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5938
+ cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
5939
+ cb(cur, "bqkv", il);
5940
+ }
5941
+
5667
5942
  Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5668
5943
  Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5669
5944
  Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@@ -5691,7 +5966,7 @@ struct llm_build_bert : public llm_graph_context {
5691
5966
 
5692
5967
  cur = build_attn(inp_attn, gf,
5693
5968
  model.layers[il].wo, model.layers[il].bo,
5694
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5969
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5695
5970
  cb(cur, "kqv_out", il);
5696
5971
 
5697
5972
  if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
@@ -5716,13 +5991,29 @@ struct llm_build_bert : public llm_graph_context {
5716
5991
  cb(ffn_inp, "ffn_inp", il);
5717
5992
 
5718
5993
  // feed-forward network
5719
- if (model.arch == LLM_ARCH_BERT) {
5994
+ if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
5995
+ // MoE branch
5996
+ cur = build_moe_ffn(cur,
5997
+ model.layers[il].ffn_gate_inp,
5998
+ model.layers[il].ffn_up_exps,
5999
+ nullptr,
6000
+ model.layers[il].ffn_down_exps,
6001
+ nullptr,
6002
+ hparams.n_expert,
6003
+ hparams.n_expert_used,
6004
+ LLM_FFN_GELU,
6005
+ false, false,
6006
+ 0.0f,
6007
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
6008
+ cb(cur, "ffn_moe_out", il);
6009
+ } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5720
6010
  cur = build_ffn(cur,
5721
6011
  model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
5722
6012
  NULL, NULL, NULL,
5723
6013
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5724
6014
  NULL,
5725
6015
  LLM_FFN_GELU, LLM_FFN_SEQ, il);
6016
+ cb(cur, "ffn_out", il);
5726
6017
  } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
5727
6018
  cur = build_ffn(cur,
5728
6019
  model.layers[il].ffn_up, NULL, NULL,
@@ -5730,6 +6021,7 @@ struct llm_build_bert : public llm_graph_context {
5730
6021
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
5731
6022
  NULL,
5732
6023
  LLM_FFN_GELU, LLM_FFN_PAR, il);
6024
+ cb(cur, "ffn_out", il);
5733
6025
  } else {
5734
6026
  cur = build_ffn(cur,
5735
6027
  model.layers[il].ffn_up, NULL, NULL,
@@ -5737,8 +6029,8 @@ struct llm_build_bert : public llm_graph_context {
5737
6029
  model.layers[il].ffn_down, NULL, NULL,
5738
6030
  NULL,
5739
6031
  LLM_FFN_SILU, LLM_FFN_PAR, il);
6032
+ cb(cur, "ffn_out", il);
5740
6033
  }
5741
- cb(cur, "ffn_out", il);
5742
6034
 
5743
6035
  // attentions bypass the intermediate layer
5744
6036
  cur = lm_ggml_add(ctx0, cur, ffn_inp);
@@ -5808,7 +6100,7 @@ struct llm_build_bloom : public llm_graph_context {
5808
6100
 
5809
6101
  cur = build_attn(inp_attn, gf,
5810
6102
  model.layers[il].wo, model.layers[il].bo,
5811
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6103
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5812
6104
  }
5813
6105
 
5814
6106
  if (il == n_layer - 1) {
@@ -5949,7 +6241,7 @@ struct llm_build_mpt : public llm_graph_context {
5949
6241
 
5950
6242
  cur = build_attn(inp_attn, gf,
5951
6243
  model.layers[il].wo, model.layers[il].bo,
5952
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6244
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5953
6245
  }
5954
6246
 
5955
6247
  if (il == n_layer - 1) {
@@ -6095,7 +6387,7 @@ struct llm_build_stablelm : public llm_graph_context {
6095
6387
 
6096
6388
  cur = build_attn(inp_attn, gf,
6097
6389
  model.layers[il].wo, NULL,
6098
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6390
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6099
6391
  }
6100
6392
 
6101
6393
  if (il == n_layer - 1) {
@@ -6218,7 +6510,7 @@ struct llm_build_qwen : public llm_graph_context {
6218
6510
 
6219
6511
  cur = build_attn(inp_attn, gf,
6220
6512
  model.layers[il].wo, NULL,
6221
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6513
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6222
6514
  }
6223
6515
 
6224
6516
  if (il == n_layer - 1) {
@@ -6338,7 +6630,7 @@ struct llm_build_qwen2 : public llm_graph_context {
6338
6630
 
6339
6631
  cur = build_attn(inp_attn, gf,
6340
6632
  model.layers[il].wo, model.layers[il].bo,
6341
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6633
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6342
6634
  }
6343
6635
 
6344
6636
  if (il == n_layer - 1) {
@@ -6459,7 +6751,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
6459
6751
 
6460
6752
  cur = build_attn(inp_attn, gf,
6461
6753
  model.layers[il].wo, model.layers[il].bo,
6462
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6754
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6463
6755
  }
6464
6756
 
6465
6757
  if (il == n_layer - 1) {
@@ -6586,7 +6878,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
6586
6878
 
6587
6879
  cur = build_attn(inp_attn, gf,
6588
6880
  model.layers[il].wo, model.layers[il].bo,
6589
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6881
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6590
6882
  }
6591
6883
 
6592
6884
  if (il == n_layer - 1) {
@@ -6739,7 +7031,7 @@ struct llm_build_qwen3 : public llm_graph_context {
6739
7031
 
6740
7032
  cur = build_attn(inp_attn, gf,
6741
7033
  model.layers[il].wo, model.layers[il].bo,
6742
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7034
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6743
7035
  }
6744
7036
 
6745
7037
  if (il == n_layer - 1) {
@@ -6860,7 +7152,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
6860
7152
 
6861
7153
  cur = build_attn(inp_attn, gf,
6862
7154
  model.layers[il].wo, model.layers[il].bo,
6863
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7155
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6864
7156
  }
6865
7157
 
6866
7158
  if (il == n_layer - 1) {
@@ -7000,7 +7292,7 @@ struct llm_build_phi2 : public llm_graph_context {
7000
7292
 
7001
7293
  cur = build_attn(inp_attn, gf,
7002
7294
  model.layers[il].wo, model.layers[il].bo,
7003
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
7295
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7004
7296
  }
7005
7297
 
7006
7298
  if (il == n_layer - 1) {
@@ -7052,6 +7344,7 @@ struct llm_build_phi2 : public llm_graph_context {
7052
7344
  }
7053
7345
  };
7054
7346
 
7347
+ template<bool iswa>
7055
7348
  struct llm_build_phi3 : public llm_graph_context {
7056
7349
  llm_build_phi3(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
7057
7350
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -7067,7 +7360,14 @@ struct llm_build_phi3 : public llm_graph_context {
7067
7360
  // inp_pos - contains the positions
7068
7361
  lm_ggml_tensor * inp_pos = build_inp_pos();
7069
7362
 
7070
- auto * inp_attn = build_attn_inp_kv_unified();
7363
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
7364
+ inp_attn_type * inp_attn = nullptr;
7365
+
7366
+ if constexpr (iswa) {
7367
+ inp_attn = build_attn_inp_kv_unified_iswa();
7368
+ } else {
7369
+ inp_attn = build_attn_inp_kv_unified();
7370
+ }
7071
7371
 
7072
7372
  for (int il = 0; il < n_layer; ++il) {
7073
7373
  auto * residual = inpL;
@@ -7075,7 +7375,7 @@ struct llm_build_phi3 : public llm_graph_context {
7075
7375
  // self-attention
7076
7376
  {
7077
7377
  // rope freq factors for 128k context
7078
- lm_ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
7378
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
7079
7379
 
7080
7380
  lm_ggml_tensor* attn_norm_output = build_norm(inpL,
7081
7381
  model.layers[il].attn_norm,
@@ -7129,7 +7429,7 @@ struct llm_build_phi3 : public llm_graph_context {
7129
7429
 
7130
7430
  cur = build_attn(inp_attn, gf,
7131
7431
  model.layers[il].wo, model.layers[il].bo,
7132
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
7432
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7133
7433
  }
7134
7434
 
7135
7435
  if (il == n_layer - 1) {
@@ -7264,7 +7564,7 @@ struct llm_build_plamo : public llm_graph_context {
7264
7564
 
7265
7565
  cur = build_attn(inp_attn, gf,
7266
7566
  model.layers[il].wo, NULL,
7267
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7567
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7268
7568
  }
7269
7569
  lm_ggml_tensor * sa_out = cur;
7270
7570
 
@@ -7371,7 +7671,7 @@ struct llm_build_gpt2 : public llm_graph_context {
7371
7671
 
7372
7672
  cur = build_attn(inp_attn, gf,
7373
7673
  model.layers[il].wo, model.layers[il].bo,
7374
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7674
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7375
7675
  }
7376
7676
 
7377
7677
  if (il == n_layer - 1) {
@@ -7487,7 +7787,7 @@ struct llm_build_codeshell : public llm_graph_context {
7487
7787
 
7488
7788
  cur = build_attn(inp_attn, gf,
7489
7789
  model.layers[il].wo, model.layers[il].bo,
7490
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7790
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7491
7791
  }
7492
7792
 
7493
7793
  if (il == n_layer - 1) {
@@ -7616,7 +7916,7 @@ struct llm_build_orion : public llm_graph_context {
7616
7916
 
7617
7917
  cur = build_attn(inp_attn, gf,
7618
7918
  model.layers[il].wo, NULL,
7619
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7919
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7620
7920
  }
7621
7921
 
7622
7922
  if (il == n_layer - 1) {
@@ -7743,7 +8043,7 @@ struct llm_build_internlm2 : public llm_graph_context {
7743
8043
 
7744
8044
  cur = build_attn(inp_attn, gf,
7745
8045
  model.layers[il].wo, model.layers[il].bo,
7746
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8046
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7747
8047
  }
7748
8048
 
7749
8049
  if (il == n_layer - 1) {
@@ -7827,7 +8127,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7827
8127
  for (int il = 0; il < n_layer; ++il) {
7828
8128
  lm_ggml_tensor * inpSA = inpL;
7829
8129
 
7830
- lm_ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
8130
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
7831
8131
 
7832
8132
  // norm
7833
8133
  cur = build_norm(inpL,
@@ -7940,7 +8240,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7940
8240
 
7941
8241
  cur = build_attn(inp_attn, gf,
7942
8242
  model.layers[il].wo, NULL,
7943
- q_states, k_states, v_states, nullptr, kq_scale, il);
8243
+ q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
7944
8244
  }
7945
8245
 
7946
8246
  if (il == n_layer - 1) {
@@ -8070,7 +8370,7 @@ struct llm_build_gemma : public llm_graph_context {
8070
8370
 
8071
8371
  cur = build_attn(inp_attn, gf,
8072
8372
  model.layers[il].wo, NULL,
8073
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
8373
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8074
8374
  }
8075
8375
 
8076
8376
  if (il == n_layer - 1) {
@@ -8127,8 +8427,8 @@ struct llm_build_gemma : public llm_graph_context {
8127
8427
  }
8128
8428
  };
8129
8429
 
8130
- struct llm_build_gemma2 : public llm_graph_context {
8131
- llm_build_gemma2(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
8430
+ struct llm_build_gemma2_iswa : public llm_graph_context {
8431
+ llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
8132
8432
  const int64_t n_embd_head = hparams.n_embd_head_k;
8133
8433
 
8134
8434
  lm_ggml_tensor * cur;
@@ -8142,7 +8442,7 @@ struct llm_build_gemma2 : public llm_graph_context {
8142
8442
  // inp_pos - contains the positions
8143
8443
  lm_ggml_tensor * inp_pos = build_inp_pos();
8144
8444
 
8145
- auto * inp_attn = build_attn_inp_kv_unified();
8445
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
8146
8446
 
8147
8447
  for (int il = 0; il < n_layer; ++il) {
8148
8448
  // norm
@@ -8192,7 +8492,7 @@ struct llm_build_gemma2 : public llm_graph_context {
8192
8492
 
8193
8493
  cur = build_attn(inp_attn, gf,
8194
8494
  model.layers[il].wo, NULL,
8195
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
8495
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8196
8496
  }
8197
8497
 
8198
8498
  cur = build_norm(cur,
@@ -8264,8 +8564,8 @@ struct llm_build_gemma2 : public llm_graph_context {
8264
8564
  }
8265
8565
  };
8266
8566
 
8267
- struct llm_build_gemma3 : public llm_graph_context {
8268
- llm_build_gemma3(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
8567
+ struct llm_build_gemma3_iswa : public llm_graph_context {
8568
+ llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
8269
8569
  const int64_t n_embd_head = hparams.n_embd_head_k;
8270
8570
 
8271
8571
  lm_ggml_tensor * cur;
@@ -8283,13 +8583,11 @@ struct llm_build_gemma3 : public llm_graph_context {
8283
8583
  lm_ggml_tensor * inp_pos = build_inp_pos();
8284
8584
 
8285
8585
  // TODO: is causal == true correct? might need some changes
8286
- auto * inp_attn = build_attn_inp_kv_unified();
8586
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
8287
8587
 
8288
8588
  for (int il = 0; il < n_layer; ++il) {
8289
- const bool is_swa = hparams.is_swa(il);
8290
-
8291
- const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
8292
- const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
8589
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
8590
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
8293
8591
 
8294
8592
  // norm
8295
8593
  cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
@@ -8333,7 +8631,7 @@ struct llm_build_gemma3 : public llm_graph_context {
8333
8631
 
8334
8632
  cur = build_attn(inp_attn, gf,
8335
8633
  model.layers[il].wo, NULL,
8336
- Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il);
8634
+ Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
8337
8635
  }
8338
8636
 
8339
8637
  cur = build_norm(cur,
@@ -8473,7 +8771,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
8473
8771
 
8474
8772
  cur = build_attn(inp_attn, gf,
8475
8773
  model.layers[il].wo, model.layers[il].bo,
8476
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8774
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8477
8775
  }
8478
8776
 
8479
8777
  if (il == n_layer - 1) {
@@ -8594,7 +8892,7 @@ struct llm_build_mamba : public llm_graph_context {
8594
8892
  lm_ggml_tensor * state_mask,
8595
8893
  const llama_ubatch & ubatch,
8596
8894
  int il) const {
8597
- const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
8895
+ const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
8598
8896
 
8599
8897
  const auto kv_head = kv_self->head;
8600
8898
 
@@ -8808,7 +9106,7 @@ struct llm_build_command_r : public llm_graph_context {
8808
9106
 
8809
9107
  cur = build_attn(inp_attn, gf,
8810
9108
  model.layers[il].wo, model.layers[il].bo,
8811
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9109
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8812
9110
  }
8813
9111
 
8814
9112
  if (il == n_layer - 1) {
@@ -8866,8 +9164,8 @@ struct llm_build_command_r : public llm_graph_context {
8866
9164
  }
8867
9165
  };
8868
9166
 
8869
- struct llm_build_cohere2 : public llm_graph_context {
8870
- llm_build_cohere2(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
9167
+ struct llm_build_cohere2_iswa : public llm_graph_context {
9168
+ llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
8871
9169
  const int64_t n_embd_head = hparams.n_embd_head_v;
8872
9170
 
8873
9171
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8882,7 +9180,7 @@ struct llm_build_cohere2 : public llm_graph_context {
8882
9180
  // inp_pos - contains the positions
8883
9181
  lm_ggml_tensor * inp_pos = build_inp_pos();
8884
9182
 
8885
- auto * inp_attn = build_attn_inp_kv_unified();
9183
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
8886
9184
 
8887
9185
  for (int il = 0; il < n_layer; ++il) {
8888
9186
  const bool is_swa = hparams.is_swa(il);
@@ -8895,7 +9193,7 @@ struct llm_build_cohere2 : public llm_graph_context {
8895
9193
  // self-attention
8896
9194
  {
8897
9195
  // rope freq factors for 128k context
8898
- lm_ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
9196
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
8899
9197
 
8900
9198
  // compute Q and K and RoPE them
8901
9199
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -8943,7 +9241,7 @@ struct llm_build_cohere2 : public llm_graph_context {
8943
9241
 
8944
9242
  cur = build_attn(inp_attn, gf,
8945
9243
  model.layers[il].wo, model.layers[il].bo,
8946
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9244
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8947
9245
  }
8948
9246
 
8949
9247
  if (il == n_layer - 1) {
@@ -9074,7 +9372,7 @@ struct llm_build_olmo : public llm_graph_context {
9074
9372
 
9075
9373
  cur = build_attn(inp_attn, gf,
9076
9374
  model.layers[il].wo, nullptr,
9077
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9375
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9078
9376
  }
9079
9377
 
9080
9378
  if (il == n_layer - 1) {
@@ -9194,7 +9492,7 @@ struct llm_build_olmo2 : public llm_graph_context {
9194
9492
 
9195
9493
  cur = build_attn(inp_attn, gf,
9196
9494
  model.layers[il].wo, NULL,
9197
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9495
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9198
9496
  }
9199
9497
 
9200
9498
  cur = build_norm(cur,
@@ -9327,7 +9625,7 @@ struct llm_build_olmoe : public llm_graph_context {
9327
9625
 
9328
9626
  cur = build_attn(inp_attn, gf,
9329
9627
  model.layers[il].wo, NULL,
9330
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9628
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9331
9629
  }
9332
9630
 
9333
9631
  if (il == n_layer - 1) {
@@ -9460,7 +9758,7 @@ struct llm_build_openelm : public llm_graph_context {
9460
9758
 
9461
9759
  cur = build_attn(inp_attn, gf,
9462
9760
  model.layers[il].wo, NULL,
9463
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9761
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9464
9762
  }
9465
9763
 
9466
9764
  if (il == n_layer - 1) {
@@ -9574,7 +9872,7 @@ struct llm_build_gptneox : public llm_graph_context {
9574
9872
 
9575
9873
  cur = build_attn(inp_attn, gf,
9576
9874
  model.layers[il].wo, model.layers[il].bo,
9577
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9875
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9578
9876
  }
9579
9877
 
9580
9878
  if (il == n_layer - 1) {
@@ -9724,7 +10022,7 @@ struct llm_build_arctic : public llm_graph_context {
9724
10022
 
9725
10023
  cur = build_attn(inp_attn, gf,
9726
10024
  model.layers[il].wo, NULL,
9727
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10025
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9728
10026
  }
9729
10027
 
9730
10028
  if (il == n_layer - 1) {
@@ -9833,7 +10131,7 @@ struct llm_build_deepseek : public llm_graph_context {
9833
10131
  // self-attention
9834
10132
  {
9835
10133
  // rope freq factors for llama3; may return nullptr for llama2 and other models
9836
- lm_ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
10134
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
9837
10135
 
9838
10136
  // compute Q and K and RoPE them
9839
10137
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9879,7 +10177,7 @@ struct llm_build_deepseek : public llm_graph_context {
9879
10177
 
9880
10178
  cur = build_attn(inp_attn, gf,
9881
10179
  model.layers[il].wo, model.layers[il].bo,
9882
- Qcur, Kcur, Vcur, nullptr, kq_scale, il);
10180
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
9883
10181
  }
9884
10182
 
9885
10183
  if (il == n_layer - 1) {
@@ -9969,15 +10267,22 @@ struct llm_build_deepseek2 : public llm_graph_context {
9969
10267
  llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
9970
10268
  bool is_lite = (hparams.n_layer == 27);
9971
10269
 
10270
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
10271
+
10272
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
10273
+ const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
10274
+ const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
10275
+
10276
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
10277
+ const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
10278
+
10279
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
10280
+
9972
10281
  // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
9973
10282
  // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
9974
10283
  const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
9975
- const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
9976
- const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
9977
-
9978
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
9979
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
9980
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
10284
+ const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
10285
+ const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
9981
10286
 
9982
10287
  lm_ggml_tensor * cur;
9983
10288
  lm_ggml_tensor * inpL;
@@ -10003,16 +10308,14 @@ struct llm_build_deepseek2 : public llm_graph_context {
10003
10308
  {
10004
10309
  lm_ggml_tensor * q = NULL;
10005
10310
  if (!is_lite) {
10006
- // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
10007
10311
  q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
10008
10312
  cb(q, "q", il);
10009
10313
 
10010
10314
  q = build_norm(q,
10011
- model.layers[il].attn_q_a_norm, NULL,
10315
+ model.layers[il].attn_q_a_norm, nullptr,
10012
10316
  LLM_NORM_RMS, il);
10013
10317
  cb(q, "q", il);
10014
10318
 
10015
- // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
10016
10319
  q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
10017
10320
  cb(q, "q", il);
10018
10321
  } else {
@@ -10020,96 +10323,125 @@ struct llm_build_deepseek2 : public llm_graph_context {
10020
10323
  cb(q, "q", il);
10021
10324
  }
10022
10325
 
10023
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
10024
- lm_ggml_tensor * q_nope = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
10025
- lm_ggml_row_size(q->type, hparams.n_embd_head_k),
10026
- lm_ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
10326
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
10327
+ lm_ggml_tensor * q_nope = lm_ggml_view_3d(ctx0, q,
10328
+ n_embd_head_qk_nope, n_head, n_tokens,
10329
+ lm_ggml_row_size(q->type, n_embd_head_k),
10330
+ lm_ggml_row_size(q->type, n_embd_head_k) * n_head,
10027
10331
  0);
10028
10332
  cb(q_nope, "q_nope", il);
10029
10333
 
10030
- // and {n_head * n_embd_head_qk_rope, n_tokens}
10031
- lm_ggml_tensor * q_pe = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
10032
- lm_ggml_row_size(q->type, hparams.n_embd_head_k),
10033
- lm_ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
10334
+ // and {n_embd_head_qk_rope, n_head, n_tokens}
10335
+ lm_ggml_tensor * q_pe = lm_ggml_view_3d(ctx0, q,
10336
+ n_embd_head_qk_rope, n_head, n_tokens,
10337
+ lm_ggml_row_size(q->type, n_embd_head_k),
10338
+ lm_ggml_row_size(q->type, n_embd_head_k) * n_head,
10034
10339
  lm_ggml_row_size(q->type, n_embd_head_qk_nope));
10035
10340
  cb(q_pe, "q_pe", il);
10036
10341
 
10037
- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
10038
- lm_ggml_tensor * kv_pe_compresseed = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
10039
- cb(kv_pe_compresseed, "kv_pe_compresseed", il);
10342
+ lm_ggml_tensor * kv_cmpr_pe = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
10343
+ cb(kv_cmpr_pe, "kv_cmpr_pe", il);
10040
10344
 
10041
10345
  // split into {kv_lora_rank, n_tokens}
10042
- lm_ggml_tensor * kv_compressed = lm_ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
10043
- kv_pe_compresseed->nb[1],
10346
+ lm_ggml_tensor * kv_cmpr = lm_ggml_view_2d(ctx0, kv_cmpr_pe,
10347
+ kv_lora_rank, n_tokens,
10348
+ lm_ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
10044
10349
  0);
10045
- cb(kv_compressed, "kv_compressed", il);
10350
+ cb(kv_cmpr, "kv_cmpr", il);
10351
+
10352
+ // and {n_embd_head_qk_rope, 1, n_tokens}
10353
+ lm_ggml_tensor * k_pe = lm_ggml_view_3d(ctx0, kv_cmpr_pe,
10354
+ n_embd_head_qk_rope, 1, n_tokens,
10355
+ lm_ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
10356
+ lm_ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
10357
+ lm_ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
10358
+ cb(k_pe, "k_pe", il);
10046
10359
 
10047
- // and {n_embd_head_qk_rope, n_tokens}
10048
- lm_ggml_tensor * k_pe = lm_ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
10049
- kv_pe_compresseed->nb[1],
10050
- kv_pe_compresseed->nb[1],
10051
- lm_ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
10360
+ q_pe = lm_ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
10361
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10362
+ ext_factor, attn_factor, beta_fast, beta_slow
10363
+ );
10364
+ cb(q_pe, "q_pe", il);
10365
+
10366
+ k_pe = lm_ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
10367
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10368
+ ext_factor, attn_factor, beta_fast, beta_slow
10369
+ );
10052
10370
  cb(k_pe, "k_pe", il);
10053
10371
 
10054
- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing lm_ggml_cont
10055
- kv_compressed = lm_ggml_cont(ctx0, kv_compressed);
10056
- kv_compressed = build_norm(kv_compressed,
10057
- model.layers[il].attn_kv_a_norm, NULL,
10372
+ kv_cmpr = build_norm(kv_cmpr,
10373
+ model.layers[il].attn_kv_a_norm, nullptr,
10058
10374
  LLM_NORM_RMS, il);
10059
- cb(kv_compressed, "kv_compressed", il);
10375
+ cb(kv_cmpr, "kv_cmpr", il);
10060
10376
 
10061
- // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
10062
- lm_ggml_tensor * kv = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
10063
- cb(kv, "kv", il);
10377
+ if (is_mla) {
10378
+ // {n_embd_head_qk_nope, n_tokens, n_head}
10379
+ q_nope = lm_ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
10380
+ cb(q_nope, "q_nope_perm", il);
10064
10381
 
10065
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
10066
- lm_ggml_tensor * k_nope = lm_ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
10067
- lm_ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
10068
- lm_ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
10069
- 0);
10070
- cb(k_nope, "k_nope", il);
10382
+ // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
10383
+ lm_ggml_tensor * q_nope_absorbed = lm_ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
10384
+ cb(q_nope_absorbed, "q_nope_absorbed", il);
10071
10385
 
10072
- // and {n_head * n_embd_head_v, n_tokens}
10073
- lm_ggml_tensor * v_states = lm_ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
10074
- lm_ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
10075
- lm_ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
10076
- lm_ggml_row_size(kv->type, (n_embd_head_qk_nope)));
10077
- cb(v_states, "v_states", il);
10386
+ // {kv_lora_rank, n_head, n_tokens}
10387
+ q_nope_absorbed = lm_ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
10388
+ cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
10078
10389
 
10079
- v_states = lm_ggml_cont(ctx0, v_states);
10080
- cb(v_states, "v_states", il);
10390
+ // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
10391
+ // note: rope must go first for in-place context shifting in build_rope_shift()
10392
+ lm_ggml_tensor * Qcur = lm_ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
10393
+ cb(Qcur, "Qcur", il);
10081
10394
 
10082
- v_states = lm_ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
10083
- lm_ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
10084
- 0);
10085
- cb(v_states, "v_states", il);
10395
+ kv_cmpr = lm_ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
10396
+ cb(kv_cmpr, "kv_cmpr_reshape", il);
10086
10397
 
10087
- q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
10088
- q_pe = lm_ggml_rope_ext(
10089
- ctx0, q_pe, inp_pos, nullptr,
10090
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10091
- ext_factor, attn_factor_scaled, beta_fast, beta_slow
10092
- );
10093
- cb(q_pe, "q_pe", il);
10398
+ // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
10399
+ lm_ggml_tensor * Kcur = lm_ggml_concat(ctx0, k_pe, kv_cmpr, 0);
10400
+ cb(Kcur, "Kcur", il);
10094
10401
 
10095
- // shared RoPE key
10096
- k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
10097
- k_pe = lm_ggml_rope_ext(
10098
- ctx0, k_pe, inp_pos, nullptr,
10099
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
10100
- ext_factor, attn_factor_scaled, beta_fast, beta_slow
10101
- );
10102
- cb(k_pe, "k_pe", il);
10402
+ // {kv_lora_rank, 1, n_tokens}
10403
+ lm_ggml_tensor * Vcur = kv_cmpr;
10404
+ cb(Vcur, "Vcur", il);
10103
10405
 
10104
- lm_ggml_tensor * q_states = lm_ggml_concat(ctx0, q_nope, q_pe, 0);
10105
- cb(q_states, "q_states", il);
10406
+ // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
10407
+ cur = build_attn(inp_attn, gf,
10408
+ model.layers[il].wo, NULL,
10409
+ Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
10410
+ } else {
10411
+ lm_ggml_tensor * kv = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
10412
+ cb(kv, "kv", il);
10413
+
10414
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
10415
+ lm_ggml_tensor * k_nope = lm_ggml_view_3d(ctx0, kv,
10416
+ n_embd_head_qk_nope, n_head, n_tokens,
10417
+ lm_ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
10418
+ lm_ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
10419
+ 0);
10420
+ cb(k_nope, "k_nope_view", il);
10106
10421
 
10107
- lm_ggml_tensor * k_states = lm_ggml_concat(ctx0, k_nope, lm_ggml_repeat(ctx0, k_pe, q_pe), 0);
10108
- cb(k_states, "k_states", il);
10422
+ // and {n_embd_head_v, n_head, n_tokens}
10423
+ lm_ggml_tensor * Vcur = lm_ggml_view_3d(ctx0, kv,
10424
+ n_embd_head_v, n_head, n_tokens,
10425
+ lm_ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
10426
+ lm_ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
10427
+ lm_ggml_row_size(kv->type, n_embd_head_qk_nope));
10428
+ cb(Vcur, "Vcur_view", il);
10109
10429
 
10110
- cur = build_attn(inp_attn, gf,
10111
- model.layers[il].wo, NULL,
10112
- q_states, k_states, v_states, nullptr, kq_scale, il);
10430
+ Vcur = lm_ggml_cont(ctx0, Vcur);
10431
+ cb(Vcur, "Vcur_cont", il);
10432
+
10433
+ // note: rope must go first for in-place context shifting in build_rope_shift()
10434
+ lm_ggml_tensor * Qcur = lm_ggml_concat(ctx0, q_pe, q_nope, 0);
10435
+ cb(Qcur, "Qcur", il);
10436
+
10437
+ lm_ggml_tensor * Kcur = lm_ggml_concat(ctx0, lm_ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
10438
+ cb(Kcur, "Kcur", il);
10439
+
10440
+ // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
10441
+ cur = build_attn(inp_attn, gf,
10442
+ model.layers[il].wo, NULL,
10443
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
10444
+ }
10113
10445
  }
10114
10446
 
10115
10447
  if (il == n_layer - 1) {
@@ -10275,7 +10607,7 @@ struct llm_build_bitnet : public llm_graph_context {
10275
10607
 
10276
10608
  cur = build_attn(inp_attn, gf,
10277
10609
  NULL, NULL,
10278
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10610
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10279
10611
 
10280
10612
  cur = build_norm(cur,
10281
10613
  model.layers[il].attn_sub_norm, NULL,
@@ -10398,7 +10730,7 @@ struct llm_build_t5_enc : public llm_graph_context {
10398
10730
 
10399
10731
  cur = build_attn(inp_attn, gf,
10400
10732
  model.layers[il].wo_enc, nullptr,
10401
- Qcur, Kcur, Vcur, kq_b, 1.0f, il);
10733
+ Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
10402
10734
  cb(cur, "kqv_out", il);
10403
10735
  }
10404
10736
 
@@ -10504,7 +10836,7 @@ struct llm_build_t5_dec : public llm_graph_context {
10504
10836
 
10505
10837
  cur = build_attn(inp_attn_self, gf,
10506
10838
  model.layers[il].wo, model.layers[il].bo,
10507
- Qcur, Kcur, Vcur, kq_b, 1.0f, il);
10839
+ Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
10508
10840
  cb(cur, "kqv_out", il);
10509
10841
  }
10510
10842
 
@@ -10536,7 +10868,7 @@ struct llm_build_t5_dec : public llm_graph_context {
10536
10868
 
10537
10869
  cur = build_attn(inp_attn_cross, gf,
10538
10870
  model.layers[il].wo_cross, nullptr,
10539
- Qcur, Kcur, Vcur, nullptr, 1.0f, il);
10871
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
10540
10872
  cb(cur, "kqv_out", il);
10541
10873
 
10542
10874
  //lm_ggml_tensor * q = lm_ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
@@ -10669,7 +11001,7 @@ struct llm_build_jais : public llm_graph_context {
10669
11001
 
10670
11002
  cur = build_attn(inp_attn, gf,
10671
11003
  model.layers[il].wo, model.layers[il].bo,
10672
- Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il);
11004
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
10673
11005
  }
10674
11006
 
10675
11007
  if (il == n_layer - 1) {
@@ -10801,7 +11133,7 @@ struct llm_build_chatglm : public llm_graph_context {
10801
11133
 
10802
11134
  cur = build_attn(inp_attn, gf,
10803
11135
  model.layers[il].wo, NULL,
10804
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11136
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10805
11137
  }
10806
11138
 
10807
11139
  if (il == n_layer - 1) {
@@ -10854,6 +11186,157 @@ struct llm_build_chatglm : public llm_graph_context {
10854
11186
  }
10855
11187
  };
10856
11188
 
11189
+ struct llm_build_glm4 : public llm_graph_context {
11190
+ llm_build_glm4(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
11191
+ const int64_t n_embd_head = hparams.n_embd_head_v;
11192
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
11193
+
11194
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
11195
+
11196
+ lm_ggml_tensor * cur;
11197
+ lm_ggml_tensor * inpL;
11198
+
11199
+ inpL = build_inp_embd(model.tok_embd);
11200
+
11201
+ // inp_pos - contains the positions
11202
+ lm_ggml_tensor * inp_pos = build_inp_pos();
11203
+
11204
+ auto * inp_attn = build_attn_inp_kv_unified();
11205
+
11206
+ for (int il = 0; il < n_layer; ++il) {
11207
+ lm_ggml_tensor * inpSA = inpL;
11208
+
11209
+ // Pre-attention norm
11210
+ cur = build_norm(inpL,
11211
+ model.layers[il].attn_norm,
11212
+ NULL,
11213
+ LLM_NORM_RMS, il);
11214
+ cb(cur, "attn_norm", il);
11215
+
11216
+ // self-attention
11217
+ {
11218
+ lm_ggml_tensor * Qcur = nullptr;
11219
+ lm_ggml_tensor * Kcur = nullptr;
11220
+ lm_ggml_tensor * Vcur = nullptr;
11221
+
11222
+ if (model.layers[il].wqkv == nullptr) {
11223
+ Qcur = build_lora_mm(model.layers[il].wq, cur);
11224
+ if (model.layers[il].bq) {
11225
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
11226
+ }
11227
+ Kcur = build_lora_mm(model.layers[il].wk, cur);
11228
+ if (model.layers[il].bk) {
11229
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
11230
+ }
11231
+ Vcur = build_lora_mm(model.layers[il].wv, cur);
11232
+ if (model.layers[il].bv) {
11233
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
11234
+ }
11235
+ } else {
11236
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
11237
+ cb(cur, "wqkv", il);
11238
+ if (model.layers[il].bqkv) {
11239
+ cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv);
11240
+ cb(cur, "bqkv", il);
11241
+ }
11242
+ Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
11243
+ Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
11244
+ Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
11245
+ }
11246
+
11247
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11248
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11249
+ Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11250
+
11251
+ Qcur = lm_ggml_rope_ext(
11252
+ ctx0, Qcur, inp_pos, nullptr,
11253
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11254
+ ext_factor, attn_factor, beta_fast, beta_slow
11255
+ );
11256
+
11257
+ Kcur = lm_ggml_rope_ext(
11258
+ ctx0, Kcur, inp_pos, nullptr,
11259
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11260
+ ext_factor, attn_factor, beta_fast, beta_slow
11261
+ );
11262
+
11263
+ cb(Qcur, "Qcur", il);
11264
+ cb(Kcur, "Kcur", il);
11265
+ cb(Vcur, "Vcur", il);
11266
+
11267
+ cur = build_attn(inp_attn, gf,
11268
+ model.layers[il].wo, NULL,
11269
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11270
+ }
11271
+
11272
+ if (il == n_layer - 1) {
11273
+ // skip computing output for unused tokens
11274
+ lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
11275
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
11276
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
11277
+ }
11278
+
11279
+ // Post-attention norm (new!)
11280
+ cur = build_norm(cur,
11281
+ model.layers[il].attn_post_norm,
11282
+ NULL,
11283
+ LLM_NORM_RMS, il);
11284
+ cb(cur, "post_attn_norm", il);
11285
+
11286
+ // Add the input (residual connection after post-attention norm)
11287
+ lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
11288
+ cb(ffn_inp, "ffn_inp", il);
11289
+
11290
+ // FF
11291
+ {
11292
+ // Pre-MLP norm
11293
+ cur = build_norm(ffn_inp,
11294
+ model.layers[il].ffn_norm,
11295
+ NULL,
11296
+ LLM_NORM_RMS, il);
11297
+ cb(cur, "ffn_norm", il);
11298
+
11299
+ // MLP
11300
+ cur = build_ffn(cur,
11301
+ model.layers[il].ffn_up, NULL, NULL,
11302
+ NULL, NULL, NULL,
11303
+ model.layers[il].ffn_down, NULL, NULL,
11304
+ NULL,
11305
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
11306
+ cb(cur, "ffn_out", il);
11307
+
11308
+ // Post-MLP norm
11309
+ cur = build_norm(cur,
11310
+ model.layers[il].ffn_post_norm,
11311
+ NULL,
11312
+ LLM_NORM_RMS, il);
11313
+ cb(cur, "post_mlp_norm", il);
11314
+ }
11315
+
11316
+ // Add residual connection after post-MLP norm
11317
+ inpL = lm_ggml_add(ctx0, cur, ffn_inp);
11318
+ cb(inpL, "l_out", il);
11319
+ }
11320
+
11321
+ // Final norm
11322
+ cur = build_norm(inpL,
11323
+ model.output_norm,
11324
+ NULL,
11325
+ LLM_NORM_RMS, -1);
11326
+
11327
+ cb(cur, "result_norm", -1);
11328
+ res->t_embd = cur;
11329
+
11330
+ // Output projection
11331
+ cur = build_lora_mm(model.output, cur);
11332
+
11333
+ cb(cur, "result_output", -1);
11334
+ res->t_logits = cur;
11335
+
11336
+ lm_ggml_build_forward_expand(gf, cur);
11337
+ }
11338
+ };
11339
+
10857
11340
  struct llm_build_nemotron : public llm_graph_context {
10858
11341
  llm_build_nemotron(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
10859
11342
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -10927,7 +11410,7 @@ struct llm_build_nemotron : public llm_graph_context {
10927
11410
 
10928
11411
  cur = build_attn(inp_attn, gf,
10929
11412
  model.layers[il].wo, model.layers[il].bo,
10930
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11413
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10931
11414
  }
10932
11415
 
10933
11416
  if (il == n_layer - 1) {
@@ -11012,7 +11495,7 @@ struct llm_build_exaone : public llm_graph_context {
11012
11495
  // self-attention
11013
11496
  {
11014
11497
  // rope freq factors for llama3; may return nullptr for llama2 and other models
11015
- lm_ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
11498
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
11016
11499
 
11017
11500
  // compute Q and K and RoPE them
11018
11501
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11058,7 +11541,7 @@ struct llm_build_exaone : public llm_graph_context {
11058
11541
 
11059
11542
  cur = build_attn(inp_attn, gf,
11060
11543
  model.layers[il].wo, model.layers[il].bo,
11061
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11544
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11062
11545
  }
11063
11546
 
11064
11547
  if (il == n_layer - 1) {
@@ -11157,7 +11640,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11157
11640
  lm_ggml_tensor * state_mask,
11158
11641
  const llama_ubatch & ubatch,
11159
11642
  int il) const {
11160
- const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
11643
+ const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
11161
11644
 
11162
11645
  const auto n_tokens = ubatch.n_tokens;
11163
11646
  const auto n_seqs = ubatch.n_seqs;
@@ -11553,7 +12036,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
11553
12036
  lm_ggml_tensor *& first_layer_value,
11554
12037
  const llama_ubatch & ubatch,
11555
12038
  int il) const {
11556
- const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
12039
+ const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
11557
12040
 
11558
12041
  const auto n_tokens = ubatch.n_tokens;
11559
12042
  const auto n_seqs = ubatch.n_seqs;
@@ -11862,6 +12345,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
11862
12345
  }
11863
12346
  };
11864
12347
 
12348
+
12349
+ struct llm_build_granite : public llm_graph_context {
12350
+ llm_build_granite(
12351
+ const llama_model & model,
12352
+ const llm_graph_params & params,
12353
+ lm_ggml_cgraph * gf,
12354
+ const bool use_rope = true)
12355
+ : llm_graph_context(params) {
12356
+
12357
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12358
+
12359
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12360
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
12361
+
12362
+ lm_ggml_tensor * cur;
12363
+ lm_ggml_tensor * inpL;
12364
+
12365
+ inpL = build_inp_embd(model.tok_embd);
12366
+
12367
+ // inp_pos - built only if rope enabled
12368
+ lm_ggml_tensor * inp_pos = nullptr;
12369
+ if (use_rope) {
12370
+ inp_pos = build_inp_pos();
12371
+ }
12372
+
12373
+ auto * inp_attn = build_attn_inp_kv_unified();
12374
+
12375
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12376
+ for (int il = 0; il < n_layer; ++il) {
12377
+ lm_ggml_tensor * inpSA = inpL;
12378
+
12379
+ // norm
12380
+ cur = build_norm(inpL,
12381
+ model.layers[il].attn_norm, NULL,
12382
+ LLM_NORM_RMS, il);
12383
+ cb(cur, "attn_norm", il);
12384
+
12385
+ // self-attention
12386
+ {
12387
+ // compute Q and K and (optionally) RoPE them
12388
+ lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12389
+ cb(Qcur, "Qcur", il);
12390
+ if (model.layers[il].bq) {
12391
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
12392
+ cb(Qcur, "Qcur", il);
12393
+ }
12394
+
12395
+ lm_ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12396
+ cb(Kcur, "Kcur", il);
12397
+ if (model.layers[il].bk) {
12398
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
12399
+ cb(Kcur, "Kcur", il);
12400
+ }
12401
+
12402
+ lm_ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12403
+ cb(Vcur, "Vcur", il);
12404
+ if (model.layers[il].bv) {
12405
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
12406
+ cb(Vcur, "Vcur", il);
12407
+ }
12408
+
12409
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12410
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12411
+ Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12412
+
12413
+ if (use_rope) {
12414
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
12415
+ Qcur = lm_ggml_rope_ext(
12416
+ ctx0, Qcur, inp_pos, rope_factors,
12417
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12418
+ ext_factor, attn_factor, beta_fast, beta_slow
12419
+ );
12420
+
12421
+ Kcur = lm_ggml_rope_ext(
12422
+ ctx0, Kcur, inp_pos, rope_factors,
12423
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12424
+ ext_factor, attn_factor, beta_fast, beta_slow
12425
+ );
12426
+ }
12427
+
12428
+ cb(Qcur, "Qcur", il);
12429
+ cb(Kcur, "Kcur", il);
12430
+ cb(Vcur, "Vcur", il);
12431
+
12432
+ cur = build_attn(inp_attn, gf,
12433
+ model.layers[il].wo, model.layers[il].bo,
12434
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12435
+ cb(cur, "attn_out", il);
12436
+ }
12437
+
12438
+ if (il == n_layer - 1) {
12439
+ // skip computing output for unused tokens
12440
+ lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
12441
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
12442
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
12443
+ }
12444
+
12445
+ // For Granite architectures - scale residual
12446
+ cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
12447
+ lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
12448
+ cb(ffn_inp, "ffn_inp", il);
12449
+
12450
+ // feed-forward network (non-MoE)
12451
+ if (model.layers[il].ffn_gate_inp == nullptr) {
12452
+
12453
+ cur = build_norm(ffn_inp,
12454
+ model.layers[il].ffn_norm, NULL,
12455
+ LLM_NORM_RMS, il);
12456
+ cb(cur, "ffn_norm", il);
12457
+
12458
+ cur = build_ffn(cur,
12459
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12460
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12461
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12462
+ NULL,
12463
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12464
+ cb(cur, "ffn_out", il);
12465
+
12466
+ } else {
12467
+ // MoE branch
12468
+ cur = build_norm(ffn_inp,
12469
+ model.layers[il].ffn_norm, NULL,
12470
+ LLM_NORM_RMS, il);
12471
+ cb(cur, "ffn_norm", il);
12472
+
12473
+ lm_ggml_tensor * moe_out = build_moe_ffn(cur,
12474
+ model.layers[il].ffn_gate_inp,
12475
+ model.layers[il].ffn_up_exps,
12476
+ model.layers[il].ffn_gate_exps,
12477
+ model.layers[il].ffn_down_exps,
12478
+ nullptr,
12479
+ n_expert, n_expert_used,
12480
+ LLM_FFN_SILU, true,
12481
+ false, 0.0,
12482
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12483
+ il);
12484
+ cb(moe_out, "ffn_moe_out", il);
12485
+
12486
+ // For Granite MoE Shared
12487
+ if (hparams.n_ff_shexp > 0) {
12488
+ lm_ggml_tensor * ffn_shexp = build_ffn(cur,
12489
+ model.layers[il].ffn_up_shexp, NULL, NULL,
12490
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
12491
+ model.layers[il].ffn_down_shexp, NULL, NULL,
12492
+ NULL,
12493
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12494
+ cb(ffn_shexp, "ffn_shexp", il);
12495
+
12496
+ cur = lm_ggml_add(ctx0, moe_out, ffn_shexp);
12497
+ cb(cur, "ffn_out", il);
12498
+ } else {
12499
+ cur = moe_out;
12500
+ }
12501
+ }
12502
+
12503
+ // For Granite architectures - scale residual
12504
+ cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
12505
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
12506
+ cb(cur, "ffn_out", il);
12507
+
12508
+ cur = build_cvec(cur, il);
12509
+ cb(cur, "l_out", il);
12510
+
12511
+ // input for next layer
12512
+ inpL = cur;
12513
+ }
12514
+
12515
+ cur = inpL;
12516
+
12517
+ cur = build_norm(cur,
12518
+ model.output_norm, NULL,
12519
+ LLM_NORM_RMS, -1);
12520
+
12521
+ cb(cur, "result_norm", -1);
12522
+ res->t_embd = cur;
12523
+
12524
+ // lm_head
12525
+ cur = build_lora_mm(model.output, cur);
12526
+
12527
+ // For Granite architectures - scale logits
12528
+ cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
12529
+ cb(cur, "result_output", -1);
12530
+ res->t_logits = cur;
12531
+
12532
+ lm_ggml_build_forward_expand(gf, cur);
12533
+ }
12534
+ };
12535
+
11865
12536
  // ref: https://github.com/facebookresearch/chameleon
11866
12537
  // based on the original build_llama() function, changes:
11867
12538
  // * qk-norm
@@ -11960,7 +12631,7 @@ struct llm_build_chameleon : public llm_graph_context {
11960
12631
 
11961
12632
  cur = build_attn(inp_attn, gf,
11962
12633
  model.layers[il].wo, nullptr,
11963
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12634
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11964
12635
 
11965
12636
  if (hparams.swin_norm) {
11966
12637
  cur = build_norm(cur,
@@ -12316,7 +12987,7 @@ struct llm_build_plm : public llm_graph_context {
12316
12987
 
12317
12988
  cur = build_attn(inp_attn, gf,
12318
12989
  model.layers[il].wo, NULL,
12319
- q_states, k_states, v_states, nullptr, kq_scale, il);
12990
+ q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
12320
12991
  }
12321
12992
 
12322
12993
  if (il == n_layer - 1) {
@@ -12393,7 +13064,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
12393
13064
  // self-attention
12394
13065
  {
12395
13066
  // rope freq factors for llama3; may return nullptr for llama2 and other models
12396
- lm_ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
13067
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
12397
13068
 
12398
13069
  // compute Q and K and RoPE them
12399
13070
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12439,7 +13110,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
12439
13110
 
12440
13111
  cur = build_attn(inp_attn, gf,
12441
13112
  model.layers[il].wo, model.layers[il].bo,
12442
- Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_rot)), il);
13113
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
12443
13114
  }
12444
13115
 
12445
13116
  if (il == n_layer - 1) {
@@ -12513,36 +13184,70 @@ struct llm_build_bailingmoe : public llm_graph_context {
12513
13184
  }
12514
13185
  };
12515
13186
 
12516
- llama_memory_i * llama_model::create_memory() const {
13187
+ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
12517
13188
  llama_memory_i * res;
12518
13189
 
12519
13190
  switch (arch) {
13191
+ case LLM_ARCH_BERT:
13192
+ case LLM_ARCH_JINA_BERT_V2:
13193
+ case LLM_ARCH_NOMIC_BERT:
13194
+ case LLM_ARCH_NOMIC_BERT_MOE:
13195
+ case LLM_ARCH_WAVTOKENIZER_DEC:
13196
+ {
13197
+ res = nullptr;
13198
+ } break;
12520
13199
  case LLM_ARCH_MAMBA:
12521
13200
  case LLM_ARCH_RWKV6:
12522
13201
  case LLM_ARCH_RWKV6QWEN2:
12523
13202
  case LLM_ARCH_RWKV7:
12524
13203
  case LLM_ARCH_ARWKV7:
12525
13204
  {
12526
- res = new llama_kv_cache_unified(hparams, {
12527
- /*.get_rope_factors =*/ nullptr
12528
- });
13205
+ res = new llama_kv_cache_recurrent(
13206
+ *this,
13207
+ LM_GGML_TYPE_F32,
13208
+ LM_GGML_TYPE_F32,
13209
+ cparams.offload_kqv,
13210
+ std::max((uint32_t) 1, cparams.n_seq_max),
13211
+ cparams.n_seq_max);
12529
13212
  } break;
12530
13213
  default:
12531
13214
  {
12532
- res = new llama_kv_cache_unified(hparams, {
12533
- /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
12534
- // choose long/short freq factors based on the context size
12535
- if (layers[il].rope_freqs != nullptr) {
12536
- return layers[il].rope_freqs;
12537
- }
13215
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
12538
13216
 
12539
- if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
12540
- return layers[il].rope_long;
12541
- }
13217
+ cparams.n_ctx = LM_GGML_PAD(cparams.n_ctx, padding);
12542
13218
 
12543
- return layers[il].rope_short;
12544
- }
12545
- });
13219
+ LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
13220
+
13221
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13222
+ LM_GGML_ASSERT(hparams.is_swa_any());
13223
+
13224
+ res = new llama_kv_cache_unified_iswa(
13225
+ *this,
13226
+ params.type_k,
13227
+ params.type_v,
13228
+ !cparams.flash_attn,
13229
+ cparams.offload_kqv,
13230
+ params.swa_full,
13231
+ cparams.n_ctx,
13232
+ cparams.n_seq_max,
13233
+ cparams.n_batch,
13234
+ padding);
13235
+ } else {
13236
+ LM_GGML_ASSERT(!hparams.is_swa_any());
13237
+
13238
+ res = new llama_kv_cache_unified(
13239
+ *this,
13240
+ nullptr,
13241
+ params.type_k,
13242
+ params.type_v,
13243
+ !cparams.flash_attn,
13244
+ cparams.offload_kqv,
13245
+ cparams.n_ctx,
13246
+ cparams.n_seq_max,
13247
+ padding,
13248
+ hparams.n_swa,
13249
+ hparams.swa_type);
13250
+ }
12546
13251
  }
12547
13252
  }
12548
13253
 
@@ -12557,13 +13262,14 @@ llm_graph_result_ptr llama_model::build_graph(
12557
13262
 
12558
13263
  switch (arch) {
12559
13264
  case LLM_ARCH_LLAMA:
12560
- case LLM_ARCH_LLAMA4:
12561
13265
  case LLM_ARCH_MINICPM:
12562
- case LLM_ARCH_GRANITE:
12563
- case LLM_ARCH_GRANITE_MOE:
12564
13266
  {
12565
13267
  llm = std::make_unique<llm_build_llama>(*this, params, gf);
12566
13268
  } break;
13269
+ case LLM_ARCH_LLAMA4:
13270
+ {
13271
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
13272
+ } break;
12567
13273
  case LLM_ARCH_DECI:
12568
13274
  {
12569
13275
  llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -12591,6 +13297,7 @@ llm_graph_result_ptr llama_model::build_graph(
12591
13297
  case LLM_ARCH_BERT:
12592
13298
  case LLM_ARCH_JINA_BERT_V2:
12593
13299
  case LLM_ARCH_NOMIC_BERT:
13300
+ case LLM_ARCH_NOMIC_BERT_MOE:
12594
13301
  {
12595
13302
  llm = std::make_unique<llm_build_bert>(*this, params, gf);
12596
13303
  } break;
@@ -12637,7 +13344,11 @@ llm_graph_result_ptr llama_model::build_graph(
12637
13344
  case LLM_ARCH_PHI3:
12638
13345
  case LLM_ARCH_PHIMOE:
12639
13346
  {
12640
- llm = std::make_unique<llm_build_phi3>(*this, params, gf);
13347
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13348
+ llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
13349
+ } else {
13350
+ llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
13351
+ }
12641
13352
  } break;
12642
13353
  case LLM_ARCH_PLAMO:
12643
13354
  {
@@ -12669,11 +13380,11 @@ llm_graph_result_ptr llama_model::build_graph(
12669
13380
  } break;
12670
13381
  case LLM_ARCH_GEMMA2:
12671
13382
  {
12672
- llm = std::make_unique<llm_build_gemma2>(*this, params, gf);
13383
+ llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
12673
13384
  } break;
12674
13385
  case LLM_ARCH_GEMMA3:
12675
13386
  {
12676
- llm = std::make_unique<llm_build_gemma3>(*this, params, gf);
13387
+ llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
12677
13388
  } break;
12678
13389
  case LLM_ARCH_STARCODER2:
12679
13390
  {
@@ -12693,7 +13404,7 @@ llm_graph_result_ptr llama_model::build_graph(
12693
13404
  } break;
12694
13405
  case LLM_ARCH_COHERE2:
12695
13406
  {
12696
- llm = std::make_unique<llm_build_cohere2>(*this, params, gf);
13407
+ llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
12697
13408
  } break;
12698
13409
  case LLM_ARCH_DBRX:
12699
13410
  {
@@ -12735,6 +13446,10 @@ llm_graph_result_ptr llama_model::build_graph(
12735
13446
  {
12736
13447
  llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
12737
13448
  } break;
13449
+ case LLM_ARCH_GLM4:
13450
+ {
13451
+ llm = std::make_unique<llm_build_glm4>(*this, params, gf);
13452
+ } break;
12738
13453
  case LLM_ARCH_BITNET:
12739
13454
  {
12740
13455
  llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
@@ -12786,6 +13501,11 @@ llm_graph_result_ptr llama_model::build_graph(
12786
13501
  {
12787
13502
  llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
12788
13503
  } break;
13504
+ case LLM_ARCH_GRANITE:
13505
+ case LLM_ARCH_GRANITE_MOE:
13506
+ {
13507
+ llm = std::make_unique<llm_build_granite>(*this, params, gf);
13508
+ } break;
12789
13509
  case LLM_ARCH_CHAMELEON:
12790
13510
  {
12791
13511
  llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
@@ -12919,8 +13639,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
12919
13639
  case LLM_ARCH_DECI:
12920
13640
  case LLM_ARCH_BAICHUAN:
12921
13641
  case LLM_ARCH_STARCODER:
12922
- case LLM_ARCH_PLAMO:
12923
- case LLM_ARCH_ORION:
12924
13642
  case LLM_ARCH_INTERNLM2:
12925
13643
  case LLM_ARCH_MINICPM:
12926
13644
  case LLM_ARCH_XVERSE:
@@ -12932,6 +13650,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
12932
13650
  case LLM_ARCH_DEEPSEEK2:
12933
13651
  case LLM_ARCH_PLM:
12934
13652
  case LLM_ARCH_CHATGLM:
13653
+ case LLM_ARCH_GLM4:
12935
13654
  case LLM_ARCH_GRANITE:
12936
13655
  case LLM_ARCH_GRANITE_MOE:
12937
13656
  case LLM_ARCH_CHAMELEON:
@@ -12944,6 +13663,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
12944
13663
  case LLM_ARCH_DBRX:
12945
13664
  case LLM_ARCH_BERT:
12946
13665
  case LLM_ARCH_NOMIC_BERT:
13666
+ case LLM_ARCH_NOMIC_BERT_MOE:
12947
13667
  case LLM_ARCH_STABLELM:
12948
13668
  case LLM_ARCH_BITNET:
12949
13669
  case LLM_ARCH_QWEN:
@@ -12956,6 +13676,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
12956
13676
  case LLM_ARCH_PHI2:
12957
13677
  case LLM_ARCH_PHI3:
12958
13678
  case LLM_ARCH_PHIMOE:
13679
+ case LLM_ARCH_PLAMO:
12959
13680
  case LLM_ARCH_GEMMA:
12960
13681
  case LLM_ARCH_GEMMA2:
12961
13682
  case LLM_ARCH_GEMMA3:
@@ -12963,6 +13684,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
12963
13684
  case LLM_ARCH_OPENELM:
12964
13685
  case LLM_ARCH_GPTNEOX:
12965
13686
  case LLM_ARCH_CODESHELL:
13687
+ case LLM_ARCH_ORION:
12966
13688
  case LLM_ARCH_NEMOTRON:
12967
13689
  case LLM_ARCH_EXAONE:
12968
13690
  case LLM_ARCH_MINICPM3:
@@ -13035,6 +13757,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
13035
13757
  : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
13036
13758
  const auto & it = model->lm_gguf_kv.find(key);
13037
13759
  if (it == model->lm_gguf_kv.end()) {
13760
+ // one-off fix for very popular models (so we are not flooded with issues)
13761
+ // do not extend this list unless absolutely necessary
13762
+ // Mistral-Small-2503 does not have built-in chat template
13763
+ llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
13764
+ if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
13765
+ return "mistral-v7-tekken";
13766
+ }
13767
+
13038
13768
  return nullptr;
13039
13769
  }
13040
13770