@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -8,7 +8,8 @@
8
8
 
9
9
  #include "llama-kv-cache-unified.h"
10
10
  #include "llama-kv-cache-unified-iswa.h"
11
- #include "llama-kv-cache-recurrent.h"
11
+ #include "llama-memory-hybrid.h"
12
+ #include "llama-memory-recurrent.h"
12
13
 
13
14
  #include "ggml-cpp.h"
14
15
 
@@ -80,6 +81,7 @@ const char * llm_type_name(llm_type type) {
80
81
  case LLM_TYPE_40B: return "40B";
81
82
  case LLM_TYPE_65B: return "65B";
82
83
  case LLM_TYPE_70B: return "70B";
84
+ case LLM_TYPE_142B: return "142B";
83
85
  case LLM_TYPE_236B: return "236B";
84
86
  case LLM_TYPE_290B: return "290B";
85
87
  case LLM_TYPE_314B: return "314B";
@@ -101,6 +103,8 @@ const char * llm_type_name(llm_type type) {
101
103
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
102
104
  case LLM_TYPE_30B_A3B: return "30B.A3B";
103
105
  case LLM_TYPE_235B_A22B: return "235B.A22B";
106
+ case LLM_TYPE_E2B: return "E2B";
107
+ case LLM_TYPE_E4B: return "E4B";
104
108
  default: return "?B";
105
109
  }
106
110
  }
@@ -469,6 +473,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
469
473
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
470
474
  std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
471
475
  std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
476
+ std::fill(
477
+ hparams.recurrent_layer_arr.begin(),
478
+ hparams.recurrent_layer_arr.end(),
479
+ llm_arch_is_recurrent(ml.get_arch()));
472
480
 
473
481
  std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
474
482
 
@@ -543,6 +551,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
543
551
  uint32_t n_vocab = 0;
544
552
  ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
545
553
 
554
+ // for classifier models
555
+ ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
556
+ if (!classifier_labels.empty()) {
557
+ hparams.n_cls_out = classifier_labels.size();
558
+ }
559
+
546
560
  // arch-specific KVs
547
561
  switch (arch) {
548
562
  case LLM_ARCH_LLAMA:
@@ -592,6 +606,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
592
606
  hparams.use_kq_norm = false;
593
607
  }
594
608
  } break;
609
+ case LLM_ARCH_ARCEE:
610
+ {
611
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
612
+
613
+ // Arcee uses the same structure as Llama
614
+ switch (hparams.n_layer) {
615
+ case 36: type = LLM_TYPE_4B; break;
616
+ default: type = LLM_TYPE_UNKNOWN;
617
+ }
618
+ } break;
595
619
  case LLM_ARCH_DECI:
596
620
  {
597
621
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -686,7 +710,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
686
710
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
687
711
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
688
712
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
689
- ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
690
713
 
691
714
  switch (hparams.n_layer) {
692
715
  case 3:
@@ -733,6 +756,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
733
756
  }
734
757
  }
735
758
  } break;
759
+ case LLM_ARCH_NEO_BERT:
760
+ {
761
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
762
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
763
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
764
+
765
+ if (hparams.n_layer == 28) {
766
+ type = LLM_TYPE_250M;
767
+ }
768
+ } break;
736
769
  case LLM_ARCH_BLOOM:
737
770
  {
738
771
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -956,6 +989,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
956
989
  case 46: type = LLM_TYPE_27B; break;
957
990
  default: type = LLM_TYPE_UNKNOWN;
958
991
  }
992
+
993
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
994
+ hparams.f_attention_scale = type == LLM_TYPE_27B
995
+ ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
996
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
959
997
  } break;
960
998
  case LLM_ARCH_GEMMA3:
961
999
  {
@@ -976,10 +1014,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
976
1014
  default: type = LLM_TYPE_UNKNOWN;
977
1015
  }
978
1016
 
1017
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
979
1018
  hparams.f_attention_scale = type == LLM_TYPE_27B
980
1019
  ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
981
1020
  : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
982
1021
  } break;
1022
+ case LLM_ARCH_GEMMA3N:
1023
+ {
1024
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1025
+ hparams.set_swa_pattern(5);
1026
+
1027
+ hparams.rope_freq_base_train_swa = 10000.0f;
1028
+ hparams.rope_freq_scale_train_swa = 1.0f;
1029
+ hparams.f_attention_scale = 1.0f;
1030
+
1031
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1032
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1033
+
1034
+ switch (hparams.n_layer) {
1035
+ case 30: type = LLM_TYPE_E2B; break;
1036
+ case 35: type = LLM_TYPE_E4B; break;
1037
+ default: type = LLM_TYPE_UNKNOWN;
1038
+ }
1039
+ } break;
983
1040
  case LLM_ARCH_STARCODER2:
984
1041
  {
985
1042
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1433,6 +1490,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1433
1490
  default: type = LLM_TYPE_UNKNOWN;
1434
1491
  }
1435
1492
  } break;
1493
+ case LLM_ARCH_DOTS1:
1494
+ {
1495
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1496
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1497
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1498
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1499
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1500
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1501
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1502
+ switch (hparams.n_layer) {
1503
+ case 62: type = LLM_TYPE_142B; break;
1504
+ default: type = LLM_TYPE_UNKNOWN;
1505
+ }
1506
+ } break;
1436
1507
  default: throw std::runtime_error("unsupported model architecture");
1437
1508
  }
1438
1509
 
@@ -2176,6 +2247,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2176
2247
  layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
2177
2248
  }
2178
2249
  } break;
2250
+ case LLM_ARCH_NEO_BERT:
2251
+ {
2252
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2253
+
2254
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
2255
+ cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
2256
+
2257
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2258
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2259
+
2260
+ output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
2261
+
2262
+ for (int i = 0; i < n_layer; ++i) {
2263
+ auto & layer = layers[i];
2264
+
2265
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2266
+
2267
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2268
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2269
+
2270
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2271
+
2272
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
2273
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2274
+ }
2275
+ } break;
2179
2276
  case LLM_ARCH_JINA_BERT_V2:
2180
2277
  {
2181
2278
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
@@ -2213,8 +2310,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2213
2310
  layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2214
2311
  layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2215
2312
 
2216
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2217
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2313
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2314
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
2218
2315
 
2219
2316
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2220
2317
  layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@@ -2873,6 +2970,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2873
2970
  layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2874
2971
  }
2875
2972
  } break;
2973
+ case LLM_ARCH_GEMMA3N:
2974
+ {
2975
+ const int64_t n_altup = hparams.n_altup;
2976
+ const int64_t laurel_rank = hparams.laurel_rank;
2977
+ const int64_t n_embd_altup = hparams.n_embd_altup;
2978
+
2979
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2980
+ // if output is NULL, init from the input tok embed
2981
+ if (output == NULL) {
2982
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2983
+ }
2984
+
2985
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2986
+ tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
2987
+
2988
+ altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
2989
+ altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0);
2990
+ per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
2991
+ per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0);
2992
+
2993
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2994
+
2995
+ for (int i = 0; i < n_layer; ++i) {
2996
+ auto & layer = layers[i];
2997
+
2998
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2999
+
3000
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3001
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
3002
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
3003
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
3004
+
3005
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
3006
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
3007
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
3008
+
3009
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3010
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3011
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3012
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3013
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
3014
+
3015
+ // altup & laurel
3016
+ layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0);
3017
+ layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0);
3018
+ layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
3019
+ layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0);
3020
+ layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
3021
+ layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0);
3022
+ layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0);
3023
+ layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0);
3024
+ layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0);
3025
+ layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0);
3026
+ layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0);
3027
+ }
3028
+ } break;
2876
3029
  case LLM_ARCH_STARCODER2:
2877
3030
  {
2878
3031
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4112,6 +4265,89 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4112
4265
  layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4113
4266
  }
4114
4267
  } break;
4268
+ case LLM_ARCH_DOTS1:
4269
+ {
4270
+ const int64_t n_ff_exp = hparams.n_ff_exp;
4271
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4272
+
4273
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4274
+
4275
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4276
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4277
+
4278
+ for (int i = 0; i < n_layer; ++i) {
4279
+ auto & layer = layers[i];
4280
+
4281
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4282
+
4283
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4284
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4285
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4286
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4287
+
4288
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4289
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4290
+
4291
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4292
+
4293
+ if (i < (int) hparams.n_layer_dense_lead) {
4294
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4295
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4296
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4297
+ } else {
4298
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4299
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
4300
+
4301
+ if (n_expert == 0) {
4302
+ throw std::runtime_error("n_expert must be > 0");
4303
+ }
4304
+ if (n_expert_used == 0) {
4305
+ throw std::runtime_error("n_expert_used must be > 0");
4306
+ }
4307
+
4308
+ // MoE branch
4309
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4310
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4311
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4312
+
4313
+ // Shared expert branch
4314
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4315
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
4316
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4317
+ }
4318
+ }
4319
+ } break;
4320
+ case LLM_ARCH_ARCEE:
4321
+ {
4322
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4323
+
4324
+ // output
4325
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4326
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4327
+
4328
+ // if output is NULL, init from the input tok embed
4329
+ if (output == NULL) {
4330
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4331
+ }
4332
+
4333
+ for (int i = 0; i < n_layer; ++i) {
4334
+ auto & layer = layers[i];
4335
+
4336
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4337
+
4338
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4339
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4340
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4341
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4342
+
4343
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4344
+
4345
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4346
+
4347
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4348
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4349
+ }
4350
+ } break;
4115
4351
  default:
4116
4352
  throw std::runtime_error("unknown architecture");
4117
4353
  }
@@ -4356,6 +4592,15 @@ void llama_model::print_info() const {
4356
4592
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
4357
4593
  LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
4358
4594
  LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
4595
+
4596
+ if (!classifier_labels.empty()) {
4597
+ LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
4598
+
4599
+ size_t i = 0;
4600
+ for (auto label : classifier_labels) {
4601
+ LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
4602
+ }
4603
+ }
4359
4604
  }
4360
4605
 
4361
4606
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
@@ -4538,6 +4783,8 @@ struct llm_build_llama : public llm_graph_context {
4538
4783
 
4539
4784
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4540
4785
 
4786
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
4787
+
4541
4788
  for (int il = 0; il < n_layer; ++il) {
4542
4789
  ggml_tensor * inpSA = inpL;
4543
4790
 
@@ -4600,9 +4847,7 @@ struct llm_build_llama : public llm_graph_context {
4600
4847
  cb(cur, "attn_out", il);
4601
4848
  }
4602
4849
 
4603
- if (il == n_layer - 1) {
4604
- // skip computing output for unused tokens
4605
- ggml_tensor * inp_out_ids = build_inp_out_ids();
4850
+ if (il == n_layer - 1 && inp_out_ids) {
4606
4851
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
4607
4852
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4608
4853
  }
@@ -4698,6 +4943,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
4698
4943
 
4699
4944
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4700
4945
 
4946
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
4947
+
4701
4948
  for (int il = 0; il < n_layer; ++il) {
4702
4949
  ggml_tensor * inpSA = inpL;
4703
4950
 
@@ -4774,9 +5021,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
4774
5021
  cb(cur, "attn_out", il);
4775
5022
  }
4776
5023
 
4777
- if (il == n_layer - 1) {
4778
- // skip computing output for unused tokens
4779
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5024
+ if (il == n_layer - 1 && inp_out_ids) {
4780
5025
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
4781
5026
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4782
5027
  }
@@ -4876,6 +5121,9 @@ struct llm_build_deci : public llm_graph_context {
4876
5121
  auto * inp_attn = build_attn_inp_kv_unified();
4877
5122
 
4878
5123
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
5124
+
5125
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5126
+
4879
5127
  for (int il = 0; il < n_layer; ++il) {
4880
5128
  ggml_tensor * inpSA = inpL;
4881
5129
  const int64_t n_head_kv = hparams.n_head_kv(il);
@@ -4949,9 +5197,7 @@ struct llm_build_deci : public llm_graph_context {
4949
5197
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4950
5198
  }
4951
5199
 
4952
- if (il == n_layer - 1) {
4953
- // skip computing output for unused tokens
4954
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5200
+ if (il == n_layer - 1 && inp_out_ids) {
4955
5201
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
4956
5202
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4957
5203
  }
@@ -5030,6 +5276,8 @@ struct llm_build_baichuan : public llm_graph_context {
5030
5276
 
5031
5277
  auto * inp_attn = build_attn_inp_kv_unified();
5032
5278
 
5279
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5280
+
5033
5281
  for (int il = 0; il < n_layer; ++il) {
5034
5282
  ggml_tensor * inpSA = inpL;
5035
5283
 
@@ -5081,9 +5329,7 @@ struct llm_build_baichuan : public llm_graph_context {
5081
5329
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5082
5330
  }
5083
5331
 
5084
- if (il == n_layer - 1) {
5085
- // skip computing output for unused tokens
5086
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5332
+ if (il == n_layer - 1 && inp_out_ids) {
5087
5333
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5088
5334
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5089
5335
  }
@@ -5152,6 +5398,8 @@ struct llm_build_xverse : public llm_graph_context {
5152
5398
 
5153
5399
  auto * inp_attn = build_attn_inp_kv_unified();
5154
5400
 
5401
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5402
+
5155
5403
  for (int il = 0; il < n_layer; ++il) {
5156
5404
  ggml_tensor * inpSA = inpL;
5157
5405
 
@@ -5196,9 +5444,7 @@ struct llm_build_xverse : public llm_graph_context {
5196
5444
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5197
5445
  }
5198
5446
 
5199
- if (il == n_layer - 1) {
5200
- // skip computing output for unused tokens
5201
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5447
+ if (il == n_layer - 1 && inp_out_ids) {
5202
5448
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5203
5449
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5204
5450
  }
@@ -5266,6 +5512,8 @@ struct llm_build_falcon : public llm_graph_context {
5266
5512
 
5267
5513
  auto * inp_attn = build_attn_inp_kv_unified();
5268
5514
 
5515
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5516
+
5269
5517
  for (int il = 0; il < n_layer; ++il) {
5270
5518
  ggml_tensor * attn_norm;
5271
5519
 
@@ -5321,9 +5569,7 @@ struct llm_build_falcon : public llm_graph_context {
5321
5569
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5322
5570
  }
5323
5571
 
5324
- if (il == n_layer - 1) {
5325
- // skip computing output for unused tokens
5326
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5572
+ if (il == n_layer - 1 && inp_out_ids) {
5327
5573
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5328
5574
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
5329
5575
  attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
@@ -5392,6 +5638,8 @@ struct llm_build_grok : public llm_graph_context {
5392
5638
 
5393
5639
  auto * inp_attn = build_attn_inp_kv_unified();
5394
5640
 
5641
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5642
+
5395
5643
  for (int il = 0; il < n_layer; ++il) {
5396
5644
  ggml_tensor * inpSA = inpL;
5397
5645
 
@@ -5451,9 +5699,7 @@ struct llm_build_grok : public llm_graph_context {
5451
5699
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
5452
5700
  }
5453
5701
 
5454
- if (il == n_layer - 1) {
5455
- // skip computing output for unused tokens
5456
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5702
+ if (il == n_layer - 1 && inp_out_ids) {
5457
5703
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5458
5704
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5459
5705
  }
@@ -5552,6 +5798,8 @@ struct llm_build_dbrx : public llm_graph_context {
5552
5798
 
5553
5799
  auto * inp_attn = build_attn_inp_kv_unified();
5554
5800
 
5801
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5802
+
5555
5803
  for (int il = 0; il < n_layer; ++il) {
5556
5804
  ggml_tensor * inpSA = inpL;
5557
5805
 
@@ -5602,9 +5850,7 @@ struct llm_build_dbrx : public llm_graph_context {
5602
5850
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5603
5851
  }
5604
5852
 
5605
- if (il == n_layer - 1) {
5606
- // skip computing output for unused tokens
5607
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5853
+ if (il == n_layer - 1 && inp_out_ids) {
5608
5854
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5609
5855
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5610
5856
  }
@@ -5684,6 +5930,8 @@ struct llm_build_starcoder : public llm_graph_context {
5684
5930
  inpL = ggml_add(ctx0, inpL, pos);
5685
5931
  cb(inpL, "inpL", -1);
5686
5932
 
5933
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
5934
+
5687
5935
  for (int il = 0; il < n_layer; ++il) {
5688
5936
  cur = build_norm(inpL,
5689
5937
  model.layers[il].attn_norm,
@@ -5716,9 +5964,7 @@ struct llm_build_starcoder : public llm_graph_context {
5716
5964
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5717
5965
  }
5718
5966
 
5719
- if (il == n_layer - 1) {
5720
- // skip computing output for unused tokens
5721
- ggml_tensor * inp_out_ids = build_inp_out_ids();
5967
+ if (il == n_layer - 1 && inp_out_ids) {
5722
5968
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5723
5969
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
5724
5970
  }
@@ -5783,6 +6029,8 @@ struct llm_build_refact : public llm_graph_context {
5783
6029
 
5784
6030
  auto * inp_attn = build_attn_inp_kv_unified();
5785
6031
 
6032
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6033
+
5786
6034
  for (int il = 0; il < n_layer; ++il) {
5787
6035
  ggml_tensor * inpSA = inpL;
5788
6036
 
@@ -5815,9 +6063,7 @@ struct llm_build_refact : public llm_graph_context {
5815
6063
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5816
6064
  }
5817
6065
 
5818
- if (il == n_layer - 1) {
5819
- // skip computing output for unused tokens
5820
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6066
+ if (il == n_layer - 1 && inp_out_ids) {
5821
6067
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5822
6068
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
5823
6069
  }
@@ -5903,78 +6149,79 @@ struct llm_build_bert : public llm_graph_context {
5903
6149
 
5904
6150
  auto * inp_attn = build_attn_inp_no_cache();
5905
6151
 
5906
- // iterate layers
6152
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6153
+
5907
6154
  for (int il = 0; il < n_layer; ++il) {
5908
6155
  ggml_tensor * cur = inpL;
5909
6156
 
5910
- ggml_tensor * Qcur;
5911
- ggml_tensor * Kcur;
5912
- ggml_tensor * Vcur;
6157
+ {
6158
+ ggml_tensor * Qcur;
6159
+ ggml_tensor * Kcur;
6160
+ ggml_tensor * Vcur;
5913
6161
 
5914
- // self-attention
5915
- if (model.layers[il].wqkv) {
5916
- cur = build_lora_mm(model.layers[il].wqkv, cur);
5917
- cb(cur, "wqkv", il);
6162
+ // self-attention
6163
+ if (model.layers[il].wqkv) {
6164
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
6165
+ cb(cur, "wqkv", il);
5918
6166
 
5919
- if (model.layers[il].bqkv) {
5920
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5921
- cb(cur, "bqkv", il);
6167
+ if (model.layers[il].bqkv) {
6168
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6169
+ cb(cur, "bqkv", il);
6170
+ }
6171
+
6172
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6173
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6174
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6175
+ } else {
6176
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
6177
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
6178
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
5922
6179
  }
5923
6180
 
5924
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5925
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5926
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5927
- } else {
5928
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
5929
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
5930
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
5931
- }
6181
+ if (model.layers[il].attn_q_norm) {
6182
+ Qcur = build_norm(Qcur,
6183
+ model.layers[il].attn_q_norm,
6184
+ model.layers[il].attn_q_norm_b,
6185
+ LLM_NORM, il);
6186
+ }
5932
6187
 
5933
- if (model.layers[il].attn_q_norm) {
5934
- Qcur = build_norm(Qcur,
5935
- model.layers[il].attn_q_norm,
5936
- model.layers[il].attn_q_norm_b,
5937
- LLM_NORM, il);
5938
- }
6188
+ if (model.layers[il].attn_k_norm) {
6189
+ Kcur = build_norm(Kcur,
6190
+ model.layers[il].attn_k_norm,
6191
+ model.layers[il].attn_k_norm_b,
6192
+ LLM_NORM, il);
6193
+ }
5939
6194
 
5940
- if (model.layers[il].attn_k_norm) {
5941
- Kcur = build_norm(Kcur,
5942
- model.layers[il].attn_k_norm,
5943
- model.layers[il].attn_k_norm_b,
5944
- LLM_NORM, il);
5945
- }
6195
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6196
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6197
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5946
6198
 
5947
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5948
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5949
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6199
+ // RoPE
6200
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
6201
+ Qcur = ggml_rope_ext(
6202
+ ctx0, Qcur, inp_pos, nullptr,
6203
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6204
+ ext_factor, attn_factor, beta_fast, beta_slow
6205
+ );
5950
6206
 
5951
- // RoPE
5952
- if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5953
- Qcur = ggml_rope_ext(
5954
- ctx0, Qcur, inp_pos, nullptr,
5955
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
5956
- ext_factor, attn_factor, beta_fast, beta_slow
5957
- );
6207
+ Kcur = ggml_rope_ext(
6208
+ ctx0, Kcur, inp_pos, nullptr,
6209
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6210
+ ext_factor, attn_factor, beta_fast, beta_slow
6211
+ );
6212
+ }
5958
6213
 
5959
- Kcur = ggml_rope_ext(
5960
- ctx0, Kcur, inp_pos, nullptr,
5961
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
5962
- ext_factor, attn_factor, beta_fast, beta_slow
5963
- );
5964
- }
5965
-
5966
- cb(Qcur, "Qcur", il);
5967
- cb(Kcur, "Kcur", il);
5968
- cb(Vcur, "Vcur", il);
6214
+ cb(Qcur, "Qcur", il);
6215
+ cb(Kcur, "Kcur", il);
6216
+ cb(Vcur, "Vcur", il);
5969
6217
 
5970
- cur = build_attn(inp_attn, gf,
5971
- model.layers[il].wo, model.layers[il].bo,
5972
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
5973
- cb(cur, "kqv_out", il);
6218
+ cur = build_attn(inp_attn, gf,
6219
+ model.layers[il].wo, model.layers[il].bo,
6220
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6221
+ cb(cur, "kqv_out", il);
6222
+ }
5974
6223
 
5975
- if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
5976
- // skip computing output for unused tokens
5977
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6224
+ if (il == n_layer - 1 && inp_out_ids) {
5978
6225
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
5979
6226
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
5980
6227
  }
@@ -6023,7 +6270,7 @@ struct llm_build_bert : public llm_graph_context {
6023
6270
  model.layers[il].ffn_gate, NULL, NULL,
6024
6271
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
6025
6272
  NULL,
6026
- LLM_FFN_GELU, LLM_FFN_PAR, il);
6273
+ model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
6027
6274
  cb(cur, "ffn_out", il);
6028
6275
  } else {
6029
6276
  cur = build_ffn(cur,
@@ -6054,6 +6301,118 @@ struct llm_build_bert : public llm_graph_context {
6054
6301
  }
6055
6302
  };
6056
6303
 
6304
+ struct llm_build_neo_bert : public llm_graph_context {
6305
+ llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6306
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6307
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6308
+
6309
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6310
+
6311
+ ggml_tensor * cur;
6312
+ ggml_tensor * inpL;
6313
+ ggml_tensor * inp_pos = build_inp_pos();
6314
+
6315
+ // construct input embeddings (token, type, position)
6316
+ inpL = build_inp_embd(model.tok_embd);
6317
+ cb(inpL, "inp_embd", -1);
6318
+
6319
+ auto * inp_attn = build_attn_inp_no_cache();
6320
+
6321
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6322
+
6323
+ for (int il = 0; il < n_layer; ++il) {
6324
+ ggml_tensor * cur = inpL;
6325
+
6326
+ // pre-norm
6327
+ cur = build_norm(inpL,
6328
+ model.layers[il].attn_norm, NULL,
6329
+ LLM_NORM_RMS, il);
6330
+
6331
+ {
6332
+ ggml_tensor * Qcur;
6333
+ ggml_tensor * Kcur;
6334
+ ggml_tensor * Vcur;
6335
+
6336
+ // self-attention
6337
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
6338
+ cb(cur, "wqkv", il);
6339
+
6340
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6341
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6342
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6343
+
6344
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6345
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6346
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6347
+
6348
+ // RoPE
6349
+ Qcur = ggml_rope_ext(
6350
+ ctx0, Qcur, inp_pos, nullptr,
6351
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6352
+ ext_factor, attn_factor, beta_fast, beta_slow
6353
+ );
6354
+
6355
+ Kcur = ggml_rope_ext(
6356
+ ctx0, Kcur, inp_pos, nullptr,
6357
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6358
+ ext_factor, attn_factor, beta_fast, beta_slow
6359
+ );
6360
+
6361
+ cb(Qcur, "Qcur", il);
6362
+ cb(Kcur, "Kcur", il);
6363
+ cb(Vcur, "Vcur", il);
6364
+
6365
+ cur = build_attn(inp_attn, gf,
6366
+ model.layers[il].wo, nullptr,
6367
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6368
+ cb(cur, "kqv_out", il);
6369
+ }
6370
+
6371
+ if (il == n_layer - 1 && inp_out_ids) {
6372
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6373
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6374
+ }
6375
+
6376
+ // re-add the layer input
6377
+ cur = ggml_add(ctx0, cur, inpL);
6378
+
6379
+ ggml_tensor * ffn_inp = cur;
6380
+ cb(ffn_inp, "ffn_inp", il);
6381
+
6382
+ // pre-norm
6383
+ cur = build_norm(ffn_inp,
6384
+ model.layers[il].ffn_norm, NULL,
6385
+ LLM_NORM_RMS, il);
6386
+ cb(cur, "ffn_norm", il);
6387
+
6388
+ // feed-forward network
6389
+ cur = build_ffn(cur,
6390
+ model.layers[il].ffn_up,
6391
+ NULL, NULL, NULL, NULL, NULL,
6392
+ model.layers[il].ffn_down,
6393
+ NULL, NULL, NULL,
6394
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
6395
+
6396
+ // attentions bypass the intermediate layer
6397
+ cur = ggml_add(ctx0, cur, ffn_inp);
6398
+
6399
+ // input for next layer
6400
+ inpL = cur;
6401
+ }
6402
+
6403
+ cur = inpL;
6404
+
6405
+ cur = build_norm(cur,
6406
+ model.output_norm_enc, NULL,
6407
+ LLM_NORM_RMS, -1);
6408
+
6409
+ cb(cur, "result_embd", -1);
6410
+ res->t_embd = cur;
6411
+
6412
+ ggml_build_forward_expand(gf, cur);
6413
+ }
6414
+ };
6415
+
6057
6416
  struct llm_build_bloom : public llm_graph_context {
6058
6417
  llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6059
6418
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -6074,6 +6433,8 @@ struct llm_build_bloom : public llm_graph_context {
6074
6433
  LLM_NORM, -1);
6075
6434
  cb(inpL, "inp_norm", -1);
6076
6435
 
6436
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6437
+
6077
6438
  for (int il = 0; il < n_layer; ++il) {
6078
6439
  cur = build_norm(inpL,
6079
6440
  model.layers[il].attn_norm,
@@ -6106,9 +6467,7 @@ struct llm_build_bloom : public llm_graph_context {
6106
6467
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6107
6468
  }
6108
6469
 
6109
- if (il == n_layer - 1) {
6110
- // skip computing output for unused tokens
6111
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6470
+ if (il == n_layer - 1 && inp_out_ids) {
6112
6471
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6113
6472
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6114
6473
  }
@@ -6185,6 +6544,8 @@ struct llm_build_mpt : public llm_graph_context {
6185
6544
  cb(inpL, "inpL", -1);
6186
6545
  }
6187
6546
 
6547
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6548
+
6188
6549
  for (int il = 0; il < n_layer; ++il) {
6189
6550
  ggml_tensor * attn_norm;
6190
6551
 
@@ -6247,9 +6608,7 @@ struct llm_build_mpt : public llm_graph_context {
6247
6608
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6248
6609
  }
6249
6610
 
6250
- if (il == n_layer - 1) {
6251
- // skip computing output for unused tokens
6252
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6611
+ if (il == n_layer - 1 && inp_out_ids) {
6253
6612
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6254
6613
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6255
6614
  }
@@ -6318,6 +6677,8 @@ struct llm_build_stablelm : public llm_graph_context {
6318
6677
 
6319
6678
  auto * inp_attn = build_attn_inp_kv_unified();
6320
6679
 
6680
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6681
+
6321
6682
  for (int il = 0; il < n_layer; ++il) {
6322
6683
  // norm
6323
6684
  cur = build_norm(inpL,
@@ -6393,9 +6754,7 @@ struct llm_build_stablelm : public llm_graph_context {
6393
6754
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6394
6755
  }
6395
6756
 
6396
- if (il == n_layer - 1) {
6397
- // skip computing output for unused tokens
6398
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6757
+ if (il == n_layer - 1 && inp_out_ids) {
6399
6758
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6400
6759
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6401
6760
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@@ -6470,6 +6829,8 @@ struct llm_build_qwen : public llm_graph_context {
6470
6829
 
6471
6830
  auto * inp_attn = build_attn_inp_kv_unified();
6472
6831
 
6832
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6833
+
6473
6834
  for (int il = 0; il < n_layer; ++il) {
6474
6835
  ggml_tensor * inpSA = inpL;
6475
6836
 
@@ -6516,9 +6877,7 @@ struct llm_build_qwen : public llm_graph_context {
6516
6877
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6517
6878
  }
6518
6879
 
6519
- if (il == n_layer - 1) {
6520
- // skip computing output for unused tokens
6521
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6880
+ if (il == n_layer - 1 && inp_out_ids) {
6522
6881
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6523
6882
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6524
6883
  }
@@ -6587,6 +6946,8 @@ struct llm_build_qwen2 : public llm_graph_context {
6587
6946
 
6588
6947
  auto * inp_attn = build_attn_inp_kv_unified();
6589
6948
 
6949
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6950
+
6590
6951
  for (int il = 0; il < n_layer; ++il) {
6591
6952
  ggml_tensor * inpSA = inpL;
6592
6953
 
@@ -6636,9 +6997,7 @@ struct llm_build_qwen2 : public llm_graph_context {
6636
6997
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6637
6998
  }
6638
6999
 
6639
- if (il == n_layer - 1) {
6640
- // skip computing output for unused tokens
6641
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7000
+ if (il == n_layer - 1 && inp_out_ids) {
6642
7001
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6643
7002
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6644
7003
  }
@@ -6708,6 +7067,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
6708
7067
  int sections[4];
6709
7068
  std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
6710
7069
 
7070
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7071
+
6711
7072
  for (int il = 0; il < n_layer; ++il) {
6712
7073
  ggml_tensor * inpSA = inpL;
6713
7074
 
@@ -6757,9 +7118,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
6757
7118
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6758
7119
  }
6759
7120
 
6760
- if (il == n_layer - 1) {
6761
- // skip computing output for unused tokens
6762
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7121
+ if (il == n_layer - 1 && inp_out_ids) {
6763
7122
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6764
7123
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6765
7124
  }
@@ -6826,6 +7185,8 @@ struct llm_build_qwen2moe : public llm_graph_context {
6826
7185
 
6827
7186
  auto * inp_attn = build_attn_inp_kv_unified();
6828
7187
 
7188
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7189
+
6829
7190
  for (int il = 0; il < n_layer; ++il) {
6830
7191
  ggml_tensor * inpSA = inpL;
6831
7192
 
@@ -6884,9 +7245,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
6884
7245
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6885
7246
  }
6886
7247
 
6887
- if (il == n_layer - 1) {
6888
- // skip computing output for unused tokens
6889
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7248
+ if (il == n_layer - 1 && inp_out_ids) {
6890
7249
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6891
7250
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
6892
7251
  }
@@ -6985,6 +7344,8 @@ struct llm_build_qwen3 : public llm_graph_context {
6985
7344
 
6986
7345
  auto * inp_attn = build_attn_inp_kv_unified();
6987
7346
 
7347
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7348
+
6988
7349
  for (int il = 0; il < n_layer; ++il) {
6989
7350
  ggml_tensor * inpSA = inpL;
6990
7351
 
@@ -7037,9 +7398,7 @@ struct llm_build_qwen3 : public llm_graph_context {
7037
7398
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7038
7399
  }
7039
7400
 
7040
- if (il == n_layer - 1) {
7041
- // skip computing output for unused tokens
7042
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7401
+ if (il == n_layer - 1 && inp_out_ids) {
7043
7402
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7044
7403
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7045
7404
  }
@@ -7106,6 +7465,8 @@ struct llm_build_qwen3moe : public llm_graph_context {
7106
7465
 
7107
7466
  auto * inp_attn = build_attn_inp_kv_unified();
7108
7467
 
7468
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7469
+
7109
7470
  for (int il = 0; il < n_layer; ++il) {
7110
7471
  ggml_tensor * inpSA = inpL;
7111
7472
 
@@ -7158,9 +7519,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
7158
7519
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7159
7520
  }
7160
7521
 
7161
- if (il == n_layer - 1) {
7162
- // skip computing output for unused tokens
7163
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7522
+ if (il == n_layer - 1 && inp_out_ids) {
7164
7523
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7165
7524
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7166
7525
  }
@@ -7236,6 +7595,8 @@ struct llm_build_phi2 : public llm_graph_context {
7236
7595
 
7237
7596
  auto * inp_attn = build_attn_inp_kv_unified();
7238
7597
 
7598
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7599
+
7239
7600
  for (int il = 0; il < n_layer; ++il) {
7240
7601
  attn_norm_output = build_norm(inpL,
7241
7602
  model.layers[il].attn_norm,
@@ -7298,9 +7659,7 @@ struct llm_build_phi2 : public llm_graph_context {
7298
7659
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7299
7660
  }
7300
7661
 
7301
- if (il == n_layer - 1) {
7302
- // skip computing output for unused tokens
7303
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7662
+ if (il == n_layer - 1 && inp_out_ids) {
7304
7663
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7305
7664
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7306
7665
  attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
@@ -7372,6 +7731,8 @@ struct llm_build_phi3 : public llm_graph_context {
7372
7731
  inp_attn = build_attn_inp_kv_unified();
7373
7732
  }
7374
7733
 
7734
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7735
+
7375
7736
  for (int il = 0; il < n_layer; ++il) {
7376
7737
  auto * residual = inpL;
7377
7738
 
@@ -7435,9 +7796,7 @@ struct llm_build_phi3 : public llm_graph_context {
7435
7796
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
7436
7797
  }
7437
7798
 
7438
- if (il == n_layer - 1) {
7439
- // skip computing output for unused tokens
7440
- ggml_tensor* inp_out_ids = build_inp_out_ids();
7799
+ if (il == n_layer - 1 && inp_out_ids) {
7441
7800
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7442
7801
  residual = ggml_get_rows(ctx0, residual, inp_out_ids);
7443
7802
  }
@@ -7523,15 +7882,16 @@ struct llm_build_plamo : public llm_graph_context {
7523
7882
 
7524
7883
  auto * inp_attn = build_attn_inp_kv_unified();
7525
7884
 
7526
- for (int il = 0; il < n_layer; ++il) {
7885
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7527
7886
 
7887
+ for (int il = 0; il < n_layer; ++il) {
7528
7888
  // norm
7529
7889
  cur = build_norm(inpL,
7530
7890
  model.layers[il].attn_norm, NULL,
7531
7891
  LLM_NORM_RMS, il);
7532
7892
  cb(cur, "attn_norm", il);
7533
7893
 
7534
- ggml_tensor * attention_norm = cur;
7894
+ ggml_tensor * sa_inp = cur;
7535
7895
 
7536
7896
  // self-attention
7537
7897
  {
@@ -7569,18 +7929,17 @@ struct llm_build_plamo : public llm_graph_context {
7569
7929
  model.layers[il].wo, NULL,
7570
7930
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7571
7931
  }
7572
- ggml_tensor * sa_out = cur;
7573
-
7574
- cur = attention_norm;
7575
7932
 
7576
- if (il == n_layer - 1) {
7577
- // skip computing output for unused tokens
7578
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7933
+ if (il == n_layer - 1 && inp_out_ids) {
7579
7934
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7580
- sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
7935
+ sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
7581
7936
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7582
7937
  }
7583
7938
 
7939
+ ggml_tensor * sa_out = cur;
7940
+
7941
+ cur = sa_inp;
7942
+
7584
7943
  // feed-forward network
7585
7944
  {
7586
7945
  cur = build_ffn(cur,
@@ -7645,6 +8004,8 @@ struct llm_build_gpt2 : public llm_graph_context {
7645
8004
  inpL = ggml_add(ctx0, inpL, pos);
7646
8005
  cb(inpL, "inpL", -1);
7647
8006
 
8007
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8008
+
7648
8009
  for (int il = 0; il < n_layer; ++il) {
7649
8010
  cur = build_norm(inpL,
7650
8011
  model.layers[il].attn_norm,
@@ -7677,9 +8038,7 @@ struct llm_build_gpt2 : public llm_graph_context {
7677
8038
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7678
8039
  }
7679
8040
 
7680
- if (il == n_layer - 1) {
7681
- // skip computing output for unused tokens
7682
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8041
+ if (il == n_layer - 1 && inp_out_ids) {
7683
8042
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7684
8043
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7685
8044
  }
@@ -7749,6 +8108,8 @@ struct llm_build_codeshell : public llm_graph_context {
7749
8108
 
7750
8109
  auto * inp_attn = build_attn_inp_kv_unified();
7751
8110
 
8111
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8112
+
7752
8113
  for (int il = 0; il < n_layer; ++il) {
7753
8114
  cur = build_norm(inpL,
7754
8115
  model.layers[il].attn_norm,
@@ -7793,9 +8154,7 @@ struct llm_build_codeshell : public llm_graph_context {
7793
8154
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7794
8155
  }
7795
8156
 
7796
- if (il == n_layer - 1) {
7797
- // skip computing output for unused tokens
7798
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8157
+ if (il == n_layer - 1 && inp_out_ids) {
7799
8158
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7800
8159
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
7801
8160
  }
@@ -7849,128 +8208,128 @@ struct llm_build_codeshell : public llm_graph_context {
7849
8208
 
7850
8209
  struct llm_build_orion : public llm_graph_context {
7851
8210
  llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7852
- const int64_t n_embd_head = hparams.n_embd_head_v;
8211
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7853
8212
 
7854
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7855
- GGML_ASSERT(n_embd_head == hparams.n_rot);
8213
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8214
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
7856
8215
 
7857
- ggml_tensor * cur;
7858
- ggml_tensor * inpL;
8216
+ ggml_tensor * cur;
8217
+ ggml_tensor * inpL;
7859
8218
 
7860
- inpL = build_inp_embd(model.tok_embd);
8219
+ inpL = build_inp_embd(model.tok_embd);
7861
8220
 
7862
- // inp_pos - contains the positions
7863
- ggml_tensor * inp_pos = build_inp_pos();
8221
+ // inp_pos - contains the positions
8222
+ ggml_tensor * inp_pos = build_inp_pos();
7864
8223
 
7865
- auto * inp_attn = build_attn_inp_kv_unified();
8224
+ auto * inp_attn = build_attn_inp_kv_unified();
7866
8225
 
7867
- for (int il = 0; il < n_layer; ++il) {
7868
- ggml_tensor * inpSA = inpL;
8226
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
7869
8227
 
7870
- // norm
7871
- cur = build_norm(inpL,
7872
- model.layers[il].attn_norm, model.layers[il].attn_norm_b,
7873
- LLM_NORM, il);
7874
- cb(cur, "attn_norm", il);
8228
+ for (int il = 0; il < n_layer; ++il) {
8229
+ ggml_tensor * inpSA = inpL;
7875
8230
 
7876
- // self-attention
7877
- {
7878
- // compute Q and K and RoPE them
7879
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
7880
- cb(Qcur, "Qcur", il);
7881
- // if (model.layers[il].bq) {
7882
- // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
7883
- // cb(Qcur, "Qcur", il);
7884
- // }
7885
-
7886
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
7887
- cb(Kcur, "Kcur", il);
7888
- // if (model.layers[il].bk) {
7889
- // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
7890
- // cb(Kcur, "Kcur", il);
7891
- // }
7892
-
7893
- ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
7894
- cb(Vcur, "Vcur", il);
7895
- // if (model.layers[il].bv) {
7896
- // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
7897
- // cb(Vcur, "Vcur", il);
7898
- // }
7899
-
7900
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7901
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7902
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7903
-
7904
- Qcur = ggml_rope_ext(
7905
- ctx0, Qcur, inp_pos, nullptr,
7906
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7907
- ext_factor, attn_factor, beta_fast, beta_slow
7908
- );
8231
+ // norm
8232
+ cur = build_norm(inpL,
8233
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
8234
+ LLM_NORM, il);
8235
+ cb(cur, "attn_norm", il);
7909
8236
 
7910
- Kcur = ggml_rope_ext(
7911
- ctx0, Kcur, inp_pos, nullptr,
7912
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
7913
- ext_factor, attn_factor, beta_fast, beta_slow
7914
- );
8237
+ // self-attention
8238
+ {
8239
+ // compute Q and K and RoPE them
8240
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
8241
+ cb(Qcur, "Qcur", il);
8242
+ // if (model.layers[il].bq) {
8243
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8244
+ // cb(Qcur, "Qcur", il);
8245
+ // }
7915
8246
 
7916
- cb(Qcur, "Qcur", il);
7917
- cb(Kcur, "Kcur", il);
7918
- cb(Vcur, "Vcur", il);
8247
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
8248
+ cb(Kcur, "Kcur", il);
8249
+ // if (model.layers[il].bk) {
8250
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8251
+ // cb(Kcur, "Kcur", il);
8252
+ // }
7919
8253
 
7920
- cur = build_attn(inp_attn, gf,
7921
- model.layers[il].wo, NULL,
7922
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7923
- }
8254
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
8255
+ cb(Vcur, "Vcur", il);
8256
+ // if (model.layers[il].bv) {
8257
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
8258
+ // cb(Vcur, "Vcur", il);
8259
+ // }
7924
8260
 
7925
- if (il == n_layer - 1) {
7926
- // skip computing output for unused tokens
7927
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7928
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7929
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7930
- }
8261
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8262
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8263
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7931
8264
 
7932
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7933
- cb(ffn_inp, "ffn_inp", il);
8265
+ Qcur = ggml_rope_ext(
8266
+ ctx0, Qcur, inp_pos, nullptr,
8267
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8268
+ ext_factor, attn_factor, beta_fast, beta_slow
8269
+ );
7934
8270
 
7935
- // feed-forward network
7936
- cur = build_norm(ffn_inp,
7937
- model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
7938
- LLM_NORM, il);
7939
- cb(cur, "ffn_norm", il);
8271
+ Kcur = ggml_rope_ext(
8272
+ ctx0, Kcur, inp_pos, nullptr,
8273
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
8274
+ ext_factor, attn_factor, beta_fast, beta_slow
8275
+ );
7940
8276
 
7941
- cur = build_ffn(cur,
7942
- model.layers[il].ffn_up, NULL, NULL,
7943
- model.layers[il].ffn_gate, NULL, NULL,
7944
- model.layers[il].ffn_down, NULL, NULL,
7945
- NULL,
7946
- LLM_FFN_SILU, LLM_FFN_PAR, il);
7947
- cb(cur, "ffn_out", il);
8277
+ cb(Qcur, "Qcur", il);
8278
+ cb(Kcur, "Kcur", il);
8279
+ cb(Vcur, "Vcur", il);
7948
8280
 
7949
- cur = ggml_add(ctx0, cur, ffn_inp);
8281
+ cur = build_attn(inp_attn, gf,
8282
+ model.layers[il].wo, NULL,
8283
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8284
+ }
7950
8285
 
7951
- cur = build_cvec(cur, il);
7952
- cb(cur, "l_out", il);
8286
+ if (il == n_layer - 1 && inp_out_ids) {
8287
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8288
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8289
+ }
7953
8290
 
7954
- // input for next layer
7955
- inpL = cur;
7956
- }
8291
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8292
+ cb(ffn_inp, "ffn_inp", il);
8293
+
8294
+ // feed-forward network
8295
+ cur = build_norm(ffn_inp,
8296
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
8297
+ LLM_NORM, il);
8298
+ cb(cur, "ffn_norm", il);
8299
+
8300
+ cur = build_ffn(cur,
8301
+ model.layers[il].ffn_up, NULL, NULL,
8302
+ model.layers[il].ffn_gate, NULL, NULL,
8303
+ model.layers[il].ffn_down, NULL, NULL,
8304
+ NULL,
8305
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
8306
+ cb(cur, "ffn_out", il);
8307
+
8308
+ cur = ggml_add(ctx0, cur, ffn_inp);
8309
+
8310
+ cur = build_cvec(cur, il);
8311
+ cb(cur, "l_out", il);
8312
+
8313
+ // input for next layer
8314
+ inpL = cur;
8315
+ }
7957
8316
 
7958
- cur = inpL;
8317
+ cur = inpL;
7959
8318
 
7960
- cur = build_norm(cur,
7961
- model.output_norm, model.output_norm_b,
7962
- LLM_NORM, -1);
8319
+ cur = build_norm(cur,
8320
+ model.output_norm, model.output_norm_b,
8321
+ LLM_NORM, -1);
7963
8322
 
7964
- cb(cur, "result_norm", -1);
7965
- res->t_embd = cur;
8323
+ cb(cur, "result_norm", -1);
8324
+ res->t_embd = cur;
7966
8325
 
7967
- // lm_head
7968
- cur = build_lora_mm(model.output, cur);
8326
+ // lm_head
8327
+ cur = build_lora_mm(model.output, cur);
7969
8328
 
7970
- cb(cur, "result_output", -1);
7971
- res->t_logits = cur;
8329
+ cb(cur, "result_output", -1);
8330
+ res->t_logits = cur;
7972
8331
 
7973
- ggml_build_forward_expand(gf, cur);
8332
+ ggml_build_forward_expand(gf, cur);
7974
8333
  }
7975
8334
  };
7976
8335
 
@@ -7991,6 +8350,8 @@ struct llm_build_internlm2 : public llm_graph_context {
7991
8350
 
7992
8351
  auto * inp_attn = build_attn_inp_kv_unified();
7993
8352
 
8353
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8354
+
7994
8355
  for (int il = 0; il < n_layer; ++il) {
7995
8356
  ggml_tensor * inpSA = inpL;
7996
8357
 
@@ -8049,9 +8410,7 @@ struct llm_build_internlm2 : public llm_graph_context {
8049
8410
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8050
8411
  }
8051
8412
 
8052
- if (il == n_layer - 1) {
8053
- // skip computing output for unused tokens
8054
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8413
+ if (il == n_layer - 1 && inp_out_ids) {
8055
8414
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8056
8415
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8057
8416
  }
@@ -8127,6 +8486,8 @@ struct llm_build_minicpm3 : public llm_graph_context {
8127
8486
 
8128
8487
  auto * inp_attn = build_attn_inp_kv_unified();
8129
8488
 
8489
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8490
+
8130
8491
  for (int il = 0; il < n_layer; ++il) {
8131
8492
  ggml_tensor * inpSA = inpL;
8132
8493
 
@@ -8246,15 +8607,13 @@ struct llm_build_minicpm3 : public llm_graph_context {
8246
8607
  q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
8247
8608
  }
8248
8609
 
8249
- if (il == n_layer - 1) {
8250
- // skip computing output for unused tokens
8251
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8610
+ if (il == n_layer - 1 && inp_out_ids) {
8252
8611
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8253
8612
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8254
8613
  }
8255
8614
 
8256
8615
  // scale_res - scale the hidden states for residual connection
8257
- const float scale_res = scale_depth/sqrtf(float(n_layer));
8616
+ const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
8258
8617
  cur = ggml_scale(ctx0, cur, scale_res);
8259
8618
  cb(cur, "hidden_scaled", il);
8260
8619
 
@@ -8331,6 +8690,8 @@ struct llm_build_gemma : public llm_graph_context {
8331
8690
 
8332
8691
  auto * inp_attn = build_attn_inp_kv_unified();
8333
8692
 
8693
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8694
+
8334
8695
  for (int il = 0; il < n_layer; ++il) {
8335
8696
  // norm
8336
8697
  cur = build_norm(inpL,
@@ -8376,9 +8737,7 @@ struct llm_build_gemma : public llm_graph_context {
8376
8737
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8377
8738
  }
8378
8739
 
8379
- if (il == n_layer - 1) {
8380
- // skip computing output for unused tokens
8381
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8740
+ if (il == n_layer - 1 && inp_out_ids) {
8382
8741
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8383
8742
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8384
8743
  }
@@ -8447,6 +8806,8 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
8447
8806
 
8448
8807
  auto * inp_attn = build_attn_inp_kv_unified_iswa();
8449
8808
 
8809
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8810
+
8450
8811
  for (int il = 0; il < n_layer; ++il) {
8451
8812
  // norm
8452
8813
  cur = build_norm(inpL,
@@ -8484,32 +8845,23 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
8484
8845
  cb(Kcur, "Kcur", il);
8485
8846
  cb(Vcur, "Vcur", il);
8486
8847
 
8487
- // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
8488
- switch (model.type) {
8489
- case LLM_TYPE_2B:
8490
- case LLM_TYPE_9B:
8491
- case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); break;
8492
- default: GGML_ABORT("fatal error");
8493
- };
8494
- cb(Qcur, "Qcur_scaled", il);
8848
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
8495
8849
 
8496
8850
  cur = build_attn(inp_attn, gf,
8497
8851
  model.layers[il].wo, NULL,
8498
8852
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8499
8853
  }
8500
8854
 
8855
+ if (il == n_layer - 1 && inp_out_ids) {
8856
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8857
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8858
+ }
8859
+
8501
8860
  cur = build_norm(cur,
8502
8861
  model.layers[il].attn_post_norm, NULL,
8503
8862
  LLM_NORM_RMS, il);
8504
8863
  cb(cur, "attn_post_norm", il);
8505
8864
 
8506
- if (il == n_layer - 1) {
8507
- // skip computing output for unused tokens
8508
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8509
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8510
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8511
- }
8512
-
8513
8865
  ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
8514
8866
  cb(sa_out, "sa_out", il);
8515
8867
 
@@ -8588,6 +8940,8 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8588
8940
  // TODO: is causal == true correct? might need some changes
8589
8941
  auto * inp_attn = build_attn_inp_kv_unified_iswa();
8590
8942
 
8943
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8944
+
8591
8945
  for (int il = 0; il < n_layer; ++il) {
8592
8946
  const float freq_base_l = model.get_rope_freq_base (cparams, il);
8593
8947
  const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
@@ -8632,9 +8986,17 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8632
8986
  cb(Kcur, "Kcur", il);
8633
8987
  cb(Vcur, "Vcur", il);
8634
8988
 
8989
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
8990
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
8991
+
8635
8992
  cur = build_attn(inp_attn, gf,
8636
8993
  model.layers[il].wo, NULL,
8637
- Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
8994
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8995
+ }
8996
+
8997
+ if (il == n_layer - 1 && inp_out_ids) {
8998
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8999
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8638
9000
  }
8639
9001
 
8640
9002
  cur = build_norm(cur,
@@ -8642,13 +9004,6 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8642
9004
  LLM_NORM_RMS, il);
8643
9005
  cb(cur, "attn_post_norm", il);
8644
9006
 
8645
- if (il == n_layer - 1) {
8646
- // skip computing output for unused tokens
8647
- ggml_tensor * inp_out_ids = build_inp_out_ids();
8648
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8649
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8650
- }
8651
-
8652
9007
  ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
8653
9008
  cb(sa_out, "sa_out", il);
8654
9009
 
@@ -8701,8 +9056,444 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8701
9056
  }
8702
9057
  };
8703
9058
 
8704
- // TODO: move up next to build_starcoder
8705
- struct llm_build_starcoder2 : public llm_graph_context {
9059
+ struct llm_build_gemma3n_iswa : public llm_graph_context {
9060
+ const llama_model & model;
9061
+ ggml_cgraph * gf;
9062
+
9063
+ const int64_t n_embd_head;
9064
+ const int64_t n_embd_altup;
9065
+ const int64_t n_altup;
9066
+ const int i_altup_act;
9067
+ const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
9068
+ const int n_layer_sparsity = 10; // number of layers using activation sparsity
9069
+ const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
9070
+
9071
+ ggml_tensor * one; // containing single element 1.0f
9072
+
9073
+ llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
9074
+ : llm_graph_context(params),
9075
+ model(model),
9076
+ gf(gf),
9077
+ n_embd_head(model.hparams.n_embd_head_k),
9078
+ n_embd_altup(model.hparams.n_embd_altup),
9079
+ n_altup(model.hparams.n_altup),
9080
+ i_altup_act(model.hparams.i_altup_act) {
9081
+ ggml_tensor * cur;
9082
+ ggml_tensor * inpL;
9083
+
9084
+ // TODO: remove this when ggml_scale_add is implemented
9085
+ one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
9086
+ {
9087
+ auto inp = std::make_unique<llm_graph_input_one>();
9088
+ inp->one = one;
9089
+ res->add_input(std::move(inp));
9090
+ }
9091
+
9092
+ inpL = build_inp_embd(model.tok_embd);
9093
+
9094
+ // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
9095
+ if (ubatch.token) {
9096
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
9097
+ cb(inpL, "inp_scaled", -1);
9098
+ }
9099
+
9100
+ // inp_pos - contains the positions
9101
+ ggml_tensor * inp_pos = build_inp_pos();
9102
+
9103
+ // TODO: is causal == true correct? might need some changes
9104
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
9105
+
9106
+ // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
9107
+ ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
9108
+
9109
+ // inpL now has only 1 altup, project it to the rest of the altups
9110
+ // these "added" altups will be concat to the last dim of inpL
9111
+ {
9112
+ ggml_tensor * target_magnitude = calc_magnitude(inpL);
9113
+ ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
9114
+ ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
9115
+ ggml_tensor * new_magnitude = calc_magnitude(altup_added);
9116
+ altup_added = ggml_div(ctx0,
9117
+ ggml_mul(ctx0, altup_added, target_magnitude),
9118
+ new_magnitude);
9119
+ inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
9120
+ cb(inpL, "inp_stacked", -1);
9121
+ }
9122
+
9123
+ // inpL now has shape: [n_embd, n_tokens, n_altup]
9124
+ // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
9125
+
9126
+ for (int il = 0; il < n_layer; ++il) {
9127
+ // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
9128
+ const bool has_kv = (il < n_layer_kv);
9129
+
9130
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
9131
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
9132
+
9133
+ ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
9134
+ ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
9135
+
9136
+ // predicted value will go through self-attention and laurel
9137
+ ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
9138
+ cur = active_prediction;
9139
+ cb(cur, "active_prediction", il);
9140
+
9141
+ // norm
9142
+ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
9143
+ cb(cur, "attn_norm", il);
9144
+
9145
+ // laurel
9146
+ ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
9147
+
9148
+ // self-attention
9149
+ if (has_kv) {
9150
+ // compute Q and K and RoPE them
9151
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
9152
+ cb(Qcur, "Qcur", il);
9153
+
9154
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
9155
+ cb(Kcur, "Kcur", il);
9156
+
9157
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
9158
+ cb(Vcur, "Vcur", il);
9159
+
9160
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9161
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9162
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9163
+
9164
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
9165
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
9166
+ Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
9167
+
9168
+ cb(Qcur, "Qcur_normed", il);
9169
+ cb(Kcur, "Kcur_normed", il);
9170
+ cb(Vcur, "Vcur_normed", il);
9171
+
9172
+ Qcur = ggml_rope_ext(
9173
+ ctx0, Qcur, inp_pos, nullptr,
9174
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
9175
+ ext_factor, attn_factor, beta_fast, beta_slow);
9176
+
9177
+ Kcur = ggml_rope_ext(
9178
+ ctx0, Kcur, inp_pos, nullptr,
9179
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
9180
+ ext_factor, attn_factor, beta_fast, beta_slow);
9181
+
9182
+ cb(Qcur, "Qcur_pos", il);
9183
+ cb(Kcur, "Kcur_pos", il);
9184
+
9185
+ cur = build_attn(inp_attn, gf,
9186
+ model.layers[il].wo, NULL,
9187
+ Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
9188
+ } else {
9189
+ // no KV layers
9190
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
9191
+ cb(Qcur, "Qcur", il);
9192
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9193
+
9194
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
9195
+ cb(Qcur, "Qcur_normed", il);
9196
+
9197
+ Qcur = ggml_rope_ext(
9198
+ ctx0, Qcur, inp_pos, nullptr,
9199
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
9200
+ ext_factor, attn_factor, beta_fast, beta_slow);
9201
+ cb(Qcur, "Qcur_pos", il);
9202
+
9203
+ cur = build_attn(inp_attn, gf,
9204
+ model.layers[il].wo, NULL,
9205
+ Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
9206
+ }
9207
+
9208
+ cur = build_norm(cur,
9209
+ model.layers[il].attn_post_norm, NULL,
9210
+ LLM_NORM_RMS, il);
9211
+ cb(cur, "attn_post_norm", il);
9212
+
9213
+ cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
9214
+ cb(cur, "attn_gated", il);
9215
+
9216
+ ggml_tensor * attn_laurel = ggml_scale(ctx0,
9217
+ ggml_add(ctx0, cur, laurel_out),
9218
+ 1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
9219
+ cb(attn_laurel, "attn_laurel", il);
9220
+
9221
+ cur = build_norm(attn_laurel,
9222
+ model.layers[il].ffn_norm, NULL,
9223
+ LLM_NORM_RMS, il);
9224
+ cb(cur, "ffn_norm", il);
9225
+
9226
+ // feed-forward network
9227
+ {
9228
+ ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur);
9229
+ ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
9230
+
9231
+ if (il < n_layer_sparsity) {
9232
+ // apply activation sparsity
9233
+ gate_proj = gaussian_topk(gate_proj);
9234
+ }
9235
+ gate_proj = ggml_gelu(ctx0, gate_proj);
9236
+
9237
+ cur = ggml_mul(ctx0, up_proj, gate_proj);
9238
+ cur = build_lora_mm(model.layers[il].ffn_down, cur);
9239
+ cb(cur, "ffn_out", il);
9240
+ }
9241
+
9242
+ cur = build_norm(cur,
9243
+ model.layers[il].ffn_post_norm, NULL,
9244
+ LLM_NORM_RMS, -1);
9245
+ cb(cur, "ffn_post_norm", il);
9246
+
9247
+ ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
9248
+ cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
9249
+
9250
+ ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
9251
+
9252
+ ggml_tensor * first_prediction; // [n_embd, n_tokens]
9253
+ {
9254
+ first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
9255
+ first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
9256
+ first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
9257
+ first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
9258
+ cb(first_prediction, "first_prediction_gated", il);
9259
+ ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
9260
+ first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
9261
+ cb(first_prediction, "first_prediction_scaled", il);
9262
+
9263
+ first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
9264
+ first_prediction = build_norm(first_prediction,
9265
+ model.layers[il].per_layer_post_norm, NULL,
9266
+ LLM_NORM_RMS, il);
9267
+ cb(first_prediction, "first_prediction_out", il);
9268
+ }
9269
+
9270
+ // equivalent to python code: corrected_predictions[1:] += first_prediction
9271
+ {
9272
+ ggml_tensor * slice_first = view_2d_slice(corrected, 0);
9273
+ ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1,
9274
+ ggml_row_size(corrected->type, n_embd),
9275
+ ggml_row_size(corrected->type, n_embd*n_tokens),
9276
+ n_embd*n_tokens*ggml_element_size(corrected));
9277
+ ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
9278
+ corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
9279
+ }
9280
+
9281
+ cur = corrected; // [n_embd, n_tokens, n_altup]
9282
+ cur = build_cvec(cur, il);
9283
+ cb(cur, "l_out", il);
9284
+
9285
+ // input for next layer
9286
+ inpL = cur;
9287
+ }
9288
+
9289
+ cur = inpL; // [n_embd, n_tokens, n_altup]
9290
+
9291
+ // cur now has multiple altup(s), we want to merge them back to 1 altup
9292
+ {
9293
+ ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
9294
+ // do a view to skip the first slice (active altup)
9295
+ ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1,
9296
+ ggml_row_size(cur->type, n_embd),
9297
+ ggml_row_size(cur->type, n_embd*n_tokens),
9298
+ n_embd*n_tokens*ggml_element_size(cur));
9299
+ ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
9300
+ ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
9301
+ altup_unembd = ggml_div(ctx0,
9302
+ ggml_mul(ctx0, altup_unembd, target_magnitude),
9303
+ new_magnitude);
9304
+ cb(altup_unembd, "altup_unembd", -1);
9305
+
9306
+ // equivalent to torch.mean(hidden_states, dim=0)
9307
+ cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
9308
+ for (int i = 0; i < n_altup - 1; ++i) {
9309
+ cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
9310
+ }
9311
+ cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
9312
+ cb(cur, "unembd_merged", -1);
9313
+ }
9314
+
9315
+ // cur now has shape: [n_embd, n_tokens]
9316
+
9317
+ // TODO: move this to right after the last KV layer
9318
+ {
9319
+ // skip computing output for unused tokens
9320
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9321
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9322
+ }
9323
+
9324
+ cur = build_norm(cur,
9325
+ model.output_norm, NULL,
9326
+ LLM_NORM_RMS, -1);
9327
+
9328
+ cb(cur, "result_norm", -1);
9329
+ res->t_embd = cur;
9330
+
9331
+ cur = build_lora_mm(model.output, cur);
9332
+
9333
+ {
9334
+ // final logit soft-capping
9335
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
9336
+ cur = ggml_tanh(ctx0, cur);
9337
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
9338
+ }
9339
+
9340
+ cb(cur, "result_output", -1);
9341
+ res->t_logits = cur;
9342
+
9343
+ ggml_build_forward_expand(gf, cur);
9344
+ }
9345
+
9346
+ ggml_tensor * calc_magnitude(ggml_tensor * x) {
9347
+ return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
9348
+ }
9349
+
9350
+ // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
9351
+ ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) {
9352
+ GGML_ASSERT(idx < (int)x->ne[2]);
9353
+ return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1],
9354
+ ggml_row_size(x->type, x->ne[0]),
9355
+ idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
9356
+ }
9357
+
9358
+ // equivalent to get_per_layer_inputs() in python code
9359
+ // output shape: [n_embd_altup, n_layer, n_tokens]
9360
+ ggml_tensor * get_per_layer_inputs() {
9361
+ auto inp = std::make_unique<llm_graph_input_embd>();
9362
+ ggml_tensor * inp_per_layer;
9363
+ if (ubatch.token) {
9364
+ inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
9365
+ ggml_set_input(inp->tokens);
9366
+ res->t_tokens = inp->tokens;
9367
+ inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
9368
+ inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
9369
+ inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup));
9370
+ cb(inp_per_layer, "inp_per_layer_selected", -1);
9371
+ } else {
9372
+ GGML_ABORT("TODO: support embd input");
9373
+ }
9374
+ res->add_input(std::move(inp));
9375
+ return inp_per_layer;
9376
+ }
9377
+
9378
+ // equivalent to project_per_layer_inputs() in python code
9379
+ // this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
9380
+ // output shape: [n_embd_altup, n_tokens, n_layer]
9381
+ ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
9382
+ const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd);
9383
+ const float per_layer_input_scale = 1.0f / sqrtf(2.0f);
9384
+
9385
+ ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
9386
+ per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
9387
+ per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
9388
+ per_layer_proj = build_norm(per_layer_proj,
9389
+ model.per_layer_proj_norm, NULL,
9390
+ LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens]
9391
+ cb(per_layer_proj, "per_layer_proj", -1);
9392
+
9393
+ inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
9394
+ inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
9395
+ cb(inp_per_layer, "inp_per_layer", -1);
9396
+
9397
+ // permute to shape: [n_embd_altup, n_tokens, n_layer]
9398
+ inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
9399
+ return inp_per_layer;
9400
+ }
9401
+
9402
+ // input cur shape: [n_altup, n_tokens]
9403
+ // output shape: [n_altup, n_tokens]
9404
+ ggml_tensor * laurel(ggml_tensor * cur, int il) {
9405
+ ggml_tensor * tmp = cur;
9406
+ tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
9407
+ tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
9408
+ tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
9409
+ tmp = ggml_add(ctx0, tmp, cur);
9410
+ cb(tmp, "laurel_out", il);
9411
+ return tmp;
9412
+ }
9413
+
9414
+ // input x shape: [n_embd, n_tokens]
9415
+ // output shape: [n_embd, n_tokens]
9416
+ ggml_tensor * gaussian_topk(ggml_tensor * x) {
9417
+ ggml_tensor * mean = ggml_mean(ctx0, x);
9418
+ ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0,
9419
+ ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
9420
+ 1.0f / (float)(x->ne[0] - 1)
9421
+ ));
9422
+ ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
9423
+ return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
9424
+ }
9425
+
9426
+ //
9427
+ // altup functions
9428
+ //
9429
+
9430
+ // equivalent to compute_router_modalities() in python code
9431
+ // input x shape: [n_embd, n_tokens]
9432
+ // output shape: [n_altup, n_tokens]
9433
+ ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) {
9434
+ ggml_tensor * router_inputs = build_norm(x,
9435
+ model.layers[il].altup_router_norm, NULL,
9436
+ LLM_NORM_RMS, il);
9437
+
9438
+ // router_input_scale
9439
+ router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd);
9440
+
9441
+ ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
9442
+ return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
9443
+ }
9444
+
9445
+ // input cur shape: [n_embd, n_tokens, n_altup]
9446
+ // output shape: [n_embd, n_tokens, n_altup]
9447
+ ggml_tensor * altup_predict(ggml_tensor * cur, int il) {
9448
+ ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
9449
+ ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
9450
+ cb(modalities, "modalities", il);
9451
+
9452
+ ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
9453
+ cb(all_coefs, "all_coefs", il);
9454
+ // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
9455
+ all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
9456
+
9457
+ // permute to [n_altup, n_embd, n_tokens]
9458
+ ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
9459
+ ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
9460
+
9461
+ // final shape must be the same as cur: [n_embd, n_tokens, n_altup]
9462
+ predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
9463
+ predictions = ggml_add(ctx0, predictions, cur);
9464
+ cb(predictions, "predictions", il);
9465
+
9466
+ return predictions;
9467
+ }
9468
+
9469
+ // input predictions shape: [n_embd, n_tokens, n_altup]
9470
+ // input activated shape: [n_embd, n_tokens]
9471
+ // output shape: [n_embd, n_tokens, n_altup]
9472
+ ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
9473
+ ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
9474
+ cb(modalities, "modalities", il);
9475
+
9476
+ ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
9477
+ ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
9478
+ cb(innovation, "innovation", il);
9479
+
9480
+ ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
9481
+ all_coefs = ggml_add(ctx0, all_coefs, one);
9482
+ cb(all_coefs, "all_coefs", il);
9483
+ all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
9484
+ all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
9485
+
9486
+ innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
9487
+ ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
9488
+ corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
9489
+ cb(corrected, "corrected", il);
9490
+
9491
+ return corrected;
9492
+ }
9493
+ };
9494
+
9495
+ // TODO: move up next to build_starcoder
9496
+ struct llm_build_starcoder2 : public llm_graph_context {
8706
9497
  llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8707
9498
  const int64_t n_embd_head = hparams.n_embd_head_v;
8708
9499
 
@@ -8719,6 +9510,8 @@ struct llm_build_starcoder2 : public llm_graph_context {
8719
9510
 
8720
9511
  auto * inp_attn = build_attn_inp_kv_unified();
8721
9512
 
9513
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9514
+
8722
9515
  for (int il = 0; il < n_layer; ++il) {
8723
9516
  ggml_tensor * inpSA = inpL;
8724
9517
 
@@ -8777,9 +9570,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
8777
9570
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
8778
9571
  }
8779
9572
 
8780
- if (il == n_layer - 1) {
8781
- // skip computing output for unused tokens
8782
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9573
+ if (il == n_layer - 1 && inp_out_ids) {
8783
9574
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8784
9575
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8785
9576
  }
@@ -8840,8 +9631,9 @@ struct llm_build_mamba : public llm_graph_context {
8840
9631
  // {n_embd, n_tokens}
8841
9632
  inpL = build_inp_embd(model.tok_embd);
8842
9633
 
8843
- ggml_tensor * state_copy = build_inp_s_copy();
8844
- ggml_tensor * state_mask = build_inp_s_mask();
9634
+ auto * rs_inp = build_rs_inp();
9635
+
9636
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
8845
9637
 
8846
9638
  for (int il = 0; il < n_layer; ++il) {
8847
9639
  // norm
@@ -8850,12 +9642,9 @@ struct llm_build_mamba : public llm_graph_context {
8850
9642
  LLM_NORM_RMS, il);
8851
9643
  cb(cur, "attn_norm", il);
8852
9644
 
8853
- //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
8854
- cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
9645
+ cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
8855
9646
 
8856
- if (il == n_layer - 1) {
8857
- // skip computing output for unused tokens
8858
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9647
+ if (il == n_layer - 1 && inp_out_ids) {
8859
9648
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
8860
9649
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
8861
9650
  }
@@ -8889,15 +9678,14 @@ struct llm_build_mamba : public llm_graph_context {
8889
9678
 
8890
9679
  // TODO: split
8891
9680
  ggml_tensor * build_mamba_layer(
8892
- ggml_cgraph * gf,
8893
- ggml_tensor * cur,
8894
- ggml_tensor * state_copy,
8895
- ggml_tensor * state_mask,
8896
- const llama_ubatch & ubatch,
8897
- int il) const {
8898
- const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
9681
+ llm_graph_input_rs * inp,
9682
+ ggml_cgraph * gf,
9683
+ ggml_tensor * cur,
9684
+ const llama_ubatch & ubatch,
9685
+ int il) const {
9686
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
8899
9687
 
8900
- const auto kv_head = kv_state->get_head();
9688
+ const auto kv_head = mctx_cur->get_head();
8901
9689
 
8902
9690
  const int64_t d_conv = hparams.ssm_d_conv;
8903
9691
  const int64_t d_inner = hparams.ssm_d_inner;
@@ -8915,17 +9703,17 @@ struct llm_build_mamba : public llm_graph_context {
8915
9703
  GGML_ASSERT(ubatch.equal_seqs);
8916
9704
  GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
8917
9705
 
8918
- ggml_tensor * conv_states_all = kv_state->get_k_l(il);
8919
- ggml_tensor * ssm_states_all = kv_state->get_v_l(il);
9706
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
9707
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
8920
9708
 
8921
9709
  // (ab)using the KV cache to store the states
8922
- ggml_tensor * conv = build_copy_mask_state(
8923
- gf, conv_states_all, state_copy, state_mask,
8924
- hparams.n_embd_k_s(), n_seqs);
9710
+ ggml_tensor * conv = build_rs(
9711
+ inp, gf, conv_states_all,
9712
+ hparams.n_embd_r(), n_seqs);
8925
9713
  conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
8926
- ggml_tensor * ssm = build_copy_mask_state(
8927
- gf, ssm_states_all, state_copy, state_mask,
8928
- hparams.n_embd_v_s(), n_seqs);
9714
+ ggml_tensor * ssm = build_rs(
9715
+ inp, gf, ssm_states_all,
9716
+ hparams.n_embd_s(), n_seqs);
8929
9717
  ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
8930
9718
 
8931
9719
  // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
@@ -9038,13 +9826,15 @@ struct llm_build_command_r : public llm_graph_context {
9038
9826
 
9039
9827
  auto * inp_attn = build_attn_inp_kv_unified();
9040
9828
 
9041
- for (int il = 0; il < n_layer; ++il) {
9829
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9042
9830
 
9831
+ for (int il = 0; il < n_layer; ++il) {
9043
9832
  // norm
9044
9833
  cur = build_norm(inpL,
9045
9834
  model.layers[il].attn_norm, NULL,
9046
9835
  LLM_NORM, il);
9047
9836
  cb(cur, "attn_norm", il);
9837
+
9048
9838
  ggml_tensor * ffn_inp = cur;
9049
9839
 
9050
9840
  // self-attention
@@ -9112,9 +9902,7 @@ struct llm_build_command_r : public llm_graph_context {
9112
9902
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9113
9903
  }
9114
9904
 
9115
- if (il == n_layer - 1) {
9116
- // skip computing output for unused tokens
9117
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9905
+ if (il == n_layer - 1 && inp_out_ids) {
9118
9906
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9119
9907
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9120
9908
  ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
@@ -9185,6 +9973,8 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
9185
9973
 
9186
9974
  auto * inp_attn = build_attn_inp_kv_unified_iswa();
9187
9975
 
9976
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9977
+
9188
9978
  for (int il = 0; il < n_layer; ++il) {
9189
9979
  const bool is_swa = hparams.is_swa(il);
9190
9980
 
@@ -9247,9 +10037,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
9247
10037
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9248
10038
  }
9249
10039
 
9250
- if (il == n_layer - 1) {
9251
- // skip computing output for unused tokens
9252
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10040
+ if (il == n_layer - 1 && inp_out_ids) {
9253
10041
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9254
10042
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9255
10043
  ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
@@ -9320,6 +10108,8 @@ struct llm_build_olmo : public llm_graph_context {
9320
10108
 
9321
10109
  auto * inp_attn = build_attn_inp_kv_unified();
9322
10110
 
10111
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10112
+
9323
10113
  for (int il = 0; il < n_layer; ++il) {
9324
10114
  ggml_tensor * inpSA = inpL;
9325
10115
 
@@ -9378,9 +10168,7 @@ struct llm_build_olmo : public llm_graph_context {
9378
10168
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9379
10169
  }
9380
10170
 
9381
- if (il == n_layer - 1) {
9382
- // skip computing output for unused tokens
9383
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10171
+ if (il == n_layer - 1 && inp_out_ids) {
9384
10172
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9385
10173
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9386
10174
  }
@@ -9448,6 +10236,8 @@ struct llm_build_olmo2 : public llm_graph_context {
9448
10236
 
9449
10237
  auto * inp_attn = build_attn_inp_kv_unified();
9450
10238
 
10239
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10240
+
9451
10241
  for (int il = 0; il < n_layer; ++il) {
9452
10242
  ggml_tensor * inpSA = inpL;
9453
10243
 
@@ -9498,18 +10288,16 @@ struct llm_build_olmo2 : public llm_graph_context {
9498
10288
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9499
10289
  }
9500
10290
 
10291
+ if (il == n_layer - 1 && inp_out_ids) {
10292
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10293
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10294
+ }
10295
+
9501
10296
  cur = build_norm(cur,
9502
10297
  model.layers[il].attn_post_norm, NULL,
9503
10298
  LLM_NORM_RMS, il);
9504
10299
  cb(cur, "attn_post_norm", il);
9505
10300
 
9506
- if (il == n_layer - 1) {
9507
- // skip computing output for unused tokens
9508
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9509
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9510
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9511
- }
9512
-
9513
10301
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
9514
10302
  cb(ffn_inp, "ffn_inp", il);
9515
10303
 
@@ -9577,6 +10365,8 @@ struct llm_build_olmoe : public llm_graph_context {
9577
10365
 
9578
10366
  auto * inp_attn = build_attn_inp_kv_unified();
9579
10367
 
10368
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10369
+
9580
10370
  for (int il = 0; il < n_layer; ++il) {
9581
10371
  ggml_tensor * inpSA = inpL;
9582
10372
 
@@ -9631,9 +10421,7 @@ struct llm_build_olmoe : public llm_graph_context {
9631
10421
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9632
10422
  }
9633
10423
 
9634
- if (il == n_layer - 1) {
9635
- // skip computing output for unused tokens
9636
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10424
+ if (il == n_layer - 1 && inp_out_ids) {
9637
10425
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9638
10426
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9639
10427
  }
@@ -9703,6 +10491,8 @@ struct llm_build_openelm : public llm_graph_context {
9703
10491
 
9704
10492
  auto * inp_attn = build_attn_inp_kv_unified();
9705
10493
 
10494
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10495
+
9706
10496
  for (int il = 0; il < n_layer; ++il) {
9707
10497
  const int64_t n_head = hparams.n_head(il);
9708
10498
  const int64_t n_head_kv = hparams.n_head_kv(il);
@@ -9764,11 +10554,9 @@ struct llm_build_openelm : public llm_graph_context {
9764
10554
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9765
10555
  }
9766
10556
 
9767
- if (il == n_layer - 1) {
9768
- // skip computing output for unused tokens
9769
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10557
+ if (il == n_layer - 1 && inp_out_ids) {
9770
10558
  residual = ggml_get_rows(ctx0, residual, inp_out_ids);
9771
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10559
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9772
10560
  }
9773
10561
 
9774
10562
  ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
@@ -9834,6 +10622,8 @@ struct llm_build_gptneox : public llm_graph_context {
9834
10622
 
9835
10623
  auto * inp_attn = build_attn_inp_kv_unified();
9836
10624
 
10625
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10626
+
9837
10627
  for (int il = 0; il < n_layer; ++il) {
9838
10628
  cur = build_norm(inpL,
9839
10629
  model.layers[il].attn_norm,
@@ -9878,9 +10668,7 @@ struct llm_build_gptneox : public llm_graph_context {
9878
10668
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
9879
10669
  }
9880
10670
 
9881
- if (il == n_layer - 1) {
9882
- // skip computing output for unused tokens
9883
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10671
+ if (il == n_layer - 1 && inp_out_ids) {
9884
10672
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9885
10673
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9886
10674
  }
@@ -9982,6 +10770,8 @@ struct llm_build_arctic : public llm_graph_context {
9982
10770
 
9983
10771
  auto * inp_attn = build_attn_inp_kv_unified();
9984
10772
 
10773
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10774
+
9985
10775
  for (int il = 0; il < n_layer; ++il) {
9986
10776
  ggml_tensor * inpSA = inpL;
9987
10777
 
@@ -10028,9 +10818,7 @@ struct llm_build_arctic : public llm_graph_context {
10028
10818
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
10029
10819
  }
10030
10820
 
10031
- if (il == n_layer - 1) {
10032
- // skip computing output for unused tokens
10033
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10821
+ if (il == n_layer - 1 && inp_out_ids) {
10034
10822
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10035
10823
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10036
10824
  }
@@ -10122,6 +10910,8 @@ struct llm_build_deepseek : public llm_graph_context {
10122
10910
 
10123
10911
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
10124
10912
 
10913
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10914
+
10125
10915
  for (int il = 0; il < n_layer; ++il) {
10126
10916
  ggml_tensor * inpSA = inpL;
10127
10917
 
@@ -10183,14 +10973,11 @@ struct llm_build_deepseek : public llm_graph_context {
10183
10973
  Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
10184
10974
  }
10185
10975
 
10186
- if (il == n_layer - 1) {
10187
- // skip computing output for unused tokens
10188
- ggml_tensor * inp_out_ids = build_inp_out_ids();
10976
+ if (il == n_layer - 1 && inp_out_ids) {
10189
10977
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10190
10978
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10191
10979
  }
10192
10980
 
10193
-
10194
10981
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
10195
10982
  cb(ffn_inp, "ffn_inp", il);
10196
10983
 
@@ -10298,6 +11085,8 @@ struct llm_build_deepseek2 : public llm_graph_context {
10298
11085
 
10299
11086
  auto * inp_attn = build_attn_inp_kv_unified();
10300
11087
 
11088
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11089
+
10301
11090
  for (int il = 0; il < n_layer; ++il) {
10302
11091
  ggml_tensor * inpSA = inpL;
10303
11092
 
@@ -10447,9 +11236,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
10447
11236
  }
10448
11237
  }
10449
11238
 
10450
- if (il == n_layer - 1) {
10451
- // skip computing output for unused tokens
10452
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11239
+ if (il == n_layer - 1 && inp_out_ids) {
10453
11240
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10454
11241
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10455
11242
  }
@@ -10545,6 +11332,8 @@ struct llm_build_bitnet : public llm_graph_context {
10545
11332
 
10546
11333
  auto * inp_attn = build_attn_inp_kv_unified();
10547
11334
 
11335
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11336
+
10548
11337
  for (int il = 0; il < n_layer; ++il) {
10549
11338
  ggml_tensor * inpSA = inpL;
10550
11339
 
@@ -10627,9 +11416,7 @@ struct llm_build_bitnet : public llm_graph_context {
10627
11416
  cb(cur, "attn_o_out", il);
10628
11417
  }
10629
11418
 
10630
- if (il == n_layer - 1) {
10631
- // skip computing output for unused tokens
10632
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11419
+ if (il == n_layer - 1 && inp_out_ids) {
10633
11420
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10634
11421
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10635
11422
  }
@@ -10704,6 +11491,8 @@ struct llm_build_t5_enc : public llm_graph_context {
10704
11491
 
10705
11492
  auto * inp_attn = build_attn_inp_no_cache();
10706
11493
 
11494
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11495
+
10707
11496
  for (int il = 0; il < n_layer; ++il) {
10708
11497
  ggml_tensor * inpSA = inpL;
10709
11498
 
@@ -10737,9 +11526,7 @@ struct llm_build_t5_enc : public llm_graph_context {
10737
11526
  cb(cur, "kqv_out", il);
10738
11527
  }
10739
11528
 
10740
- if (il == n_layer - 1) {
10741
- // skip computing output for unused tokens
10742
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11529
+ if (il == n_layer - 1 && inp_out_ids) {
10743
11530
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10744
11531
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10745
11532
  }
@@ -10810,6 +11597,8 @@ struct llm_build_t5_dec : public llm_graph_context {
10810
11597
  auto * inp_attn_self = build_attn_inp_kv_unified();
10811
11598
  auto * inp_attn_cross = build_attn_inp_cross();
10812
11599
 
11600
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11601
+
10813
11602
  for (int il = 0; il < n_layer; ++il) {
10814
11603
  ggml_tensor * inpSA = inpL;
10815
11604
 
@@ -10901,11 +11690,8 @@ struct llm_build_t5_dec : public llm_graph_context {
10901
11690
  //cb(cur, "kqv_out", il);
10902
11691
  }
10903
11692
 
10904
- if (il == n_layer - 1) {
10905
- // skip computing output for unused tokens
10906
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11693
+ if (il == n_layer - 1 && inp_out_ids) {
10907
11694
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10908
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
10909
11695
  inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
10910
11696
  }
10911
11697
 
@@ -10975,6 +11761,8 @@ struct llm_build_jais : public llm_graph_context {
10975
11761
 
10976
11762
  auto * inp_attn = build_attn_inp_kv_unified();
10977
11763
 
11764
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11765
+
10978
11766
  for (int il = 0; il < n_layer; ++il) {
10979
11767
  cur = build_norm(inpL,
10980
11768
  model.layers[il].attn_norm,
@@ -11007,9 +11795,7 @@ struct llm_build_jais : public llm_graph_context {
11007
11795
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
11008
11796
  }
11009
11797
 
11010
- if (il == n_layer - 1) {
11011
- // skip computing output for unused tokens
11012
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11798
+ if (il == n_layer - 1 && inp_out_ids) {
11013
11799
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11014
11800
  inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
11015
11801
  }
@@ -11073,6 +11859,8 @@ struct llm_build_chatglm : public llm_graph_context {
11073
11859
 
11074
11860
  auto * inp_attn = build_attn_inp_kv_unified();
11075
11861
 
11862
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11863
+
11076
11864
  for (int il = 0; il < n_layer; ++il) {
11077
11865
  ggml_tensor * inpSA = inpL;
11078
11866
 
@@ -11139,9 +11927,7 @@ struct llm_build_chatglm : public llm_graph_context {
11139
11927
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11140
11928
  }
11141
11929
 
11142
- if (il == n_layer - 1) {
11143
- // skip computing output for unused tokens
11144
- ggml_tensor * inp_out_ids = build_inp_out_ids();
11930
+ if (il == n_layer - 1 && inp_out_ids) {
11145
11931
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11146
11932
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11147
11933
  }
@@ -11206,6 +11992,8 @@ struct llm_build_glm4 : public llm_graph_context {
11206
11992
 
11207
11993
  auto * inp_attn = build_attn_inp_kv_unified();
11208
11994
 
11995
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11996
+
11209
11997
  for (int il = 0; il < n_layer; ++il) {
11210
11998
  ggml_tensor * inpSA = inpL;
11211
11999
 
@@ -11272,9 +12060,7 @@ struct llm_build_glm4 : public llm_graph_context {
11272
12060
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11273
12061
  }
11274
12062
 
11275
- if (il == n_layer - 1) {
11276
- // skip computing output for unused tokens
11277
- ggml_tensor * inp_out_ids = build_inp_out_ids();
12063
+ if (il == n_layer - 1 && inp_out_ids) {
11278
12064
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11279
12065
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11280
12066
  }
@@ -11357,6 +12143,8 @@ struct llm_build_nemotron : public llm_graph_context {
11357
12143
 
11358
12144
  auto * inp_attn = build_attn_inp_kv_unified();
11359
12145
 
12146
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12147
+
11360
12148
  for (int il = 0; il < n_layer; ++il) {
11361
12149
  ggml_tensor * inpSA = inpL;
11362
12150
 
@@ -11416,9 +12204,7 @@ struct llm_build_nemotron : public llm_graph_context {
11416
12204
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11417
12205
  }
11418
12206
 
11419
- if (il == n_layer - 1) {
11420
- // skip computing output for unused tokens
11421
- ggml_tensor * inp_out_ids = build_inp_out_ids();
12207
+ if (il == n_layer - 1 && inp_out_ids) {
11422
12208
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11423
12209
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11424
12210
  }
@@ -11486,6 +12272,8 @@ struct llm_build_exaone : public llm_graph_context {
11486
12272
 
11487
12273
  auto * inp_attn = build_attn_inp_kv_unified();
11488
12274
 
12275
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12276
+
11489
12277
  for (int il = 0; il < n_layer; ++il) {
11490
12278
  ggml_tensor * inpSA = inpL;
11491
12279
 
@@ -11547,9 +12335,7 @@ struct llm_build_exaone : public llm_graph_context {
11547
12335
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
11548
12336
  }
11549
12337
 
11550
- if (il == n_layer - 1) {
11551
- // skip computing output for unused tokens
11552
- ggml_tensor * inp_out_ids = build_inp_out_ids();
12338
+ if (il == n_layer - 1 && inp_out_ids) {
11553
12339
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11554
12340
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11555
12341
  }
@@ -11636,14 +12422,13 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11636
12422
  }
11637
12423
 
11638
12424
  ggml_tensor * build_rwkv6_time_mix(
12425
+ llm_graph_input_rs * inp,
11639
12426
  ggml_cgraph * gf,
11640
12427
  ggml_tensor * cur,
11641
12428
  ggml_tensor * x_prev,
11642
- ggml_tensor * state_copy,
11643
- ggml_tensor * state_mask,
11644
12429
  const llama_ubatch & ubatch,
11645
12430
  int il) const {
11646
- const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
12431
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
11647
12432
 
11648
12433
  const auto n_tokens = ubatch.n_tokens;
11649
12434
  const auto n_seqs = ubatch.n_seqs;
@@ -11653,7 +12438,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11653
12438
  const auto n_head = n_embd / head_size;
11654
12439
  const auto n_head_kv = hparams.n_head_kv(il);
11655
12440
 
11656
- const auto kv_head = kv_state->get_head();
12441
+ const auto kv_head = mctx_cur->get_head();
11657
12442
 
11658
12443
  const auto & layer = model.layers[il];
11659
12444
 
@@ -11764,9 +12549,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11764
12549
  k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
11765
12550
  }
11766
12551
 
11767
- ggml_tensor * wkv_state = build_copy_mask_state(
11768
- gf, kv_state->get_v_l(il), state_copy, state_mask,
11769
- hparams.n_embd_v_s(), n_seqs);
12552
+ ggml_tensor * wkv_state = build_rs(
12553
+ inp, gf, mctx_cur->get_s_l(il),
12554
+ hparams.n_embd_s(), n_seqs);
11770
12555
 
11771
12556
  ggml_tensor * wkv_output;
11772
12557
  if (is_qrwkv) {
@@ -11784,9 +12569,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11784
12569
  wkv_state,
11785
12570
  ggml_view_1d(
11786
12571
  ctx0,
11787
- kv_state->get_v_l(il),
11788
- hparams.n_embd_v_s() * n_seqs,
11789
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
12572
+ mctx_cur->get_s_l(il),
12573
+ hparams.n_embd_s() * n_seqs,
12574
+ hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
11790
12575
  )
11791
12576
  )
11792
12577
  );
@@ -11820,20 +12605,19 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11820
12605
  inpL = build_inp_embd(model.tok_embd);
11821
12606
  inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
11822
12607
 
11823
- ggml_tensor * state_copy = build_inp_s_copy();
11824
- ggml_tensor * state_mask = build_inp_s_mask();
12608
+ auto * rs_inp = build_rs_inp();
11825
12609
 
11826
12610
  const auto n_embd = hparams.n_embd;
11827
12611
  const auto n_seq_tokens = ubatch.n_seq_tokens;
11828
12612
  const auto n_seqs = ubatch.n_seqs;
11829
12613
 
12614
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12615
+
11830
12616
  for (int il = 0; il < n_layer; ++il) {
11831
12617
  const llama_layer * layer = &model.layers[il];
11832
12618
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
11833
12619
 
11834
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
11835
- gf, state_copy, state_mask, ubatch, il
11836
- );
12620
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
11837
12621
 
11838
12622
  ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
11839
12623
  ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -11848,7 +12632,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11848
12632
  1
11849
12633
  );
11850
12634
 
11851
- cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
12635
+ cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
11852
12636
 
11853
12637
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
11854
12638
  cb(ffn_inp, "ffn_inp", il);
@@ -11870,13 +12654,16 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11870
12654
  );
11871
12655
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
11872
12656
 
11873
- if (il == n_layer - 1) {
11874
- // skip computing output for unused tokens
11875
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
11876
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
11877
- ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
11878
- x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
11879
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
12657
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
12658
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
12659
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
12660
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
12661
+
12662
+ if (il == n_layer - 1 && inp_out_ids) {
12663
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
12664
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
12665
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
12666
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11880
12667
  }
11881
12668
 
11882
12669
  cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
@@ -11911,27 +12698,26 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11911
12698
  // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
11912
12699
  struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11913
12700
  llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
11914
- GGML_ASSERT(n_embd == hparams.n_embd_k_s());
12701
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
11915
12702
 
11916
12703
  ggml_tensor * cur;
11917
12704
  ggml_tensor * inpL;
11918
12705
 
11919
12706
  inpL = build_inp_embd(model.tok_embd);
11920
12707
 
11921
- ggml_tensor * state_copy = build_inp_s_copy();
11922
- ggml_tensor * state_mask = build_inp_s_mask();
12708
+ auto * rs_inp = build_rs_inp();
11923
12709
 
11924
12710
  const auto n_embd = hparams.n_embd;
11925
12711
  const auto n_seq_tokens = ubatch.n_seq_tokens;
11926
12712
  const auto n_seqs = ubatch.n_seqs;
11927
12713
 
12714
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12715
+
11928
12716
  for (int il = 0; il < n_layer; ++il) {
11929
12717
  const llama_layer * layer = &model.layers[il];
11930
12718
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
11931
12719
 
11932
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
11933
- gf, state_copy, state_mask, ubatch, il
11934
- );
12720
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
11935
12721
 
11936
12722
  ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
11937
12723
  cb(att_norm, "attn_norm", il);
@@ -11943,7 +12729,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11943
12729
  1
11944
12730
  );
11945
12731
 
11946
- cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
12732
+ cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
11947
12733
 
11948
12734
  token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
11949
12735
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -11951,11 +12737,12 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11951
12737
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
11952
12738
  cb(ffn_inp, "ffn_inp", il);
11953
12739
 
11954
- if (il == n_layer - 1) {
11955
- // skip computing output for unused tokens
11956
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
11957
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
11958
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
12740
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
12741
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
12742
+
12743
+ if (il == n_layer - 1 && inp_out_ids) {
12744
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12745
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
11959
12746
  }
11960
12747
 
11961
12748
  // feed-forward network
@@ -12031,15 +12818,14 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12031
12818
  }
12032
12819
 
12033
12820
  ggml_tensor * build_rwkv7_time_mix(
12821
+ llm_graph_input_rs * inp,
12034
12822
  ggml_cgraph * gf,
12035
12823
  ggml_tensor * cur,
12036
12824
  ggml_tensor * x_prev,
12037
- ggml_tensor * state_copy,
12038
- ggml_tensor * state_mask,
12039
12825
  ggml_tensor *& first_layer_value,
12040
12826
  const llama_ubatch & ubatch,
12041
12827
  int il) const {
12042
- const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
12828
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
12043
12829
 
12044
12830
  const auto n_tokens = ubatch.n_tokens;
12045
12831
  const auto n_seqs = ubatch.n_seqs;
@@ -12048,7 +12834,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12048
12834
  const auto head_count = n_embd / head_size;
12049
12835
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12050
12836
 
12051
- const auto kv_head = kv_state->get_head();
12837
+ const auto kv_head = mctx_cur->get_head();
12052
12838
 
12053
12839
  const auto & layer = model.layers[il];
12054
12840
 
@@ -12118,9 +12904,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12118
12904
  v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
12119
12905
  a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
12120
12906
 
12121
- ggml_tensor * wkv_state = build_copy_mask_state(
12122
- gf, kv_state->get_v_l(il), state_copy, state_mask,
12123
- hparams.n_embd_v_s(), n_seqs);
12907
+ ggml_tensor * wkv_state = build_rs(
12908
+ inp, gf, mctx_cur->get_s_l(il),
12909
+ hparams.n_embd_s(), n_seqs);
12124
12910
 
12125
12911
  ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
12126
12912
  cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
@@ -12133,9 +12919,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12133
12919
  wkv_state,
12134
12920
  ggml_view_1d(
12135
12921
  ctx0,
12136
- kv_state->get_v_l(il),
12137
- hparams.n_embd_v_s() * n_seqs,
12138
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
12922
+ mctx_cur->get_s_l(il),
12923
+ hparams.n_embd_s() * n_seqs,
12924
+ hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
12139
12925
  )
12140
12926
  )
12141
12927
  );
@@ -12176,20 +12962,19 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12176
12962
  inpL = build_inp_embd(model.tok_embd);
12177
12963
  inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
12178
12964
 
12179
- ggml_tensor * state_copy = build_inp_s_copy();
12180
- ggml_tensor * state_mask = build_inp_s_mask();
12965
+ auto * rs_inp = build_rs_inp();
12181
12966
 
12182
12967
  const auto n_embd = hparams.n_embd;
12183
12968
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12184
12969
  const auto n_seqs = ubatch.n_seqs;
12185
12970
 
12971
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12972
+
12186
12973
  for (int il = 0; il < n_layer; ++il) {
12187
12974
  const llama_layer * layer = &model.layers[il];
12188
12975
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
12189
12976
 
12190
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
12191
- gf, state_copy, state_mask, ubatch, il
12192
- );
12977
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
12193
12978
 
12194
12979
  ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
12195
12980
  ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -12204,7 +12989,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12204
12989
  1
12205
12990
  );
12206
12991
 
12207
- cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
12992
+ cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
12208
12993
 
12209
12994
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
12210
12995
  cb(ffn_inp, "ffn_inp", il);
@@ -12226,12 +13011,14 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12226
13011
  );
12227
13012
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
12228
13013
 
12229
- if (il == n_layer - 1) {
12230
- // skip computing output for unused tokens
12231
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
12232
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
12233
- ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
12234
- x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids);
13014
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
13015
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
13016
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
13017
+
13018
+ if (il == n_layer - 1 && inp_out_ids) {
13019
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
13020
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
13021
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
12235
13022
  }
12236
13023
 
12237
13024
  cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
@@ -12262,7 +13049,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12262
13049
 
12263
13050
  struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12264
13051
  llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
12265
- GGML_ASSERT(n_embd == hparams.n_embd_k_s());
13052
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
12266
13053
 
12267
13054
  ggml_tensor * cur;
12268
13055
  ggml_tensor * inpL;
@@ -12270,20 +13057,19 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12270
13057
 
12271
13058
  inpL = build_inp_embd(model.tok_embd);
12272
13059
 
12273
- ggml_tensor * state_copy = build_inp_s_copy();
12274
- ggml_tensor * state_mask = build_inp_s_mask();
13060
+ auto * rs_inp = build_rs_inp();
12275
13061
 
12276
13062
  const auto n_embd = hparams.n_embd;
12277
13063
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12278
13064
  const auto n_seqs = ubatch.n_seqs;
12279
13065
 
13066
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13067
+
12280
13068
  for (int il = 0; il < n_layer; ++il) {
12281
13069
  const llama_layer * layer = &model.layers[il];
12282
13070
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
12283
13071
 
12284
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
12285
- gf, state_copy, state_mask, ubatch, il
12286
- );
13072
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
12287
13073
 
12288
13074
  ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
12289
13075
  cb(att_norm, "attn_norm", il);
@@ -12295,7 +13081,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12295
13081
  1
12296
13082
  );
12297
13083
 
12298
- cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
13084
+ cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
12299
13085
 
12300
13086
  token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
12301
13087
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -12303,11 +13089,12 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12303
13089
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
12304
13090
  cb(ffn_inp, "ffn_inp", il);
12305
13091
 
12306
- if (il == n_layer - 1) {
12307
- // skip computing output for unused tokens
12308
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
12309
- cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
12310
- ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
13092
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
13093
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
13094
+
13095
+ if (il == n_layer - 1 && inp_out_ids) {
13096
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13097
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
12311
13098
  }
12312
13099
 
12313
13100
  // feed-forward network
@@ -12376,6 +13163,9 @@ struct llm_build_granite : public llm_graph_context {
12376
13163
  auto * inp_attn = build_attn_inp_kv_unified();
12377
13164
 
12378
13165
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
13166
+
13167
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13168
+
12379
13169
  for (int il = 0; il < n_layer; ++il) {
12380
13170
  ggml_tensor * inpSA = inpL;
12381
13171
 
@@ -12438,9 +13228,7 @@ struct llm_build_granite : public llm_graph_context {
12438
13228
  cb(cur, "attn_out", il);
12439
13229
  }
12440
13230
 
12441
- if (il == n_layer - 1) {
12442
- // skip computing output for unused tokens
12443
- ggml_tensor * inp_out_ids = build_inp_out_ids();
13231
+ if (il == n_layer - 1 && inp_out_ids) {
12444
13232
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12445
13233
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12446
13234
  }
@@ -12559,6 +13347,8 @@ struct llm_build_chameleon : public llm_graph_context {
12559
13347
 
12560
13348
  auto * inp_attn = build_attn_inp_kv_unified();
12561
13349
 
13350
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13351
+
12562
13352
  for (int il = 0; il < n_layer; ++il) {
12563
13353
  ggml_tensor * inpSA = inpL;
12564
13354
 
@@ -12635,21 +13425,19 @@ struct llm_build_chameleon : public llm_graph_context {
12635
13425
  cur = build_attn(inp_attn, gf,
12636
13426
  model.layers[il].wo, nullptr,
12637
13427
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12638
-
12639
- if (hparams.swin_norm) {
12640
- cur = build_norm(cur,
12641
- model.layers[il].attn_norm, NULL,
12642
- LLM_NORM_RMS, il);
12643
- }
12644
13428
  }
12645
13429
 
12646
- if (il == n_layer - 1) {
12647
- // skip computing output for unused tokens
12648
- ggml_tensor * inp_out_ids = build_inp_out_ids();
13430
+ if (il == n_layer - 1 && inp_out_ids) {
12649
13431
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12650
13432
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12651
13433
  }
12652
13434
 
13435
+ if (hparams.swin_norm) {
13436
+ cur = build_norm(cur,
13437
+ model.layers[il].attn_norm, NULL,
13438
+ LLM_NORM_RMS, il);
13439
+ }
13440
+
12653
13441
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12654
13442
  cb(ffn_inp, "ffn_inp", il);
12655
13443
 
@@ -12890,6 +13678,8 @@ struct llm_build_plm : public llm_graph_context {
12890
13678
 
12891
13679
  auto * inp_attn = build_attn_inp_kv_unified();
12892
13680
 
13681
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13682
+
12893
13683
  for (int il = 0; il < n_layer; ++il) {
12894
13684
  ggml_tensor * inpSA = inpL;
12895
13685
 
@@ -12993,9 +13783,7 @@ struct llm_build_plm : public llm_graph_context {
12993
13783
  q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
12994
13784
  }
12995
13785
 
12996
- if (il == n_layer - 1) {
12997
- // skip computing output for unused tokens
12998
- ggml_tensor * inp_out_ids = build_inp_out_ids();
13786
+ if (il == n_layer - 1 && inp_out_ids) {
12999
13787
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13000
13788
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13001
13789
  }
@@ -13055,6 +13843,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
13055
13843
 
13056
13844
  auto * inp_attn = build_attn_inp_kv_unified();
13057
13845
 
13846
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13847
+
13058
13848
  for (int il = 0; il < n_layer; ++il) {
13059
13849
  ggml_tensor * inpSA = inpL;
13060
13850
 
@@ -13116,9 +13906,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
13116
13906
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
13117
13907
  }
13118
13908
 
13119
- if (il == n_layer - 1) {
13120
- // skip computing output for unused tokens
13121
- ggml_tensor * inp_out_ids = build_inp_out_ids();
13909
+ if (il == n_layer - 1 && inp_out_ids) {
13122
13910
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13123
13911
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13124
13912
  }
@@ -13187,69 +13975,375 @@ struct llm_build_bailingmoe : public llm_graph_context {
13187
13975
  }
13188
13976
  };
13189
13977
 
13190
- llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
13191
- llama_memory_i * res;
13978
+ struct llm_build_dots1 : public llm_graph_context {
13979
+ llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13980
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13192
13981
 
13193
- switch (arch) {
13194
- case LLM_ARCH_BERT:
13195
- case LLM_ARCH_JINA_BERT_V2:
13196
- case LLM_ARCH_NOMIC_BERT:
13197
- case LLM_ARCH_NOMIC_BERT_MOE:
13198
- case LLM_ARCH_WAVTOKENIZER_DEC:
13199
- {
13200
- res = nullptr;
13201
- } break;
13202
- case LLM_ARCH_MAMBA:
13203
- case LLM_ARCH_RWKV6:
13204
- case LLM_ARCH_RWKV6QWEN2:
13205
- case LLM_ARCH_RWKV7:
13206
- case LLM_ARCH_ARWKV7:
13982
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13983
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
13984
+
13985
+ ggml_tensor * cur;
13986
+ ggml_tensor * inpL;
13987
+
13988
+ inpL = build_inp_embd(model.tok_embd);
13989
+
13990
+ // inp_pos - contains the positions
13991
+ ggml_tensor * inp_pos = build_inp_pos();
13992
+
13993
+ auto * inp_attn = build_attn_inp_kv_unified();
13994
+
13995
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13996
+
13997
+ for (int il = 0; il < n_layer; ++il) {
13998
+ ggml_tensor * inpSA = inpL;
13999
+
14000
+ // norm
14001
+ cur = build_norm(inpL,
14002
+ model.layers[il].attn_norm, NULL,
14003
+ LLM_NORM_RMS, il);
14004
+ cb(cur, "attn_norm", il);
14005
+
14006
+ // self_attention
13207
14007
  {
13208
- res = new llama_kv_cache_recurrent(
13209
- *this,
13210
- GGML_TYPE_F32,
13211
- GGML_TYPE_F32,
13212
- cparams.offload_kqv,
13213
- std::max((uint32_t) 1, cparams.n_seq_max),
13214
- cparams.n_seq_max);
13215
- } break;
13216
- default:
14008
+ // compute Q and K and RoPE them
14009
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14010
+ cb(Qcur, "Qcur", il);
14011
+
14012
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14013
+ cb(Kcur, "Kcur", il);
14014
+
14015
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14016
+ cb(Vcur, "Vcur", il);
14017
+
14018
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14019
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14020
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14021
+
14022
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
14023
+ cb(Qcur, "Qcur_normed", il);
14024
+
14025
+ Qcur = ggml_rope_ext(
14026
+ ctx0, Qcur, inp_pos, nullptr,
14027
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14028
+ ext_factor, attn_factor, beta_fast, beta_slow
14029
+ );
14030
+
14031
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
14032
+ cb(Kcur, "Kcur_normed", il);
14033
+
14034
+ Kcur = ggml_rope_ext(
14035
+ ctx0, Kcur, inp_pos, nullptr,
14036
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14037
+ ext_factor, attn_factor, beta_fast, beta_slow
14038
+ );
14039
+
14040
+ cb(Qcur, "Qcur", il);
14041
+ cb(Kcur, "Kcur", il);
14042
+ cb(Vcur, "Vcur", il);
14043
+
14044
+ cur = build_attn(inp_attn, gf,
14045
+ model.layers[il].wo, model.layers[il].bo,
14046
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14047
+ }
14048
+
14049
+ if (il == n_layer - 1 && inp_out_ids) {
14050
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14051
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14052
+ }
14053
+
14054
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14055
+ cb(ffn_inp, "ffn_inp", il);
14056
+
14057
+ // MoE branch
14058
+ cur = build_norm(ffn_inp,
14059
+ model.layers[il].ffn_norm, NULL,
14060
+ LLM_NORM_RMS, il);
14061
+ cb(cur, "ffn_norm", il);
14062
+
14063
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
14064
+ cur = build_ffn(cur,
14065
+ model.layers[il].ffn_up, NULL, NULL,
14066
+ model.layers[il].ffn_gate, NULL, NULL,
14067
+ model.layers[il].ffn_down, NULL, NULL,
14068
+ NULL,
14069
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14070
+ cb(cur, "ffn_out", il);
14071
+ } else {
14072
+ ggml_tensor * moe_out =
14073
+ build_moe_ffn(cur,
14074
+ model.layers[il].ffn_gate_inp,
14075
+ model.layers[il].ffn_up_exps,
14076
+ model.layers[il].ffn_gate_exps,
14077
+ model.layers[il].ffn_down_exps,
14078
+ model.layers[il].ffn_exp_probs_b,
14079
+ n_expert, n_expert_used,
14080
+ LLM_FFN_SILU, hparams.expert_weights_norm,
14081
+ true, hparams.expert_weights_scale,
14082
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
14083
+ il);
14084
+ cb(moe_out, "ffn_moe_out", il);
14085
+
14086
+ {
14087
+ ggml_tensor * ffn_shexp = build_ffn(cur,
14088
+ model.layers[il].ffn_up_shexp, NULL, NULL,
14089
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
14090
+ model.layers[il].ffn_down_shexp, NULL, NULL,
14091
+ NULL,
14092
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14093
+ cb(ffn_shexp, "ffn_shexp", il);
14094
+
14095
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
14096
+ cb(cur, "ffn_out", il);
14097
+ }
14098
+ }
14099
+
14100
+ cur = ggml_add(ctx0, cur, ffn_inp);
14101
+
14102
+ cur = build_cvec(cur, il);
14103
+ cb(cur, "l_out", il);
14104
+
14105
+ // input for next layer
14106
+ inpL = cur;
14107
+ }
14108
+
14109
+ cur = inpL;
14110
+
14111
+ cur = build_norm(cur,
14112
+ model.output_norm, NULL,
14113
+ LLM_NORM_RMS, -1);
14114
+
14115
+ cb(cur, "result_norm", -1);
14116
+ res->t_embd = cur;
14117
+
14118
+ // lm_head
14119
+ cur = build_lora_mm(model.output, cur);
14120
+
14121
+ cb(cur, "result_output", -1);
14122
+ res->t_logits = cur;
14123
+
14124
+ ggml_build_forward_expand(gf, cur);
14125
+ }
14126
+ };
14127
+
14128
+ struct llm_build_arcee : public llm_graph_context {
14129
+ llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14130
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14131
+
14132
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14133
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
14134
+
14135
+ ggml_tensor * cur;
14136
+ ggml_tensor * inpL;
14137
+
14138
+ inpL = build_inp_embd(model.tok_embd);
14139
+
14140
+ // inp_pos - contains the positions
14141
+ ggml_tensor * inp_pos = build_inp_pos();
14142
+
14143
+ auto * inp_attn = build_attn_inp_kv_unified();
14144
+
14145
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14146
+
14147
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14148
+
14149
+ for (int il = 0; il < n_layer; ++il) {
14150
+ ggml_tensor * inpSA = inpL;
14151
+
14152
+ // norm
14153
+ cur = build_norm(inpL,
14154
+ model.layers[il].attn_norm, NULL,
14155
+ LLM_NORM_RMS, il);
14156
+ cb(cur, "attn_norm", il);
14157
+
14158
+ // self-attention
13217
14159
  {
13218
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
14160
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
14161
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
13219
14162
 
13220
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
14163
+ // compute Q and K and RoPE them
14164
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14165
+ cb(Qcur, "Qcur", il);
14166
+ if (model.layers[il].bq) {
14167
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14168
+ cb(Qcur, "Qcur", il);
14169
+ }
13221
14170
 
13222
- LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
14171
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14172
+ cb(Kcur, "Kcur", il);
14173
+ if (model.layers[il].bk) {
14174
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14175
+ cb(Kcur, "Kcur", il);
14176
+ }
13223
14177
 
13224
- if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13225
- GGML_ASSERT(hparams.is_swa_any());
14178
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14179
+ cb(Vcur, "Vcur", il);
14180
+ if (model.layers[il].bv) {
14181
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14182
+ cb(Vcur, "Vcur", il);
14183
+ }
13226
14184
 
13227
- res = new llama_kv_cache_unified_iswa(
13228
- *this,
13229
- params.type_k,
13230
- params.type_v,
13231
- !cparams.flash_attn,
13232
- cparams.offload_kqv,
13233
- params.swa_full,
13234
- cparams.n_ctx,
13235
- cparams.n_seq_max,
13236
- cparams.n_ubatch,
13237
- padding);
13238
- } else {
13239
- GGML_ASSERT(!hparams.is_swa_any());
14185
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14186
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14187
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14188
+
14189
+ Qcur = ggml_rope_ext(
14190
+ ctx0, Qcur, inp_pos, rope_factors,
14191
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14192
+ ext_factor, attn_factor, beta_fast, beta_slow
14193
+ );
14194
+
14195
+ Kcur = ggml_rope_ext(
14196
+ ctx0, Kcur, inp_pos, rope_factors,
14197
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14198
+ ext_factor, attn_factor, beta_fast, beta_slow
14199
+ );
14200
+
14201
+ cb(Qcur, "Qcur", il);
14202
+ cb(Kcur, "Kcur", il);
14203
+ cb(Vcur, "Vcur", il);
14204
+
14205
+ cur = build_attn(inp_attn, gf,
14206
+ model.layers[il].wo, model.layers[il].bo,
14207
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14208
+ cb(cur, "attn_out", il);
14209
+ }
14210
+
14211
+ if (il == n_layer - 1 && inp_out_ids) {
14212
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14213
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14214
+ }
14215
+
14216
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14217
+ cb(ffn_inp, "ffn_inp", il);
14218
+
14219
+ // feed-forward network
14220
+ // ARCEE uses relu^2 instead of silu
14221
+ cur = build_norm(ffn_inp,
14222
+ model.layers[il].ffn_norm, NULL,
14223
+ LLM_NORM_RMS, il);
14224
+ cb(cur, "ffn_norm", il);
14225
+
14226
+ cur = build_ffn(cur,
14227
+ model.layers[il].ffn_up, NULL, NULL,
14228
+ NULL, NULL, NULL,
14229
+ model.layers[il].ffn_down, NULL, NULL,
14230
+ NULL,
14231
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
14232
+ cb(cur, "ffn_out", il);
14233
+
14234
+ cur = ggml_add(ctx0, cur, ffn_inp);
14235
+ cb(cur, "ffn_out", il);
14236
+
14237
+ cur = build_cvec(cur, il);
14238
+ cb(cur, "l_out", il);
14239
+
14240
+ // input for next layer
14241
+ inpL = cur;
14242
+ }
14243
+
14244
+ cur = inpL;
14245
+
14246
+ cur = build_norm(cur,
14247
+ model.output_norm, NULL,
14248
+ LLM_NORM_RMS, -1);
14249
+
14250
+ cb(cur, "result_norm", -1);
14251
+ res->t_embd = cur;
13240
14252
 
13241
- res = new llama_kv_cache_unified(
14253
+ // lm_head
14254
+ cur = build_lora_mm(model.output, cur);
14255
+
14256
+ cb(cur, "result_output", -1);
14257
+ res->t_logits = cur;
14258
+
14259
+ ggml_build_forward_expand(gf, cur);
14260
+ }
14261
+ };
14262
+
14263
+ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
14264
+ llama_memory_i * res;
14265
+
14266
+ switch (arch) {
14267
+ // Models that need specific instantiation should be handled in the
14268
+ // switch statement
14269
+ case LLM_ARCH_BERT:
14270
+ case LLM_ARCH_JINA_BERT_V2:
14271
+ case LLM_ARCH_NOMIC_BERT:
14272
+ case LLM_ARCH_NOMIC_BERT_MOE:
14273
+ case LLM_ARCH_NEO_BERT:
14274
+ case LLM_ARCH_WAVTOKENIZER_DEC:
14275
+ {
14276
+ res = nullptr;
14277
+ } break;
14278
+ // Models that need standard caching should rely on recurrent/hybrid
14279
+ // checks
14280
+ default:
14281
+ {
14282
+ if (llm_arch_is_recurrent(arch)) {
14283
+ res = new llama_memory_recurrent(
13242
14284
  *this,
13243
14285
  nullptr,
13244
- params.type_k,
13245
- params.type_v,
13246
- !cparams.flash_attn,
14286
+ GGML_TYPE_F32,
14287
+ GGML_TYPE_F32,
13247
14288
  cparams.offload_kqv,
13248
- cparams.n_ctx,
13249
- cparams.n_seq_max,
13250
- padding,
13251
- hparams.n_swa,
13252
- hparams.swa_type);
14289
+ std::max((uint32_t) 1, cparams.n_seq_max),
14290
+ cparams.n_seq_max);
14291
+ } else if (llm_arch_is_hybrid(arch)) {
14292
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
14293
+
14294
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
14295
+
14296
+ res = new llama_memory_hybrid(
14297
+ /* model */ *this,
14298
+ /* attn_type_k */ params.type_k,
14299
+ /* attn_type_v */ params.type_v,
14300
+ /* attn_v_trans */ !cparams.flash_attn,
14301
+ /* attn_kv_size */ cparams.n_ctx,
14302
+ /* attn_n_pad */ padding,
14303
+ /* attn_n_swa */ hparams.n_swa,
14304
+ /* attn_swa_type */ hparams.swa_type,
14305
+ /* recurrent_type_k */ GGML_TYPE_F32,
14306
+ /* recurrent_type_v */ GGML_TYPE_F32,
14307
+ /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
14308
+ /* n_seq_max */ cparams.n_seq_max,
14309
+ /* offload */ cparams.offload_kqv);
14310
+ } else {
14311
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
14312
+
14313
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
14314
+
14315
+ LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
14316
+
14317
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
14318
+ GGML_ASSERT(hparams.is_swa_any());
14319
+
14320
+ res = new llama_kv_cache_unified_iswa(
14321
+ *this,
14322
+ params.type_k,
14323
+ params.type_v,
14324
+ !cparams.flash_attn,
14325
+ cparams.offload_kqv,
14326
+ params.swa_full,
14327
+ cparams.n_ctx,
14328
+ cparams.n_seq_max,
14329
+ cparams.n_ubatch,
14330
+ padding);
14331
+ } else {
14332
+ GGML_ASSERT(!hparams.is_swa_any());
14333
+
14334
+ res = new llama_kv_cache_unified(
14335
+ *this,
14336
+ nullptr,
14337
+ params.type_k,
14338
+ params.type_v,
14339
+ !cparams.flash_attn,
14340
+ cparams.offload_kqv,
14341
+ cparams.n_ctx,
14342
+ cparams.n_seq_max,
14343
+ padding,
14344
+ hparams.n_swa,
14345
+ hparams.swa_type);
14346
+ }
13253
14347
  }
13254
14348
  }
13255
14349
  }
@@ -13303,6 +14397,10 @@ llm_graph_result_ptr llama_model::build_graph(
13303
14397
  {
13304
14398
  llm = std::make_unique<llm_build_bert>(*this, params, gf);
13305
14399
  } break;
14400
+ case LLM_ARCH_NEO_BERT:
14401
+ {
14402
+ llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
14403
+ } break;
13306
14404
  case LLM_ARCH_BLOOM:
13307
14405
  {
13308
14406
  llm = std::make_unique<llm_build_bloom>(*this, params, gf);
@@ -13388,6 +14486,10 @@ llm_graph_result_ptr llama_model::build_graph(
13388
14486
  {
13389
14487
  llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
13390
14488
  } break;
14489
+ case LLM_ARCH_GEMMA3N:
14490
+ {
14491
+ llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params, gf);
14492
+ } break;
13391
14493
  case LLM_ARCH_STARCODER2:
13392
14494
  {
13393
14495
  llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
@@ -13525,6 +14627,14 @@ llm_graph_result_ptr llama_model::build_graph(
13525
14627
  {
13526
14628
  llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
13527
14629
  } break;
14630
+ case LLM_ARCH_DOTS1:
14631
+ {
14632
+ llm = std::make_unique<llm_build_dots1>(*this, params, gf);
14633
+ } break;
14634
+ case LLM_ARCH_ARCEE:
14635
+ {
14636
+ llm = std::make_unique<llm_build_arcee>(*this, params, gf);
14637
+ } break;
13528
14638
  default:
13529
14639
  GGML_ABORT("fatal error");
13530
14640
  }
@@ -13600,6 +14710,18 @@ int32_t llama_model_n_swa(const llama_model * model) {
13600
14710
  return model->hparams.n_swa;
13601
14711
  }
13602
14712
 
14713
+ uint32_t llama_model_n_cls_out(const struct llama_model * model) {
14714
+ return model->hparams.n_cls_out;
14715
+ }
14716
+
14717
+ const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
14718
+ if (i < model->classifier_labels.size()) {
14719
+ return model->classifier_labels[i].c_str();
14720
+ }
14721
+
14722
+ return nullptr;
14723
+ }
14724
+
13603
14725
  // deprecated
13604
14726
  int32_t llama_n_ctx_train(const llama_model * model) {
13605
14727
  return llama_model_n_ctx_train(model);
@@ -13662,6 +14784,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13662
14784
  case LLM_ARCH_GRANITE_MOE:
13663
14785
  case LLM_ARCH_CHAMELEON:
13664
14786
  case LLM_ARCH_BAILINGMOE:
14787
+ case LLM_ARCH_NEO_BERT:
14788
+ case LLM_ARCH_ARCEE:
13665
14789
  return LLAMA_ROPE_TYPE_NORM;
13666
14790
 
13667
14791
  // the pairs of head values are offset by n_rot/2
@@ -13687,6 +14811,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13687
14811
  case LLM_ARCH_GEMMA:
13688
14812
  case LLM_ARCH_GEMMA2:
13689
14813
  case LLM_ARCH_GEMMA3:
14814
+ case LLM_ARCH_GEMMA3N:
13690
14815
  case LLM_ARCH_STARCODER2:
13691
14816
  case LLM_ARCH_OPENELM:
13692
14817
  case LLM_ARCH_GPTNEOX:
@@ -13695,6 +14820,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13695
14820
  case LLM_ARCH_NEMOTRON:
13696
14821
  case LLM_ARCH_EXAONE:
13697
14822
  case LLM_ARCH_MINICPM3:
14823
+ case LLM_ARCH_DOTS1:
13698
14824
  return LLAMA_ROPE_TYPE_NEOX;
13699
14825
 
13700
14826
  case LLM_ARCH_QWEN2VL:
@@ -13760,7 +14886,7 @@ uint64_t llama_model_size(const llama_model * model) {
13760
14886
  }
13761
14887
 
13762
14888
  const char * llama_model_chat_template(const llama_model * model, const char * name) {
13763
- const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
14889
+ const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
13764
14890
  : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
13765
14891
  const auto & it = model->gguf_kv.find(key);
13766
14892
  if (it == model->gguf_kv.end()) {
@@ -13768,7 +14894,7 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
13768
14894
  // do not extend this list unless absolutely necessary
13769
14895
  // Mistral-Small-2503 does not have built-in chat template
13770
14896
  llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
13771
- if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
14897
+ if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
13772
14898
  return "mistral-v7-tekken";
13773
14899
  }
13774
14900
 
@@ -13802,14 +14928,7 @@ llama_token llama_model_decoder_start_token(const llama_model * model) {
13802
14928
  }
13803
14929
 
13804
14930
  bool llama_model_is_recurrent(const llama_model * model) {
13805
- switch (model->arch) {
13806
- case LLM_ARCH_MAMBA: return true;
13807
- case LLM_ARCH_RWKV6: return true;
13808
- case LLM_ARCH_RWKV6QWEN2: return true;
13809
- case LLM_ARCH_RWKV7: return true;
13810
- case LLM_ARCH_ARWKV7: return true;
13811
- default: return false;
13812
- }
14931
+ return llm_arch_is_recurrent(model->arch);
13813
14932
  }
13814
14933
 
13815
14934
  const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {