@novastera-oss/llamarn 0.2.9 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  5. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  15. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  17. package/cpp/build-info.cpp +2 -2
  18. package/cpp/llama.cpp/CMakeLists.txt +0 -1
  19. package/cpp/llama.cpp/README.md +4 -5
  20. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  21. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  22. package/cpp/llama.cpp/common/arg.cpp +17 -0
  23. package/cpp/llama.cpp/common/chat.cpp +37 -20
  24. package/cpp/llama.cpp/common/chat.h +2 -0
  25. package/cpp/llama.cpp/common/common.h +4 -0
  26. package/cpp/llama.cpp/convert_hf_to_gguf.py +745 -6
  27. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
  28. package/cpp/llama.cpp/ggml/CMakeLists.txt +7 -2
  29. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  30. package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
  31. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  32. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
  33. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
  34. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
  35. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1203 -163
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
  43. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +17 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  47. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +8 -6
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +185 -79
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
  66. package/cpp/llama.cpp/ggml/src/ggml-impl.h +64 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  68. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +35 -9
  69. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +167 -39
  70. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +254 -57
  71. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +505 -40
  73. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  83. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +60 -9
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +711 -292
  92. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
  93. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  94. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  95. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  105. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  106. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
  117. package/cpp/llama.cpp/ggml/src/ggml.c +382 -61
  118. package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
  119. package/cpp/llama.cpp/gguf-py/gguf/constants.py +209 -0
  120. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
  121. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +73 -21
  122. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
  123. package/cpp/llama.cpp/include/llama.h +0 -40
  124. package/cpp/llama.cpp/src/llama-arch.cpp +210 -3
  125. package/cpp/llama.cpp/src/llama-arch.h +18 -1
  126. package/cpp/llama.cpp/src/llama-batch.cpp +27 -1
  127. package/cpp/llama.cpp/src/llama-batch.h +8 -1
  128. package/cpp/llama.cpp/src/llama-chat.cpp +15 -0
  129. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  130. package/cpp/llama.cpp/src/llama-graph.cpp +119 -184
  131. package/cpp/llama.cpp/src/llama-graph.h +47 -60
  132. package/cpp/llama.cpp/src/llama-hparams.cpp +7 -1
  133. package/cpp/llama.cpp/src/llama-hparams.h +3 -0
  134. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  135. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  136. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  137. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  138. package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
  139. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  140. package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
  141. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +20 -10
  142. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  143. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  144. package/cpp/llama.cpp/src/llama-model.cpp +2530 -685
  145. package/cpp/llama.cpp/src/llama-model.h +18 -0
  146. package/cpp/llama.cpp/src/llama-quant.cpp +1 -0
  147. package/cpp/llama.cpp/src/llama-vocab.cpp +13 -2
  148. package/cpp/llama.cpp/src/llama-vocab.h +41 -0
  149. package/ios/include/chat.h +2 -0
  150. package/ios/include/common.h +4 -0
  151. package/ios/include/llama.h +0 -40
  152. package/ios/libs/llama.xcframework/Info.plist +19 -19
  153. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  154. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5055 -4886
  155. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  156. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
  157. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +0 -40
  158. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  159. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  160. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  161. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
  162. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  163. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  164. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  165. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  168. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3766
  169. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  170. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
  171. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -40
  172. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  173. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
  174. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -40
  175. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  176. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  177. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
  178. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -40
  179. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  180. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  181. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  182. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4890
  183. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  184. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
  185. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -40
  186. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  187. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  188. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  189. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
  190. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  191. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  192. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  193. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  194. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  195. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5091 -4922
  196. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  197. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
  198. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -40
  199. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  200. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  201. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4897
  202. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3794
  203. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  204. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  205. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  206. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  207. package/package.json +1 -1
  208. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  209. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  210. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  211. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  212. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  213. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  214. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  215. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  216. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  217. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  218. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  219. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  220. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  221. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  222. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  223. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  224. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  225. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  226. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  227. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  228. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  229. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  230. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  231. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  232. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  233. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  234. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  235. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  236. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  237. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  238. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  239. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  240. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  241. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  242. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  243. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  244. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  245. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  246. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  247. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -40,16 +40,21 @@ const char * llm_type_name(llm_type type) {
40
40
  case LLM_TYPE_190M: return "190M";
41
41
  case LLM_TYPE_220M: return "220M";
42
42
  case LLM_TYPE_250M: return "250M";
43
+ case LLM_TYPE_256M: return "256M";
43
44
  case LLM_TYPE_270M: return "270M";
44
45
  case LLM_TYPE_335M: return "335M";
46
+ case LLM_TYPE_350M: return "350M";
45
47
  case LLM_TYPE_410M: return "410M";
46
48
  case LLM_TYPE_450M: return "450M";
47
49
  case LLM_TYPE_475M: return "475M";
50
+ case LLM_TYPE_700M: return "700M";
48
51
  case LLM_TYPE_770M: return "770M";
49
52
  case LLM_TYPE_780M: return "780M";
53
+ case LLM_TYPE_0_3B: return "0.3B";
50
54
  case LLM_TYPE_0_5B: return "0.5B";
51
55
  case LLM_TYPE_0_6B: return "0.6B";
52
56
  case LLM_TYPE_1B: return "1B";
57
+ case LLM_TYPE_1_2B: return "1.2B";
53
58
  case LLM_TYPE_1_3B: return "1.3B";
54
59
  case LLM_TYPE_1_4B: return "1.4B";
55
60
  case LLM_TYPE_1_5B: return "1.5B";
@@ -101,6 +106,7 @@ const char * llm_type_name(llm_type type) {
101
106
  case LLM_TYPE_57B_A14B: return "57B.A14B";
102
107
  case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
103
108
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
109
+ case LLM_TYPE_A13B: return "A13B";
104
110
  case LLM_TYPE_30B_A3B: return "30B.A3B";
105
111
  case LLM_TYPE_235B_A22B: return "235B.A22B";
106
112
  case LLM_TYPE_E2B: return "E2B";
@@ -207,23 +213,27 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
207
213
  } break;
208
214
  case GGML_OP_SSM_CONV:
209
215
  {
210
- // FIXME
211
- ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
216
+ const int64_t n_seq_tokens = 512;
217
+ const int64_t n_seqs = 3;
218
+ ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
212
219
  op_tensor = ggml_ssm_conv(ctx, conv_x, w);
213
220
  } break;
214
221
  case GGML_OP_SSM_SCAN:
215
222
  {
216
- // FIXME
217
- const int64_t d_state = w->ne[0];
218
- const int64_t d_inner = w->ne[1];
223
+ // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
224
+ const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
225
+ const int64_t n_head = w->ne[1];
226
+ const int64_t head_dim = hparams.ssm_d_inner / n_head;
227
+ const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
219
228
  const int64_t n_seq_tokens = 512;
220
- const int64_t n_seqs = 1;
221
- ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
222
- ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
223
- ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
224
- ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
225
- ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
226
- op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
229
+ const int64_t n_seqs = 3;
230
+ ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
231
+ ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
232
+ ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
233
+ ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
234
+ ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
235
+ ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
236
+ op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
227
237
  } break;
228
238
  case GGML_OP_RWKV_WKV6:
229
239
  {
@@ -575,6 +585,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
575
585
  case 22: type = LLM_TYPE_1B; break;
576
586
  case 26: type = LLM_TYPE_3B; break;
577
587
  case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
588
+ case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
578
589
  // granite uses a vocab with len 49152
579
590
  case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
580
591
  case 36: type = LLM_TYPE_8B; break; // granite
@@ -1080,6 +1091,58 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1080
1091
  default: type = LLM_TYPE_UNKNOWN;
1081
1092
  }
1082
1093
  } break;
1094
+ case LLM_ARCH_MAMBA2:
1095
+ {
1096
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1097
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1098
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1099
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1100
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1101
+
1102
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1103
+
1104
+ switch (hparams.n_layer) {
1105
+ case 24:
1106
+ switch (hparams.n_embd) {
1107
+ case 768: type = LLM_TYPE_SMALL; break;
1108
+ default: type = LLM_TYPE_UNKNOWN;
1109
+ } break;
1110
+ case 48:
1111
+ switch (hparams.n_embd) {
1112
+ case 1024: type = LLM_TYPE_MEDIUM; break;
1113
+ case 1536: type = LLM_TYPE_LARGE; break;
1114
+ case 2048: type = LLM_TYPE_XL; break;
1115
+ default: type = LLM_TYPE_UNKNOWN;
1116
+ } break;
1117
+ case 64:
1118
+ switch (hparams.n_embd) {
1119
+ case 2560: type = LLM_TYPE_3B; break;
1120
+ case 4096: type = LLM_TYPE_7B; break;
1121
+ default: type = LLM_TYPE_UNKNOWN;
1122
+ } break;
1123
+ default: type = LLM_TYPE_UNKNOWN;
1124
+ }
1125
+ } break;
1126
+ case LLM_ARCH_JAMBA:
1127
+ {
1128
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1129
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1130
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1131
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1132
+
1133
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1134
+
1135
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1136
+ hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1137
+ }
1138
+
1139
+ switch (hparams.n_layer) {
1140
+ // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
1141
+ case 12: // 900M 8x???M
1142
+ case 32: // 51B 16x?B
1143
+ default: type = LLM_TYPE_UNKNOWN;
1144
+ }
1145
+ } break;
1083
1146
  case LLM_ARCH_XVERSE:
1084
1147
  {
1085
1148
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1446,6 +1509,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1446
1509
  ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
1447
1510
  ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
1448
1511
 
1512
+ // Granite uses rope_finetuned as a switch for rope, so default to true
1513
+ bool rope_finetuned = true;
1514
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
1515
+ hparams.rope_finetuned = rope_finetuned;
1516
+
1449
1517
  switch (hparams.n_layer) {
1450
1518
  case 32: type = LLM_TYPE_3B; break;
1451
1519
  case 40: type = LLM_TYPE_3B; break;
@@ -1453,6 +1521,40 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1453
1521
  default: type = LLM_TYPE_UNKNOWN;
1454
1522
  }
1455
1523
 
1524
+ // For Granite MoE Shared
1525
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1526
+ } break;
1527
+ case LLM_ARCH_GRANITE_HYBRID:
1528
+ {
1529
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1530
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
1531
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /* required */ false);
1532
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /* required */ false);
1533
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, /* required */ false);
1534
+
1535
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1536
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1537
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1538
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1539
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1540
+
1541
+ // Granite uses rope_finetuned as a switch for rope, so default to true
1542
+ bool rope_finetuned = true;
1543
+ ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
1544
+ hparams.rope_finetuned = rope_finetuned;
1545
+
1546
+ // A layer is recurrent IFF the n_head_kv value is set to 0
1547
+ for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1548
+ hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
1549
+ }
1550
+
1551
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1552
+
1553
+ switch (hparams.n_layer) {
1554
+ // TODO: Add llm type label (not sure this is useful)
1555
+ default: type = LLM_TYPE_UNKNOWN;
1556
+ }
1557
+
1456
1558
  // For Granite MoE Shared
1457
1559
  ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1458
1560
  } break;
@@ -1504,6 +1606,80 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1504
1606
  default: type = LLM_TYPE_UNKNOWN;
1505
1607
  }
1506
1608
  } break;
1609
+ case LLM_ARCH_ERNIE4_5:
1610
+ {
1611
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1612
+ switch (hparams.n_layer) {
1613
+ case 18: type = LLM_TYPE_0_3B; break;
1614
+ default: type = LLM_TYPE_UNKNOWN;
1615
+ }
1616
+ } break;
1617
+ case LLM_ARCH_FALCON_H1:
1618
+ {
1619
+ // Common parameters
1620
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1621
+
1622
+ // SSM parameters
1623
+ ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
1624
+ ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
1625
+ ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
1626
+ ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
1627
+ ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
1628
+
1629
+ std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
1630
+
1631
+ switch (hparams.n_layer) {
1632
+ case 36:
1633
+ type = LLM_TYPE_0_5B; break;
1634
+ case 24:
1635
+ type = LLM_TYPE_1_5B; break;
1636
+ case 66:
1637
+ type = LLM_TYPE_1B; break;
1638
+ case 32:
1639
+ type = LLM_TYPE_3B; break;
1640
+ case 44:
1641
+ type = LLM_TYPE_7B; break;
1642
+ case 72:
1643
+ type = LLM_TYPE_34B; break;
1644
+ default:
1645
+ type = LLM_TYPE_UNKNOWN;
1646
+ }
1647
+ } break;
1648
+ case LLM_ARCH_HUNYUAN_MOE:
1649
+ {
1650
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1651
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1652
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
1653
+
1654
+ switch (hparams.n_layer) {
1655
+ case 32: type = LLM_TYPE_A13B; break;
1656
+ default: type = LLM_TYPE_UNKNOWN;
1657
+ }
1658
+ } break;
1659
+ case LLM_ARCH_SMOLLM3:
1660
+ {
1661
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1662
+ hparams.n_no_rope_layer_step = 4;
1663
+
1664
+ switch (hparams.n_layer) {
1665
+ case 36: type = LLM_TYPE_3B; break;
1666
+ default: type = LLM_TYPE_UNKNOWN;
1667
+ }
1668
+ } break;
1669
+ case LLM_ARCH_LFM2:
1670
+ {
1671
+ ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
1672
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1673
+ for (uint32_t il = 0; il < hparams.n_layer; ++il) {
1674
+ hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
1675
+ }
1676
+ switch (hparams.n_embd) {
1677
+ case 1024: type = LLM_TYPE_350M; break;
1678
+ case 1536: type = LLM_TYPE_700M; break;
1679
+ case 2048: type = LLM_TYPE_1_2B; break;
1680
+ default: type = LLM_TYPE_UNKNOWN;
1681
+ }
1682
+ } break;
1507
1683
  default: throw std::runtime_error("unsupported model architecture");
1508
1684
  }
1509
1685
 
@@ -3115,6 +3291,228 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3115
3291
  layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3116
3292
  }
3117
3293
  } break;
3294
+ case LLM_ARCH_MAMBA2:
3295
+ {
3296
+ const int64_t d_conv = hparams.ssm_d_conv;
3297
+ const int64_t d_inner = hparams.ssm_d_inner;
3298
+ const int64_t d_state = hparams.ssm_d_state;
3299
+ const int64_t n_head = hparams.ssm_dt_rank;
3300
+ const int64_t n_group = hparams.ssm_n_group;
3301
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
3302
+
3303
+ // only an expansion factor of 2 is supported for now
3304
+ GGML_ASSERT(2 * n_embd == d_inner);
3305
+
3306
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3307
+
3308
+ // output
3309
+ {
3310
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3311
+
3312
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3313
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
3314
+ if (output == NULL) {
3315
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3316
+ }
3317
+ }
3318
+
3319
+ for (int i = 0; i < n_layer; ++i) {
3320
+ auto & layer = layers[i];
3321
+
3322
+ // norm
3323
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3324
+
3325
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
3326
+
3327
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
3328
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
3329
+
3330
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
3331
+
3332
+ // no "weight" suffix for these
3333
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
3334
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
3335
+
3336
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
3337
+
3338
+ // out_proj
3339
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3340
+ }
3341
+ } break;
3342
+ case LLM_ARCH_JAMBA:
3343
+ {
3344
+ const int64_t d_conv = hparams.ssm_d_conv;
3345
+ const int64_t d_inner = hparams.ssm_d_inner;
3346
+ const int64_t d_state = hparams.ssm_d_state;
3347
+ const int64_t dt_rank = hparams.ssm_dt_rank;
3348
+
3349
+ // only an expansion factor of 2 is supported for now
3350
+ GGML_ASSERT(2 * n_embd == d_inner);
3351
+
3352
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3353
+
3354
+ // output
3355
+ {
3356
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3357
+
3358
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3359
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
3360
+ if (output == NULL) {
3361
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3362
+ }
3363
+ }
3364
+
3365
+ for (int i = 0; i < n_layer; ++i) {
3366
+ const int64_t n_head_kv = hparams.n_head_kv(i);
3367
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
3368
+
3369
+ auto & layer = layers[i];
3370
+
3371
+ // norm
3372
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3373
+
3374
+ if (n_head_kv == 0) {
3375
+ // Mamba layer
3376
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
3377
+
3378
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
3379
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
3380
+
3381
+ layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
3382
+
3383
+ layer.ssm_dt_norm = create_tensor(tn(LLM_TENSOR_SSM_DT_NORM, "weight", i), {dt_rank}, 0);
3384
+
3385
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
3386
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
3387
+
3388
+ layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, "weight", i), {d_state}, 0);
3389
+ layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, "weight", i), {d_state}, 0);
3390
+
3391
+ // no "weight" suffix for these
3392
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
3393
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
3394
+
3395
+ // out_proj
3396
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3397
+ } else {
3398
+ // Attention layers
3399
+
3400
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3401
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3402
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3403
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3404
+ }
3405
+
3406
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3407
+
3408
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
3409
+
3410
+ if (layer.ffn_gate_inp) {
3411
+ // MoE
3412
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3413
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
3414
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3415
+ } else {
3416
+ // FFN (no MoE)
3417
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3418
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3419
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3420
+ }
3421
+ }
3422
+ } break;
3423
+ case LLM_ARCH_GRANITE_HYBRID:
3424
+ {
3425
+ // mamba2 Mixer SSM params
3426
+ // NOTE: int64_t for tensor dimensions
3427
+ const int64_t d_conv = hparams.ssm_d_conv;
3428
+ const int64_t d_inner = hparams.ssm_d_inner;
3429
+ const int64_t d_state = hparams.ssm_d_state;
3430
+ const int64_t n_ssm_head = hparams.ssm_dt_rank;
3431
+ const int64_t n_group = hparams.ssm_n_group;
3432
+ const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
3433
+
3434
+ // only an expansion factor of 2 is supported for now
3435
+ GGML_ASSERT(2 * n_embd == d_inner);
3436
+
3437
+ // embeddings
3438
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3439
+
3440
+ // output
3441
+ {
3442
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3443
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3444
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
3445
+ if (output == NULL) {
3446
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3447
+ }
3448
+ }
3449
+
3450
+ for (int i = 0; i < n_layer; ++i) {
3451
+ auto & layer = layers[i];
3452
+
3453
+ // norm
3454
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3455
+
3456
+ if (hparams.is_recurrent(i)) {
3457
+ // ssm layers
3458
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
3459
+
3460
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
3461
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
3462
+
3463
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
3464
+
3465
+ // no "weight" suffix for these
3466
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
3467
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
3468
+
3469
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
3470
+
3471
+ // out_proj
3472
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3473
+ } else {
3474
+ // attention layers (with optional bias)
3475
+ const int64_t n_head_i = hparams.n_head(i);
3476
+ const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
3477
+ const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
3478
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
3479
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
3480
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
3481
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
3482
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3483
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
3484
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
3485
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3486
+ }
3487
+
3488
+ // feed forward (w/ optional biases)
3489
+ if (n_expert > 0) {
3490
+ // MoE FFN
3491
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3492
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3493
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3494
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
3495
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
3496
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3497
+
3498
+ // For Granite MoE Shared
3499
+ if (hparams.n_ff_shexp > 0) {
3500
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3501
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3502
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
3503
+ }
3504
+ } else {
3505
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3506
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
3507
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3508
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3509
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3510
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3511
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
3512
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
3513
+ }
3514
+ }
3515
+ } break;
3118
3516
  case LLM_ARCH_XVERSE:
3119
3517
  {
3120
3518
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4348,16 +4746,226 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4348
4746
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4349
4747
  }
4350
4748
  } break;
4351
- default:
4352
- throw std::runtime_error("unknown architecture");
4353
- }
4749
+ case LLM_ARCH_ERNIE4_5:
4750
+ {
4751
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4354
4752
 
4355
- if (n_moved_tensors > 0) {
4356
- LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
4357
- __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
4358
- ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
4359
- }
4360
- }
4753
+ // output
4754
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4755
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4756
+ // if output is NULL, init from the input tok embed
4757
+ if (output == NULL) {
4758
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4759
+ }
4760
+
4761
+ for (int i = 0; i < n_layer; ++i) {
4762
+ auto & layer = layers[i];
4763
+
4764
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4765
+
4766
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4767
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4768
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4769
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4770
+
4771
+ // optional bias tensors
4772
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4773
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4774
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4775
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4776
+
4777
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4778
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4779
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4780
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4781
+ }
4782
+ } break;
4783
+ case LLM_ARCH_FALCON_H1:
4784
+ {
4785
+ // Common
4786
+ const int64_t hidden_size = hparams.n_embd; // hidden_size
4787
+
4788
+ // mamba2 Mixer SSM params
4789
+ const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
4790
+ const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
4791
+ const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
4792
+ const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
4793
+ const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
4794
+ const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
4795
+ const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
4796
+
4797
+ // attn params
4798
+ const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
4799
+ const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
4800
+
4801
+ // ffn params
4802
+ const int64_t ffn_intermediate_size = hparams.n_ff(0);
4803
+
4804
+ // embeddings
4805
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
4806
+
4807
+ // output
4808
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
4809
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
4810
+
4811
+ // if output is NULL, init from the input tok embed
4812
+ if (output == NULL) {
4813
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
4814
+ }
4815
+
4816
+ for (int i = 0; i < n_layer; ++i) {
4817
+ auto & layer = layers[i];
4818
+
4819
+ /*SSM LAYERS*/
4820
+ // ssm in
4821
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
4822
+ // ssm 1d conv
4823
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
4824
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
4825
+ // ssm_dt
4826
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
4827
+ // no "weight" suffix for these
4828
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
4829
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
4830
+ // ssm_norm
4831
+ layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
4832
+ // out_proj
4833
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
4834
+
4835
+ /*ATTENTION LAYERS*/
4836
+ // attention layers (with optional bias)
4837
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
4838
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
4839
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
4840
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
4841
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
4842
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
4843
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
4844
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
4845
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
4846
+
4847
+
4848
+ // feed forward (w/ optional biases)
4849
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
4850
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4851
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
4852
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
4853
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
4854
+
4855
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
4856
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
4857
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
4858
+ }
4859
+ } break;
4860
+ case LLM_ARCH_HUNYUAN_MOE:
4861
+ {
4862
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4863
+
4864
+ // output
4865
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4866
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4867
+ // if output is NULL, init from the input tok embed
4868
+ if (output == NULL) {
4869
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4870
+ }
4871
+
4872
+ for (int i = 0; i < n_layer; ++i) {
4873
+ auto & layer = layers[i];
4874
+
4875
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4876
+
4877
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4878
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4879
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4880
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4881
+
4882
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4883
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4884
+
4885
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4886
+
4887
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4888
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4889
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
4890
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
4891
+
4892
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4893
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
4894
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
4895
+ }
4896
+ } break;
4897
+ case LLM_ARCH_SMOLLM3:
4898
+ {
4899
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4900
+
4901
+ // output
4902
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4903
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4904
+
4905
+ // if output is NULL, init from the input tok embed
4906
+ if (output == NULL) {
4907
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4908
+ }
4909
+
4910
+ for (int i = 0; i < n_layer; ++i) {
4911
+ auto & layer = layers[i];
4912
+
4913
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4914
+
4915
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4916
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4917
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4918
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4919
+
4920
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4921
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4922
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4923
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4924
+ }
4925
+ } break;
4926
+ case LLM_ARCH_LFM2:
4927
+ {
4928
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4929
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
4930
+
4931
+ for (int i = 0; i < n_layer; ++i) {
4932
+ auto & layer = layers[i];
4933
+ // ffn is same for transformer and conv layers
4934
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4935
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4936
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4937
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4938
+
4939
+ // for operator_norm
4940
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4941
+
4942
+ if (!hparams.is_recurrent(i)) {
4943
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4944
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4945
+ GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
4946
+
4947
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
4948
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
4949
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
4950
+
4951
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
4952
+ } else {
4953
+ layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
4954
+ layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
4955
+ layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
4956
+ }
4957
+ }
4958
+ } break;
4959
+ default:
4960
+ throw std::runtime_error("unknown architecture");
4961
+ }
4962
+
4963
+ if (n_moved_tensors > 0) {
4964
+ LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
4965
+ __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
4966
+ ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
4967
+ }
4968
+ }
4361
4969
 
4362
4970
  ml.done_getting_tensors();
4363
4971
 
@@ -4587,12 +5195,6 @@ void llama_model::print_info() const {
4587
5195
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4588
5196
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
4589
5197
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
4590
- LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
4591
- LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
4592
- LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
4593
- LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
4594
- LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
4595
-
4596
5198
  if (!classifier_labels.empty()) {
4597
5199
  LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
4598
5200
 
@@ -4603,6 +5205,19 @@ void llama_model::print_info() const {
4603
5205
  }
4604
5206
  }
4605
5207
 
5208
+ if (arch == LLM_ARCH_MAMBA ||
5209
+ arch == LLM_ARCH_MAMBA2 ||
5210
+ arch == LLM_ARCH_JAMBA ||
5211
+ arch == LLM_ARCH_FALCON_H1 ||
5212
+ arch == LLM_ARCH_GRANITE_HYBRID) {
5213
+ LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
5214
+ LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
5215
+ LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
5216
+ LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
5217
+ LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
5218
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
5219
+ }
5220
+
4606
5221
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
4607
5222
  if (pimpl->n_elements >= 1e12) {
4608
5223
  LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
@@ -4649,7 +5264,8 @@ void llama_model::print_info() const {
4649
5264
 
4650
5265
  if (arch == LLM_ARCH_MINICPM ||
4651
5266
  arch == LLM_ARCH_GRANITE ||
4652
- arch == LLM_ARCH_GRANITE_MOE) {
5267
+ arch == LLM_ARCH_GRANITE_MOE ||
5268
+ arch == LLM_ARCH_GRANITE_HYBRID) {
4653
5269
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
4654
5270
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
4655
5271
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -5539,12 +6155,10 @@ struct llm_build_falcon : public llm_graph_context {
5539
6155
  cur = build_lora_mm(model.layers[il].wqkv, cur);
5540
6156
  cb(cur, "wqkv", il);
5541
6157
 
5542
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5543
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6158
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6159
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
5544
6160
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5545
6161
 
5546
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5547
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5548
6162
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5549
6163
 
5550
6164
  // using mode = 2 for neox mode
@@ -5821,12 +6435,10 @@ struct llm_build_dbrx : public llm_graph_context {
5821
6435
  cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
5822
6436
  cb(cur, "wqkv_clamped", il);
5823
6437
 
5824
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5825
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6438
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6439
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
5826
6440
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5827
6441
 
5828
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5829
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5830
6442
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5831
6443
 
5832
6444
  Qcur = ggml_rope_ext(
@@ -6337,12 +6949,10 @@ struct llm_build_neo_bert : public llm_graph_context {
6337
6949
  cur = build_lora_mm(model.layers[il].wqkv, cur);
6338
6950
  cb(cur, "wqkv", il);
6339
6951
 
6340
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6341
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6952
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6953
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6342
6954
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6343
6955
 
6344
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6345
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6346
6956
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6347
6957
 
6348
6958
  // RoPE
@@ -6572,8 +7182,8 @@ struct llm_build_mpt : public llm_graph_context {
6572
7182
  cb(cur, "wqkv_clamped", il);
6573
7183
  }
6574
7184
 
6575
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6576
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7185
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7186
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
6577
7187
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6578
7188
 
6579
7189
  cb(Qcur, "Qcur", il);
@@ -6593,6 +7203,12 @@ struct llm_build_mpt : public llm_graph_context {
6593
7203
  model.layers[il].attn_k_norm_b,
6594
7204
  LLM_NORM, il);
6595
7205
  cb(Kcur, "Kcur", il);
7206
+ } else {
7207
+ Qcur = ggml_cont(ctx0, Qcur);
7208
+ cb(Qcur, "Qcur", il);
7209
+
7210
+ Kcur = ggml_cont(ctx0, Kcur);
7211
+ cb(Kcur, "Kcur", il);
6596
7212
  }
6597
7213
 
6598
7214
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -6847,12 +7463,10 @@ struct llm_build_qwen : public llm_graph_context {
6847
7463
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6848
7464
  cb(cur, "bqkv", il);
6849
7465
 
6850
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6851
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7466
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7467
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6852
7468
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
6853
7469
 
6854
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6855
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6856
7470
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6857
7471
 
6858
7472
  // using mode = 2 for neox mode
@@ -7617,21 +8231,21 @@ struct llm_build_phi2 : public llm_graph_context {
7617
8231
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7618
8232
  cb(cur, "bqkv", il);
7619
8233
 
7620
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7621
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
8234
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8235
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7622
8236
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7623
8237
  } else {
7624
8238
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7625
8239
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7626
8240
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
8241
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8242
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7627
8243
  }
7628
8244
 
7629
8245
  cb(Qcur, "Qcur", il);
7630
8246
  cb(Kcur, "Kcur", il);
7631
8247
  cb(Vcur, "Vcur", il);
7632
8248
 
7633
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7634
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7635
8249
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7636
8250
 
7637
8251
  Qcur = ggml_rope_ext(
@@ -7755,21 +8369,21 @@ struct llm_build_phi3 : public llm_graph_context {
7755
8369
  cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
7756
8370
  cb(cur, "wqkv", il);
7757
8371
 
7758
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
7759
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
8372
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
8373
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
7760
8374
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
7761
8375
  } else {
7762
8376
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7763
8377
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7764
8378
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
8379
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8380
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7765
8381
  }
7766
8382
 
7767
8383
  cb(Qcur, "Qcur", il);
7768
8384
  cb(Kcur, "Kcur", il);
7769
8385
  cb(Vcur, "Vcur", il);
7770
8386
 
7771
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7772
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7773
8387
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7774
8388
 
7775
8389
  Qcur = ggml_rope_ext(
@@ -8125,12 +8739,10 @@ struct llm_build_codeshell : public llm_graph_context {
8125
8739
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8126
8740
  cb(cur, "bqkv", il);
8127
8741
 
8128
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
8129
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
8742
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8743
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8130
8744
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
8131
8745
 
8132
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8133
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8134
8746
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8135
8747
 
8136
8748
  Qcur = ggml_rope_ext(
@@ -8546,8 +9158,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8546
9158
  ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
8547
9159
  cb(k_pe, "k_pe", il);
8548
9160
 
8549
- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
8550
- kv_compressed = ggml_cont(ctx0, kv_compressed);
8551
9161
  kv_compressed = build_norm(kv_compressed,
8552
9162
  model.layers[il].attn_kv_a_norm, NULL,
8553
9163
  LLM_NORM_RMS, il);
@@ -8574,12 +9184,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8574
9184
  v_states = ggml_cont(ctx0, v_states);
8575
9185
  cb(v_states, "v_states", il);
8576
9186
 
8577
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
8578
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
8579
- 0);
8580
- cb(v_states, "v_states", il);
8581
-
8582
- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
8583
9187
  q_pe = ggml_rope_ext(
8584
9188
  ctx0, q_pe, inp_pos, rope_factors,
8585
9189
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -8588,7 +9192,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8588
9192
  cb(q_pe, "q_pe", il);
8589
9193
 
8590
9194
  // shared RoPE key
8591
- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
8592
9195
  k_pe = ggml_rope_ext(
8593
9196
  ctx0, k_pe, inp_pos, rope_factors,
8594
9197
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9068,8 +9671,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
9068
9671
  const int n_layer_sparsity = 10; // number of layers using activation sparsity
9069
9672
  const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
9070
9673
 
9071
- ggml_tensor * one; // containing single element 1.0f
9072
-
9073
9674
  llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
9074
9675
  : llm_graph_context(params),
9075
9676
  model(model),
@@ -9081,14 +9682,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
9081
9682
  ggml_tensor * cur;
9082
9683
  ggml_tensor * inpL;
9083
9684
 
9084
- // TODO: remove this when ggml_scale_add is implemented
9085
- one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
9086
- {
9087
- auto inp = std::make_unique<llm_graph_input_one>();
9088
- inp->one = one;
9089
- res->add_input(std::move(inp));
9090
- }
9091
-
9092
9685
  inpL = build_inp_embd(model.tok_embd);
9093
9686
 
9094
9687
  // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
@@ -9478,7 +10071,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
9478
10071
  cb(innovation, "innovation", il);
9479
10072
 
9480
10073
  ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
9481
- all_coefs = ggml_add(ctx0, all_coefs, one);
10074
+ all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
9482
10075
  cb(all_coefs, "all_coefs", il);
9483
10076
  all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
9484
10077
  all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
@@ -9621,81 +10214,32 @@ struct llm_build_starcoder2 : public llm_graph_context {
9621
10214
  }
9622
10215
  };
9623
10216
 
9624
- struct llm_build_mamba : public llm_graph_context {
9625
- const llama_model & model;
9626
-
9627
- llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
9628
- ggml_tensor * cur;
9629
- ggml_tensor * inpL;
9630
-
9631
- // {n_embd, n_tokens}
9632
- inpL = build_inp_embd(model.tok_embd);
9633
-
9634
- auto * rs_inp = build_rs_inp();
9635
-
9636
- ggml_tensor * inp_out_ids = build_inp_out_ids();
9637
-
9638
- for (int il = 0; il < n_layer; ++il) {
9639
- // norm
9640
- cur = build_norm(inpL,
9641
- model.layers[il].attn_norm, NULL,
9642
- LLM_NORM_RMS, il);
9643
- cb(cur, "attn_norm", il);
9644
-
9645
- cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
9646
-
9647
- if (il == n_layer - 1 && inp_out_ids) {
9648
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
9649
- inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
9650
- }
9651
-
9652
- // residual
9653
- cur = ggml_add(ctx0, cur, inpL);
9654
-
9655
- cur = build_cvec(cur, il);
9656
- cb(cur, "l_out", il);
9657
-
9658
- // input for next layer
9659
- inpL = cur;
9660
- }
10217
+ struct llm_graph_context_mamba : public llm_graph_context {
10218
+ llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
9661
10219
 
9662
- // final rmsnorm
9663
- cur = build_norm(inpL,
9664
- model.output_norm, NULL,
9665
- LLM_NORM_RMS, -1);
9666
-
9667
- cb(cur, "result_norm", -1);
9668
- res->t_embd = cur;
9669
-
9670
- // lm_head
9671
- cur = build_lora_mm(model.output, cur);
9672
-
9673
- cb(cur, "result_output", -1);
9674
- res->t_logits = cur;
9675
-
9676
- ggml_build_forward_expand(gf, cur);
9677
- }
9678
-
9679
- // TODO: split
9680
10220
  ggml_tensor * build_mamba_layer(
9681
10221
  llm_graph_input_rs * inp,
9682
10222
  ggml_cgraph * gf,
9683
10223
  ggml_tensor * cur,
10224
+ const llama_model & model,
9684
10225
  const llama_ubatch & ubatch,
9685
- int il) const {
9686
- const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
10226
+ int il) {
10227
+
10228
+ const auto * mctx_cur = inp->mctx;
9687
10229
 
9688
10230
  const auto kv_head = mctx_cur->get_head();
9689
10231
 
10232
+ const auto & layer = model.layers[il];
10233
+
9690
10234
  const int64_t d_conv = hparams.ssm_d_conv;
9691
10235
  const int64_t d_inner = hparams.ssm_d_inner;
9692
10236
  const int64_t d_state = hparams.ssm_d_state;
9693
10237
  const int64_t dt_rank = hparams.ssm_dt_rank;
10238
+ const int64_t n_head = d_inner;
10239
+ const int64_t head_dim = 1;
9694
10240
  const int64_t n_seqs = ubatch.n_seqs;
9695
10241
  // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
9696
10242
  const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
9697
- // Use the same RMS norm as the final layer norm
9698
- const float norm_rms_eps = hparams.f_norm_rms_eps;
9699
10243
 
9700
10244
  const int64_t n_seq_tokens = ubatch.n_seq_tokens;
9701
10245
 
@@ -9706,21 +10250,14 @@ struct llm_build_mamba : public llm_graph_context {
9706
10250
  ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
9707
10251
  ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
9708
10252
 
9709
- // (ab)using the KV cache to store the states
9710
- ggml_tensor * conv = build_rs(
9711
- inp, gf, conv_states_all,
9712
- hparams.n_embd_r(), n_seqs);
10253
+ ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
9713
10254
  conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
9714
- ggml_tensor * ssm = build_rs(
9715
- inp, gf, ssm_states_all,
9716
- hparams.n_embd_s(), n_seqs);
9717
- ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
9718
10255
 
9719
10256
  // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
9720
10257
  cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
9721
10258
 
9722
10259
  // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
9723
- ggml_tensor * xz = build_lora_mm(model.layers[il].ssm_in, cur);
10260
+ ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
9724
10261
  // split the above in two
9725
10262
  // => {d_inner, n_seq_tokens, n_seqs}
9726
10263
  ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
@@ -9749,10 +10286,10 @@ struct llm_build_mamba : public llm_graph_context {
9749
10286
  // then permute away the ne[0] dimension,
9750
10287
  // and then you're left with the resulting x tensor.
9751
10288
  // For simultaneous sequences, all sequences need to have the same length.
9752
- x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
10289
+ x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
9753
10290
 
9754
10291
  // bias
9755
- x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
10292
+ x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
9756
10293
 
9757
10294
  x = ggml_silu(ctx0, x);
9758
10295
  }
@@ -9760,55 +10297,366 @@ struct llm_build_mamba : public llm_graph_context {
9760
10297
  // ssm
9761
10298
  {
9762
10299
  // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
9763
- ggml_tensor * x_db = build_lora_mm(model.layers[il].ssm_x, x);
10300
+ ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
9764
10301
  // split
9765
10302
  ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
9766
- ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
9767
- ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
9768
-
9769
- // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
9770
- if (ssm_dt_b_c_rms) {
9771
- dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
9772
- B = ggml_rms_norm(ctx0, B, norm_rms_eps);
9773
- C = ggml_rms_norm(ctx0, C, norm_rms_eps);
10303
+ ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
10304
+ ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
10305
+
10306
+ // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
10307
+ if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
10308
+ dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
10309
+ B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
10310
+ C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
9774
10311
  }
9775
10312
 
9776
10313
  // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
9777
- dt = build_lora_mm(model.layers[il].ssm_dt, dt);
9778
- dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
10314
+ dt = build_lora_mm(layer.ssm_dt, dt);
10315
+ dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
10316
+
10317
+ cur = x;
10318
+ x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
9779
10319
 
9780
- // Custom operator to optimize the parallel associative scan
9781
- // as described in the Annex D of the Mamba paper.
9782
- // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
9783
- ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
10320
+ ggml_tensor * A = layer.ssm_a;
10321
+
10322
+ // use the states and the indices provided by build_recurrent_state
10323
+ // (this is necessary in order to properly use the states before they are overwritten,
10324
+ // while avoiding to make unnecessary copies of the states)
10325
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
10326
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
10327
+
10328
+ // Custom operator to optimize the parallel associative scan
10329
+ // as described in the Annex D of the Mamba paper.
10330
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
10331
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
10332
+ };
10333
+
10334
+ ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
9784
10335
 
9785
10336
  // store last states
9786
10337
  ggml_build_forward_expand(gf,
9787
10338
  ggml_cpy(ctx0,
9788
- ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
10339
+ ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
9789
10340
  ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
9790
10341
 
9791
- ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
10342
+ ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
9792
10343
 
9793
10344
  // TODO: skip computing output earlier for unused tokens
9794
10345
 
9795
- // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
9796
- y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
9797
- y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
10346
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
10347
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
9798
10348
 
9799
10349
  // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
9800
- cur = build_lora_mm(model.layers[il].ssm_out, y);
10350
+ cur = build_lora_mm(layer.ssm_out, y);
9801
10351
  }
9802
10352
 
9803
10353
  // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
9804
10354
  cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
9805
- //cb(cur, "mamba_out", il);
9806
10355
 
9807
10356
  return cur;
9808
10357
  }
9809
- };
9810
10358
 
9811
- struct llm_build_command_r : public llm_graph_context {
10359
+ ggml_tensor * build_mamba2_layer(
10360
+ llm_graph_input_rs * inp,
10361
+ ggml_cgraph * gf,
10362
+ ggml_tensor * cur,
10363
+ const llama_model & model,
10364
+ const llama_ubatch & ubatch,
10365
+ int il) const {
10366
+
10367
+ const auto * mctx_cur = inp->mctx;
10368
+
10369
+ const auto kv_head = mctx_cur->get_head();
10370
+
10371
+ const int64_t d_conv = hparams.ssm_d_conv;
10372
+ const int64_t d_inner = hparams.ssm_d_inner;
10373
+ const int64_t d_state = hparams.ssm_d_state;
10374
+ const int64_t n_head = hparams.ssm_dt_rank;
10375
+ const int64_t head_dim = d_inner / n_head;
10376
+ const int64_t n_group = hparams.ssm_n_group;
10377
+ const int64_t n_seqs = ubatch.n_seqs;
10378
+
10379
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
10380
+
10381
+ GGML_ASSERT(n_seqs != 0);
10382
+ GGML_ASSERT(ubatch.equal_seqs);
10383
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
10384
+
10385
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
10386
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
10387
+
10388
+ ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
10389
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
10390
+
10391
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
10392
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
10393
+
10394
+ // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
10395
+
10396
+ // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
10397
+ ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
10398
+
10399
+ // split the above in three
10400
+ ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
10401
+ ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
10402
+ ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
10403
+
10404
+ // conv
10405
+ {
10406
+ // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
10407
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
10408
+
10409
+ // copy last (d_conv - 1) columns back into the state cache
10410
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
10411
+
10412
+ ggml_build_forward_expand(gf,
10413
+ ggml_cpy(ctx0, last_conv,
10414
+ ggml_view_1d(ctx0, conv_states_all,
10415
+ (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
10416
+ kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
10417
+
10418
+ // 1D convolution
10419
+ // The equivalent is to make a self-overlapping view of conv_x
10420
+ // over d_conv columns at each stride in the 3rd dimension,
10421
+ // then element-wise multiply that with the conv1d weight,
10422
+ // then sum the elements of each row,
10423
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
10424
+ // then permute away the ne[0] dimension,
10425
+ // and then you're left with the resulting x tensor.
10426
+ // For simultaneous sequences, all sequences need to have the same length.
10427
+ xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
10428
+
10429
+ // bias
10430
+ xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
10431
+
10432
+ xBC = ggml_silu(ctx0, xBC);
10433
+ }
10434
+
10435
+ // ssm
10436
+ {
10437
+ // These correspond to V K Q in SSM/attention duality
10438
+ ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
10439
+ ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
10440
+ ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
10441
+
10442
+ // {n_head, n_seq_tokens, n_seqs}
10443
+ dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
10444
+
10445
+ ggml_tensor * A = model.layers[il].ssm_a;
10446
+
10447
+ // use the states and the indices provided by build_recurrent_state
10448
+ // (this is necessary in order to properly use the states before they are overwritten,
10449
+ // while avoiding to make unnecessary copies of the states)
10450
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
10451
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
10452
+
10453
+ // TODO: use semistructured matrices to implement state-space duality
10454
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
10455
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
10456
+ };
10457
+
10458
+ ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
10459
+
10460
+ // store last states
10461
+ ggml_build_forward_expand(gf,
10462
+ ggml_cpy(ctx0,
10463
+ ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
10464
+ ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
10465
+
10466
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
10467
+
10468
+ // TODO: skip computing output earlier for unused tokens
10469
+
10470
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
10471
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
10472
+
10473
+ // grouped RMS norm
10474
+ if (model.layers[il].ssm_norm) {
10475
+ y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
10476
+ y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
10477
+ }
10478
+
10479
+ y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
10480
+
10481
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
10482
+ cur = build_lora_mm(model.layers[il].ssm_out, y);
10483
+ }
10484
+
10485
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
10486
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
10487
+ cb(cur, "mamba_out", il);
10488
+
10489
+ return cur;
10490
+ }
10491
+ };
10492
+
10493
+ struct llm_build_mamba : public llm_graph_context_mamba {
10494
+ llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
10495
+ ggml_tensor * cur;
10496
+ ggml_tensor * inpL;
10497
+
10498
+ // {n_embd, n_tokens}
10499
+ inpL = build_inp_embd(model.tok_embd);
10500
+
10501
+ auto * rs_inp = build_rs_inp();
10502
+
10503
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10504
+
10505
+ for (int il = 0; il < n_layer; ++il) {
10506
+ // norm
10507
+ cur = build_norm(inpL,
10508
+ model.layers[il].attn_norm, NULL,
10509
+ LLM_NORM_RMS, il);
10510
+ cb(cur, "attn_norm", il);
10511
+
10512
+ if (model.arch == LLM_ARCH_MAMBA2) {
10513
+ cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il);
10514
+ } else {
10515
+ cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il);
10516
+ }
10517
+
10518
+ if (il == n_layer - 1 && inp_out_ids) {
10519
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10520
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10521
+ }
10522
+
10523
+ // residual
10524
+ cur = ggml_add(ctx0, cur, inpL);
10525
+
10526
+ cur = build_cvec(cur, il);
10527
+ cb(cur, "l_out", il);
10528
+
10529
+ // input for next layer
10530
+ inpL = cur;
10531
+ }
10532
+
10533
+ // final rmsnorm
10534
+ cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
10535
+
10536
+ cb(cur, "result_norm", -1);
10537
+ res->t_embd = cur;
10538
+
10539
+ // lm_head
10540
+ cur = build_lora_mm(model.output, cur);
10541
+
10542
+ cb(cur, "result_output", -1);
10543
+ res->t_logits = cur;
10544
+
10545
+ ggml_build_forward_expand(gf, cur);
10546
+ }
10547
+
10548
+ };
10549
+
10550
+ struct llm_build_jamba : public llm_graph_context_mamba {
10551
+ llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
10552
+ const int64_t n_embd_head = hparams.n_embd_head_v;
10553
+
10554
+ ggml_tensor * cur;
10555
+ ggml_tensor * inpL;
10556
+
10557
+ // {n_embd, n_tokens}
10558
+ inpL = build_inp_embd(model.tok_embd);
10559
+
10560
+ auto * inp_hybrid = build_inp_mem_hybrid();
10561
+
10562
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
10563
+
10564
+ for (int il = 0; il < n_layer; ++il) {
10565
+ const int64_t n_head_kv = hparams.n_head_kv(il);
10566
+
10567
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
10568
+ cb(cur, "attn_norm", il);
10569
+
10570
+ if (n_head_kv == 0) {
10571
+ cur = build_mamba_layer(inp_hybrid->get_recr(), gf, cur, model, ubatch, il);
10572
+ } else {
10573
+ // Attention
10574
+
10575
+ struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
10576
+ struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
10577
+ struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
10578
+
10579
+ cb(Qcur, "Qcur", il);
10580
+ cb(Kcur, "Kcur", il);
10581
+ cb(Vcur, "Vcur", il);
10582
+
10583
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
10584
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
10585
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
10586
+
10587
+ cb(Qcur, "Qcur", il);
10588
+ cb(Kcur, "Kcur", il);
10589
+ cb(Vcur, "Vcur", il);
10590
+
10591
+ // No RoPE :)
10592
+ cur = build_attn(inp_hybrid->get_attn(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
10593
+ }
10594
+
10595
+ if (il == n_layer - 1 && inp_out_ids) {
10596
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
10597
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
10598
+ }
10599
+
10600
+ // residual
10601
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, inpL, cur);
10602
+ cb(cur, "ffn_inp", il);
10603
+
10604
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
10605
+ cb(cur, "ffn_norm", il);
10606
+
10607
+ // feed-forward network
10608
+ if (model.layers[il].ffn_gate_inp == nullptr) {
10609
+ // FFN
10610
+ cur = build_ffn(cur,
10611
+ model.layers[il].ffn_up, NULL, NULL,
10612
+ model.layers[il].ffn_gate, NULL, NULL,
10613
+ model.layers[il].ffn_down, NULL, NULL,
10614
+ NULL,
10615
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
10616
+ cb(cur, "ffn_out", il);
10617
+ } else {
10618
+ // MoE branch
10619
+ cur = build_moe_ffn(cur,
10620
+ model.layers[il].ffn_gate_inp,
10621
+ model.layers[il].ffn_up_exps,
10622
+ model.layers[il].ffn_gate_exps,
10623
+ model.layers[il].ffn_down_exps,
10624
+ nullptr,
10625
+ n_expert, n_expert_used,
10626
+ LLM_FFN_SILU, false,
10627
+ false, 0.0,
10628
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
10629
+ il);
10630
+ cb(cur, "ffn_moe_out", il);
10631
+ }
10632
+
10633
+ // residual
10634
+ cur = ggml_add(ctx0, ffn_inp, cur);
10635
+
10636
+ cur = build_cvec(cur, il);
10637
+ cb(cur, "l_out", il);
10638
+
10639
+ // input for next layer
10640
+ inpL = cur;
10641
+ }
10642
+
10643
+ // final rmsnorm
10644
+ cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
10645
+
10646
+ cb(cur, "result_norm", -1);
10647
+ res->t_embd = cur;
10648
+
10649
+ // lm_head
10650
+ cur = build_lora_mm(model.output, cur);
10651
+
10652
+ cb(cur, "result_output", -1);
10653
+ res->t_logits = cur;
10654
+
10655
+ ggml_build_forward_expand(gf, cur);
10656
+ }
10657
+ };
10658
+
10659
+ struct llm_build_command_r : public llm_graph_context {
9812
10660
  llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9813
10661
  const int64_t n_embd_head = hparams.n_embd_head_v;
9814
10662
 
@@ -10514,10 +11362,10 @@ struct llm_build_openelm : public llm_graph_context {
10514
11362
 
10515
11363
  cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
10516
11364
 
10517
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
11365
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
10518
11366
  cb(Qcur, "Qcur", il);
10519
11367
 
10520
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
11368
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
10521
11369
  cb(Kcur, "Kcur", il);
10522
11370
 
10523
11371
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
@@ -10639,12 +11487,10 @@ struct llm_build_gptneox : public llm_graph_context {
10639
11487
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10640
11488
  cb(cur, "bqkv", il);
10641
11489
 
10642
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
10643
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
11490
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
11491
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
10644
11492
  ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10645
11493
 
10646
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
10647
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
10648
11494
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
10649
11495
 
10650
11496
  Qcur = ggml_rope_ext(
@@ -11889,6 +12735,8 @@ struct llm_build_chatglm : public llm_graph_context {
11889
12735
  if (model.layers[il].bv) {
11890
12736
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
11891
12737
  }
12738
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12739
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11892
12740
  } else {
11893
12741
  cur = build_lora_mm(model.layers[il].wqkv, cur);
11894
12742
  cb(cur, "wqkv", il);
@@ -11896,13 +12744,11 @@ struct llm_build_chatglm : public llm_graph_context {
11896
12744
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
11897
12745
  cb(cur, "bqkv", il);
11898
12746
  }
11899
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
11900
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
12747
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12748
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
11901
12749
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
11902
12750
  }
11903
12751
 
11904
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
11905
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
11906
12752
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
11907
12753
 
11908
12754
  //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
@@ -12023,6 +12869,8 @@ struct llm_build_glm4 : public llm_graph_context {
12023
12869
  if (model.layers[il].bv) {
12024
12870
  Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12025
12871
  }
12872
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12873
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12026
12874
  } else {
12027
12875
  cur = build_lora_mm(model.layers[il].wqkv, cur);
12028
12876
  cb(cur, "wqkv", il);
@@ -12030,13 +12878,11 @@ struct llm_build_glm4 : public llm_graph_context {
12030
12878
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
12031
12879
  cb(cur, "bqkv", il);
12032
12880
  }
12033
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
12034
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
12881
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12882
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
12035
12883
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
12036
12884
  }
12037
12885
 
12038
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12039
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12040
12886
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12041
12887
 
12042
12888
  Qcur = ggml_rope_ext(
@@ -13135,13 +13981,11 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
13135
13981
  }
13136
13982
  };
13137
13983
 
13138
-
13139
13984
  struct llm_build_granite : public llm_graph_context {
13140
13985
  llm_build_granite(
13141
13986
  const llama_model & model,
13142
13987
  const llm_graph_params & params,
13143
- ggml_cgraph * gf,
13144
- const bool use_rope = true)
13988
+ ggml_cgraph * gf)
13145
13989
  : llm_graph_context(params) {
13146
13990
 
13147
13991
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -13156,14 +14000,12 @@ struct llm_build_granite : public llm_graph_context {
13156
14000
 
13157
14001
  // inp_pos - built only if rope enabled
13158
14002
  ggml_tensor * inp_pos = nullptr;
13159
- if (use_rope) {
14003
+ if (hparams.rope_finetuned) {
13160
14004
  inp_pos = build_inp_pos();
13161
14005
  }
13162
14006
 
13163
14007
  auto * inp_attn = build_attn_inp_kv_unified();
13164
14008
 
13165
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
13166
-
13167
14009
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13168
14010
 
13169
14011
  for (int il = 0; il < n_layer; ++il) {
@@ -13176,21 +14018,956 @@ struct llm_build_granite : public llm_graph_context {
13176
14018
  cb(cur, "attn_norm", il);
13177
14019
 
13178
14020
  // self-attention
13179
- {
13180
- // compute Q and K and (optionally) RoPE them
13181
- ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13182
- cb(Qcur, "Qcur", il);
13183
- if (model.layers[il].bq) {
13184
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13185
- cb(Qcur, "Qcur", il);
13186
- }
13187
-
13188
- ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13189
- cb(Kcur, "Kcur", il);
13190
- if (model.layers[il].bk) {
13191
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13192
- cb(Kcur, "Kcur", il);
13193
- }
14021
+ cur = build_attention_layer(
14022
+ gf, cur, inp_pos, inp_attn,
14023
+ model, n_embd_head, il);
14024
+
14025
+ if (il == n_layer - 1 && inp_out_ids) {
14026
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14027
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14028
+ }
14029
+
14030
+ // ffn
14031
+ cur = build_layer_ffn(cur, inpSA, model, il);
14032
+
14033
+ // input for next layer
14034
+ inpL = cur;
14035
+ }
14036
+
14037
+ cur = inpL;
14038
+
14039
+ cur = build_norm(cur,
14040
+ model.output_norm, NULL,
14041
+ LLM_NORM_RMS, -1);
14042
+
14043
+ cb(cur, "result_norm", -1);
14044
+ res->t_embd = cur;
14045
+
14046
+ // lm_head
14047
+ cur = build_lora_mm(model.output, cur);
14048
+
14049
+ // For Granite architectures - scale logits
14050
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
14051
+ cb(cur, "result_output", -1);
14052
+ res->t_logits = cur;
14053
+
14054
+ ggml_build_forward_expand(gf, cur);
14055
+ }
14056
+
14057
+ ggml_tensor * build_attention_layer(
14058
+ ggml_cgraph * gf,
14059
+ ggml_tensor * cur,
14060
+ ggml_tensor * inp_pos,
14061
+ llm_graph_input_attn_kv_unified * inp_attn,
14062
+ const llama_model & model,
14063
+ const int64_t n_embd_head,
14064
+ const int il) {
14065
+
14066
+ // compute Q and K and (optionally) RoPE them
14067
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14068
+ cb(Qcur, "Qcur", il);
14069
+ if (model.layers[il].bq) {
14070
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14071
+ cb(Qcur, "Qcur", il);
14072
+ }
14073
+
14074
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14075
+ cb(Kcur, "Kcur", il);
14076
+ if (model.layers[il].bk) {
14077
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14078
+ cb(Kcur, "Kcur", il);
14079
+ }
14080
+
14081
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14082
+ cb(Vcur, "Vcur", il);
14083
+ if (model.layers[il].bv) {
14084
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14085
+ cb(Vcur, "Vcur", il);
14086
+ }
14087
+
14088
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
14089
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14090
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14091
+
14092
+ const bool use_rope = hparams.rope_finetuned;
14093
+ if (use_rope) {
14094
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14095
+ Qcur = ggml_rope_ext(
14096
+ ctx0, Qcur, inp_pos, rope_factors,
14097
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14098
+ ext_factor, attn_factor, beta_fast, beta_slow
14099
+ );
14100
+
14101
+ Kcur = ggml_rope_ext(
14102
+ ctx0, Kcur, inp_pos, rope_factors,
14103
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14104
+ ext_factor, attn_factor, beta_fast, beta_slow
14105
+ );
14106
+ }
14107
+
14108
+ cb(Qcur, "Qcur", il);
14109
+ cb(Kcur, "Kcur", il);
14110
+ cb(Vcur, "Vcur", il);
14111
+
14112
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14113
+ cur = build_attn(inp_attn, gf,
14114
+ model.layers[il].wo, model.layers[il].bo,
14115
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14116
+ cb(cur, "attn_out", il);
14117
+ return cur;
14118
+ }
14119
+
14120
+ ggml_tensor * build_layer_ffn(
14121
+ ggml_tensor * cur,
14122
+ ggml_tensor * inpSA,
14123
+ const llama_model & model,
14124
+ const int il) {
14125
+
14126
+ // For Granite architectures - scale residual
14127
+ if (hparams.f_residual_scale) {
14128
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14129
+ }
14130
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14131
+ cb(ffn_inp, "ffn_inp", il);
14132
+
14133
+ // feed-forward network (non-MoE)
14134
+ if (model.layers[il].ffn_gate_inp == nullptr) {
14135
+
14136
+ cur = build_norm(ffn_inp,
14137
+ model.layers[il].ffn_norm, NULL,
14138
+ LLM_NORM_RMS, il);
14139
+ cb(cur, "ffn_norm", il);
14140
+
14141
+ cur = build_ffn(cur,
14142
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14143
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
14144
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14145
+ NULL,
14146
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14147
+ cb(cur, "ffn_out", il);
14148
+
14149
+ } else {
14150
+ // MoE branch
14151
+ cur = build_norm(ffn_inp,
14152
+ model.layers[il].ffn_norm, NULL,
14153
+ LLM_NORM_RMS, il);
14154
+ cb(cur, "ffn_norm", il);
14155
+
14156
+ ggml_tensor * moe_out = build_moe_ffn(cur,
14157
+ model.layers[il].ffn_gate_inp,
14158
+ model.layers[il].ffn_up_exps,
14159
+ model.layers[il].ffn_gate_exps,
14160
+ model.layers[il].ffn_down_exps,
14161
+ nullptr,
14162
+ n_expert, n_expert_used,
14163
+ LLM_FFN_SILU, true,
14164
+ false, 0.0,
14165
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
14166
+ il);
14167
+ cb(moe_out, "ffn_moe_out", il);
14168
+
14169
+ // For Granite MoE Shared
14170
+ if (hparams.n_ff_shexp > 0) {
14171
+ ggml_tensor * ffn_shexp = build_ffn(cur,
14172
+ model.layers[il].ffn_up_shexp, NULL, NULL,
14173
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
14174
+ model.layers[il].ffn_down_shexp, NULL, NULL,
14175
+ NULL,
14176
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14177
+ cb(ffn_shexp, "ffn_shexp", il);
14178
+
14179
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
14180
+ cb(cur, "ffn_out", il);
14181
+ } else {
14182
+ cur = moe_out;
14183
+ }
14184
+ }
14185
+
14186
+ // For Granite architectures - scale residual
14187
+ if (hparams.f_residual_scale) {
14188
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14189
+ }
14190
+ cur = ggml_add(ctx0, cur, ffn_inp);
14191
+ cb(cur, "ffn_out", il);
14192
+
14193
+ cur = build_cvec(cur, il);
14194
+ cb(cur, "l_out", il);
14195
+
14196
+ return cur;
14197
+ }
14198
+ };
14199
+
14200
+ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
14201
+
14202
+ llm_build_granite_hybrid(
14203
+ const llama_model & model,
14204
+ const llm_graph_params & params,
14205
+ ggml_cgraph * gf) :
14206
+ llm_graph_context_mamba(params) {
14207
+
14208
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14209
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14210
+
14211
+ ggml_tensor * cur;
14212
+ ggml_tensor * inpL;
14213
+
14214
+ inpL = build_inp_embd(model.tok_embd);
14215
+
14216
+ auto * inp = build_inp_mem_hybrid();
14217
+
14218
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14219
+
14220
+ // Positional embeddings populated if rope enabled
14221
+ ggml_tensor * inp_pos = nullptr;
14222
+ if (hparams.rope_finetuned) {
14223
+ inp_pos = build_inp_pos();
14224
+ }
14225
+
14226
+ for (int il = 0; il < n_layer; ++il) {
14227
+ struct ggml_tensor * inpSA = inpL;
14228
+
14229
+ // norm
14230
+ cur = build_norm(inpL,
14231
+ model.layers[il].attn_norm, NULL,
14232
+ LLM_NORM_RMS, il);
14233
+ cb(cur, "attn_norm", il);
14234
+
14235
+ if (hparams.is_recurrent(il)) {
14236
+ // ssm layer //
14237
+ cur = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il);
14238
+ } else {
14239
+ // attention layer //
14240
+ cur = build_attention_layer(
14241
+ gf, cur, inp_pos, inp->get_attn(), model,
14242
+ n_embd_head, il);
14243
+ }
14244
+
14245
+ if (il == n_layer - 1 && inp_out_ids) {
14246
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14247
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14248
+ }
14249
+
14250
+ // ffn
14251
+ cur = build_layer_ffn(cur, inpSA, model, il);
14252
+
14253
+ // input for next layer
14254
+ inpL = cur;
14255
+ }
14256
+
14257
+ cur = inpL;
14258
+
14259
+ cur = build_norm(cur,
14260
+ model.output_norm, NULL,
14261
+ LLM_NORM_RMS, -1);
14262
+
14263
+ cb(cur, "result_norm", -1);
14264
+ res->t_embd = cur;
14265
+
14266
+ // lm_head
14267
+ cur = build_lora_mm(model.output, cur);
14268
+
14269
+ // For Granite architectures - scale logits
14270
+ if (hparams.f_logit_scale) {
14271
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
14272
+ }
14273
+ cb(cur, "result_output", -1);
14274
+ res->t_logits = cur;
14275
+
14276
+ ggml_build_forward_expand(gf, cur);
14277
+ }
14278
+
14279
+ ggml_tensor * build_attention_layer(
14280
+ ggml_cgraph * gf,
14281
+ ggml_tensor * cur,
14282
+ ggml_tensor * inp_pos,
14283
+ llm_graph_input_attn_kv_unified * inp_attn,
14284
+ const llama_model & model,
14285
+ const int64_t n_embd_head,
14286
+ const int il) {
14287
+
14288
+ // compute Q and K and (optionally) RoPE them
14289
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14290
+ cb(Qcur, "Qcur", il);
14291
+ if (model.layers[il].bq) {
14292
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14293
+ cb(Qcur, "Qcur", il);
14294
+ }
14295
+
14296
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14297
+ cb(Kcur, "Kcur", il);
14298
+ if (model.layers[il].bk) {
14299
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14300
+ cb(Kcur, "Kcur", il);
14301
+ }
14302
+
14303
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14304
+ cb(Vcur, "Vcur", il);
14305
+ if (model.layers[il].bv) {
14306
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14307
+ cb(Vcur, "Vcur", il);
14308
+ }
14309
+
14310
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
14311
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14312
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14313
+
14314
+ const bool use_rope = hparams.rope_finetuned;
14315
+ if (use_rope) {
14316
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14317
+ Qcur = ggml_rope_ext(
14318
+ ctx0, Qcur, inp_pos, rope_factors,
14319
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14320
+ ext_factor, attn_factor, beta_fast, beta_slow
14321
+ );
14322
+
14323
+ Kcur = ggml_rope_ext(
14324
+ ctx0, Kcur, inp_pos, rope_factors,
14325
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14326
+ ext_factor, attn_factor, beta_fast, beta_slow
14327
+ );
14328
+ }
14329
+
14330
+ cb(Qcur, "Qcur", il);
14331
+ cb(Kcur, "Kcur", il);
14332
+ cb(Vcur, "Vcur", il);
14333
+
14334
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14335
+ cur = build_attn(inp_attn, gf,
14336
+ model.layers[il].wo, model.layers[il].bo,
14337
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14338
+ cb(cur, "attn_out", il);
14339
+ return cur;
14340
+ }
14341
+
14342
+ ggml_tensor * build_layer_ffn(
14343
+ ggml_tensor * cur,
14344
+ ggml_tensor * inpSA,
14345
+ const llama_model & model,
14346
+ const int il) {
14347
+
14348
+ // For Granite architectures - scale residual
14349
+ if (hparams.f_residual_scale) {
14350
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14351
+ }
14352
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14353
+ cb(ffn_inp, "ffn_inp", il);
14354
+
14355
+ // feed-forward network (non-MoE)
14356
+ if (model.layers[il].ffn_gate_inp == nullptr) {
14357
+
14358
+ cur = build_norm(ffn_inp,
14359
+ model.layers[il].ffn_norm, NULL,
14360
+ LLM_NORM_RMS, il);
14361
+ cb(cur, "ffn_norm", il);
14362
+
14363
+ cur = build_ffn(cur,
14364
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14365
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
14366
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14367
+ NULL,
14368
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14369
+ cb(cur, "ffn_out", il);
14370
+
14371
+ } else {
14372
+ // MoE branch
14373
+ cur = build_norm(ffn_inp,
14374
+ model.layers[il].ffn_norm, NULL,
14375
+ LLM_NORM_RMS, il);
14376
+ cb(cur, "ffn_norm", il);
14377
+
14378
+ ggml_tensor * moe_out = build_moe_ffn(cur,
14379
+ model.layers[il].ffn_gate_inp,
14380
+ model.layers[il].ffn_up_exps,
14381
+ model.layers[il].ffn_gate_exps,
14382
+ model.layers[il].ffn_down_exps,
14383
+ nullptr,
14384
+ n_expert, n_expert_used,
14385
+ LLM_FFN_SILU, true,
14386
+ false, 0.0,
14387
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
14388
+ il);
14389
+ cb(moe_out, "ffn_moe_out", il);
14390
+
14391
+ // For Granite MoE Shared
14392
+ if (hparams.n_ff_shexp > 0) {
14393
+ ggml_tensor * ffn_shexp = build_ffn(cur,
14394
+ model.layers[il].ffn_up_shexp, NULL, NULL,
14395
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
14396
+ model.layers[il].ffn_down_shexp, NULL, NULL,
14397
+ NULL,
14398
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14399
+ cb(ffn_shexp, "ffn_shexp", il);
14400
+
14401
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
14402
+ cb(cur, "ffn_out", il);
14403
+ } else {
14404
+ cur = moe_out;
14405
+ }
14406
+ }
14407
+
14408
+ // For Granite architectures - scale residual
14409
+ if (hparams.f_residual_scale) {
14410
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14411
+ }
14412
+ cur = ggml_add(ctx0, cur, ffn_inp);
14413
+ cb(cur, "ffn_out", il);
14414
+
14415
+ cur = build_cvec(cur, il);
14416
+ cb(cur, "l_out", il);
14417
+
14418
+ return cur;
14419
+ }
14420
+ };
14421
+
14422
+ // ref: https://github.com/facebookresearch/chameleon
14423
+ // based on the original build_llama() function, changes:
14424
+ // * qk-norm
14425
+ // * swin-norm
14426
+ // * removed bias
14427
+ // * removed MoE
14428
+ struct llm_build_chameleon : public llm_graph_context {
14429
+ llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14430
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14431
+
14432
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14433
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
14434
+
14435
+ ggml_tensor * cur;
14436
+ ggml_tensor * inpL;
14437
+
14438
+ inpL = build_inp_embd(model.tok_embd);
14439
+
14440
+ // inp_pos - contains the positions
14441
+ ggml_tensor * inp_pos = build_inp_pos();
14442
+
14443
+ auto * inp_attn = build_attn_inp_kv_unified();
14444
+
14445
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14446
+
14447
+ for (int il = 0; il < n_layer; ++il) {
14448
+ ggml_tensor * inpSA = inpL;
14449
+
14450
+ // norm
14451
+ if (hparams.swin_norm) {
14452
+ cur = inpL;
14453
+ } else {
14454
+ cur = build_norm(inpL,
14455
+ model.layers[il].attn_norm, NULL,
14456
+ LLM_NORM_RMS, il);
14457
+ cb(cur, "attn_norm", il);
14458
+ }
14459
+
14460
+ // self-attention
14461
+ {
14462
+ // compute Q and K and RoPE them
14463
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14464
+ cb(Qcur, "Qcur", il);
14465
+
14466
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14467
+ cb(Kcur, "Kcur", il);
14468
+
14469
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14470
+ cb(Vcur, "Vcur", il);
14471
+
14472
+ if (model.layers[il].attn_q_norm) {
14473
+ Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
14474
+ ggml_element_size(Qcur) * n_embd_head,
14475
+ ggml_element_size(Qcur) * n_embd_head * n_head,
14476
+ 0);
14477
+ cb(Qcur, "Qcur", il);
14478
+
14479
+ Qcur = build_norm(Qcur,
14480
+ model.layers[il].attn_q_norm,
14481
+ model.layers[il].attn_q_norm_b,
14482
+ LLM_NORM, il);
14483
+ cb(Qcur, "Qcur", il);
14484
+ }
14485
+
14486
+ if (model.layers[il].attn_k_norm) {
14487
+ Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
14488
+ ggml_element_size(Kcur) * n_embd_head,
14489
+ ggml_element_size(Kcur) * n_embd_head * n_head_kv,
14490
+ 0);
14491
+ cb(Kcur, "Kcur", il);
14492
+
14493
+ Kcur = build_norm(Kcur,
14494
+ model.layers[il].attn_k_norm,
14495
+ model.layers[il].attn_k_norm_b,
14496
+ LLM_NORM, il);
14497
+ cb(Kcur, "Kcur", il);
14498
+ }
14499
+
14500
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14501
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14502
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14503
+
14504
+ Qcur = ggml_rope_ext(
14505
+ ctx0, Qcur, inp_pos, nullptr,
14506
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14507
+ ext_factor, attn_factor, beta_fast, beta_slow
14508
+ );
14509
+
14510
+ Kcur = ggml_rope_ext(
14511
+ ctx0, Kcur, inp_pos, nullptr,
14512
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14513
+ ext_factor, attn_factor, beta_fast, beta_slow
14514
+ );
14515
+
14516
+ cb(Qcur, "Qcur", il);
14517
+ cb(Kcur, "Kcur", il);
14518
+ cb(Vcur, "Vcur", il);
14519
+
14520
+ cur = build_attn(inp_attn, gf,
14521
+ model.layers[il].wo, nullptr,
14522
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14523
+ }
14524
+
14525
+ if (il == n_layer - 1 && inp_out_ids) {
14526
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14527
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14528
+ }
14529
+
14530
+ if (hparams.swin_norm) {
14531
+ cur = build_norm(cur,
14532
+ model.layers[il].attn_norm, NULL,
14533
+ LLM_NORM_RMS, il);
14534
+ }
14535
+
14536
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14537
+ cb(ffn_inp, "ffn_inp", il);
14538
+
14539
+ // feed-forward network
14540
+ if (!hparams.swin_norm) {
14541
+ cur = build_norm(ffn_inp,
14542
+ model.layers[il].ffn_norm, NULL,
14543
+ LLM_NORM_RMS, il);
14544
+ cb(cur, "ffn_norm", il);
14545
+ }
14546
+
14547
+ cur = build_ffn(cur,
14548
+ model.layers[il].ffn_up, NULL, NULL,
14549
+ model.layers[il].ffn_gate, NULL, NULL,
14550
+ model.layers[il].ffn_down, NULL, NULL,
14551
+ NULL,
14552
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14553
+ cb(cur, "ffn_out", il);
14554
+
14555
+ if (hparams.swin_norm) {
14556
+ cur = build_norm(cur,
14557
+ model.layers[il].ffn_norm, NULL,
14558
+ LLM_NORM_RMS, il);
14559
+ cb(cur, "ffn_norm", il);
14560
+ }
14561
+
14562
+ cur = ggml_add(ctx0, cur, ffn_inp);
14563
+ cb(cur, "ffn_out", il);
14564
+
14565
+ cur = build_cvec(cur, il);
14566
+ cb(cur, "l_out", il);
14567
+
14568
+ // input for next layer
14569
+ inpL = cur;
14570
+ }
14571
+
14572
+ cur = inpL;
14573
+
14574
+ cur = build_norm(cur,
14575
+ model.output_norm, NULL,
14576
+ LLM_NORM_RMS, -1);
14577
+
14578
+ cb(cur, "result_norm", -1);
14579
+ res->t_embd = cur;
14580
+
14581
+ // lm_head
14582
+ cur = build_lora_mm(model.output, cur);
14583
+ cb(cur, "result_output_with_img_logits", -1);
14584
+
14585
+ // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
14586
+ // Needs to be removed once image outputs are supported.
14587
+ int img_token_end_idx = 8196;
14588
+ int img_token_start_idx = 4;
14589
+ int num_img_tokens = img_token_end_idx - img_token_start_idx;
14590
+ // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
14591
+ // which ensures that text token values are always at least larger than image token values
14592
+ ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
14593
+ img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
14594
+ cb(img_logits, "img_logits", -1);
14595
+
14596
+ cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
14597
+
14598
+ cb(cur, "result_output", -1);
14599
+ res->t_logits = cur;
14600
+
14601
+ ggml_build_forward_expand(gf, cur);
14602
+ }
14603
+ };
14604
+
14605
+ struct llm_build_wavtokenizer_dec : public llm_graph_context {
14606
+ llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14607
+ ggml_tensor * cur;
14608
+ ggml_tensor * inpL;
14609
+
14610
+ inpL = build_inp_embd(model.tok_embd);
14611
+
14612
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
14613
+
14614
+ cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
14615
+ cur = ggml_add(ctx0, cur, model.conv1d_b);
14616
+
14617
+ // posnet
14618
+ for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
14619
+ const auto & layer = model.layers[il].posnet;
14620
+
14621
+ inpL = cur;
14622
+
14623
+ switch (il) {
14624
+ case 0:
14625
+ case 1:
14626
+ case 3:
14627
+ case 4:
14628
+ {
14629
+ cur = build_norm(cur,
14630
+ layer.norm1,
14631
+ layer.norm1_b,
14632
+ LLM_NORM_GROUP, 0);
14633
+
14634
+ cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
14635
+
14636
+ cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
14637
+ cur = ggml_add(ctx0, cur, layer.conv1_b);
14638
+
14639
+ cur = build_norm(cur,
14640
+ layer.norm2,
14641
+ layer.norm2_b,
14642
+ LLM_NORM_GROUP, 0);
14643
+
14644
+ cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
14645
+
14646
+ cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
14647
+ cur = ggml_add(ctx0, cur, layer.conv2_b);
14648
+
14649
+ cur = ggml_add(ctx0, cur, inpL);
14650
+ } break;
14651
+ case 2:
14652
+ {
14653
+ cur = build_norm(cur,
14654
+ layer.attn_norm,
14655
+ layer.attn_norm_b,
14656
+ LLM_NORM_GROUP, 0);
14657
+
14658
+ ggml_tensor * q;
14659
+ ggml_tensor * k;
14660
+ ggml_tensor * v;
14661
+
14662
+ q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
14663
+ k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
14664
+ v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
14665
+
14666
+ q = ggml_add(ctx0, q, layer.attn_q_b);
14667
+ k = ggml_add(ctx0, k, layer.attn_k_b);
14668
+ v = ggml_add(ctx0, v, layer.attn_v_b);
14669
+
14670
+ q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
14671
+ k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
14672
+
14673
+ ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
14674
+
14675
+ kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
14676
+
14677
+ cur = ggml_mul_mat(ctx0, kq, v);
14678
+
14679
+ cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
14680
+ cur = ggml_add(ctx0, cur, layer.attn_o_b);
14681
+
14682
+ cur = ggml_add(ctx0, cur, inpL);
14683
+ } break;
14684
+ case 5:
14685
+ {
14686
+ cur = build_norm(cur,
14687
+ layer.norm,
14688
+ layer.norm_b,
14689
+ LLM_NORM_GROUP, 0);
14690
+ } break;
14691
+ default: GGML_ABORT("unknown posnet layer");
14692
+ };
14693
+ }
14694
+
14695
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
14696
+
14697
+ cur = build_norm(cur,
14698
+ model.tok_norm,
14699
+ model.tok_norm_b,
14700
+ LLM_NORM, -1);
14701
+
14702
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
14703
+
14704
+ inpL = cur;
14705
+
14706
+ // convnext
14707
+ for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
14708
+ const auto & layer = model.layers[il].convnext;
14709
+
14710
+ cur = inpL;
14711
+
14712
+ cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
14713
+ cur = ggml_add(ctx0, cur, layer.dw_b);
14714
+
14715
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
14716
+
14717
+ cur = build_norm(cur,
14718
+ layer.norm,
14719
+ layer.norm_b,
14720
+ LLM_NORM, -1);
14721
+
14722
+ cur = build_ffn(cur,
14723
+ layer.pw1, layer.pw1_b, NULL,
14724
+ NULL, NULL, NULL,
14725
+ layer.pw2, layer.pw2_b, NULL,
14726
+ NULL,
14727
+ LLM_FFN_GELU, LLM_FFN_SEQ, il);
14728
+
14729
+ cur = ggml_mul(ctx0, cur, layer.gamma);
14730
+
14731
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
14732
+
14733
+ inpL = ggml_add(ctx0, cur, inpL);
14734
+ }
14735
+
14736
+ cur = inpL;
14737
+
14738
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
14739
+
14740
+ cur = build_norm(cur,
14741
+ model.output_norm,
14742
+ model.output_norm_b,
14743
+ LLM_NORM, -1);
14744
+
14745
+ // lm_head
14746
+ cur = build_lora_mm(model.output, cur);
14747
+
14748
+ cur = ggml_add(ctx0, cur, model.output_b);
14749
+
14750
+ cb(cur, "result_embd", -1);
14751
+ res->t_embd = cur;
14752
+
14753
+ ggml_build_forward_expand(gf, cur);
14754
+ }
14755
+ };
14756
+
14757
+ struct llm_build_plm : public llm_graph_context {
14758
+ llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14759
+ const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
14760
+
14761
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
14762
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
14763
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
14764
+
14765
+ ggml_tensor * cur;
14766
+ ggml_tensor * inpL;
14767
+
14768
+ // {n_embd, n_tokens}
14769
+ inpL = build_inp_embd(model.tok_embd);
14770
+
14771
+ // inp_pos - contains the positions
14772
+ ggml_tensor * inp_pos = build_inp_pos();
14773
+
14774
+ auto * inp_attn = build_attn_inp_kv_unified();
14775
+
14776
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14777
+
14778
+ for (int il = 0; il < n_layer; ++il) {
14779
+ ggml_tensor * inpSA = inpL;
14780
+
14781
+ // norm
14782
+ cur = build_norm(inpL,
14783
+ model.layers[il].attn_norm, NULL,
14784
+ LLM_NORM_RMS, il);
14785
+ cb(cur, "attn_norm", il);
14786
+
14787
+ // self_attention
14788
+ {
14789
+ ggml_tensor * q = NULL;
14790
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
14791
+ cb(q, "q", il);
14792
+
14793
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
14794
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
14795
+ ggml_row_size(q->type, hparams.n_embd_head_k),
14796
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
14797
+ 0);
14798
+ cb(q_nope, "q_nope", il);
14799
+
14800
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
14801
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
14802
+ ggml_row_size(q->type, hparams.n_embd_head_k),
14803
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
14804
+ ggml_row_size(q->type, n_embd_head_qk_nope));
14805
+ cb(q_pe, "q_pe", il);
14806
+
14807
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
14808
+ ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
14809
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
14810
+
14811
+ // split into {kv_lora_rank, n_tokens}
14812
+ ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
14813
+ kv_pe_compresseed->nb[1],
14814
+ 0);
14815
+ cb(kv_compressed, "kv_compressed", il);
14816
+
14817
+ // and {n_embd_head_qk_rope, n_tokens}
14818
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
14819
+ kv_pe_compresseed->nb[1],
14820
+ kv_pe_compresseed->nb[1],
14821
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
14822
+ cb(k_pe, "k_pe", il);
14823
+
14824
+ kv_compressed = build_norm(kv_compressed,
14825
+ model.layers[il].attn_kv_a_norm, NULL,
14826
+ LLM_NORM_RMS, il);
14827
+ cb(kv_compressed, "kv_compressed", il);
14828
+
14829
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
14830
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
14831
+ cb(kv, "kv", il);
14832
+
14833
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
14834
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
14835
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
14836
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
14837
+ 0);
14838
+ cb(k_nope, "k_nope", il);
14839
+
14840
+ // and {n_head * n_embd_head_v, n_tokens}
14841
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
14842
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
14843
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
14844
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
14845
+ cb(v_states, "v_states", il);
14846
+
14847
+ v_states = ggml_cont(ctx0, v_states);
14848
+ cb(v_states, "v_states", il);
14849
+
14850
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
14851
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
14852
+ 0);
14853
+ cb(v_states, "v_states", il);
14854
+
14855
+ q_pe = ggml_rope_ext(
14856
+ ctx0, q_pe, inp_pos, nullptr,
14857
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14858
+ ext_factor, attn_factor, beta_fast, beta_slow
14859
+ );
14860
+ cb(q_pe, "q_pe", il);
14861
+
14862
+ // shared RoPE key
14863
+ k_pe = ggml_rope_ext(
14864
+ ctx0, k_pe, inp_pos, nullptr,
14865
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14866
+ ext_factor, attn_factor, beta_fast, beta_slow
14867
+ );
14868
+ cb(k_pe, "k_pe", il);
14869
+
14870
+ ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
14871
+ cb(q_states, "q_states", il);
14872
+
14873
+ ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
14874
+ cb(k_states, "k_states", il);
14875
+
14876
+ cur = build_attn(inp_attn, gf,
14877
+ model.layers[il].wo, NULL,
14878
+ q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
14879
+ }
14880
+
14881
+ if (il == n_layer - 1 && inp_out_ids) {
14882
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14883
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14884
+ }
14885
+
14886
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14887
+ cb(ffn_inp, "ffn_inp", il);
14888
+
14889
+ cur = build_norm(ffn_inp,
14890
+ model.layers[il].ffn_norm, NULL,
14891
+ LLM_NORM_RMS, il);
14892
+ cb(cur, "ffn_norm", il);
14893
+
14894
+ cur = build_ffn(cur,
14895
+ model.layers[il].ffn_up, NULL, NULL,
14896
+ NULL, NULL, NULL,
14897
+ model.layers[il].ffn_down, NULL, NULL,
14898
+ NULL,
14899
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
14900
+ cb(cur, "ffn_out", il);
14901
+
14902
+ cur = ggml_add(ctx0, cur, ffn_inp);
14903
+
14904
+ cur = build_cvec(cur, il);
14905
+ cb(cur, "l_out", il);
14906
+
14907
+ // input for next layer
14908
+ inpL = cur;
14909
+ }
14910
+
14911
+ cur = inpL;
14912
+
14913
+ cur = build_norm(cur,
14914
+ model.output_norm, NULL,
14915
+ LLM_NORM_RMS, -1);
14916
+
14917
+ cb(cur, "result_norm", -1);
14918
+ res->t_embd = cur;
14919
+
14920
+ cur = build_lora_mm(model.output, cur);
14921
+
14922
+ cb(cur, "result_output", -1);
14923
+ res->t_logits = cur;
14924
+
14925
+ ggml_build_forward_expand(gf, cur);
14926
+ }
14927
+ };
14928
+
14929
+ struct llm_build_bailingmoe : public llm_graph_context {
14930
+ llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14931
+ ggml_tensor * cur;
14932
+ ggml_tensor * inpL;
14933
+
14934
+ inpL = build_inp_embd(model.tok_embd);
14935
+
14936
+ // inp_pos - contains the positions
14937
+ ggml_tensor * inp_pos = build_inp_pos();
14938
+
14939
+ auto * inp_attn = build_attn_inp_kv_unified();
14940
+
14941
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14942
+
14943
+ for (int il = 0; il < n_layer; ++il) {
14944
+ ggml_tensor * inpSA = inpL;
14945
+
14946
+ // norm
14947
+ cur = build_norm(inpL,
14948
+ model.layers[il].attn_norm, NULL,
14949
+ LLM_NORM_RMS, il);
14950
+ cb(cur, "attn_norm", il);
14951
+
14952
+ // self-attention
14953
+ {
14954
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
14955
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14956
+
14957
+ // compute Q and K and RoPE them
14958
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14959
+ cb(Qcur, "Qcur", il);
14960
+ if (model.layers[il].bq) {
14961
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14962
+ cb(Qcur, "Qcur", il);
14963
+ }
14964
+
14965
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14966
+ cb(Kcur, "Kcur", il);
14967
+ if (model.layers[il].bk) {
14968
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14969
+ cb(Kcur, "Kcur", il);
14970
+ }
13194
14971
 
13195
14972
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13196
14973
  cb(Vcur, "Vcur", il);
@@ -13199,24 +14976,21 @@ struct llm_build_granite : public llm_graph_context {
13199
14976
  cb(Vcur, "Vcur", il);
13200
14977
  }
13201
14978
 
13202
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13203
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13204
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14979
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
14980
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
14981
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
13205
14982
 
13206
- if (use_rope) {
13207
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
13208
- Qcur = ggml_rope_ext(
13209
- ctx0, Qcur, inp_pos, rope_factors,
13210
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13211
- ext_factor, attn_factor, beta_fast, beta_slow
13212
- );
14983
+ Qcur = ggml_rope_ext(
14984
+ ctx0, Qcur, inp_pos, rope_factors,
14985
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14986
+ ext_factor, attn_factor, beta_fast, beta_slow
14987
+ );
13213
14988
 
13214
- Kcur = ggml_rope_ext(
13215
- ctx0, Kcur, inp_pos, rope_factors,
13216
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13217
- ext_factor, attn_factor, beta_fast, beta_slow
13218
- );
13219
- }
14989
+ Kcur = ggml_rope_ext(
14990
+ ctx0, Kcur, inp_pos, rope_factors,
14991
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14992
+ ext_factor, attn_factor, beta_fast, beta_slow
14993
+ );
13220
14994
 
13221
14995
  cb(Qcur, "Qcur", il);
13222
14996
  cb(Kcur, "Kcur", il);
@@ -13224,77 +14998,51 @@ struct llm_build_granite : public llm_graph_context {
13224
14998
 
13225
14999
  cur = build_attn(inp_attn, gf,
13226
15000
  model.layers[il].wo, model.layers[il].bo,
13227
- Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
13228
- cb(cur, "attn_out", il);
15001
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
13229
15002
  }
13230
15003
 
13231
15004
  if (il == n_layer - 1 && inp_out_ids) {
13232
15005
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13233
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13234
- }
13235
-
13236
- // For Granite architectures - scale residual
13237
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
13238
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13239
- cb(ffn_inp, "ffn_inp", il);
13240
-
13241
- // feed-forward network (non-MoE)
13242
- if (model.layers[il].ffn_gate_inp == nullptr) {
13243
-
13244
- cur = build_norm(ffn_inp,
13245
- model.layers[il].ffn_norm, NULL,
13246
- LLM_NORM_RMS, il);
13247
- cb(cur, "ffn_norm", il);
13248
-
13249
- cur = build_ffn(cur,
13250
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
13251
- model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
13252
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
13253
- NULL,
13254
- LLM_FFN_SILU, LLM_FFN_PAR, il);
13255
- cb(cur, "ffn_out", il);
15006
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15007
+ }
13256
15008
 
13257
- } else {
13258
- // MoE branch
13259
- cur = build_norm(ffn_inp,
13260
- model.layers[il].ffn_norm, NULL,
13261
- LLM_NORM_RMS, il);
13262
- cb(cur, "ffn_norm", il);
15009
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15010
+ cb(ffn_inp, "ffn_inp", il);
15011
+
15012
+ cur = build_norm(ffn_inp,
15013
+ model.layers[il].ffn_norm, NULL,
15014
+ LLM_NORM_RMS, il);
15015
+ cb(cur, "ffn_norm", il);
13263
15016
 
13264
- ggml_tensor * moe_out = build_moe_ffn(cur,
15017
+ ggml_tensor * moe_out =
15018
+ build_moe_ffn(cur,
13265
15019
  model.layers[il].ffn_gate_inp,
13266
15020
  model.layers[il].ffn_up_exps,
13267
15021
  model.layers[il].ffn_gate_exps,
13268
15022
  model.layers[il].ffn_down_exps,
13269
15023
  nullptr,
13270
15024
  n_expert, n_expert_used,
13271
- LLM_FFN_SILU, true,
13272
- false, 0.0,
15025
+ LLM_FFN_SILU, hparams.expert_weights_norm,
15026
+ false, hparams.expert_weights_scale,
13273
15027
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
13274
15028
  il);
13275
- cb(moe_out, "ffn_moe_out", il);
15029
+ cb(moe_out, "ffn_moe_out", il);
13276
15030
 
13277
- // For Granite MoE Shared
13278
- if (hparams.n_ff_shexp > 0) {
13279
- ggml_tensor * ffn_shexp = build_ffn(cur,
15031
+ // FFN shared expert
15032
+ {
15033
+ ggml_tensor * ffn_shexp = build_ffn(cur,
13280
15034
  model.layers[il].ffn_up_shexp, NULL, NULL,
13281
15035
  model.layers[il].ffn_gate_shexp, NULL, NULL,
13282
15036
  model.layers[il].ffn_down_shexp, NULL, NULL,
13283
15037
  NULL,
13284
15038
  LLM_FFN_SILU, LLM_FFN_PAR, il);
13285
- cb(ffn_shexp, "ffn_shexp", il);
15039
+ cb(ffn_shexp, "ffn_shexp", il);
13286
15040
 
13287
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
13288
- cb(cur, "ffn_out", il);
13289
- } else {
13290
- cur = moe_out;
13291
- }
15041
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
15042
+ cb(cur, "ffn_out", il);
13292
15043
  }
13293
15044
 
13294
- // For Granite architectures - scale residual
13295
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
13296
15045
  cur = ggml_add(ctx0, cur, ffn_inp);
13297
- cb(cur, "ffn_out", il);
13298
15046
 
13299
15047
  cur = build_cvec(cur, il);
13300
15048
  cb(cur, "l_out", il);
@@ -13315,8 +15063,6 @@ struct llm_build_granite : public llm_graph_context {
13315
15063
  // lm_head
13316
15064
  cur = build_lora_mm(model.output, cur);
13317
15065
 
13318
- // For Granite architectures - scale logits
13319
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
13320
15066
  cb(cur, "result_output", -1);
13321
15067
  res->t_logits = cur;
13322
15068
 
@@ -13324,14 +15070,8 @@ struct llm_build_granite : public llm_graph_context {
13324
15070
  }
13325
15071
  };
13326
15072
 
13327
- // ref: https://github.com/facebookresearch/chameleon
13328
- // based on the original build_llama() function, changes:
13329
- // * qk-norm
13330
- // * swin-norm
13331
- // * removed bias
13332
- // * removed MoE
13333
- struct llm_build_chameleon : public llm_graph_context {
13334
- llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15073
+ struct llm_build_dots1 : public llm_graph_context {
15074
+ llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13335
15075
  const int64_t n_embd_head = hparams.n_embd_head_v;
13336
15076
 
13337
15077
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -13353,16 +15093,12 @@ struct llm_build_chameleon : public llm_graph_context {
13353
15093
  ggml_tensor * inpSA = inpL;
13354
15094
 
13355
15095
  // norm
13356
- if (hparams.swin_norm) {
13357
- cur = inpL;
13358
- } else {
13359
- cur = build_norm(inpL,
13360
- model.layers[il].attn_norm, NULL,
13361
- LLM_NORM_RMS, il);
13362
- cb(cur, "attn_norm", il);
13363
- }
15096
+ cur = build_norm(inpL,
15097
+ model.layers[il].attn_norm, NULL,
15098
+ LLM_NORM_RMS, il);
15099
+ cb(cur, "attn_norm", il);
13364
15100
 
13365
- // self-attention
15101
+ // self_attention
13366
15102
  {
13367
15103
  // compute Q and K and RoPE them
13368
15104
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -13374,44 +15110,22 @@ struct llm_build_chameleon : public llm_graph_context {
13374
15110
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13375
15111
  cb(Vcur, "Vcur", il);
13376
15112
 
13377
- if (model.layers[il].attn_q_norm) {
13378
- Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
13379
- ggml_element_size(Qcur) * n_embd_head,
13380
- ggml_element_size(Qcur) * n_embd_head * n_head,
13381
- 0);
13382
- cb(Qcur, "Qcur", il);
13383
-
13384
- Qcur = build_norm(Qcur,
13385
- model.layers[il].attn_q_norm,
13386
- model.layers[il].attn_q_norm_b,
13387
- LLM_NORM, il);
13388
- cb(Qcur, "Qcur", il);
13389
- }
13390
-
13391
- if (model.layers[il].attn_k_norm) {
13392
- Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
13393
- ggml_element_size(Kcur) * n_embd_head,
13394
- ggml_element_size(Kcur) * n_embd_head * n_head_kv,
13395
- 0);
13396
- cb(Kcur, "Kcur", il);
13397
-
13398
- Kcur = build_norm(Kcur,
13399
- model.layers[il].attn_k_norm,
13400
- model.layers[il].attn_k_norm_b,
13401
- LLM_NORM, il);
13402
- cb(Kcur, "Kcur", il);
13403
- }
13404
-
13405
15113
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13406
15114
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13407
15115
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13408
15116
 
15117
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
15118
+ cb(Qcur, "Qcur_normed", il);
15119
+
13409
15120
  Qcur = ggml_rope_ext(
13410
15121
  ctx0, Qcur, inp_pos, nullptr,
13411
15122
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13412
15123
  ext_factor, attn_factor, beta_fast, beta_slow
13413
15124
  );
13414
15125
 
15126
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
15127
+ cb(Kcur, "Kcur_normed", il);
15128
+
13415
15129
  Kcur = ggml_rope_ext(
13416
15130
  ctx0, Kcur, inp_pos, nullptr,
13417
15131
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13423,7 +15137,7 @@ struct llm_build_chameleon : public llm_graph_context {
13423
15137
  cb(Vcur, "Vcur", il);
13424
15138
 
13425
15139
  cur = build_attn(inp_attn, gf,
13426
- model.layers[il].wo, nullptr,
15140
+ model.layers[il].wo, model.layers[il].bo,
13427
15141
  Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13428
15142
  }
13429
15143
 
@@ -13432,40 +15146,53 @@ struct llm_build_chameleon : public llm_graph_context {
13432
15146
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13433
15147
  }
13434
15148
 
13435
- if (hparams.swin_norm) {
13436
- cur = build_norm(cur,
13437
- model.layers[il].attn_norm, NULL,
13438
- LLM_NORM_RMS, il);
13439
- }
13440
-
13441
15149
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13442
15150
  cb(ffn_inp, "ffn_inp", il);
13443
15151
 
13444
- // feed-forward network
13445
- if (!hparams.swin_norm) {
13446
- cur = build_norm(ffn_inp,
13447
- model.layers[il].ffn_norm, NULL,
13448
- LLM_NORM_RMS, il);
13449
- cb(cur, "ffn_norm", il);
13450
- }
15152
+ // MoE branch
15153
+ cur = build_norm(ffn_inp,
15154
+ model.layers[il].ffn_norm, NULL,
15155
+ LLM_NORM_RMS, il);
15156
+ cb(cur, "ffn_norm", il);
13451
15157
 
13452
- cur = build_ffn(cur,
13453
- model.layers[il].ffn_up, NULL, NULL,
13454
- model.layers[il].ffn_gate, NULL, NULL,
13455
- model.layers[il].ffn_down, NULL, NULL,
13456
- NULL,
13457
- LLM_FFN_SILU, LLM_FFN_PAR, il);
13458
- cb(cur, "ffn_out", il);
15158
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
15159
+ cur = build_ffn(cur,
15160
+ model.layers[il].ffn_up, NULL, NULL,
15161
+ model.layers[il].ffn_gate, NULL, NULL,
15162
+ model.layers[il].ffn_down, NULL, NULL,
15163
+ NULL,
15164
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15165
+ cb(cur, "ffn_out", il);
15166
+ } else {
15167
+ ggml_tensor * moe_out =
15168
+ build_moe_ffn(cur,
15169
+ model.layers[il].ffn_gate_inp,
15170
+ model.layers[il].ffn_up_exps,
15171
+ model.layers[il].ffn_gate_exps,
15172
+ model.layers[il].ffn_down_exps,
15173
+ model.layers[il].ffn_exp_probs_b,
15174
+ n_expert, n_expert_used,
15175
+ LLM_FFN_SILU, hparams.expert_weights_norm,
15176
+ true, hparams.expert_weights_scale,
15177
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
15178
+ il);
15179
+ cb(moe_out, "ffn_moe_out", il);
13459
15180
 
13460
- if (hparams.swin_norm) {
13461
- cur = build_norm(cur,
13462
- model.layers[il].ffn_norm, NULL,
13463
- LLM_NORM_RMS, il);
13464
- cb(cur, "ffn_norm", il);
15181
+ {
15182
+ ggml_tensor * ffn_shexp = build_ffn(cur,
15183
+ model.layers[il].ffn_up_shexp, NULL, NULL,
15184
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
15185
+ model.layers[il].ffn_down_shexp, NULL, NULL,
15186
+ NULL,
15187
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15188
+ cb(ffn_shexp, "ffn_shexp", il);
15189
+
15190
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
15191
+ cb(cur, "ffn_out", il);
15192
+ }
13465
15193
  }
13466
15194
 
13467
15195
  cur = ggml_add(ctx0, cur, ffn_inp);
13468
- cb(cur, "ffn_out", il);
13469
15196
 
13470
15197
  cur = build_cvec(cur, il);
13471
15198
  cb(cur, "l_out", il);
@@ -13485,20 +15212,6 @@ struct llm_build_chameleon : public llm_graph_context {
13485
15212
 
13486
15213
  // lm_head
13487
15214
  cur = build_lora_mm(model.output, cur);
13488
- cb(cur, "result_output_with_img_logits", -1);
13489
-
13490
- // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
13491
- // Needs to be removed once image outputs are supported.
13492
- int img_token_end_idx = 8196;
13493
- int img_token_start_idx = 4;
13494
- int num_img_tokens = img_token_end_idx - img_token_start_idx;
13495
- // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
13496
- // which ensures that text token values are always at least larger than image token values
13497
- ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
13498
- img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
13499
- cb(img_logits, "img_logits", -1);
13500
-
13501
- cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
13502
15215
 
13503
15216
  cb(cur, "result_output", -1);
13504
15217
  res->t_logits = cur;
@@ -13507,304 +15220,235 @@ struct llm_build_chameleon : public llm_graph_context {
13507
15220
  }
13508
15221
  };
13509
15222
 
13510
- struct llm_build_wavtokenizer_dec : public llm_graph_context {
13511
- llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15223
+ struct llm_build_ernie4_5 : public llm_graph_context {
15224
+ llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15225
+ const int64_t n_embd_head = hparams.n_embd_head_v;
15226
+
15227
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15228
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
15229
+
13512
15230
  ggml_tensor * cur;
13513
15231
  ggml_tensor * inpL;
13514
15232
 
13515
15233
  inpL = build_inp_embd(model.tok_embd);
13516
15234
 
13517
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
13518
-
13519
- cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
13520
- cur = ggml_add(ctx0, cur, model.conv1d_b);
13521
-
13522
- // posnet
13523
- for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
13524
- const auto & layer = model.layers[il].posnet;
13525
-
13526
- inpL = cur;
13527
-
13528
- switch (il) {
13529
- case 0:
13530
- case 1:
13531
- case 3:
13532
- case 4:
13533
- {
13534
- cur = build_norm(cur,
13535
- layer.norm1,
13536
- layer.norm1_b,
13537
- LLM_NORM_GROUP, 0);
13538
-
13539
- cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
13540
-
13541
- cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
13542
- cur = ggml_add(ctx0, cur, layer.conv1_b);
13543
-
13544
- cur = build_norm(cur,
13545
- layer.norm2,
13546
- layer.norm2_b,
13547
- LLM_NORM_GROUP, 0);
13548
-
13549
- cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
13550
-
13551
- cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
13552
- cur = ggml_add(ctx0, cur, layer.conv2_b);
13553
-
13554
- cur = ggml_add(ctx0, cur, inpL);
13555
- } break;
13556
- case 2:
13557
- {
13558
- cur = build_norm(cur,
13559
- layer.attn_norm,
13560
- layer.attn_norm_b,
13561
- LLM_NORM_GROUP, 0);
13562
-
13563
- ggml_tensor * q;
13564
- ggml_tensor * k;
13565
- ggml_tensor * v;
13566
-
13567
- q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
13568
- k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
13569
- v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
13570
-
13571
- q = ggml_add(ctx0, q, layer.attn_q_b);
13572
- k = ggml_add(ctx0, k, layer.attn_k_b);
13573
- v = ggml_add(ctx0, v, layer.attn_v_b);
13574
-
13575
- q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
13576
- k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
15235
+ // inp_pos - contains the positions
15236
+ ggml_tensor * inp_pos = build_inp_pos();
13577
15237
 
13578
- ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
15238
+ auto * inp_attn = build_attn_inp_kv_unified();
13579
15239
 
13580
- kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
15240
+ for (int il = 0; il < n_layer; ++il) {
15241
+ ggml_tensor * inpSA = inpL;
13581
15242
 
13582
- cur = ggml_mul_mat(ctx0, kq, v);
15243
+ // norm
15244
+ {
15245
+ cur = build_norm(inpL,
15246
+ model.layers[il].attn_norm, NULL,
15247
+ LLM_NORM_RMS, il);
15248
+ cb(cur, "attn_norm", il);
15249
+ }
13583
15250
 
13584
- cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
13585
- cur = ggml_add(ctx0, cur, layer.attn_o_b);
15251
+ // self-attention
15252
+ {
15253
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
15254
+ cb(Qcur, "Qcur", il);
15255
+ if (model.layers[il].bq) {
15256
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
15257
+ cb(Qcur, "Qcur", il);
15258
+ }
13586
15259
 
13587
- cur = ggml_add(ctx0, cur, inpL);
13588
- } break;
13589
- case 5:
13590
- {
13591
- cur = build_norm(cur,
13592
- layer.norm,
13593
- layer.norm_b,
13594
- LLM_NORM_GROUP, 0);
13595
- } break;
13596
- default: GGML_ABORT("unknown posnet layer");
13597
- };
13598
- }
15260
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
15261
+ cb(Kcur, "Kcur", il);
15262
+ if (model.layers[il].bk) {
15263
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
15264
+ cb(Kcur, "Kcur", il);
15265
+ }
13599
15266
 
13600
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
15267
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
15268
+ cb(Vcur, "Vcur", il);
15269
+ if (model.layers[il].bv) {
15270
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
15271
+ cb(Vcur, "Vcur", il);
15272
+ }
13601
15273
 
13602
- cur = build_norm(cur,
13603
- model.tok_norm,
13604
- model.tok_norm_b,
13605
- LLM_NORM, -1);
15274
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
15275
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
15276
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13606
15277
 
13607
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
15278
+ Qcur = ggml_rope_ext(
15279
+ ctx0, Qcur, inp_pos, nullptr,
15280
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15281
+ ext_factor, attn_factor, beta_fast, beta_slow
15282
+ );
13608
15283
 
13609
- inpL = cur;
15284
+ Kcur = ggml_rope_ext(
15285
+ ctx0, Kcur, inp_pos, nullptr,
15286
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15287
+ ext_factor, attn_factor, beta_fast, beta_slow
15288
+ );
13610
15289
 
13611
- // convnext
13612
- for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
13613
- const auto & layer = model.layers[il].convnext;
15290
+ cb(Qcur, "Qcur", il);
15291
+ cb(Kcur, "Kcur", il);
15292
+ cb(Vcur, "Vcur", il);
13614
15293
 
13615
- cur = inpL;
15294
+ cur = build_attn(inp_attn, gf,
15295
+ model.layers[il].wo, NULL,
15296
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15297
+ }
13616
15298
 
13617
- cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
13618
- cur = ggml_add(ctx0, cur, layer.dw_b);
15299
+ if (il == n_layer - 1) {
15300
+ // skip computing output for unused tokens
15301
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
15302
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15303
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15304
+ }
13619
15305
 
13620
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
15306
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15307
+ cb(ffn_inp, "ffn_inp", il);
13621
15308
 
13622
- cur = build_norm(cur,
13623
- layer.norm,
13624
- layer.norm_b,
13625
- LLM_NORM, -1);
15309
+ // feed-forward network
15310
+ {
15311
+ cur = build_norm(ffn_inp,
15312
+ model.layers[il].ffn_norm, NULL,
15313
+ LLM_NORM_RMS, il);
15314
+ cb(cur, "ffn_norm", il);
13626
15315
 
13627
- cur = build_ffn(cur,
13628
- layer.pw1, layer.pw1_b, NULL,
13629
- NULL, NULL, NULL,
13630
- layer.pw2, layer.pw2_b, NULL,
13631
- NULL,
13632
- LLM_FFN_GELU, LLM_FFN_SEQ, il);
15316
+ cur = build_ffn(cur,
15317
+ model.layers[il].ffn_up, NULL, NULL,
15318
+ model.layers[il].ffn_gate, NULL, NULL,
15319
+ model.layers[il].ffn_down, NULL, NULL,
15320
+ NULL,
15321
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15322
+ cb(cur, "ffn_out", il);
15323
+ }
13633
15324
 
13634
- cur = ggml_mul(ctx0, cur, layer.gamma);
15325
+ cur = ggml_add(ctx0, cur, ffn_inp);
13635
15326
 
13636
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
15327
+ cur = build_cvec(cur, il);
15328
+ cb(cur, "l_out", il);
13637
15329
 
13638
- inpL = ggml_add(ctx0, cur, inpL);
15330
+ // input for next layer
15331
+ inpL = cur;
13639
15332
  }
13640
15333
 
13641
15334
  cur = inpL;
13642
15335
 
13643
- cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
13644
-
13645
15336
  cur = build_norm(cur,
13646
- model.output_norm,
13647
- model.output_norm_b,
13648
- LLM_NORM, -1);
15337
+ model.output_norm, NULL,
15338
+ LLM_NORM_RMS, -1);
15339
+
15340
+ cb(cur, "result_norm", -1);
15341
+ res->t_embd = cur;
13649
15342
 
13650
15343
  // lm_head
13651
15344
  cur = build_lora_mm(model.output, cur);
13652
15345
 
13653
- cur = ggml_add(ctx0, cur, model.output_b);
13654
-
13655
- cb(cur, "result_embd", -1);
13656
- res->t_embd = cur;
15346
+ cb(cur, "result_output", -1);
15347
+ res->t_logits = cur;
13657
15348
 
13658
15349
  ggml_build_forward_expand(gf, cur);
13659
15350
  }
13660
15351
  };
13661
15352
 
13662
- struct llm_build_plm : public llm_graph_context {
13663
- llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13664
- const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
13665
-
13666
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
13667
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
13668
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
15353
+ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
15354
+ llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
15355
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13669
15356
 
13670
15357
  ggml_tensor * cur;
13671
15358
  ggml_tensor * inpL;
13672
15359
 
13673
- // {n_embd, n_tokens}
13674
15360
  inpL = build_inp_embd(model.tok_embd);
13675
15361
 
13676
15362
  // inp_pos - contains the positions
13677
15363
  ggml_tensor * inp_pos = build_inp_pos();
13678
15364
 
13679
- auto * inp_attn = build_attn_inp_kv_unified();
15365
+ // Build the inputs in the recurrent & kv cache
15366
+ auto * inp = build_inp_mem_hybrid();
15367
+
15368
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
13680
15369
 
13681
15370
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13682
15371
 
13683
15372
  for (int il = 0; il < n_layer; ++il) {
13684
15373
  ggml_tensor * inpSA = inpL;
13685
15374
 
13686
- // norm
13687
15375
  cur = build_norm(inpL,
13688
15376
  model.layers[il].attn_norm, NULL,
13689
15377
  LLM_NORM_RMS, il);
13690
15378
  cb(cur, "attn_norm", il);
13691
15379
 
13692
- // self_attention
13693
- {
13694
- ggml_tensor * q = NULL;
13695
- q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
13696
- cb(q, "q", il);
13697
-
13698
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
13699
- ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
13700
- ggml_row_size(q->type, hparams.n_embd_head_k),
13701
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
13702
- 0);
13703
- cb(q_nope, "q_nope", il);
13704
-
13705
- // and {n_head * n_embd_head_qk_rope, n_tokens}
13706
- ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
13707
- ggml_row_size(q->type, hparams.n_embd_head_k),
13708
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
13709
- ggml_row_size(q->type, n_embd_head_qk_nope));
13710
- cb(q_pe, "q_pe", il);
13711
-
13712
- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
13713
- ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
13714
- cb(kv_pe_compresseed, "kv_pe_compresseed", il);
13715
-
13716
- // split into {kv_lora_rank, n_tokens}
13717
- ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
13718
- kv_pe_compresseed->nb[1],
13719
- 0);
13720
- cb(kv_compressed, "kv_compressed", il);
13721
-
13722
- // and {n_embd_head_qk_rope, n_tokens}
13723
- ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
13724
- kv_pe_compresseed->nb[1],
13725
- kv_pe_compresseed->nb[1],
13726
- ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
13727
- cb(k_pe, "k_pe", il);
15380
+ // self-attention
15381
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
15382
+ cb(Qcur, "Qcur", il);
13728
15383
 
13729
- kv_compressed = build_norm(kv_compressed,
13730
- model.layers[il].attn_kv_a_norm, NULL,
13731
- LLM_NORM_RMS, il);
13732
- cb(kv_compressed, "kv_compressed", il);
15384
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
15385
+ cb(Kcur, "Kcur", il);
13733
15386
 
13734
- // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
13735
- ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
13736
- cb(kv, "kv", il);
15387
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
15388
+ cb(Vcur, "Vcur", il);
13737
15389
 
13738
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
13739
- ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
13740
- ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
13741
- ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
13742
- 0);
13743
- cb(k_nope, "k_nope", il);
15390
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
15391
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13744
15392
 
13745
- // and {n_head * n_embd_head_v, n_tokens}
13746
- ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
13747
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
13748
- ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
13749
- ggml_row_size(kv->type, (n_embd_head_qk_nope)));
13750
- cb(v_states, "v_states", il);
15393
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13751
15394
 
13752
- v_states = ggml_cont(ctx0, v_states);
13753
- cb(v_states, "v_states", il);
15395
+ Qcur = ggml_rope_ext(
15396
+ ctx0, Qcur, inp_pos, nullptr,
15397
+ n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
15398
+ ext_factor, attn_factor, beta_fast, beta_slow);
13754
15399
 
13755
- v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
13756
- ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
13757
- 0);
13758
- cb(v_states, "v_states", il);
15400
+ Kcur = ggml_rope_ext(
15401
+ ctx0, Kcur, inp_pos, nullptr,
15402
+ n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
15403
+ ext_factor, attn_factor, beta_fast, beta_slow
15404
+ );
13759
15405
 
13760
- q_pe = ggml_rope_ext(
13761
- ctx0, q_pe, inp_pos, nullptr,
13762
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13763
- ext_factor, attn_factor, beta_fast, beta_slow
13764
- );
13765
- cb(q_pe, "q_pe", il);
15406
+ cb(Qcur, "Qcur-post-rope", il);
15407
+ cb(Kcur, "Kcur-post-rope", il);
15408
+ cb(Vcur, "Vcur-post-rope", il);
13766
15409
 
13767
- // shared RoPE key
13768
- k_pe = ggml_rope_ext(
13769
- ctx0, k_pe, inp_pos, nullptr,
13770
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13771
- ext_factor, attn_factor, beta_fast, beta_slow
13772
- );
13773
- cb(k_pe, "k_pe", il);
15410
+ ggml_tensor * attn_out = build_attn(inp->get_attn(), gf,
15411
+ model.layers[il].wo, NULL,
15412
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15413
+ cb(attn_out, "attn_out", il);
13774
15414
 
13775
- ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
13776
- cb(q_states, "q_states", il);
15415
+ cur = build_norm(inpL,
15416
+ model.layers[il].attn_norm, NULL,
15417
+ LLM_NORM_RMS, il);
15418
+ // Mamba2 layer
15419
+ cb(cur, "ssm_in", il);
13777
15420
 
13778
- ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
13779
- cb(k_states, "k_states", il);
15421
+ ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il);
15422
+ cb(ssm_out, "ssm_out", il);
13780
15423
 
13781
- cur = build_attn(inp_attn, gf,
13782
- model.layers[il].wo, NULL,
13783
- q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
13784
- }
15424
+ // // Aggregation
15425
+ cur = ggml_add(ctx0, attn_out, ssm_out);
15426
+ inpSA = ggml_add(ctx0, cur, inpSA);
15427
+ cb(cur, "layer_out", il);
13785
15428
 
13786
15429
  if (il == n_layer - 1 && inp_out_ids) {
13787
15430
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13788
15431
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13789
15432
  }
13790
15433
 
13791
- ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15434
+ ggml_tensor * ffn_inp = inpSA;
13792
15435
  cb(ffn_inp, "ffn_inp", il);
13793
15436
 
15437
+ // feed-forward network
13794
15438
  cur = build_norm(ffn_inp,
13795
15439
  model.layers[il].ffn_norm, NULL,
13796
15440
  LLM_NORM_RMS, il);
13797
15441
  cb(cur, "ffn_norm", il);
13798
15442
 
13799
15443
  cur = build_ffn(cur,
13800
- model.layers[il].ffn_up, NULL, NULL,
13801
- NULL, NULL, NULL,
13802
- model.layers[il].ffn_down, NULL, NULL,
15444
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
15445
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
15446
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
13803
15447
  NULL,
13804
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
15448
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13805
15449
  cb(cur, "ffn_out", il);
13806
15450
 
13807
- cur = ggml_add(ctx0, cur, ffn_inp);
15451
+ cur = ggml_add(ctx0, cur, inpSA);
13808
15452
 
13809
15453
  cur = build_cvec(cur, il);
13810
15454
  cb(cur, "l_out", il);
@@ -13822,6 +15466,7 @@ struct llm_build_plm : public llm_graph_context {
13822
15466
  cb(cur, "result_norm", -1);
13823
15467
  res->t_embd = cur;
13824
15468
 
15469
+ // lm_head
13825
15470
  cur = build_lora_mm(model.output, cur);
13826
15471
 
13827
15472
  cb(cur, "result_output", -1);
@@ -13831,8 +15476,13 @@ struct llm_build_plm : public llm_graph_context {
13831
15476
  }
13832
15477
  };
13833
15478
 
13834
- struct llm_build_bailingmoe : public llm_graph_context {
13835
- llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15479
+ struct llm_build_arcee : public llm_graph_context {
15480
+ llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15481
+ const int64_t n_embd_head = hparams.n_embd_head_v;
15482
+
15483
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15484
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
15485
+
13836
15486
  ggml_tensor * cur;
13837
15487
  ggml_tensor * inpL;
13838
15488
 
@@ -13843,6 +15493,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
13843
15493
 
13844
15494
  auto * inp_attn = build_attn_inp_kv_unified();
13845
15495
 
15496
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15497
+
13846
15498
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13847
15499
 
13848
15500
  for (int il = 0; il < n_layer; ++il) {
@@ -13881,9 +15533,9 @@ struct llm_build_bailingmoe : public llm_graph_context {
13881
15533
  cb(Vcur, "Vcur", il);
13882
15534
  }
13883
15535
 
13884
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
13885
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
13886
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
15536
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
15537
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
15538
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13887
15539
 
13888
15540
  Qcur = ggml_rope_ext(
13889
15541
  ctx0, Qcur, inp_pos, rope_factors,
@@ -13903,7 +15555,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
13903
15555
 
13904
15556
  cur = build_attn(inp_attn, gf,
13905
15557
  model.layers[il].wo, model.layers[il].bo,
13906
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
15558
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15559
+ cb(cur, "attn_out", il);
13907
15560
  }
13908
15561
 
13909
15562
  if (il == n_layer - 1 && inp_out_ids) {
@@ -13914,40 +15567,23 @@ struct llm_build_bailingmoe : public llm_graph_context {
13914
15567
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13915
15568
  cb(ffn_inp, "ffn_inp", il);
13916
15569
 
15570
+ // feed-forward network
15571
+ // ARCEE uses relu^2 instead of silu
13917
15572
  cur = build_norm(ffn_inp,
13918
15573
  model.layers[il].ffn_norm, NULL,
13919
15574
  LLM_NORM_RMS, il);
13920
15575
  cb(cur, "ffn_norm", il);
13921
15576
 
13922
- ggml_tensor * moe_out =
13923
- build_moe_ffn(cur,
13924
- model.layers[il].ffn_gate_inp,
13925
- model.layers[il].ffn_up_exps,
13926
- model.layers[il].ffn_gate_exps,
13927
- model.layers[il].ffn_down_exps,
13928
- nullptr,
13929
- n_expert, n_expert_used,
13930
- LLM_FFN_SILU, hparams.expert_weights_norm,
13931
- false, hparams.expert_weights_scale,
13932
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
13933
- il);
13934
- cb(moe_out, "ffn_moe_out", il);
13935
-
13936
- // FFN shared expert
13937
- {
13938
- ggml_tensor * ffn_shexp = build_ffn(cur,
13939
- model.layers[il].ffn_up_shexp, NULL, NULL,
13940
- model.layers[il].ffn_gate_shexp, NULL, NULL,
13941
- model.layers[il].ffn_down_shexp, NULL, NULL,
13942
- NULL,
13943
- LLM_FFN_SILU, LLM_FFN_PAR, il);
13944
- cb(ffn_shexp, "ffn_shexp", il);
13945
-
13946
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
13947
- cb(cur, "ffn_out", il);
13948
- }
15577
+ cur = build_ffn(cur,
15578
+ model.layers[il].ffn_up, NULL, NULL,
15579
+ NULL, NULL, NULL,
15580
+ model.layers[il].ffn_down, NULL, NULL,
15581
+ NULL,
15582
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
15583
+ cb(cur, "ffn_out", il);
13949
15584
 
13950
15585
  cur = ggml_add(ctx0, cur, ffn_inp);
15586
+ cb(cur, "ffn_out", il);
13951
15587
 
13952
15588
  cur = build_cvec(cur, il);
13953
15589
  cb(cur, "l_out", il);
@@ -13975,8 +15611,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
13975
15611
  }
13976
15612
  };
13977
15613
 
13978
- struct llm_build_dots1 : public llm_graph_context {
13979
- llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15614
+ struct llm_build_hunyuan_moe : public llm_graph_context {
15615
+ llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13980
15616
  const int64_t n_embd_head = hparams.n_embd_head_v;
13981
15617
 
13982
15618
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -13992,6 +15628,8 @@ struct llm_build_dots1 : public llm_graph_context {
13992
15628
 
13993
15629
  auto * inp_attn = build_attn_inp_kv_unified();
13994
15630
 
15631
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
15632
+
13995
15633
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13996
15634
 
13997
15635
  for (int il = 0; il < n_layer; ++il) {
@@ -14003,47 +15641,67 @@ struct llm_build_dots1 : public llm_graph_context {
14003
15641
  LLM_NORM_RMS, il);
14004
15642
  cb(cur, "attn_norm", il);
14005
15643
 
14006
- // self_attention
15644
+ // self-attention
14007
15645
  {
15646
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
15647
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
15648
+
14008
15649
  // compute Q and K and RoPE them
14009
15650
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14010
15651
  cb(Qcur, "Qcur", il);
15652
+ if (model.layers[il].bq) {
15653
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
15654
+ cb(Qcur, "Qcur", il);
15655
+ }
14011
15656
 
14012
15657
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14013
15658
  cb(Kcur, "Kcur", il);
15659
+ if (model.layers[il].bk) {
15660
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
15661
+ cb(Kcur, "Kcur", il);
15662
+ }
14014
15663
 
14015
15664
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14016
15665
  cb(Vcur, "Vcur", il);
15666
+ if (model.layers[il].bv) {
15667
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
15668
+ cb(Vcur, "Vcur", il);
15669
+ }
14017
15670
 
14018
15671
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14019
15672
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14020
15673
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14021
15674
 
14022
- Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
14023
- cb(Qcur, "Qcur_normed", il);
14024
-
14025
15675
  Qcur = ggml_rope_ext(
14026
- ctx0, Qcur, inp_pos, nullptr,
15676
+ ctx0, Qcur, inp_pos, rope_factors,
14027
15677
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14028
15678
  ext_factor, attn_factor, beta_fast, beta_slow
14029
15679
  );
14030
15680
 
14031
- Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
14032
- cb(Kcur, "Kcur_normed", il);
15681
+ cb(Qcur, "Qcur", il);
15682
+ cb(Kcur, "Kcur", il);
15683
+ cb(Vcur, "Vcur", il);
14033
15684
 
14034
15685
  Kcur = ggml_rope_ext(
14035
- ctx0, Kcur, inp_pos, nullptr,
15686
+ ctx0, Kcur, inp_pos, rope_factors,
14036
15687
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14037
15688
  ext_factor, attn_factor, beta_fast, beta_slow
14038
15689
  );
14039
15690
 
14040
- cb(Qcur, "Qcur", il);
14041
- cb(Kcur, "Kcur", il);
14042
- cb(Vcur, "Vcur", il);
15691
+ Kcur = build_norm(Kcur,
15692
+ model.layers[il].attn_k_norm, nullptr,
15693
+ LLM_NORM_RMS, il);
15694
+ cb(Kcur, "Kcur_norm", il);
15695
+
15696
+ Qcur = build_norm(Qcur,
15697
+ model.layers[il].attn_q_norm, nullptr,
15698
+ LLM_NORM_RMS, il);
15699
+ cb(Qcur, "Qcur_norm", il);
14043
15700
 
14044
15701
  cur = build_attn(inp_attn, gf,
14045
15702
  model.layers[il].wo, model.layers[il].bo,
14046
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
15703
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
15704
+ cb(cur, "attn_out", il);
14047
15705
  }
14048
15706
 
14049
15707
  if (il == n_layer - 1 && inp_out_ids) {
@@ -14054,50 +15712,40 @@ struct llm_build_dots1 : public llm_graph_context {
14054
15712
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14055
15713
  cb(ffn_inp, "ffn_inp", il);
14056
15714
 
14057
- // MoE branch
14058
15715
  cur = build_norm(ffn_inp,
14059
- model.layers[il].ffn_norm, NULL,
14060
- LLM_NORM_RMS, il);
15716
+ model.layers[il].ffn_norm, NULL,
15717
+ LLM_NORM_RMS, il);
14061
15718
  cb(cur, "ffn_norm", il);
14062
15719
 
14063
- if ((uint32_t) il < hparams.n_layer_dense_lead) {
14064
- cur = build_ffn(cur,
14065
- model.layers[il].ffn_up, NULL, NULL,
14066
- model.layers[il].ffn_gate, NULL, NULL,
14067
- model.layers[il].ffn_down, NULL, NULL,
14068
- NULL,
14069
- LLM_FFN_SILU, LLM_FFN_PAR, il);
14070
- cb(cur, "ffn_out", il);
14071
- } else {
14072
- ggml_tensor * moe_out =
14073
- build_moe_ffn(cur,
14074
- model.layers[il].ffn_gate_inp,
14075
- model.layers[il].ffn_up_exps,
14076
- model.layers[il].ffn_gate_exps,
14077
- model.layers[il].ffn_down_exps,
14078
- model.layers[il].ffn_exp_probs_b,
14079
- n_expert, n_expert_used,
14080
- LLM_FFN_SILU, hparams.expert_weights_norm,
14081
- true, hparams.expert_weights_scale,
14082
- (llama_expert_gating_func_type) hparams.expert_gating_func,
14083
- il);
14084
- cb(moe_out, "ffn_moe_out", il);
15720
+ // feed-forward network (non-MoE)
15721
+ ggml_tensor * cur_mlp = build_ffn(cur,
15722
+ model.layers[il].ffn_up_shexp, NULL, NULL,
15723
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
15724
+ model.layers[il].ffn_down_shexp, NULL, NULL,
15725
+ NULL,
15726
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15727
+ cb(cur_mlp, "ffn_mlp", il);
14085
15728
 
14086
- {
14087
- ggml_tensor * ffn_shexp = build_ffn(cur,
14088
- model.layers[il].ffn_up_shexp, NULL, NULL,
14089
- model.layers[il].ffn_gate_shexp, NULL, NULL,
14090
- model.layers[il].ffn_down_shexp, NULL, NULL,
14091
- NULL,
14092
- LLM_FFN_SILU, LLM_FFN_PAR, il);
14093
- cb(ffn_shexp, "ffn_shexp", il);
15729
+ // MoE branch
15730
+ ggml_tensor * cur_moe = build_moe_ffn(cur,
15731
+ model.layers[il].ffn_gate_inp,
15732
+ model.layers[il].ffn_up_exps,
15733
+ model.layers[il].ffn_gate_exps,
15734
+ model.layers[il].ffn_down_exps,
15735
+ nullptr,
15736
+ n_expert, n_expert_used,
15737
+ LLM_FFN_SILU,
15738
+ true, // norm_topk_prob
15739
+ false,
15740
+ 0.0,
15741
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
15742
+ il);
15743
+ cb(cur_moe, "ffn_moe_out", il);
14094
15744
 
14095
- cur = ggml_add(ctx0, moe_out, ffn_shexp);
14096
- cb(cur, "ffn_out", il);
14097
- }
14098
- }
15745
+ ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
15746
+ cb(ffn_out, "ffn_out", il);
14099
15747
 
14100
- cur = ggml_add(ctx0, cur, ffn_inp);
15748
+ cur = ggml_add(ctx0, ffn_out, ffn_inp);
14101
15749
 
14102
15750
  cur = build_cvec(cur, il);
14103
15751
  cb(cur, "l_out", il);
@@ -14117,7 +15765,6 @@ struct llm_build_dots1 : public llm_graph_context {
14117
15765
 
14118
15766
  // lm_head
14119
15767
  cur = build_lora_mm(model.output, cur);
14120
-
14121
15768
  cb(cur, "result_output", -1);
14122
15769
  res->t_logits = cur;
14123
15770
 
@@ -14125,8 +15772,8 @@ struct llm_build_dots1 : public llm_graph_context {
14125
15772
  }
14126
15773
  };
14127
15774
 
14128
- struct llm_build_arcee : public llm_graph_context {
14129
- llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
15775
+ struct llm_build_smollm3 : public llm_graph_context {
15776
+ llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14130
15777
  const int64_t n_embd_head = hparams.n_embd_head_v;
14131
15778
 
14132
15779
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14149,6 +15796,8 @@ struct llm_build_arcee : public llm_graph_context {
14149
15796
  for (int il = 0; il < n_layer; ++il) {
14150
15797
  ggml_tensor * inpSA = inpL;
14151
15798
 
15799
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
15800
+
14152
15801
  // norm
14153
15802
  cur = build_norm(inpL,
14154
15803
  model.layers[il].attn_norm, NULL,
@@ -14157,9 +15806,6 @@ struct llm_build_arcee : public llm_graph_context {
14157
15806
 
14158
15807
  // self-attention
14159
15808
  {
14160
- // rope freq factors for llama3; may return nullptr for llama2 and other models
14161
- ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14162
-
14163
15809
  // compute Q and K and RoPE them
14164
15810
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14165
15811
  cb(Qcur, "Qcur", il);
@@ -14186,17 +15832,19 @@ struct llm_build_arcee : public llm_graph_context {
14186
15832
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14187
15833
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14188
15834
 
14189
- Qcur = ggml_rope_ext(
14190
- ctx0, Qcur, inp_pos, rope_factors,
14191
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14192
- ext_factor, attn_factor, beta_fast, beta_slow
14193
- );
15835
+ if (use_rope) {
15836
+ Qcur = ggml_rope_ext(
15837
+ ctx0, Qcur, inp_pos, nullptr,
15838
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15839
+ ext_factor, attn_factor, beta_fast, beta_slow
15840
+ );
14194
15841
 
14195
- Kcur = ggml_rope_ext(
14196
- ctx0, Kcur, inp_pos, rope_factors,
14197
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14198
- ext_factor, attn_factor, beta_fast, beta_slow
14199
- );
15842
+ Kcur = ggml_rope_ext(
15843
+ ctx0, Kcur, inp_pos, nullptr,
15844
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15845
+ ext_factor, attn_factor, beta_fast, beta_slow
15846
+ );
15847
+ }
14200
15848
 
14201
15849
  cb(Qcur, "Qcur", il);
14202
15850
  cb(Kcur, "Kcur", il);
@@ -14217,19 +15865,20 @@ struct llm_build_arcee : public llm_graph_context {
14217
15865
  cb(ffn_inp, "ffn_inp", il);
14218
15866
 
14219
15867
  // feed-forward network
14220
- // ARCEE uses relu^2 instead of silu
14221
- cur = build_norm(ffn_inp,
14222
- model.layers[il].ffn_norm, NULL,
14223
- LLM_NORM_RMS, il);
14224
- cb(cur, "ffn_norm", il);
15868
+ {
15869
+ cur = build_norm(ffn_inp,
15870
+ model.layers[il].ffn_norm, NULL,
15871
+ LLM_NORM_RMS, il);
15872
+ cb(cur, "ffn_norm", il);
14225
15873
 
14226
- cur = build_ffn(cur,
14227
- model.layers[il].ffn_up, NULL, NULL,
14228
- NULL, NULL, NULL,
14229
- model.layers[il].ffn_down, NULL, NULL,
14230
- NULL,
14231
- LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
14232
- cb(cur, "ffn_out", il);
15874
+ cur = build_ffn(cur,
15875
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
15876
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
15877
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
15878
+ NULL,
15879
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15880
+ cb(cur, "ffn_out", il);
15881
+ }
14233
15882
 
14234
15883
  cur = ggml_add(ctx0, cur, ffn_inp);
14235
15884
  cb(cur, "ffn_out", il);
@@ -14260,6 +15909,163 @@ struct llm_build_arcee : public llm_graph_context {
14260
15909
  }
14261
15910
  };
14262
15911
 
15912
+ struct llm_build_lfm2 : public llm_graph_context {
15913
+ const llama_model & model;
15914
+
15915
+ llm_build_lfm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
15916
+
15917
+ ggml_tensor * cur = build_inp_embd(model.tok_embd);
15918
+ cb(cur, "model.embed_tokens", -1);
15919
+
15920
+ ggml_tensor * inp_pos = build_inp_pos();
15921
+ auto * inp_hybrid = build_inp_mem_hybrid();
15922
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
15923
+
15924
+ for (int il = 0; il < n_layer; ++il) {
15925
+ auto * prev_cur = cur;
15926
+ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
15927
+ cb(cur, "model.layers.{}.operator_norm", il);
15928
+
15929
+ cur = hparams.is_recurrent(il) ?
15930
+ build_shortconv_block(gf, cur, inp_hybrid->get_recr(), il) :
15931
+ build_attn_block(gf, cur, inp_pos, inp_hybrid->get_attn(), il) ;
15932
+
15933
+ if (il == n_layer - 1 && inp_out_ids) {
15934
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15935
+ prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
15936
+ }
15937
+
15938
+ cur = ggml_add(ctx0, prev_cur, cur);
15939
+ cur = ggml_add(ctx0, cur, build_feed_forward(cur, il));
15940
+ }
15941
+
15942
+ cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
15943
+ cb(cur, "model.embedding_norm", -1);
15944
+ res->t_embd = cur;
15945
+
15946
+ // lm_head is tied with embeddings
15947
+ cur = build_lora_mm(model.tok_embd, cur);
15948
+ cb(cur, "lm_head", -1);
15949
+
15950
+ res->t_logits = cur;
15951
+
15952
+ ggml_build_forward_expand(gf, cur);
15953
+ }
15954
+
15955
+ ggml_tensor * build_feed_forward(ggml_tensor * cur,
15956
+ int il) const {
15957
+ cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
15958
+ cb(cur, "model.layers.{}.ffn_norm", il);
15959
+
15960
+ GGML_ASSERT(!model.layers[il].ffn_up_b);
15961
+ GGML_ASSERT(!model.layers[il].ffn_gate_b);
15962
+ GGML_ASSERT(!model.layers[il].ffn_down_b);
15963
+ cur = build_ffn(cur,
15964
+ model.layers[il].ffn_up, NULL, NULL,
15965
+ model.layers[il].ffn_gate, NULL, NULL,
15966
+ model.layers[il].ffn_down, NULL, NULL,
15967
+ NULL,
15968
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15969
+ cb(cur, "model.layers.{}.feed_forward.w2", il);
15970
+
15971
+ return cur;
15972
+ }
15973
+
15974
+ ggml_tensor * build_attn_block(ggml_cgraph * gf,
15975
+ ggml_tensor * cur,
15976
+ ggml_tensor * inp_pos,
15977
+ llm_graph_input_attn_kv_unified * inp_attn,
15978
+ int il) const {
15979
+ GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
15980
+ auto const n_embd_head = hparams.n_embd_head_v;
15981
+ auto const n_head_kv = hparams.n_head_kv(il);
15982
+
15983
+ auto * q = build_lora_mm(model.layers[il].wq, cur);
15984
+ cb(q, "model.layers.{}.self_attn.q_proj", il);
15985
+ auto * k = build_lora_mm(model.layers[il].wk, cur);
15986
+ cb(k, "model.layers.{}.self_attn.k_proj", il);
15987
+ auto * v = build_lora_mm(model.layers[il].wv, cur);
15988
+ cb(v, "model.layers.{}.self_attn.v_proj", il);
15989
+
15990
+ q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
15991
+ k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
15992
+ v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
15993
+
15994
+ // qk norm
15995
+ q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
15996
+ cb(q, "model.layers.{}.self_attn.q_layernorm", il);
15997
+ k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
15998
+ cb(k, "model.layers.{}.self_attn.k_layernorm", il);
15999
+
16000
+ // RoPE
16001
+ q = ggml_rope_ext(
16002
+ ctx0, q, inp_pos, nullptr,
16003
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
16004
+ ext_factor, attn_factor, beta_fast, beta_slow
16005
+ );
16006
+ k = ggml_rope_ext(
16007
+ ctx0, k, inp_pos, nullptr,
16008
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
16009
+ ext_factor, attn_factor, beta_fast, beta_slow
16010
+ );
16011
+
16012
+ cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL,
16013
+ q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
16014
+
16015
+ cb(cur, "model.layers.{}.self_attn.out_proj", il);
16016
+
16017
+ return cur;
16018
+ }
16019
+
16020
+ ggml_tensor * build_shortconv_block(ggml_cgraph * gf,
16021
+ ggml_tensor * cur,
16022
+ llm_graph_input_rs * inp_recr,
16023
+ int il) {
16024
+ const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
16025
+
16026
+ auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
16027
+ cb(bcx, "model.layers.{}.conv.in_proj", il);
16028
+
16029
+ constexpr auto n_chunks = 3;
16030
+ GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
16031
+ auto const chunk_size = bcx->ne[0] / n_chunks;
16032
+ auto * b = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 0 * chunk_size * ggml_element_size(bcx));
16033
+ auto * c = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 1 * chunk_size * ggml_element_size(bcx));
16034
+ auto * x = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 2 * chunk_size * ggml_element_size(bcx));
16035
+
16036
+ auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
16037
+
16038
+ // read conv state directly, with build_rs generation is slower
16039
+ ggml_tensor * conv_state = mctx_cur->get_r_l(il);
16040
+ const int64_t n_seqs = ubatch.n_seqs;
16041
+ ggml_tensor * conv = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs);
16042
+ conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs);
16043
+
16044
+ bx = ggml_concat(ctx0, conv, bx, 0);
16045
+ GGML_ASSERT(bx->ne[0] > conv->ne[0]);
16046
+
16047
+ auto * new_conv = ggml_view_2d(ctx0, bx, conv->ne[0], bx->ne[1], bx->nb[1], (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
16048
+ GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
16049
+
16050
+ // write conv state
16051
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, conv_state));
16052
+
16053
+ auto * conv_kernel = model.layers[il].shortconv.conv;
16054
+ GGML_ASSERT(hparams.n_shortconv_l_cache > 0);
16055
+
16056
+ // construct ssm_conv op
16057
+ ggml_tensor * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
16058
+ cb(conv_out, "model.layers.{}.conv.conv", il);
16059
+
16060
+ auto * y = ggml_mul(ctx0, c, conv_out);
16061
+
16062
+ y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
16063
+ cb(y, "model.layers.{}.conv.out_proj", il);
16064
+
16065
+ return y;
16066
+ }
16067
+ };
16068
+
14263
16069
  llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
14264
16070
  llama_memory_i * res;
14265
16071
 
@@ -14306,7 +16112,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
14306
16112
  /* recurrent_type_v */ GGML_TYPE_F32,
14307
16113
  /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
14308
16114
  /* n_seq_max */ cparams.n_seq_max,
14309
- /* offload */ cparams.offload_kqv);
16115
+ /* offload */ cparams.offload_kqv,
16116
+ /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
16117
+ /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
14310
16118
  } else {
14311
16119
  const auto padding = llama_kv_cache_unified::get_padding(cparams);
14312
16120
 
@@ -14495,9 +16303,14 @@ llm_graph_result_ptr llama_model::build_graph(
14495
16303
  llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
14496
16304
  } break;
14497
16305
  case LLM_ARCH_MAMBA:
16306
+ case LLM_ARCH_MAMBA2:
14498
16307
  {
14499
16308
  llm = std::make_unique<llm_build_mamba>(*this, params, gf);
14500
16309
  } break;
16310
+ case LLM_ARCH_JAMBA:
16311
+ {
16312
+ llm = std::make_unique<llm_build_jamba>(*this, params, gf);
16313
+ } break;
14501
16314
  case LLM_ARCH_XVERSE:
14502
16315
  {
14503
16316
  llm = std::make_unique<llm_build_xverse>(*this, params, gf);
@@ -14611,6 +16424,10 @@ llm_graph_result_ptr llama_model::build_graph(
14611
16424
  {
14612
16425
  llm = std::make_unique<llm_build_granite>(*this, params, gf);
14613
16426
  } break;
16427
+ case LLM_ARCH_GRANITE_HYBRID:
16428
+ {
16429
+ llm = std::make_unique<llm_build_granite_hybrid>(*this, params, gf);
16430
+ } break;
14614
16431
  case LLM_ARCH_CHAMELEON:
14615
16432
  {
14616
16433
  llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
@@ -14635,6 +16452,26 @@ llm_graph_result_ptr llama_model::build_graph(
14635
16452
  {
14636
16453
  llm = std::make_unique<llm_build_arcee>(*this, params, gf);
14637
16454
  } break;
16455
+ case LLM_ARCH_ERNIE4_5:
16456
+ {
16457
+ llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf);
16458
+ } break;
16459
+ case LLM_ARCH_HUNYUAN_MOE:
16460
+ {
16461
+ llm = std::make_unique<llm_build_hunyuan_moe>(*this, params, gf);
16462
+ } break;
16463
+ case LLM_ARCH_SMOLLM3:
16464
+ {
16465
+ llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
16466
+ } break;
16467
+ case LLM_ARCH_FALCON_H1:
16468
+ {
16469
+ llm = std::make_unique<llm_build_falcon_h1>(*this, params, gf);
16470
+ } break;
16471
+ case LLM_ARCH_LFM2:
16472
+ {
16473
+ llm = std::make_unique<llm_build_lfm2>(*this, params, gf);
16474
+ } break;
14638
16475
  default:
14639
16476
  GGML_ABORT("fatal error");
14640
16477
  }
@@ -14751,6 +16588,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14751
16588
  case LLM_ARCH_REFACT:
14752
16589
  case LLM_ARCH_BLOOM:
14753
16590
  case LLM_ARCH_MAMBA:
16591
+ case LLM_ARCH_MAMBA2:
16592
+ case LLM_ARCH_JAMBA:
14754
16593
  case LLM_ARCH_JINA_BERT_V2:
14755
16594
  case LLM_ARCH_T5:
14756
16595
  case LLM_ARCH_T5ENCODER:
@@ -14782,14 +16621,18 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14782
16621
  case LLM_ARCH_GLM4:
14783
16622
  case LLM_ARCH_GRANITE:
14784
16623
  case LLM_ARCH_GRANITE_MOE:
16624
+ case LLM_ARCH_GRANITE_HYBRID:
14785
16625
  case LLM_ARCH_CHAMELEON:
14786
16626
  case LLM_ARCH_BAILINGMOE:
14787
16627
  case LLM_ARCH_NEO_BERT:
16628
+ case LLM_ARCH_SMOLLM3:
14788
16629
  case LLM_ARCH_ARCEE:
16630
+ case LLM_ARCH_ERNIE4_5:
14789
16631
  return LLAMA_ROPE_TYPE_NORM;
14790
16632
 
14791
16633
  // the pairs of head values are offset by n_rot/2
14792
16634
  case LLM_ARCH_FALCON:
16635
+ case LLM_ARCH_FALCON_H1:
14793
16636
  case LLM_ARCH_GROK:
14794
16637
  case LLM_ARCH_DBRX:
14795
16638
  case LLM_ARCH_BERT:
@@ -14821,6 +16664,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14821
16664
  case LLM_ARCH_EXAONE:
14822
16665
  case LLM_ARCH_MINICPM3:
14823
16666
  case LLM_ARCH_DOTS1:
16667
+ case LLM_ARCH_HUNYUAN_MOE:
16668
+ case LLM_ARCH_LFM2:
14824
16669
  return LLAMA_ROPE_TYPE_NEOX;
14825
16670
 
14826
16671
  case LLM_ARCH_QWEN2VL: