@novastera-oss/llamarn 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/RNLlamaCpp.podspec +3 -2
  2. package/android/CMakeLists.txt +6 -3
  3. package/android/src/main/cpp/include/llama.h +140 -38
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  12. package/cpp/LlamaCppModel.cpp +48 -67
  13. package/cpp/LlamaCppModel.h +8 -3
  14. package/cpp/PureCppImpl.cpp +1 -1
  15. package/cpp/PureCppImpl.h +2 -2
  16. package/cpp/build-info.cpp +2 -2
  17. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  18. package/cpp/llama.cpp/Makefile +2 -2
  19. package/cpp/llama.cpp/README.md +33 -13
  20. package/cpp/llama.cpp/common/CMakeLists.txt +15 -28
  21. package/cpp/llama.cpp/common/arg.cpp +38 -12
  22. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  23. package/cpp/llama.cpp/common/chat-parser.cpp +9 -3
  24. package/cpp/llama.cpp/common/chat-parser.h +4 -1
  25. package/cpp/llama.cpp/common/chat.cpp +16 -13
  26. package/cpp/llama.cpp/common/chat.h +1 -1
  27. package/cpp/llama.cpp/common/common.cpp +52 -40
  28. package/cpp/llama.cpp/common/common.h +5 -2
  29. package/cpp/llama.cpp/common/json-partial.cpp +5 -4
  30. package/cpp/llama.cpp/common/json-partial.h +2 -1
  31. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  32. package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
  33. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  34. package/cpp/llama.cpp/convert_hf_to_gguf.py +128 -84
  35. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
  36. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  37. package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
  38. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +49 -13
  39. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
  41. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  44. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +33 -2
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +6 -8
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +25 -16
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  79. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  82. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
  83. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  88. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  93. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  94. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -46
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -248
  109. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  112. package/cpp/llama.cpp/ggml/src/ggml.c +9 -8
  113. package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
  114. package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
  115. package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
  116. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
  117. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
  118. package/cpp/llama.cpp/include/llama.h +140 -38
  119. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  120. package/cpp/llama.cpp/src/CMakeLists.txt +4 -1
  121. package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
  122. package/cpp/llama.cpp/src/llama-arch.h +7 -1
  123. package/cpp/llama.cpp/src/llama-batch.cpp +289 -31
  124. package/cpp/llama.cpp/src/llama-batch.h +47 -17
  125. package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
  126. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  127. package/cpp/llama.cpp/src/llama-context.cpp +488 -313
  128. package/cpp/llama.cpp/src/llama-context.h +38 -17
  129. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  130. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  131. package/cpp/llama.cpp/src/llama-graph.cpp +275 -152
  132. package/cpp/llama.cpp/src/llama-graph.h +109 -52
  133. package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
  134. package/cpp/llama.cpp/src/llama-hparams.h +8 -2
  135. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +281 -0
  136. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +133 -0
  137. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1835 -0
  138. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +308 -0
  139. package/cpp/llama.cpp/src/llama-kv-cells.h +53 -17
  140. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
  141. package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
  142. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +1116 -0
  143. package/cpp/llama.cpp/src/llama-memory-recurrent.h +188 -0
  144. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  145. package/cpp/llama.cpp/src/llama-memory.h +89 -4
  146. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  147. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  148. package/cpp/llama.cpp/src/llama-model.cpp +735 -143
  149. package/cpp/llama.cpp/src/llama-model.h +4 -0
  150. package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
  151. package/cpp/llama.cpp/src/llama-vocab.cpp +39 -25
  152. package/cpp/llama.cpp/src/llama.cpp +11 -7
  153. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  154. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
  155. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
  156. package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
  157. package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
  158. package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
  159. package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  160. package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
  161. package/cpp/rn-completion.cpp +65 -10
  162. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  163. package/cpp/{rn-utils.hpp → rn-utils.h} +8 -1
  164. package/ios/include/chat.h +1 -1
  165. package/ios/include/common/minja/chat-template.hpp +1 -1
  166. package/ios/include/common/minja/minja.hpp +1 -1
  167. package/ios/include/common.h +5 -2
  168. package/ios/include/json-schema-to-grammar.h +4 -4
  169. package/ios/include/llama.h +140 -38
  170. package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
  171. package/ios/libs/llama.xcframework/Info.plist +20 -20
  172. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  173. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4617
  174. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
  175. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +140 -38
  176. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  177. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  178. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
  179. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3557
  180. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  181. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  182. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  183. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  184. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
  185. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3559
  186. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
  187. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +140 -38
  188. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
  189. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +140 -38
  190. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  191. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
  192. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +140 -38
  193. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  194. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  195. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  196. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4616
  197. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
  198. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +140 -38
  199. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  200. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  201. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4637
  202. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3556
  203. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  204. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  205. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  206. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  207. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4653
  208. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
  209. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +140 -38
  210. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  211. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  212. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4674
  213. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3587
  214. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  215. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  216. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  217. package/package.json +1 -2
  218. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  219. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  220. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  221. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2747
  222. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -502
  223. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  224. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  225. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -5,7 +5,11 @@
5
5
  #include "llama-batch.h"
6
6
  #include "llama-cparams.h"
7
7
  #include "llama-model-loader.h"
8
- #include "llama-kv-cache.h"
8
+
9
+ #include "llama-kv-cache-unified.h"
10
+ #include "llama-kv-cache-unified-iswa.h"
11
+ #include "llama-memory-hybrid.h"
12
+ #include "llama-memory-recurrent.h"
9
13
 
10
14
  #include "ggml-cpp.h"
11
15
 
@@ -77,6 +81,7 @@ const char * llm_type_name(llm_type type) {
77
81
  case LLM_TYPE_40B: return "40B";
78
82
  case LLM_TYPE_65B: return "65B";
79
83
  case LLM_TYPE_70B: return "70B";
84
+ case LLM_TYPE_142B: return "142B";
80
85
  case LLM_TYPE_236B: return "236B";
81
86
  case LLM_TYPE_290B: return "290B";
82
87
  case LLM_TYPE_314B: return "314B";
@@ -466,6 +471,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
466
471
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
467
472
  std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
468
473
  std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
474
+ std::fill(
475
+ hparams.recurrent_layer_arr.begin(),
476
+ hparams.recurrent_layer_arr.end(),
477
+ llm_arch_is_recurrent(ml.get_arch()));
469
478
 
470
479
  std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
471
480
 
@@ -540,6 +549,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
540
549
  uint32_t n_vocab = 0;
541
550
  ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
542
551
 
552
+ // for classifier models
553
+ ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
554
+ if (!classifier_labels.empty()) {
555
+ hparams.n_cls_out = classifier_labels.size();
556
+ }
557
+
543
558
  // arch-specific KVs
544
559
  switch (arch) {
545
560
  case LLM_ARCH_LLAMA:
@@ -589,6 +604,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
589
604
  hparams.use_kq_norm = false;
590
605
  }
591
606
  } break;
607
+ case LLM_ARCH_ARCEE:
608
+ {
609
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
610
+
611
+ // Arcee uses the same structure as Llama
612
+ switch (hparams.n_layer) {
613
+ case 36: type = LLM_TYPE_4B; break;
614
+ default: type = LLM_TYPE_UNKNOWN;
615
+ }
616
+ } break;
592
617
  case LLM_ARCH_DECI:
593
618
  {
594
619
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -683,7 +708,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
683
708
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
684
709
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
685
710
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
686
- ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
687
711
 
688
712
  switch (hparams.n_layer) {
689
713
  case 3:
@@ -730,6 +754,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
730
754
  }
731
755
  }
732
756
  } break;
757
+ case LLM_ARCH_NEO_BERT:
758
+ {
759
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
760
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
761
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
762
+
763
+ if (hparams.n_layer == 28) {
764
+ type = LLM_TYPE_250M;
765
+ }
766
+ } break;
733
767
  case LLM_ARCH_BLOOM:
734
768
  {
735
769
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -953,6 +987,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
953
987
  case 46: type = LLM_TYPE_27B; break;
954
988
  default: type = LLM_TYPE_UNKNOWN;
955
989
  }
990
+
991
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
992
+ hparams.f_attention_scale = type == LLM_TYPE_27B
993
+ ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
994
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
956
995
  } break;
957
996
  case LLM_ARCH_GEMMA3:
958
997
  {
@@ -973,6 +1012,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
973
1012
  default: type = LLM_TYPE_UNKNOWN;
974
1013
  }
975
1014
 
1015
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
976
1016
  hparams.f_attention_scale = type == LLM_TYPE_27B
977
1017
  ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
978
1018
  : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
@@ -1430,6 +1470,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1430
1470
  default: type = LLM_TYPE_UNKNOWN;
1431
1471
  }
1432
1472
  } break;
1473
+ case LLM_ARCH_DOTS1:
1474
+ {
1475
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1476
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1477
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1478
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1479
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1480
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1481
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1482
+ switch (hparams.n_layer) {
1483
+ case 62: type = LLM_TYPE_142B; break;
1484
+ default: type = LLM_TYPE_UNKNOWN;
1485
+ }
1486
+ } break;
1433
1487
  default: throw std::runtime_error("unsupported model architecture");
1434
1488
  }
1435
1489
 
@@ -2173,6 +2227,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2173
2227
  layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
2174
2228
  }
2175
2229
  } break;
2230
+ case LLM_ARCH_NEO_BERT:
2231
+ {
2232
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2233
+
2234
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
2235
+ cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
2236
+
2237
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2238
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2239
+
2240
+ output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
2241
+
2242
+ for (int i = 0; i < n_layer; ++i) {
2243
+ auto & layer = layers[i];
2244
+
2245
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2246
+
2247
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2248
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2249
+
2250
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2251
+
2252
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
2253
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2254
+ }
2255
+ } break;
2176
2256
  case LLM_ARCH_JINA_BERT_V2:
2177
2257
  {
2178
2258
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
@@ -2210,8 +2290,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2210
2290
  layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2211
2291
  layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2212
2292
 
2213
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2214
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2293
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2294
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
2215
2295
 
2216
2296
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2217
2297
  layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@@ -4109,6 +4189,89 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4109
4189
  layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4110
4190
  }
4111
4191
  } break;
4192
+ case LLM_ARCH_DOTS1:
4193
+ {
4194
+ const int64_t n_ff_exp = hparams.n_ff_exp;
4195
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4196
+
4197
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4198
+
4199
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4200
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4201
+
4202
+ for (int i = 0; i < n_layer; ++i) {
4203
+ auto & layer = layers[i];
4204
+
4205
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4206
+
4207
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4208
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4209
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4210
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4211
+
4212
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4213
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4214
+
4215
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4216
+
4217
+ if (i < (int) hparams.n_layer_dense_lead) {
4218
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4219
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4220
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4221
+ } else {
4222
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4223
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
4224
+
4225
+ if (n_expert == 0) {
4226
+ throw std::runtime_error("n_expert must be > 0");
4227
+ }
4228
+ if (n_expert_used == 0) {
4229
+ throw std::runtime_error("n_expert_used must be > 0");
4230
+ }
4231
+
4232
+ // MoE branch
4233
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4234
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4235
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4236
+
4237
+ // Shared expert branch
4238
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4239
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
4240
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4241
+ }
4242
+ }
4243
+ } break;
4244
+ case LLM_ARCH_ARCEE:
4245
+ {
4246
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4247
+
4248
+ // output
4249
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4250
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4251
+
4252
+ // if output is NULL, init from the input tok embed
4253
+ if (output == NULL) {
4254
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4255
+ }
4256
+
4257
+ for (int i = 0; i < n_layer; ++i) {
4258
+ auto & layer = layers[i];
4259
+
4260
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4261
+
4262
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4263
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4264
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4265
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4266
+
4267
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4268
+
4269
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4270
+
4271
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4272
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4273
+ }
4274
+ } break;
4112
4275
  default:
4113
4276
  throw std::runtime_error("unknown architecture");
4114
4277
  }
@@ -4353,6 +4516,15 @@ void llama_model::print_info() const {
4353
4516
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
4354
4517
  LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
4355
4518
  LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
4519
+
4520
+ if (!classifier_labels.empty()) {
4521
+ LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
4522
+
4523
+ size_t i = 0;
4524
+ for (auto label : classifier_labels) {
4525
+ LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
4526
+ }
4527
+ }
4356
4528
  }
4357
4529
 
4358
4530
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
@@ -6020,7 +6192,7 @@ struct llm_build_bert : public llm_graph_context {
6020
6192
  model.layers[il].ffn_gate, NULL, NULL,
6021
6193
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
6022
6194
  NULL,
6023
- LLM_FFN_GELU, LLM_FFN_PAR, il);
6195
+ model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
6024
6196
  cb(cur, "ffn_out", il);
6025
6197
  } else {
6026
6198
  cur = build_ffn(cur,
@@ -6051,6 +6223,117 @@ struct llm_build_bert : public llm_graph_context {
6051
6223
  }
6052
6224
  };
6053
6225
 
6226
+ struct llm_build_neo_bert : public llm_graph_context {
6227
+ llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6228
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6229
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6230
+
6231
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6232
+
6233
+ ggml_tensor * cur;
6234
+ ggml_tensor * inpL;
6235
+ ggml_tensor * inp_pos = build_inp_pos();
6236
+
6237
+ // construct input embeddings (token, type, position)
6238
+ inpL = build_inp_embd(model.tok_embd);
6239
+ cb(inpL, "inp_embd", -1);
6240
+
6241
+ auto * inp_attn = build_attn_inp_no_cache();
6242
+
6243
+ // iterate layers
6244
+ for (int il = 0; il < n_layer; ++il) {
6245
+ ggml_tensor * cur = inpL;
6246
+
6247
+ ggml_tensor * Qcur;
6248
+ ggml_tensor * Kcur;
6249
+ ggml_tensor * Vcur;
6250
+
6251
+ // pre-norm
6252
+ cur = build_norm(inpL,
6253
+ model.layers[il].attn_norm, NULL,
6254
+ LLM_NORM_RMS, il);
6255
+
6256
+ // self-attention
6257
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
6258
+ cb(cur, "wqkv", il);
6259
+
6260
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6261
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6262
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6263
+
6264
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6265
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6266
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6267
+
6268
+ // RoPE
6269
+ Qcur = ggml_rope_ext(
6270
+ ctx0, Qcur, inp_pos, nullptr,
6271
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6272
+ ext_factor, attn_factor, beta_fast, beta_slow
6273
+ );
6274
+
6275
+ Kcur = ggml_rope_ext(
6276
+ ctx0, Kcur, inp_pos, nullptr,
6277
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6278
+ ext_factor, attn_factor, beta_fast, beta_slow
6279
+ );
6280
+
6281
+ cb(Qcur, "Qcur", il);
6282
+ cb(Kcur, "Kcur", il);
6283
+ cb(Vcur, "Vcur", il);
6284
+
6285
+ cur = build_attn(inp_attn, gf,
6286
+ model.layers[il].wo, nullptr,
6287
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6288
+ cb(cur, "kqv_out", il);
6289
+
6290
+ if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
6291
+ // skip computing output for unused tokens
6292
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6293
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6294
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6295
+ }
6296
+
6297
+ // re-add the layer input
6298
+ cur = ggml_add(ctx0, cur, inpL);
6299
+
6300
+ ggml_tensor * ffn_inp = cur;
6301
+ cb(ffn_inp, "ffn_inp", il);
6302
+
6303
+ // pre-norm
6304
+ cur = build_norm(ffn_inp,
6305
+ model.layers[il].ffn_norm, NULL,
6306
+ LLM_NORM_RMS, il);
6307
+ cb(cur, "ffn_norm", il);
6308
+
6309
+ // feed-forward network
6310
+ cur = build_ffn(cur,
6311
+ model.layers[il].ffn_up,
6312
+ NULL, NULL, NULL, NULL, NULL,
6313
+ model.layers[il].ffn_down,
6314
+ NULL, NULL, NULL,
6315
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
6316
+
6317
+ // attentions bypass the intermediate layer
6318
+ cur = ggml_add(ctx0, cur, ffn_inp);
6319
+
6320
+ // input for next layer
6321
+ inpL = cur;
6322
+ }
6323
+
6324
+ cur = inpL;
6325
+
6326
+ cur = build_norm(cur,
6327
+ model.output_norm_enc, NULL,
6328
+ LLM_NORM_RMS, -1);
6329
+
6330
+ cb(cur, "result_embd", -1);
6331
+ res->t_embd = cur;
6332
+
6333
+ ggml_build_forward_expand(gf, cur);
6334
+ }
6335
+ };
6336
+
6054
6337
  struct llm_build_bloom : public llm_graph_context {
6055
6338
  llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6056
6339
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -8481,14 +8764,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
8481
8764
  cb(Kcur, "Kcur", il);
8482
8765
  cb(Vcur, "Vcur", il);
8483
8766
 
8484
- // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
8485
- switch (model.type) {
8486
- case LLM_TYPE_2B:
8487
- case LLM_TYPE_9B:
8488
- case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); break;
8489
- default: GGML_ABORT("fatal error");
8490
- };
8491
- cb(Qcur, "Qcur_scaled", il);
8767
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
8492
8768
 
8493
8769
  cur = build_attn(inp_attn, gf,
8494
8770
  model.layers[il].wo, NULL,
@@ -8629,9 +8905,12 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8629
8905
  cb(Kcur, "Kcur", il);
8630
8906
  cb(Vcur, "Vcur", il);
8631
8907
 
8908
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
8909
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
8910
+
8632
8911
  cur = build_attn(inp_attn, gf,
8633
8912
  model.layers[il].wo, NULL,
8634
- Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
8913
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8635
8914
  }
8636
8915
 
8637
8916
  cur = build_norm(cur,
@@ -8837,8 +9116,7 @@ struct llm_build_mamba : public llm_graph_context {
8837
9116
  // {n_embd, n_tokens}
8838
9117
  inpL = build_inp_embd(model.tok_embd);
8839
9118
 
8840
- ggml_tensor * state_copy = build_inp_s_copy();
8841
- ggml_tensor * state_mask = build_inp_s_mask();
9119
+ auto * rs_inp = build_rs_inp();
8842
9120
 
8843
9121
  for (int il = 0; il < n_layer; ++il) {
8844
9122
  // norm
@@ -8847,8 +9125,7 @@ struct llm_build_mamba : public llm_graph_context {
8847
9125
  LLM_NORM_RMS, il);
8848
9126
  cb(cur, "attn_norm", il);
8849
9127
 
8850
- //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
8851
- cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
9128
+ cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
8852
9129
 
8853
9130
  if (il == n_layer - 1) {
8854
9131
  // skip computing output for unused tokens
@@ -8886,15 +9163,14 @@ struct llm_build_mamba : public llm_graph_context {
8886
9163
 
8887
9164
  // TODO: split
8888
9165
  ggml_tensor * build_mamba_layer(
8889
- ggml_cgraph * gf,
8890
- ggml_tensor * cur,
8891
- ggml_tensor * state_copy,
8892
- ggml_tensor * state_mask,
8893
- const llama_ubatch & ubatch,
8894
- int il) const {
8895
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
9166
+ llm_graph_input_rs * inp,
9167
+ ggml_cgraph * gf,
9168
+ ggml_tensor * cur,
9169
+ const llama_ubatch & ubatch,
9170
+ int il) const {
9171
+ const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
8896
9172
 
8897
- const auto kv_head = kv_self->head;
9173
+ const auto kv_head = kv_state->get_head();
8898
9174
 
8899
9175
  const int64_t d_conv = hparams.ssm_d_conv;
8900
9176
  const int64_t d_inner = hparams.ssm_d_inner;
@@ -8912,17 +9188,17 @@ struct llm_build_mamba : public llm_graph_context {
8912
9188
  GGML_ASSERT(ubatch.equal_seqs);
8913
9189
  GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
8914
9190
 
8915
- ggml_tensor * conv_states_all = kv_self->k_l[il];
8916
- ggml_tensor * ssm_states_all = kv_self->v_l[il];
9191
+ ggml_tensor * conv_states_all = kv_state->get_r_l(il);
9192
+ ggml_tensor * ssm_states_all = kv_state->get_s_l(il);
8917
9193
 
8918
9194
  // (ab)using the KV cache to store the states
8919
- ggml_tensor * conv = build_copy_mask_state(
8920
- gf, conv_states_all, state_copy, state_mask,
8921
- hparams.n_embd_k_s(), n_seqs);
9195
+ ggml_tensor * conv = build_rs(
9196
+ inp, gf, conv_states_all,
9197
+ hparams.n_embd_r(), n_seqs);
8922
9198
  conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
8923
- ggml_tensor * ssm = build_copy_mask_state(
8924
- gf, ssm_states_all, state_copy, state_mask,
8925
- hparams.n_embd_v_s(), n_seqs);
9199
+ ggml_tensor * ssm = build_rs(
9200
+ inp, gf, ssm_states_all,
9201
+ hparams.n_embd_s(), n_seqs);
8926
9202
  ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
8927
9203
 
8928
9204
  // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
@@ -11633,14 +11909,13 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11633
11909
  }
11634
11910
 
11635
11911
  ggml_tensor * build_rwkv6_time_mix(
11912
+ llm_graph_input_rs * inp,
11636
11913
  ggml_cgraph * gf,
11637
11914
  ggml_tensor * cur,
11638
11915
  ggml_tensor * x_prev,
11639
- ggml_tensor * state_copy,
11640
- ggml_tensor * state_mask,
11641
11916
  const llama_ubatch & ubatch,
11642
11917
  int il) const {
11643
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
11918
+ const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
11644
11919
 
11645
11920
  const auto n_tokens = ubatch.n_tokens;
11646
11921
  const auto n_seqs = ubatch.n_seqs;
@@ -11650,7 +11925,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11650
11925
  const auto n_head = n_embd / head_size;
11651
11926
  const auto n_head_kv = hparams.n_head_kv(il);
11652
11927
 
11653
- const auto kv_head = kv_self->head;
11928
+ const auto kv_head = kv_state->get_head();
11654
11929
 
11655
11930
  const auto & layer = model.layers[il];
11656
11931
 
@@ -11761,9 +12036,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11761
12036
  k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
11762
12037
  }
11763
12038
 
11764
- ggml_tensor * wkv_state = build_copy_mask_state(
11765
- gf, kv_self->v_l[il], state_copy, state_mask,
11766
- hparams.n_embd_v_s(), n_seqs);
12039
+ ggml_tensor * wkv_state = build_rs(
12040
+ inp, gf, kv_state->get_s_l(il),
12041
+ hparams.n_embd_s(), n_seqs);
11767
12042
 
11768
12043
  ggml_tensor * wkv_output;
11769
12044
  if (is_qrwkv) {
@@ -11781,9 +12056,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11781
12056
  wkv_state,
11782
12057
  ggml_view_1d(
11783
12058
  ctx0,
11784
- kv_self->v_l[il],
11785
- hparams.n_embd_v_s() * n_seqs,
11786
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
12059
+ kv_state->get_s_l(il),
12060
+ hparams.n_embd_s() * n_seqs,
12061
+ hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il))
11787
12062
  )
11788
12063
  )
11789
12064
  );
@@ -11817,8 +12092,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11817
12092
  inpL = build_inp_embd(model.tok_embd);
11818
12093
  inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
11819
12094
 
11820
- ggml_tensor * state_copy = build_inp_s_copy();
11821
- ggml_tensor * state_mask = build_inp_s_mask();
12095
+ auto * rs_inp = build_rs_inp();
11822
12096
 
11823
12097
  const auto n_embd = hparams.n_embd;
11824
12098
  const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -11828,9 +12102,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11828
12102
  const llama_layer * layer = &model.layers[il];
11829
12103
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
11830
12104
 
11831
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
11832
- gf, state_copy, state_mask, ubatch, il
11833
- );
12105
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
11834
12106
 
11835
12107
  ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
11836
12108
  ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -11845,7 +12117,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11845
12117
  1
11846
12118
  );
11847
12119
 
11848
- cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
12120
+ cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
11849
12121
 
11850
12122
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
11851
12123
  cb(ffn_inp, "ffn_inp", il);
@@ -11908,15 +12180,14 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11908
12180
  // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
11909
12181
  struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11910
12182
  llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
11911
- GGML_ASSERT(n_embd == hparams.n_embd_k_s());
12183
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
11912
12184
 
11913
12185
  ggml_tensor * cur;
11914
12186
  ggml_tensor * inpL;
11915
12187
 
11916
12188
  inpL = build_inp_embd(model.tok_embd);
11917
12189
 
11918
- ggml_tensor * state_copy = build_inp_s_copy();
11919
- ggml_tensor * state_mask = build_inp_s_mask();
12190
+ auto * rs_inp = build_rs_inp();
11920
12191
 
11921
12192
  const auto n_embd = hparams.n_embd;
11922
12193
  const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -11926,9 +12197,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11926
12197
  const llama_layer * layer = &model.layers[il];
11927
12198
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
11928
12199
 
11929
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
11930
- gf, state_copy, state_mask, ubatch, il
11931
- );
12200
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
11932
12201
 
11933
12202
  ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
11934
12203
  cb(att_norm, "attn_norm", il);
@@ -11940,7 +12209,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11940
12209
  1
11941
12210
  );
11942
12211
 
11943
- cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
12212
+ cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
11944
12213
 
11945
12214
  token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
11946
12215
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -12028,15 +12297,14 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12028
12297
  }
12029
12298
 
12030
12299
  ggml_tensor * build_rwkv7_time_mix(
12300
+ llm_graph_input_rs * inp,
12031
12301
  ggml_cgraph * gf,
12032
12302
  ggml_tensor * cur,
12033
12303
  ggml_tensor * x_prev,
12034
- ggml_tensor * state_copy,
12035
- ggml_tensor * state_mask,
12036
12304
  ggml_tensor *& first_layer_value,
12037
12305
  const llama_ubatch & ubatch,
12038
12306
  int il) const {
12039
- const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
12307
+ const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
12040
12308
 
12041
12309
  const auto n_tokens = ubatch.n_tokens;
12042
12310
  const auto n_seqs = ubatch.n_seqs;
@@ -12045,7 +12313,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12045
12313
  const auto head_count = n_embd / head_size;
12046
12314
  const auto n_seq_tokens = ubatch.n_seq_tokens;
12047
12315
 
12048
- const auto kv_head = kv_self->head;
12316
+ const auto kv_head = kv_state->get_head();
12049
12317
 
12050
12318
  const auto & layer = model.layers[il];
12051
12319
 
@@ -12115,9 +12383,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12115
12383
  v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
12116
12384
  a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
12117
12385
 
12118
- ggml_tensor * wkv_state = build_copy_mask_state(
12119
- gf, kv_self->v_l[il], state_copy, state_mask,
12120
- hparams.n_embd_v_s(), n_seqs);
12386
+ ggml_tensor * wkv_state = build_rs(
12387
+ inp, gf, kv_state->get_s_l(il),
12388
+ hparams.n_embd_s(), n_seqs);
12121
12389
 
12122
12390
  ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
12123
12391
  cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
@@ -12130,9 +12398,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12130
12398
  wkv_state,
12131
12399
  ggml_view_1d(
12132
12400
  ctx0,
12133
- kv_self->v_l[il],
12134
- hparams.n_embd_v_s() * n_seqs,
12135
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
12401
+ kv_state->get_s_l(il),
12402
+ hparams.n_embd_s() * n_seqs,
12403
+ hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il))
12136
12404
  )
12137
12405
  )
12138
12406
  );
@@ -12173,8 +12441,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12173
12441
  inpL = build_inp_embd(model.tok_embd);
12174
12442
  inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
12175
12443
 
12176
- ggml_tensor * state_copy = build_inp_s_copy();
12177
- ggml_tensor * state_mask = build_inp_s_mask();
12444
+ auto * rs_inp = build_rs_inp();
12178
12445
 
12179
12446
  const auto n_embd = hparams.n_embd;
12180
12447
  const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -12184,9 +12451,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12184
12451
  const llama_layer * layer = &model.layers[il];
12185
12452
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
12186
12453
 
12187
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
12188
- gf, state_copy, state_mask, ubatch, il
12189
- );
12454
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
12190
12455
 
12191
12456
  ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
12192
12457
  ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -12201,7 +12466,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12201
12466
  1
12202
12467
  );
12203
12468
 
12204
- cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
12469
+ cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
12205
12470
 
12206
12471
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
12207
12472
  cb(ffn_inp, "ffn_inp", il);
@@ -12259,7 +12524,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12259
12524
 
12260
12525
  struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12261
12526
  llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
12262
- GGML_ASSERT(n_embd == hparams.n_embd_k_s());
12527
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
12263
12528
 
12264
12529
  ggml_tensor * cur;
12265
12530
  ggml_tensor * inpL;
@@ -12267,8 +12532,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12267
12532
 
12268
12533
  inpL = build_inp_embd(model.tok_embd);
12269
12534
 
12270
- ggml_tensor * state_copy = build_inp_s_copy();
12271
- ggml_tensor * state_mask = build_inp_s_mask();
12535
+ auto * rs_inp = build_rs_inp();
12272
12536
 
12273
12537
  const auto n_embd = hparams.n_embd;
12274
12538
  const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -12278,9 +12542,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12278
12542
  const llama_layer * layer = &model.layers[il];
12279
12543
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
12280
12544
 
12281
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
12282
- gf, state_copy, state_mask, ubatch, il
12283
- );
12545
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
12284
12546
 
12285
12547
  ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
12286
12548
  cb(att_norm, "attn_norm", il);
@@ -12292,7 +12554,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12292
12554
  1
12293
12555
  );
12294
12556
 
12295
- cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
12557
+ cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
12296
12558
 
12297
12559
  token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
12298
12560
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -13184,69 +13446,375 @@ struct llm_build_bailingmoe : public llm_graph_context {
13184
13446
  }
13185
13447
  };
13186
13448
 
13187
- llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
13188
- llama_memory_i * res;
13449
+ struct llm_build_dots1 : public llm_graph_context {
13450
+ llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13451
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13189
13452
 
13190
- switch (arch) {
13191
- case LLM_ARCH_BERT:
13192
- case LLM_ARCH_JINA_BERT_V2:
13193
- case LLM_ARCH_NOMIC_BERT:
13194
- case LLM_ARCH_NOMIC_BERT_MOE:
13195
- case LLM_ARCH_WAVTOKENIZER_DEC:
13196
- {
13197
- res = nullptr;
13198
- } break;
13199
- case LLM_ARCH_MAMBA:
13200
- case LLM_ARCH_RWKV6:
13201
- case LLM_ARCH_RWKV6QWEN2:
13202
- case LLM_ARCH_RWKV7:
13203
- case LLM_ARCH_ARWKV7:
13204
- {
13205
- res = new llama_kv_cache_recurrent(
13206
- *this,
13207
- GGML_TYPE_F32,
13208
- GGML_TYPE_F32,
13209
- cparams.offload_kqv,
13210
- std::max((uint32_t) 1, cparams.n_seq_max),
13211
- cparams.n_seq_max);
13212
- } break;
13213
- default:
13214
- {
13215
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
13453
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13454
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
13216
13455
 
13217
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
13456
+ ggml_tensor * cur;
13457
+ ggml_tensor * inpL;
13218
13458
 
13219
- LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
13459
+ inpL = build_inp_embd(model.tok_embd);
13220
13460
 
13221
- if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13222
- GGML_ASSERT(hparams.is_swa_any());
13461
+ // inp_pos - contains the positions
13462
+ ggml_tensor * inp_pos = build_inp_pos();
13223
13463
 
13224
- res = new llama_kv_cache_unified_iswa(
13225
- *this,
13226
- params.type_k,
13227
- params.type_v,
13228
- !cparams.flash_attn,
13229
- cparams.offload_kqv,
13230
- params.swa_full,
13231
- cparams.n_ctx,
13232
- cparams.n_seq_max,
13233
- cparams.n_batch,
13234
- padding);
13235
- } else {
13236
- GGML_ASSERT(!hparams.is_swa_any());
13464
+ auto * inp_attn = build_attn_inp_kv_unified();
13465
+
13466
+ for (int il = 0; il < n_layer; ++il) {
13467
+ ggml_tensor * inpSA = inpL;
13468
+
13469
+ // norm
13470
+ cur = build_norm(inpL,
13471
+ model.layers[il].attn_norm, NULL,
13472
+ LLM_NORM_RMS, il);
13473
+ cb(cur, "attn_norm", il);
13474
+
13475
+ // self_attention
13476
+ {
13477
+ // compute Q and K and RoPE them
13478
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13479
+ cb(Qcur, "Qcur", il);
13480
+
13481
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13482
+ cb(Kcur, "Kcur", il);
13483
+
13484
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13485
+ cb(Vcur, "Vcur", il);
13486
+
13487
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13488
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13489
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13490
+
13491
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
13492
+ cb(Qcur, "Qcur_normed", il);
13493
+
13494
+ Qcur = ggml_rope_ext(
13495
+ ctx0, Qcur, inp_pos, nullptr,
13496
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13497
+ ext_factor, attn_factor, beta_fast, beta_slow
13498
+ );
13499
+
13500
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
13501
+ cb(Kcur, "Kcur_normed", il);
13502
+
13503
+ Kcur = ggml_rope_ext(
13504
+ ctx0, Kcur, inp_pos, nullptr,
13505
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13506
+ ext_factor, attn_factor, beta_fast, beta_slow
13507
+ );
13508
+
13509
+ cb(Qcur, "Qcur", il);
13510
+ cb(Kcur, "Kcur", il);
13511
+ cb(Vcur, "Vcur", il);
13512
+
13513
+ cur = build_attn(inp_attn, gf,
13514
+ model.layers[il].wo, model.layers[il].bo,
13515
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13516
+ }
13517
+
13518
+ if (il == n_layer - 1) {
13519
+ // skip computing output for unused tokens
13520
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13521
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13522
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13523
+ }
13524
+
13525
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13526
+ cb(ffn_inp, "ffn_inp", il);
13527
+
13528
+ // MoE branch
13529
+ cur = build_norm(ffn_inp,
13530
+ model.layers[il].ffn_norm, NULL,
13531
+ LLM_NORM_RMS, il);
13532
+ cb(cur, "ffn_norm", il);
13533
+
13534
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
13535
+ cur = build_ffn(cur,
13536
+ model.layers[il].ffn_up, NULL, NULL,
13537
+ model.layers[il].ffn_gate, NULL, NULL,
13538
+ model.layers[il].ffn_down, NULL, NULL,
13539
+ NULL,
13540
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13541
+ cb(cur, "ffn_out", il);
13542
+ } else {
13543
+ ggml_tensor * moe_out =
13544
+ build_moe_ffn(cur,
13545
+ model.layers[il].ffn_gate_inp,
13546
+ model.layers[il].ffn_up_exps,
13547
+ model.layers[il].ffn_gate_exps,
13548
+ model.layers[il].ffn_down_exps,
13549
+ model.layers[il].ffn_exp_probs_b,
13550
+ n_expert, n_expert_used,
13551
+ LLM_FFN_SILU, hparams.expert_weights_norm,
13552
+ true, hparams.expert_weights_scale,
13553
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
13554
+ il);
13555
+ cb(moe_out, "ffn_moe_out", il);
13556
+
13557
+ {
13558
+ ggml_tensor * ffn_shexp = build_ffn(cur,
13559
+ model.layers[il].ffn_up_shexp, NULL, NULL,
13560
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
13561
+ model.layers[il].ffn_down_shexp, NULL, NULL,
13562
+ NULL,
13563
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13564
+ cb(ffn_shexp, "ffn_shexp", il);
13565
+
13566
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
13567
+ cb(cur, "ffn_out", il);
13568
+ }
13569
+ }
13237
13570
 
13238
- res = new llama_kv_cache_unified(
13571
+ cur = ggml_add(ctx0, cur, ffn_inp);
13572
+
13573
+ cur = build_cvec(cur, il);
13574
+ cb(cur, "l_out", il);
13575
+
13576
+ // input for next layer
13577
+ inpL = cur;
13578
+ }
13579
+
13580
+ cur = inpL;
13581
+
13582
+ cur = build_norm(cur,
13583
+ model.output_norm, NULL,
13584
+ LLM_NORM_RMS, -1);
13585
+
13586
+ cb(cur, "result_norm", -1);
13587
+ res->t_embd = cur;
13588
+
13589
+ // lm_head
13590
+ cur = build_lora_mm(model.output, cur);
13591
+
13592
+ cb(cur, "result_output", -1);
13593
+ res->t_logits = cur;
13594
+
13595
+ ggml_build_forward_expand(gf, cur);
13596
+ }
13597
+ };
13598
+
13599
+ struct llm_build_arcee : public llm_graph_context {
13600
+ llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13601
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13602
+
13603
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13604
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
13605
+
13606
+ ggml_tensor * cur;
13607
+ ggml_tensor * inpL;
13608
+
13609
+ inpL = build_inp_embd(model.tok_embd);
13610
+
13611
+ // inp_pos - contains the positions
13612
+ ggml_tensor * inp_pos = build_inp_pos();
13613
+
13614
+ auto * inp_attn = build_attn_inp_kv_unified();
13615
+
13616
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
13617
+
13618
+ for (int il = 0; il < n_layer; ++il) {
13619
+ ggml_tensor * inpSA = inpL;
13620
+
13621
+ // norm
13622
+ cur = build_norm(inpL,
13623
+ model.layers[il].attn_norm, NULL,
13624
+ LLM_NORM_RMS, il);
13625
+ cb(cur, "attn_norm", il);
13626
+
13627
+ // self-attention
13628
+ {
13629
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
13630
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
13631
+
13632
+ // compute Q and K and RoPE them
13633
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13634
+ cb(Qcur, "Qcur", il);
13635
+ if (model.layers[il].bq) {
13636
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13637
+ cb(Qcur, "Qcur", il);
13638
+ }
13639
+
13640
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13641
+ cb(Kcur, "Kcur", il);
13642
+ if (model.layers[il].bk) {
13643
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13644
+ cb(Kcur, "Kcur", il);
13645
+ }
13646
+
13647
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13648
+ cb(Vcur, "Vcur", il);
13649
+ if (model.layers[il].bv) {
13650
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
13651
+ cb(Vcur, "Vcur", il);
13652
+ }
13653
+
13654
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13655
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13656
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13657
+
13658
+ Qcur = ggml_rope_ext(
13659
+ ctx0, Qcur, inp_pos, rope_factors,
13660
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13661
+ ext_factor, attn_factor, beta_fast, beta_slow
13662
+ );
13663
+
13664
+ Kcur = ggml_rope_ext(
13665
+ ctx0, Kcur, inp_pos, rope_factors,
13666
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13667
+ ext_factor, attn_factor, beta_fast, beta_slow
13668
+ );
13669
+
13670
+ cb(Qcur, "Qcur", il);
13671
+ cb(Kcur, "Kcur", il);
13672
+ cb(Vcur, "Vcur", il);
13673
+
13674
+ cur = build_attn(inp_attn, gf,
13675
+ model.layers[il].wo, model.layers[il].bo,
13676
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
13677
+ cb(cur, "attn_out", il);
13678
+ }
13679
+
13680
+ if (il == n_layer - 1) {
13681
+ // skip computing output for unused tokens
13682
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13683
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13684
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13685
+ }
13686
+
13687
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13688
+ cb(ffn_inp, "ffn_inp", il);
13689
+
13690
+ // feed-forward network
13691
+ // ARCEE uses relu^2 instead of silu
13692
+ cur = build_norm(ffn_inp,
13693
+ model.layers[il].ffn_norm, NULL,
13694
+ LLM_NORM_RMS, il);
13695
+ cb(cur, "ffn_norm", il);
13696
+
13697
+ cur = build_ffn(cur,
13698
+ model.layers[il].ffn_up, NULL, NULL,
13699
+ NULL, NULL, NULL,
13700
+ model.layers[il].ffn_down, NULL, NULL,
13701
+ NULL,
13702
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
13703
+ cb(cur, "ffn_out", il);
13704
+
13705
+ cur = ggml_add(ctx0, cur, ffn_inp);
13706
+ cb(cur, "ffn_out", il);
13707
+
13708
+ cur = build_cvec(cur, il);
13709
+ cb(cur, "l_out", il);
13710
+
13711
+ // input for next layer
13712
+ inpL = cur;
13713
+ }
13714
+
13715
+ cur = inpL;
13716
+
13717
+ cur = build_norm(cur,
13718
+ model.output_norm, NULL,
13719
+ LLM_NORM_RMS, -1);
13720
+
13721
+ cb(cur, "result_norm", -1);
13722
+ res->t_embd = cur;
13723
+
13724
+ // lm_head
13725
+ cur = build_lora_mm(model.output, cur);
13726
+
13727
+ cb(cur, "result_output", -1);
13728
+ res->t_logits = cur;
13729
+
13730
+ ggml_build_forward_expand(gf, cur);
13731
+ }
13732
+ };
13733
+
13734
+ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
13735
+ llama_memory_i * res;
13736
+
13737
+ switch (arch) {
13738
+ // Models that need specific instantiation should be handled in the
13739
+ // switch statement
13740
+ case LLM_ARCH_BERT:
13741
+ case LLM_ARCH_JINA_BERT_V2:
13742
+ case LLM_ARCH_NOMIC_BERT:
13743
+ case LLM_ARCH_NOMIC_BERT_MOE:
13744
+ case LLM_ARCH_NEO_BERT:
13745
+ case LLM_ARCH_WAVTOKENIZER_DEC:
13746
+ {
13747
+ res = nullptr;
13748
+ } break;
13749
+ // Models that need standard caching should rely on recurrent/hybrid
13750
+ // checks
13751
+ default:
13752
+ {
13753
+ if (llm_arch_is_recurrent(arch)) {
13754
+ res = new llama_memory_recurrent(
13239
13755
  *this,
13240
13756
  nullptr,
13241
- params.type_k,
13242
- params.type_v,
13243
- !cparams.flash_attn,
13757
+ GGML_TYPE_F32,
13758
+ GGML_TYPE_F32,
13244
13759
  cparams.offload_kqv,
13245
- cparams.n_ctx,
13246
- cparams.n_seq_max,
13247
- padding,
13248
- hparams.n_swa,
13249
- hparams.swa_type);
13760
+ std::max((uint32_t) 1, cparams.n_seq_max),
13761
+ cparams.n_seq_max);
13762
+ } else if (llm_arch_is_hybrid(arch)) {
13763
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
13764
+
13765
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
13766
+
13767
+ res = new llama_memory_hybrid(
13768
+ /* model */ *this,
13769
+ /* attn_type_k */ params.type_k,
13770
+ /* attn_type_v */ params.type_v,
13771
+ /* attn_v_trans */ !cparams.flash_attn,
13772
+ /* attn_kv_size */ cparams.n_ctx,
13773
+ /* attn_n_pad */ padding,
13774
+ /* attn_n_swa */ hparams.n_swa,
13775
+ /* attn_swa_type */ hparams.swa_type,
13776
+ /* recurrent_type_k */ GGML_TYPE_F32,
13777
+ /* recurrent_type_v */ GGML_TYPE_F32,
13778
+ /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
13779
+ /* n_seq_max */ cparams.n_seq_max,
13780
+ /* offload */ cparams.offload_kqv);
13781
+ } else {
13782
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
13783
+
13784
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
13785
+
13786
+ LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
13787
+
13788
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13789
+ GGML_ASSERT(hparams.is_swa_any());
13790
+
13791
+ res = new llama_kv_cache_unified_iswa(
13792
+ *this,
13793
+ params.type_k,
13794
+ params.type_v,
13795
+ !cparams.flash_attn,
13796
+ cparams.offload_kqv,
13797
+ params.swa_full,
13798
+ cparams.n_ctx,
13799
+ cparams.n_seq_max,
13800
+ cparams.n_ubatch,
13801
+ padding);
13802
+ } else {
13803
+ GGML_ASSERT(!hparams.is_swa_any());
13804
+
13805
+ res = new llama_kv_cache_unified(
13806
+ *this,
13807
+ nullptr,
13808
+ params.type_k,
13809
+ params.type_v,
13810
+ !cparams.flash_attn,
13811
+ cparams.offload_kqv,
13812
+ cparams.n_ctx,
13813
+ cparams.n_seq_max,
13814
+ padding,
13815
+ hparams.n_swa,
13816
+ hparams.swa_type);
13817
+ }
13250
13818
  }
13251
13819
  }
13252
13820
  }
@@ -13300,6 +13868,10 @@ llm_graph_result_ptr llama_model::build_graph(
13300
13868
  {
13301
13869
  llm = std::make_unique<llm_build_bert>(*this, params, gf);
13302
13870
  } break;
13871
+ case LLM_ARCH_NEO_BERT:
13872
+ {
13873
+ llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
13874
+ } break;
13303
13875
  case LLM_ARCH_BLOOM:
13304
13876
  {
13305
13877
  llm = std::make_unique<llm_build_bloom>(*this, params, gf);
@@ -13522,6 +14094,14 @@ llm_graph_result_ptr llama_model::build_graph(
13522
14094
  {
13523
14095
  llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
13524
14096
  } break;
14097
+ case LLM_ARCH_DOTS1:
14098
+ {
14099
+ llm = std::make_unique<llm_build_dots1>(*this, params, gf);
14100
+ } break;
14101
+ case LLM_ARCH_ARCEE:
14102
+ {
14103
+ llm = std::make_unique<llm_build_arcee>(*this, params, gf);
14104
+ } break;
13525
14105
  default:
13526
14106
  GGML_ABORT("fatal error");
13527
14107
  }
@@ -13593,6 +14173,22 @@ int32_t llama_model_n_head_kv(const llama_model * model) {
13593
14173
  return model->hparams.n_head_kv();
13594
14174
  }
13595
14175
 
14176
+ int32_t llama_model_n_swa(const llama_model * model) {
14177
+ return model->hparams.n_swa;
14178
+ }
14179
+
14180
+ uint32_t llama_model_n_cls_out(const struct llama_model * model) {
14181
+ return model->hparams.n_cls_out;
14182
+ }
14183
+
14184
+ const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
14185
+ if (i < model->classifier_labels.size()) {
14186
+ return model->classifier_labels[i].c_str();
14187
+ }
14188
+
14189
+ return nullptr;
14190
+ }
14191
+
13596
14192
  // deprecated
13597
14193
  int32_t llama_n_ctx_train(const llama_model * model) {
13598
14194
  return llama_model_n_ctx_train(model);
@@ -13655,6 +14251,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13655
14251
  case LLM_ARCH_GRANITE_MOE:
13656
14252
  case LLM_ARCH_CHAMELEON:
13657
14253
  case LLM_ARCH_BAILINGMOE:
14254
+ case LLM_ARCH_NEO_BERT:
14255
+ case LLM_ARCH_ARCEE:
13658
14256
  return LLAMA_ROPE_TYPE_NORM;
13659
14257
 
13660
14258
  // the pairs of head values are offset by n_rot/2
@@ -13688,6 +14286,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13688
14286
  case LLM_ARCH_NEMOTRON:
13689
14287
  case LLM_ARCH_EXAONE:
13690
14288
  case LLM_ARCH_MINICPM3:
14289
+ case LLM_ARCH_DOTS1:
13691
14290
  return LLAMA_ROPE_TYPE_NEOX;
13692
14291
 
13693
14292
  case LLM_ARCH_QWEN2VL:
@@ -13753,7 +14352,7 @@ uint64_t llama_model_size(const llama_model * model) {
13753
14352
  }
13754
14353
 
13755
14354
  const char * llama_model_chat_template(const llama_model * model, const char * name) {
13756
- const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
14355
+ const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
13757
14356
  : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
13758
14357
  const auto & it = model->gguf_kv.find(key);
13759
14358
  if (it == model->gguf_kv.end()) {
@@ -13795,14 +14394,7 @@ llama_token llama_model_decoder_start_token(const llama_model * model) {
13795
14394
  }
13796
14395
 
13797
14396
  bool llama_model_is_recurrent(const llama_model * model) {
13798
- switch (model->arch) {
13799
- case LLM_ARCH_MAMBA: return true;
13800
- case LLM_ARCH_RWKV6: return true;
13801
- case LLM_ARCH_RWKV6QWEN2: return true;
13802
- case LLM_ARCH_RWKV7: return true;
13803
- case LLM_ARCH_ARWKV7: return true;
13804
- default: return false;
13805
- }
14397
+ return llm_arch_is_recurrent(model->arch);
13806
14398
  }
13807
14399
 
13808
14400
  const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {