@novastera-oss/llamarn 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. package/android/src/main/cpp/include/llama.h +134 -36
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +2 -2
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +30 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +50 -40
  26. package/cpp/llama.cpp/common/common.h +5 -2
  27. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  28. package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
  29. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
  30. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  31. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
  32. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  34. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  35. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  70. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
  74. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
  76. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  84. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  101. package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
  102. package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
  103. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
  104. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
  105. package/cpp/llama.cpp/include/llama.h +134 -36
  106. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  107. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  108. package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
  109. package/cpp/llama.cpp/src/llama-arch.h +7 -1
  110. package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
  111. package/cpp/llama.cpp/src/llama-batch.h +36 -11
  112. package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
  113. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  114. package/cpp/llama.cpp/src/llama-context.cpp +313 -213
  115. package/cpp/llama.cpp/src/llama-context.h +16 -12
  116. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  117. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  118. package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
  119. package/cpp/llama.cpp/src/llama-graph.h +90 -34
  120. package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
  121. package/cpp/llama.cpp/src/llama-hparams.h +8 -2
  122. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
  123. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
  124. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
  125. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
  126. package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
  127. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
  128. package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
  129. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
  130. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
  131. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  132. package/cpp/llama.cpp/src/llama-memory.h +64 -23
  133. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  134. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  135. package/cpp/llama.cpp/src/llama-model.cpp +726 -141
  136. package/cpp/llama.cpp/src/llama-model.h +4 -0
  137. package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
  138. package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
  139. package/cpp/llama.cpp/src/llama.cpp +11 -7
  140. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  141. package/cpp/rn-completion.cpp +2 -2
  142. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  143. package/ios/include/chat.h +1 -1
  144. package/ios/include/common.h +5 -2
  145. package/ios/include/llama.h +134 -36
  146. package/ios/libs/llama.xcframework/Info.plist +18 -18
  147. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  148. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  149. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
  150. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  151. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  152. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  153. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  154. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  155. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
  160. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
  161. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  162. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
  163. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  164. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  165. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  167. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
  168. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  173. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  175. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  178. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  183. package/package.json +1 -2
  184. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  185. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  186. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  187. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  188. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  189. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  190. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  191. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  192. /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
@@ -8,7 +8,8 @@
8
8
 
9
9
  #include "llama-kv-cache-unified.h"
10
10
  #include "llama-kv-cache-unified-iswa.h"
11
- #include "llama-kv-cache-recurrent.h"
11
+ #include "llama-memory-hybrid.h"
12
+ #include "llama-memory-recurrent.h"
12
13
 
13
14
  #include "ggml-cpp.h"
14
15
 
@@ -80,6 +81,7 @@ const char * llm_type_name(llm_type type) {
80
81
  case LLM_TYPE_40B: return "40B";
81
82
  case LLM_TYPE_65B: return "65B";
82
83
  case LLM_TYPE_70B: return "70B";
84
+ case LLM_TYPE_142B: return "142B";
83
85
  case LLM_TYPE_236B: return "236B";
84
86
  case LLM_TYPE_290B: return "290B";
85
87
  case LLM_TYPE_314B: return "314B";
@@ -469,6 +471,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
469
471
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
470
472
  std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
471
473
  std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
474
+ std::fill(
475
+ hparams.recurrent_layer_arr.begin(),
476
+ hparams.recurrent_layer_arr.end(),
477
+ llm_arch_is_recurrent(ml.get_arch()));
472
478
 
473
479
  std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
474
480
 
@@ -543,6 +549,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
543
549
  uint32_t n_vocab = 0;
544
550
  ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
545
551
 
552
+ // for classifier models
553
+ ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
554
+ if (!classifier_labels.empty()) {
555
+ hparams.n_cls_out = classifier_labels.size();
556
+ }
557
+
546
558
  // arch-specific KVs
547
559
  switch (arch) {
548
560
  case LLM_ARCH_LLAMA:
@@ -592,6 +604,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
592
604
  hparams.use_kq_norm = false;
593
605
  }
594
606
  } break;
607
+ case LLM_ARCH_ARCEE:
608
+ {
609
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
610
+
611
+ // Arcee uses the same structure as Llama
612
+ switch (hparams.n_layer) {
613
+ case 36: type = LLM_TYPE_4B; break;
614
+ default: type = LLM_TYPE_UNKNOWN;
615
+ }
616
+ } break;
595
617
  case LLM_ARCH_DECI:
596
618
  {
597
619
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -686,7 +708,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
686
708
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
687
709
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
688
710
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
689
- ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
690
711
 
691
712
  switch (hparams.n_layer) {
692
713
  case 3:
@@ -733,6 +754,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
733
754
  }
734
755
  }
735
756
  } break;
757
+ case LLM_ARCH_NEO_BERT:
758
+ {
759
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
760
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
761
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
762
+
763
+ if (hparams.n_layer == 28) {
764
+ type = LLM_TYPE_250M;
765
+ }
766
+ } break;
736
767
  case LLM_ARCH_BLOOM:
737
768
  {
738
769
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -956,6 +987,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
956
987
  case 46: type = LLM_TYPE_27B; break;
957
988
  default: type = LLM_TYPE_UNKNOWN;
958
989
  }
990
+
991
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
992
+ hparams.f_attention_scale = type == LLM_TYPE_27B
993
+ ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
994
+ : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
959
995
  } break;
960
996
  case LLM_ARCH_GEMMA3:
961
997
  {
@@ -976,6 +1012,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
976
1012
  default: type = LLM_TYPE_UNKNOWN;
977
1013
  }
978
1014
 
1015
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
979
1016
  hparams.f_attention_scale = type == LLM_TYPE_27B
980
1017
  ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
981
1018
  : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
@@ -1433,6 +1470,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1433
1470
  default: type = LLM_TYPE_UNKNOWN;
1434
1471
  }
1435
1472
  } break;
1473
+ case LLM_ARCH_DOTS1:
1474
+ {
1475
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1476
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1477
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1478
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1479
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1480
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1481
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1482
+ switch (hparams.n_layer) {
1483
+ case 62: type = LLM_TYPE_142B; break;
1484
+ default: type = LLM_TYPE_UNKNOWN;
1485
+ }
1486
+ } break;
1436
1487
  default: throw std::runtime_error("unsupported model architecture");
1437
1488
  }
1438
1489
 
@@ -2176,6 +2227,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2176
2227
  layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
2177
2228
  }
2178
2229
  } break;
2230
+ case LLM_ARCH_NEO_BERT:
2231
+ {
2232
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2233
+
2234
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
2235
+ cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
2236
+
2237
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2238
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2239
+
2240
+ output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
2241
+
2242
+ for (int i = 0; i < n_layer; ++i) {
2243
+ auto & layer = layers[i];
2244
+
2245
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2246
+
2247
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2248
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2249
+
2250
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2251
+
2252
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
2253
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2254
+ }
2255
+ } break;
2179
2256
  case LLM_ARCH_JINA_BERT_V2:
2180
2257
  {
2181
2258
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
@@ -2213,8 +2290,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2213
2290
  layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2214
2291
  layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
2215
2292
 
2216
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
2217
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
2293
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2294
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
2218
2295
 
2219
2296
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
2220
2297
  layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
@@ -4112,6 +4189,89 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4112
4189
  layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4113
4190
  }
4114
4191
  } break;
4192
+ case LLM_ARCH_DOTS1:
4193
+ {
4194
+ const int64_t n_ff_exp = hparams.n_ff_exp;
4195
+ const int64_t n_expert_shared = hparams.n_expert_shared;
4196
+
4197
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4198
+
4199
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4200
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4201
+
4202
+ for (int i = 0; i < n_layer; ++i) {
4203
+ auto & layer = layers[i];
4204
+
4205
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4206
+
4207
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4208
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4209
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4210
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4211
+
4212
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
4213
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
4214
+
4215
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4216
+
4217
+ if (i < (int) hparams.n_layer_dense_lead) {
4218
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4219
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4220
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4221
+ } else {
4222
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
4223
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
4224
+
4225
+ if (n_expert == 0) {
4226
+ throw std::runtime_error("n_expert must be > 0");
4227
+ }
4228
+ if (n_expert_used == 0) {
4229
+ throw std::runtime_error("n_expert_used must be > 0");
4230
+ }
4231
+
4232
+ // MoE branch
4233
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4234
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
4235
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
4236
+
4237
+ // Shared expert branch
4238
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4239
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
4240
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
4241
+ }
4242
+ }
4243
+ } break;
4244
+ case LLM_ARCH_ARCEE:
4245
+ {
4246
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4247
+
4248
+ // output
4249
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4250
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4251
+
4252
+ // if output is NULL, init from the input tok embed
4253
+ if (output == NULL) {
4254
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4255
+ }
4256
+
4257
+ for (int i = 0; i < n_layer; ++i) {
4258
+ auto & layer = layers[i];
4259
+
4260
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4261
+
4262
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4263
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4264
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4265
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4266
+
4267
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4268
+
4269
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4270
+
4271
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4272
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4273
+ }
4274
+ } break;
4115
4275
  default:
4116
4276
  throw std::runtime_error("unknown architecture");
4117
4277
  }
@@ -4356,6 +4516,15 @@ void llama_model::print_info() const {
4356
4516
  LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
4357
4517
  LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
4358
4518
  LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
4519
+
4520
+ if (!classifier_labels.empty()) {
4521
+ LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
4522
+
4523
+ size_t i = 0;
4524
+ for (auto label : classifier_labels) {
4525
+ LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
4526
+ }
4527
+ }
4359
4528
  }
4360
4529
 
4361
4530
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
@@ -6023,7 +6192,7 @@ struct llm_build_bert : public llm_graph_context {
6023
6192
  model.layers[il].ffn_gate, NULL, NULL,
6024
6193
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
6025
6194
  NULL,
6026
- LLM_FFN_GELU, LLM_FFN_PAR, il);
6195
+ model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
6027
6196
  cb(cur, "ffn_out", il);
6028
6197
  } else {
6029
6198
  cur = build_ffn(cur,
@@ -6054,6 +6223,117 @@ struct llm_build_bert : public llm_graph_context {
6054
6223
  }
6055
6224
  };
6056
6225
 
6226
+ struct llm_build_neo_bert : public llm_graph_context {
6227
+ llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6228
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6229
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6230
+
6231
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6232
+
6233
+ ggml_tensor * cur;
6234
+ ggml_tensor * inpL;
6235
+ ggml_tensor * inp_pos = build_inp_pos();
6236
+
6237
+ // construct input embeddings (token, type, position)
6238
+ inpL = build_inp_embd(model.tok_embd);
6239
+ cb(inpL, "inp_embd", -1);
6240
+
6241
+ auto * inp_attn = build_attn_inp_no_cache();
6242
+
6243
+ // iterate layers
6244
+ for (int il = 0; il < n_layer; ++il) {
6245
+ ggml_tensor * cur = inpL;
6246
+
6247
+ ggml_tensor * Qcur;
6248
+ ggml_tensor * Kcur;
6249
+ ggml_tensor * Vcur;
6250
+
6251
+ // pre-norm
6252
+ cur = build_norm(inpL,
6253
+ model.layers[il].attn_norm, NULL,
6254
+ LLM_NORM_RMS, il);
6255
+
6256
+ // self-attention
6257
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
6258
+ cb(cur, "wqkv", il);
6259
+
6260
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6261
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6262
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6263
+
6264
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6265
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6266
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6267
+
6268
+ // RoPE
6269
+ Qcur = ggml_rope_ext(
6270
+ ctx0, Qcur, inp_pos, nullptr,
6271
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6272
+ ext_factor, attn_factor, beta_fast, beta_slow
6273
+ );
6274
+
6275
+ Kcur = ggml_rope_ext(
6276
+ ctx0, Kcur, inp_pos, nullptr,
6277
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6278
+ ext_factor, attn_factor, beta_fast, beta_slow
6279
+ );
6280
+
6281
+ cb(Qcur, "Qcur", il);
6282
+ cb(Kcur, "Kcur", il);
6283
+ cb(Vcur, "Vcur", il);
6284
+
6285
+ cur = build_attn(inp_attn, gf,
6286
+ model.layers[il].wo, nullptr,
6287
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6288
+ cb(cur, "kqv_out", il);
6289
+
6290
+ if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
6291
+ // skip computing output for unused tokens
6292
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6293
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6294
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6295
+ }
6296
+
6297
+ // re-add the layer input
6298
+ cur = ggml_add(ctx0, cur, inpL);
6299
+
6300
+ ggml_tensor * ffn_inp = cur;
6301
+ cb(ffn_inp, "ffn_inp", il);
6302
+
6303
+ // pre-norm
6304
+ cur = build_norm(ffn_inp,
6305
+ model.layers[il].ffn_norm, NULL,
6306
+ LLM_NORM_RMS, il);
6307
+ cb(cur, "ffn_norm", il);
6308
+
6309
+ // feed-forward network
6310
+ cur = build_ffn(cur,
6311
+ model.layers[il].ffn_up,
6312
+ NULL, NULL, NULL, NULL, NULL,
6313
+ model.layers[il].ffn_down,
6314
+ NULL, NULL, NULL,
6315
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
6316
+
6317
+ // attentions bypass the intermediate layer
6318
+ cur = ggml_add(ctx0, cur, ffn_inp);
6319
+
6320
+ // input for next layer
6321
+ inpL = cur;
6322
+ }
6323
+
6324
+ cur = inpL;
6325
+
6326
+ cur = build_norm(cur,
6327
+ model.output_norm_enc, NULL,
6328
+ LLM_NORM_RMS, -1);
6329
+
6330
+ cb(cur, "result_embd", -1);
6331
+ res->t_embd = cur;
6332
+
6333
+ ggml_build_forward_expand(gf, cur);
6334
+ }
6335
+ };
6336
+
6057
6337
  struct llm_build_bloom : public llm_graph_context {
6058
6338
  llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
6059
6339
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -8484,14 +8764,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
8484
8764
  cb(Kcur, "Kcur", il);
8485
8765
  cb(Vcur, "Vcur", il);
8486
8766
 
8487
- // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
8488
- switch (model.type) {
8489
- case LLM_TYPE_2B:
8490
- case LLM_TYPE_9B:
8491
- case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); break;
8492
- default: GGML_ABORT("fatal error");
8493
- };
8494
- cb(Qcur, "Qcur_scaled", il);
8767
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
8495
8768
 
8496
8769
  cur = build_attn(inp_attn, gf,
8497
8770
  model.layers[il].wo, NULL,
@@ -8632,9 +8905,12 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
8632
8905
  cb(Kcur, "Kcur", il);
8633
8906
  cb(Vcur, "Vcur", il);
8634
8907
 
8908
+ // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
8909
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
8910
+
8635
8911
  cur = build_attn(inp_attn, gf,
8636
8912
  model.layers[il].wo, NULL,
8637
- Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
8913
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
8638
8914
  }
8639
8915
 
8640
8916
  cur = build_norm(cur,
@@ -8840,8 +9116,7 @@ struct llm_build_mamba : public llm_graph_context {
8840
9116
  // {n_embd, n_tokens}
8841
9117
  inpL = build_inp_embd(model.tok_embd);
8842
9118
 
8843
- ggml_tensor * state_copy = build_inp_s_copy();
8844
- ggml_tensor * state_mask = build_inp_s_mask();
9119
+ auto * rs_inp = build_rs_inp();
8845
9120
 
8846
9121
  for (int il = 0; il < n_layer; ++il) {
8847
9122
  // norm
@@ -8850,8 +9125,7 @@ struct llm_build_mamba : public llm_graph_context {
8850
9125
  LLM_NORM_RMS, il);
8851
9126
  cb(cur, "attn_norm", il);
8852
9127
 
8853
- //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
8854
- cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
9128
+ cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
8855
9129
 
8856
9130
  if (il == n_layer - 1) {
8857
9131
  // skip computing output for unused tokens
@@ -8889,13 +9163,12 @@ struct llm_build_mamba : public llm_graph_context {
8889
9163
 
8890
9164
  // TODO: split
8891
9165
  ggml_tensor * build_mamba_layer(
8892
- ggml_cgraph * gf,
8893
- ggml_tensor * cur,
8894
- ggml_tensor * state_copy,
8895
- ggml_tensor * state_mask,
8896
- const llama_ubatch & ubatch,
8897
- int il) const {
8898
- const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
9166
+ llm_graph_input_rs * inp,
9167
+ ggml_cgraph * gf,
9168
+ ggml_tensor * cur,
9169
+ const llama_ubatch & ubatch,
9170
+ int il) const {
9171
+ const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
8899
9172
 
8900
9173
  const auto kv_head = kv_state->get_head();
8901
9174
 
@@ -8915,17 +9188,17 @@ struct llm_build_mamba : public llm_graph_context {
8915
9188
  GGML_ASSERT(ubatch.equal_seqs);
8916
9189
  GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
8917
9190
 
8918
- ggml_tensor * conv_states_all = kv_state->get_k_l(il);
8919
- ggml_tensor * ssm_states_all = kv_state->get_v_l(il);
9191
+ ggml_tensor * conv_states_all = kv_state->get_r_l(il);
9192
+ ggml_tensor * ssm_states_all = kv_state->get_s_l(il);
8920
9193
 
8921
9194
  // (ab)using the KV cache to store the states
8922
- ggml_tensor * conv = build_copy_mask_state(
8923
- gf, conv_states_all, state_copy, state_mask,
8924
- hparams.n_embd_k_s(), n_seqs);
9195
+ ggml_tensor * conv = build_rs(
9196
+ inp, gf, conv_states_all,
9197
+ hparams.n_embd_r(), n_seqs);
8925
9198
  conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
8926
- ggml_tensor * ssm = build_copy_mask_state(
8927
- gf, ssm_states_all, state_copy, state_mask,
8928
- hparams.n_embd_v_s(), n_seqs);
9199
+ ggml_tensor * ssm = build_rs(
9200
+ inp, gf, ssm_states_all,
9201
+ hparams.n_embd_s(), n_seqs);
8929
9202
  ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
8930
9203
 
8931
9204
  // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
@@ -11636,14 +11909,13 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11636
11909
  }
11637
11910
 
11638
11911
  ggml_tensor * build_rwkv6_time_mix(
11912
+ llm_graph_input_rs * inp,
11639
11913
  ggml_cgraph * gf,
11640
11914
  ggml_tensor * cur,
11641
11915
  ggml_tensor * x_prev,
11642
- ggml_tensor * state_copy,
11643
- ggml_tensor * state_mask,
11644
11916
  const llama_ubatch & ubatch,
11645
11917
  int il) const {
11646
- const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
11918
+ const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
11647
11919
 
11648
11920
  const auto n_tokens = ubatch.n_tokens;
11649
11921
  const auto n_seqs = ubatch.n_seqs;
@@ -11764,9 +12036,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11764
12036
  k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
11765
12037
  }
11766
12038
 
11767
- ggml_tensor * wkv_state = build_copy_mask_state(
11768
- gf, kv_state->get_v_l(il), state_copy, state_mask,
11769
- hparams.n_embd_v_s(), n_seqs);
12039
+ ggml_tensor * wkv_state = build_rs(
12040
+ inp, gf, kv_state->get_s_l(il),
12041
+ hparams.n_embd_s(), n_seqs);
11770
12042
 
11771
12043
  ggml_tensor * wkv_output;
11772
12044
  if (is_qrwkv) {
@@ -11784,9 +12056,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11784
12056
  wkv_state,
11785
12057
  ggml_view_1d(
11786
12058
  ctx0,
11787
- kv_state->get_v_l(il),
11788
- hparams.n_embd_v_s() * n_seqs,
11789
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
12059
+ kv_state->get_s_l(il),
12060
+ hparams.n_embd_s() * n_seqs,
12061
+ hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il))
11790
12062
  )
11791
12063
  )
11792
12064
  );
@@ -11820,8 +12092,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11820
12092
  inpL = build_inp_embd(model.tok_embd);
11821
12093
  inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
11822
12094
 
11823
- ggml_tensor * state_copy = build_inp_s_copy();
11824
- ggml_tensor * state_mask = build_inp_s_mask();
12095
+ auto * rs_inp = build_rs_inp();
11825
12096
 
11826
12097
  const auto n_embd = hparams.n_embd;
11827
12098
  const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -11831,9 +12102,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11831
12102
  const llama_layer * layer = &model.layers[il];
11832
12103
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
11833
12104
 
11834
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
11835
- gf, state_copy, state_mask, ubatch, il
11836
- );
12105
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
11837
12106
 
11838
12107
  ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
11839
12108
  ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -11848,7 +12117,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11848
12117
  1
11849
12118
  );
11850
12119
 
11851
- cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
12120
+ cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
11852
12121
 
11853
12122
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
11854
12123
  cb(ffn_inp, "ffn_inp", il);
@@ -11911,15 +12180,14 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
11911
12180
  // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
11912
12181
  struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11913
12182
  llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
11914
- GGML_ASSERT(n_embd == hparams.n_embd_k_s());
12183
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
11915
12184
 
11916
12185
  ggml_tensor * cur;
11917
12186
  ggml_tensor * inpL;
11918
12187
 
11919
12188
  inpL = build_inp_embd(model.tok_embd);
11920
12189
 
11921
- ggml_tensor * state_copy = build_inp_s_copy();
11922
- ggml_tensor * state_mask = build_inp_s_mask();
12190
+ auto * rs_inp = build_rs_inp();
11923
12191
 
11924
12192
  const auto n_embd = hparams.n_embd;
11925
12193
  const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -11929,9 +12197,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11929
12197
  const llama_layer * layer = &model.layers[il];
11930
12198
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
11931
12199
 
11932
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
11933
- gf, state_copy, state_mask, ubatch, il
11934
- );
12200
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
11935
12201
 
11936
12202
  ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
11937
12203
  cb(att_norm, "attn_norm", il);
@@ -11943,7 +12209,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
11943
12209
  1
11944
12210
  );
11945
12211
 
11946
- cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
12212
+ cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
11947
12213
 
11948
12214
  token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
11949
12215
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -12031,15 +12297,14 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12031
12297
  }
12032
12298
 
12033
12299
  ggml_tensor * build_rwkv7_time_mix(
12300
+ llm_graph_input_rs * inp,
12034
12301
  ggml_cgraph * gf,
12035
12302
  ggml_tensor * cur,
12036
12303
  ggml_tensor * x_prev,
12037
- ggml_tensor * state_copy,
12038
- ggml_tensor * state_mask,
12039
12304
  ggml_tensor *& first_layer_value,
12040
12305
  const llama_ubatch & ubatch,
12041
12306
  int il) const {
12042
- const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
12307
+ const auto * kv_state = static_cast<const llama_memory_recurrent_state *>(mstate);
12043
12308
 
12044
12309
  const auto n_tokens = ubatch.n_tokens;
12045
12310
  const auto n_seqs = ubatch.n_seqs;
@@ -12118,9 +12383,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12118
12383
  v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
12119
12384
  a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
12120
12385
 
12121
- ggml_tensor * wkv_state = build_copy_mask_state(
12122
- gf, kv_state->get_v_l(il), state_copy, state_mask,
12123
- hparams.n_embd_v_s(), n_seqs);
12386
+ ggml_tensor * wkv_state = build_rs(
12387
+ inp, gf, kv_state->get_s_l(il),
12388
+ hparams.n_embd_s(), n_seqs);
12124
12389
 
12125
12390
  ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
12126
12391
  cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
@@ -12133,9 +12398,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
12133
12398
  wkv_state,
12134
12399
  ggml_view_1d(
12135
12400
  ctx0,
12136
- kv_state->get_v_l(il),
12137
- hparams.n_embd_v_s() * n_seqs,
12138
- hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
12401
+ kv_state->get_s_l(il),
12402
+ hparams.n_embd_s() * n_seqs,
12403
+ hparams.n_embd_s() * kv_head * ggml_element_size(kv_state->get_s_l(il))
12139
12404
  )
12140
12405
  )
12141
12406
  );
@@ -12176,8 +12441,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12176
12441
  inpL = build_inp_embd(model.tok_embd);
12177
12442
  inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
12178
12443
 
12179
- ggml_tensor * state_copy = build_inp_s_copy();
12180
- ggml_tensor * state_mask = build_inp_s_mask();
12444
+ auto * rs_inp = build_rs_inp();
12181
12445
 
12182
12446
  const auto n_embd = hparams.n_embd;
12183
12447
  const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -12187,9 +12451,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12187
12451
  const llama_layer * layer = &model.layers[il];
12188
12452
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
12189
12453
 
12190
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
12191
- gf, state_copy, state_mask, ubatch, il
12192
- );
12454
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
12193
12455
 
12194
12456
  ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
12195
12457
  ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -12204,7 +12466,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12204
12466
  1
12205
12467
  );
12206
12468
 
12207
- cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
12469
+ cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
12208
12470
 
12209
12471
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
12210
12472
  cb(ffn_inp, "ffn_inp", il);
@@ -12262,7 +12524,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
12262
12524
 
12263
12525
  struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12264
12526
  llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
12265
- GGML_ASSERT(n_embd == hparams.n_embd_k_s());
12527
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
12266
12528
 
12267
12529
  ggml_tensor * cur;
12268
12530
  ggml_tensor * inpL;
@@ -12270,8 +12532,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12270
12532
 
12271
12533
  inpL = build_inp_embd(model.tok_embd);
12272
12534
 
12273
- ggml_tensor * state_copy = build_inp_s_copy();
12274
- ggml_tensor * state_mask = build_inp_s_mask();
12535
+ auto * rs_inp = build_rs_inp();
12275
12536
 
12276
12537
  const auto n_embd = hparams.n_embd;
12277
12538
  const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -12281,9 +12542,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12281
12542
  const llama_layer * layer = &model.layers[il];
12282
12543
  inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
12283
12544
 
12284
- ggml_tensor * token_shift = build_rwkv_token_shift_load(
12285
- gf, state_copy, state_mask, ubatch, il
12286
- );
12545
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
12287
12546
 
12288
12547
  ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
12289
12548
  cb(att_norm, "attn_norm", il);
@@ -12295,7 +12554,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12295
12554
  1
12296
12555
  );
12297
12556
 
12298
- cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
12557
+ cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
12299
12558
 
12300
12559
  token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
12301
12560
  ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -13187,69 +13446,375 @@ struct llm_build_bailingmoe : public llm_graph_context {
13187
13446
  }
13188
13447
  };
13189
13448
 
13190
- llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
13191
- llama_memory_i * res;
13192
-
13193
- switch (arch) {
13194
- case LLM_ARCH_BERT:
13195
- case LLM_ARCH_JINA_BERT_V2:
13196
- case LLM_ARCH_NOMIC_BERT:
13197
- case LLM_ARCH_NOMIC_BERT_MOE:
13198
- case LLM_ARCH_WAVTOKENIZER_DEC:
13199
- {
13200
- res = nullptr;
13201
- } break;
13202
- case LLM_ARCH_MAMBA:
13203
- case LLM_ARCH_RWKV6:
13204
- case LLM_ARCH_RWKV6QWEN2:
13205
- case LLM_ARCH_RWKV7:
13206
- case LLM_ARCH_ARWKV7:
13207
- {
13208
- res = new llama_kv_cache_recurrent(
13209
- *this,
13210
- GGML_TYPE_F32,
13211
- GGML_TYPE_F32,
13212
- cparams.offload_kqv,
13213
- std::max((uint32_t) 1, cparams.n_seq_max),
13214
- cparams.n_seq_max);
13215
- } break;
13216
- default:
13217
- {
13218
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
13449
+ struct llm_build_dots1 : public llm_graph_context {
13450
+ llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13451
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13219
13452
 
13220
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
13453
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13454
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
13221
13455
 
13222
- LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
13456
+ ggml_tensor * cur;
13457
+ ggml_tensor * inpL;
13223
13458
 
13224
- if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13225
- GGML_ASSERT(hparams.is_swa_any());
13459
+ inpL = build_inp_embd(model.tok_embd);
13226
13460
 
13227
- res = new llama_kv_cache_unified_iswa(
13228
- *this,
13229
- params.type_k,
13230
- params.type_v,
13231
- !cparams.flash_attn,
13232
- cparams.offload_kqv,
13233
- params.swa_full,
13234
- cparams.n_ctx,
13235
- cparams.n_seq_max,
13236
- cparams.n_ubatch,
13237
- padding);
13238
- } else {
13239
- GGML_ASSERT(!hparams.is_swa_any());
13461
+ // inp_pos - contains the positions
13462
+ ggml_tensor * inp_pos = build_inp_pos();
13463
+
13464
+ auto * inp_attn = build_attn_inp_kv_unified();
13465
+
13466
+ for (int il = 0; il < n_layer; ++il) {
13467
+ ggml_tensor * inpSA = inpL;
13468
+
13469
+ // norm
13470
+ cur = build_norm(inpL,
13471
+ model.layers[il].attn_norm, NULL,
13472
+ LLM_NORM_RMS, il);
13473
+ cb(cur, "attn_norm", il);
13474
+
13475
+ // self_attention
13476
+ {
13477
+ // compute Q and K and RoPE them
13478
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13479
+ cb(Qcur, "Qcur", il);
13480
+
13481
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13482
+ cb(Kcur, "Kcur", il);
13483
+
13484
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13485
+ cb(Vcur, "Vcur", il);
13486
+
13487
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13488
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13489
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13490
+
13491
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
13492
+ cb(Qcur, "Qcur_normed", il);
13493
+
13494
+ Qcur = ggml_rope_ext(
13495
+ ctx0, Qcur, inp_pos, nullptr,
13496
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13497
+ ext_factor, attn_factor, beta_fast, beta_slow
13498
+ );
13499
+
13500
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
13501
+ cb(Kcur, "Kcur_normed", il);
13502
+
13503
+ Kcur = ggml_rope_ext(
13504
+ ctx0, Kcur, inp_pos, nullptr,
13505
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13506
+ ext_factor, attn_factor, beta_fast, beta_slow
13507
+ );
13508
+
13509
+ cb(Qcur, "Qcur", il);
13510
+ cb(Kcur, "Kcur", il);
13511
+ cb(Vcur, "Vcur", il);
13512
+
13513
+ cur = build_attn(inp_attn, gf,
13514
+ model.layers[il].wo, model.layers[il].bo,
13515
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
13516
+ }
13517
+
13518
+ if (il == n_layer - 1) {
13519
+ // skip computing output for unused tokens
13520
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13521
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13522
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13523
+ }
13524
+
13525
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13526
+ cb(ffn_inp, "ffn_inp", il);
13527
+
13528
+ // MoE branch
13529
+ cur = build_norm(ffn_inp,
13530
+ model.layers[il].ffn_norm, NULL,
13531
+ LLM_NORM_RMS, il);
13532
+ cb(cur, "ffn_norm", il);
13533
+
13534
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
13535
+ cur = build_ffn(cur,
13536
+ model.layers[il].ffn_up, NULL, NULL,
13537
+ model.layers[il].ffn_gate, NULL, NULL,
13538
+ model.layers[il].ffn_down, NULL, NULL,
13539
+ NULL,
13540
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13541
+ cb(cur, "ffn_out", il);
13542
+ } else {
13543
+ ggml_tensor * moe_out =
13544
+ build_moe_ffn(cur,
13545
+ model.layers[il].ffn_gate_inp,
13546
+ model.layers[il].ffn_up_exps,
13547
+ model.layers[il].ffn_gate_exps,
13548
+ model.layers[il].ffn_down_exps,
13549
+ model.layers[il].ffn_exp_probs_b,
13550
+ n_expert, n_expert_used,
13551
+ LLM_FFN_SILU, hparams.expert_weights_norm,
13552
+ true, hparams.expert_weights_scale,
13553
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
13554
+ il);
13555
+ cb(moe_out, "ffn_moe_out", il);
13556
+
13557
+ {
13558
+ ggml_tensor * ffn_shexp = build_ffn(cur,
13559
+ model.layers[il].ffn_up_shexp, NULL, NULL,
13560
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
13561
+ model.layers[il].ffn_down_shexp, NULL, NULL,
13562
+ NULL,
13563
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
13564
+ cb(ffn_shexp, "ffn_shexp", il);
13565
+
13566
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
13567
+ cb(cur, "ffn_out", il);
13568
+ }
13569
+ }
13570
+
13571
+ cur = ggml_add(ctx0, cur, ffn_inp);
13572
+
13573
+ cur = build_cvec(cur, il);
13574
+ cb(cur, "l_out", il);
13575
+
13576
+ // input for next layer
13577
+ inpL = cur;
13578
+ }
13579
+
13580
+ cur = inpL;
13581
+
13582
+ cur = build_norm(cur,
13583
+ model.output_norm, NULL,
13584
+ LLM_NORM_RMS, -1);
13585
+
13586
+ cb(cur, "result_norm", -1);
13587
+ res->t_embd = cur;
13588
+
13589
+ // lm_head
13590
+ cur = build_lora_mm(model.output, cur);
13591
+
13592
+ cb(cur, "result_output", -1);
13593
+ res->t_logits = cur;
13594
+
13595
+ ggml_build_forward_expand(gf, cur);
13596
+ }
13597
+ };
13598
+
13599
+ struct llm_build_arcee : public llm_graph_context {
13600
+ llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
13601
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13602
+
13603
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13604
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
13605
+
13606
+ ggml_tensor * cur;
13607
+ ggml_tensor * inpL;
13608
+
13609
+ inpL = build_inp_embd(model.tok_embd);
13610
+
13611
+ // inp_pos - contains the positions
13612
+ ggml_tensor * inp_pos = build_inp_pos();
13613
+
13614
+ auto * inp_attn = build_attn_inp_kv_unified();
13615
+
13616
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
13617
+
13618
+ for (int il = 0; il < n_layer; ++il) {
13619
+ ggml_tensor * inpSA = inpL;
13620
+
13621
+ // norm
13622
+ cur = build_norm(inpL,
13623
+ model.layers[il].attn_norm, NULL,
13624
+ LLM_NORM_RMS, il);
13625
+ cb(cur, "attn_norm", il);
13626
+
13627
+ // self-attention
13628
+ {
13629
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
13630
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
13631
+
13632
+ // compute Q and K and RoPE them
13633
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
13634
+ cb(Qcur, "Qcur", il);
13635
+ if (model.layers[il].bq) {
13636
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13637
+ cb(Qcur, "Qcur", il);
13638
+ }
13639
+
13640
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
13641
+ cb(Kcur, "Kcur", il);
13642
+ if (model.layers[il].bk) {
13643
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13644
+ cb(Kcur, "Kcur", il);
13645
+ }
13646
+
13647
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
13648
+ cb(Vcur, "Vcur", il);
13649
+ if (model.layers[il].bv) {
13650
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
13651
+ cb(Vcur, "Vcur", il);
13652
+ }
13653
+
13654
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13655
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13656
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13657
+
13658
+ Qcur = ggml_rope_ext(
13659
+ ctx0, Qcur, inp_pos, rope_factors,
13660
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13661
+ ext_factor, attn_factor, beta_fast, beta_slow
13662
+ );
13240
13663
 
13241
- res = new llama_kv_cache_unified(
13664
+ Kcur = ggml_rope_ext(
13665
+ ctx0, Kcur, inp_pos, rope_factors,
13666
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13667
+ ext_factor, attn_factor, beta_fast, beta_slow
13668
+ );
13669
+
13670
+ cb(Qcur, "Qcur", il);
13671
+ cb(Kcur, "Kcur", il);
13672
+ cb(Vcur, "Vcur", il);
13673
+
13674
+ cur = build_attn(inp_attn, gf,
13675
+ model.layers[il].wo, model.layers[il].bo,
13676
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
13677
+ cb(cur, "attn_out", il);
13678
+ }
13679
+
13680
+ if (il == n_layer - 1) {
13681
+ // skip computing output for unused tokens
13682
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13683
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13684
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13685
+ }
13686
+
13687
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13688
+ cb(ffn_inp, "ffn_inp", il);
13689
+
13690
+ // feed-forward network
13691
+ // ARCEE uses relu^2 instead of silu
13692
+ cur = build_norm(ffn_inp,
13693
+ model.layers[il].ffn_norm, NULL,
13694
+ LLM_NORM_RMS, il);
13695
+ cb(cur, "ffn_norm", il);
13696
+
13697
+ cur = build_ffn(cur,
13698
+ model.layers[il].ffn_up, NULL, NULL,
13699
+ NULL, NULL, NULL,
13700
+ model.layers[il].ffn_down, NULL, NULL,
13701
+ NULL,
13702
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
13703
+ cb(cur, "ffn_out", il);
13704
+
13705
+ cur = ggml_add(ctx0, cur, ffn_inp);
13706
+ cb(cur, "ffn_out", il);
13707
+
13708
+ cur = build_cvec(cur, il);
13709
+ cb(cur, "l_out", il);
13710
+
13711
+ // input for next layer
13712
+ inpL = cur;
13713
+ }
13714
+
13715
+ cur = inpL;
13716
+
13717
+ cur = build_norm(cur,
13718
+ model.output_norm, NULL,
13719
+ LLM_NORM_RMS, -1);
13720
+
13721
+ cb(cur, "result_norm", -1);
13722
+ res->t_embd = cur;
13723
+
13724
+ // lm_head
13725
+ cur = build_lora_mm(model.output, cur);
13726
+
13727
+ cb(cur, "result_output", -1);
13728
+ res->t_logits = cur;
13729
+
13730
+ ggml_build_forward_expand(gf, cur);
13731
+ }
13732
+ };
13733
+
13734
+ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
13735
+ llama_memory_i * res;
13736
+
13737
+ switch (arch) {
13738
+ // Models that need specific instantiation should be handled in the
13739
+ // switch statement
13740
+ case LLM_ARCH_BERT:
13741
+ case LLM_ARCH_JINA_BERT_V2:
13742
+ case LLM_ARCH_NOMIC_BERT:
13743
+ case LLM_ARCH_NOMIC_BERT_MOE:
13744
+ case LLM_ARCH_NEO_BERT:
13745
+ case LLM_ARCH_WAVTOKENIZER_DEC:
13746
+ {
13747
+ res = nullptr;
13748
+ } break;
13749
+ // Models that need standard caching should rely on recurrent/hybrid
13750
+ // checks
13751
+ default:
13752
+ {
13753
+ if (llm_arch_is_recurrent(arch)) {
13754
+ res = new llama_memory_recurrent(
13242
13755
  *this,
13243
13756
  nullptr,
13244
- params.type_k,
13245
- params.type_v,
13246
- !cparams.flash_attn,
13757
+ GGML_TYPE_F32,
13758
+ GGML_TYPE_F32,
13247
13759
  cparams.offload_kqv,
13248
- cparams.n_ctx,
13249
- cparams.n_seq_max,
13250
- padding,
13251
- hparams.n_swa,
13252
- hparams.swa_type);
13760
+ std::max((uint32_t) 1, cparams.n_seq_max),
13761
+ cparams.n_seq_max);
13762
+ } else if (llm_arch_is_hybrid(arch)) {
13763
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
13764
+
13765
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
13766
+
13767
+ res = new llama_memory_hybrid(
13768
+ /* model */ *this,
13769
+ /* attn_type_k */ params.type_k,
13770
+ /* attn_type_v */ params.type_v,
13771
+ /* attn_v_trans */ !cparams.flash_attn,
13772
+ /* attn_kv_size */ cparams.n_ctx,
13773
+ /* attn_n_pad */ padding,
13774
+ /* attn_n_swa */ hparams.n_swa,
13775
+ /* attn_swa_type */ hparams.swa_type,
13776
+ /* recurrent_type_k */ GGML_TYPE_F32,
13777
+ /* recurrent_type_v */ GGML_TYPE_F32,
13778
+ /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
13779
+ /* n_seq_max */ cparams.n_seq_max,
13780
+ /* offload */ cparams.offload_kqv);
13781
+ } else {
13782
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
13783
+
13784
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
13785
+
13786
+ LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
13787
+
13788
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13789
+ GGML_ASSERT(hparams.is_swa_any());
13790
+
13791
+ res = new llama_kv_cache_unified_iswa(
13792
+ *this,
13793
+ params.type_k,
13794
+ params.type_v,
13795
+ !cparams.flash_attn,
13796
+ cparams.offload_kqv,
13797
+ params.swa_full,
13798
+ cparams.n_ctx,
13799
+ cparams.n_seq_max,
13800
+ cparams.n_ubatch,
13801
+ padding);
13802
+ } else {
13803
+ GGML_ASSERT(!hparams.is_swa_any());
13804
+
13805
+ res = new llama_kv_cache_unified(
13806
+ *this,
13807
+ nullptr,
13808
+ params.type_k,
13809
+ params.type_v,
13810
+ !cparams.flash_attn,
13811
+ cparams.offload_kqv,
13812
+ cparams.n_ctx,
13813
+ cparams.n_seq_max,
13814
+ padding,
13815
+ hparams.n_swa,
13816
+ hparams.swa_type);
13817
+ }
13253
13818
  }
13254
13819
  }
13255
13820
  }
@@ -13303,6 +13868,10 @@ llm_graph_result_ptr llama_model::build_graph(
13303
13868
  {
13304
13869
  llm = std::make_unique<llm_build_bert>(*this, params, gf);
13305
13870
  } break;
13871
+ case LLM_ARCH_NEO_BERT:
13872
+ {
13873
+ llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
13874
+ } break;
13306
13875
  case LLM_ARCH_BLOOM:
13307
13876
  {
13308
13877
  llm = std::make_unique<llm_build_bloom>(*this, params, gf);
@@ -13525,6 +14094,14 @@ llm_graph_result_ptr llama_model::build_graph(
13525
14094
  {
13526
14095
  llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
13527
14096
  } break;
14097
+ case LLM_ARCH_DOTS1:
14098
+ {
14099
+ llm = std::make_unique<llm_build_dots1>(*this, params, gf);
14100
+ } break;
14101
+ case LLM_ARCH_ARCEE:
14102
+ {
14103
+ llm = std::make_unique<llm_build_arcee>(*this, params, gf);
14104
+ } break;
13528
14105
  default:
13529
14106
  GGML_ABORT("fatal error");
13530
14107
  }
@@ -13600,6 +14177,18 @@ int32_t llama_model_n_swa(const llama_model * model) {
13600
14177
  return model->hparams.n_swa;
13601
14178
  }
13602
14179
 
14180
+ uint32_t llama_model_n_cls_out(const struct llama_model * model) {
14181
+ return model->hparams.n_cls_out;
14182
+ }
14183
+
14184
+ const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
14185
+ if (i < model->classifier_labels.size()) {
14186
+ return model->classifier_labels[i].c_str();
14187
+ }
14188
+
14189
+ return nullptr;
14190
+ }
14191
+
13603
14192
  // deprecated
13604
14193
  int32_t llama_n_ctx_train(const llama_model * model) {
13605
14194
  return llama_model_n_ctx_train(model);
@@ -13662,6 +14251,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13662
14251
  case LLM_ARCH_GRANITE_MOE:
13663
14252
  case LLM_ARCH_CHAMELEON:
13664
14253
  case LLM_ARCH_BAILINGMOE:
14254
+ case LLM_ARCH_NEO_BERT:
14255
+ case LLM_ARCH_ARCEE:
13665
14256
  return LLAMA_ROPE_TYPE_NORM;
13666
14257
 
13667
14258
  // the pairs of head values are offset by n_rot/2
@@ -13695,6 +14286,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13695
14286
  case LLM_ARCH_NEMOTRON:
13696
14287
  case LLM_ARCH_EXAONE:
13697
14288
  case LLM_ARCH_MINICPM3:
14289
+ case LLM_ARCH_DOTS1:
13698
14290
  return LLAMA_ROPE_TYPE_NEOX;
13699
14291
 
13700
14292
  case LLM_ARCH_QWEN2VL:
@@ -13760,7 +14352,7 @@ uint64_t llama_model_size(const llama_model * model) {
13760
14352
  }
13761
14353
 
13762
14354
  const char * llama_model_chat_template(const llama_model * model, const char * name) {
13763
- const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
14355
+ const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
13764
14356
  : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
13765
14357
  const auto & it = model->gguf_kv.find(key);
13766
14358
  if (it == model->gguf_kv.end()) {
@@ -13802,14 +14394,7 @@ llama_token llama_model_decoder_start_token(const llama_model * model) {
13802
14394
  }
13803
14395
 
13804
14396
  bool llama_model_is_recurrent(const llama_model * model) {
13805
- switch (model->arch) {
13806
- case LLM_ARCH_MAMBA: return true;
13807
- case LLM_ARCH_RWKV6: return true;
13808
- case LLM_ARCH_RWKV6QWEN2: return true;
13809
- case LLM_ARCH_RWKV7: return true;
13810
- case LLM_ARCH_ARWKV7: return true;
13811
- default: return false;
13812
- }
14397
+ return llm_arch_is_recurrent(model->arch);
13813
14398
  }
13814
14399
 
13815
14400
  const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {