cui-llama.rn 1.6.1 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. package/android/src/main/CMakeLists.txt +6 -0
  2. package/android/src/main/java/com/rnllama/LlamaContext.java +38 -5
  3. package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
  4. package/android/src/main/jni.cpp +153 -14
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  13. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
  14. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
  15. package/cpp/chat.cpp +128 -106
  16. package/cpp/chat.h +2 -0
  17. package/cpp/common.cpp +41 -76
  18. package/cpp/common.h +23 -19
  19. package/cpp/ggml-backend.cpp +9 -5
  20. package/cpp/ggml-backend.h +4 -4
  21. package/cpp/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  22. package/cpp/ggml-cpu/ggml-cpu-quants.c +306 -6
  23. package/cpp/ggml-cpu/ggml-cpu.c +5 -13
  24. package/cpp/ggml-cpu/ggml-cpu.cpp +29 -16
  25. package/cpp/ggml-cpu/ops.cpp +107 -13
  26. package/cpp/ggml-cpu/vec.cpp +0 -6
  27. package/cpp/ggml-cpu/vec.h +16 -0
  28. package/cpp/ggml-llama-sim.metallib +0 -0
  29. package/cpp/ggml-llama.metallib +0 -0
  30. package/cpp/ggml-metal-impl.h +36 -11
  31. package/cpp/ggml-metal.m +321 -132
  32. package/cpp/ggml-opt.cpp +373 -190
  33. package/cpp/ggml-opt.h +49 -28
  34. package/cpp/ggml-quants.c +0 -6
  35. package/cpp/ggml.c +93 -38
  36. package/cpp/ggml.h +21 -7
  37. package/cpp/gguf.cpp +33 -33
  38. package/cpp/llama-adapter.cpp +6 -0
  39. package/cpp/llama-arch.cpp +3 -0
  40. package/cpp/llama-batch.cpp +3 -1
  41. package/cpp/llama-chat.cpp +8 -6
  42. package/cpp/llama-chat.h +1 -0
  43. package/cpp/llama-context.cpp +349 -135
  44. package/cpp/llama-context.h +30 -3
  45. package/cpp/llama-cparams.h +1 -0
  46. package/cpp/llama-graph.cpp +150 -234
  47. package/cpp/llama-graph.h +52 -7
  48. package/cpp/llama-hparams.cpp +17 -1
  49. package/cpp/llama-hparams.h +34 -5
  50. package/cpp/llama-kv-cache.cpp +662 -321
  51. package/cpp/llama-kv-cache.h +203 -93
  52. package/cpp/llama-memory.h +3 -2
  53. package/cpp/llama-model-loader.cpp +24 -15
  54. package/cpp/llama-model-saver.cpp +281 -0
  55. package/cpp/llama-model-saver.h +37 -0
  56. package/cpp/llama-model.cpp +536 -132
  57. package/cpp/llama-model.h +7 -1
  58. package/cpp/llama-sampling.cpp +18 -6
  59. package/cpp/llama-vocab.cpp +46 -8
  60. package/cpp/llama-vocab.h +6 -0
  61. package/cpp/llama.cpp +14 -0
  62. package/cpp/llama.h +72 -131
  63. package/cpp/minja/chat-template.hpp +9 -5
  64. package/cpp/minja/minja.hpp +69 -36
  65. package/cpp/rn-llama.cpp +611 -47
  66. package/cpp/rn-llama.h +33 -3
  67. package/cpp/sampling.cpp +57 -50
  68. package/cpp/tools/mtmd/clip-impl.h +462 -0
  69. package/cpp/tools/mtmd/clip.cpp +4024 -0
  70. package/cpp/tools/mtmd/clip.h +101 -0
  71. package/cpp/tools/mtmd/miniaudio.h +93468 -0
  72. package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  73. package/cpp/tools/mtmd/mtmd-audio.h +62 -0
  74. package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
  75. package/cpp/tools/mtmd/mtmd.cpp +942 -0
  76. package/cpp/tools/mtmd/mtmd.h +362 -0
  77. package/cpp/tools/mtmd/stb_image.h +7988 -0
  78. package/ios/CMakeLists.txt +7 -0
  79. package/ios/RNLlama.mm +77 -3
  80. package/ios/RNLlamaContext.h +5 -1
  81. package/ios/RNLlamaContext.mm +105 -10
  82. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
  83. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +23 -19
  84. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  85. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  86. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  87. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +21 -7
  88. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  89. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +30 -3
  90. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  91. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +52 -7
  92. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +34 -5
  93. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +203 -93
  94. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +3 -2
  95. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  96. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +7 -1
  97. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  98. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +72 -131
  99. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  100. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  101. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +33 -3
  102. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  105. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  106. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +23 -19
  107. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  108. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  109. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  110. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +21 -7
  111. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  112. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +30 -3
  113. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  114. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +52 -7
  115. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +34 -5
  116. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +203 -93
  117. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +3 -2
  118. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  119. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +7 -1
  120. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  121. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +72 -131
  122. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  123. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  124. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +33 -3
  125. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  126. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  127. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  128. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  129. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
  130. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +23 -19
  131. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  132. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  133. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  134. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +21 -7
  135. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  136. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +30 -3
  137. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  138. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +52 -7
  139. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +34 -5
  140. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +203 -93
  141. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +3 -2
  142. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  143. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +7 -1
  144. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  145. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +72 -131
  146. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  147. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  148. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +33 -3
  149. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  150. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  151. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  152. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  153. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +23 -19
  154. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  155. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  156. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  157. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +21 -7
  158. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  159. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +30 -3
  160. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  161. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +52 -7
  162. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +34 -5
  163. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +203 -93
  164. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +3 -2
  165. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  166. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +7 -1
  167. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  168. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +72 -131
  169. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  170. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  171. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +33 -3
  172. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  173. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  174. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  175. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  176. package/jest/mock.js +33 -7
  177. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  178. package/lib/commonjs/index.js +153 -21
  179. package/lib/commonjs/index.js.map +1 -1
  180. package/lib/module/NativeRNLlama.js.map +1 -1
  181. package/lib/module/index.js +152 -20
  182. package/lib/module/index.js.map +1 -1
  183. package/lib/typescript/NativeRNLlama.d.ts +50 -4
  184. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  185. package/lib/typescript/index.d.ts +72 -6
  186. package/lib/typescript/index.d.ts.map +1 -1
  187. package/package.json +1 -1
  188. package/src/NativeRNLlama.ts +67 -4
  189. package/src/index.ts +212 -38
  190. package/lib/commonjs/chat.js +0 -37
  191. package/lib/commonjs/chat.js.map +0 -1
  192. package/lib/module/chat.js +0 -33
  193. package/lib/module/chat.js.map +0 -1
  194. package/lib/typescript/chat.d.ts +0 -10
  195. package/lib/typescript/chat.d.ts.map +0 -1
  196. package/src/chat.ts +0 -44
@@ -80,6 +80,7 @@ const char * llm_type_name(llm_type type) {
80
80
  case LLM_TYPE_236B: return "236B";
81
81
  case LLM_TYPE_290B: return "290B";
82
82
  case LLM_TYPE_314B: return "314B";
83
+ case LLM_TYPE_405B: return "405B";
83
84
  case LLM_TYPE_671B: return "671B";
84
85
  case LLM_TYPE_SMALL: return "0.1B";
85
86
  case LLM_TYPE_MEDIUM: return "0.4B";
@@ -116,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
116
117
  { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
117
118
  };
118
119
 
120
+ std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
121
+ return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
122
+ }
123
+
119
124
  static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
120
125
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
121
126
  if (kv.second == name) {
@@ -298,6 +303,10 @@ static buft_list_t make_cpu_buft_list(const std::vector<lm_ggml_backend_dev_t> &
298
303
  // add extra buffer types, only if no GPU device is present
299
304
  // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
300
305
  auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
306
+ if (cpu_dev == nullptr) {
307
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
308
+ }
309
+
301
310
  auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
302
311
  auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
303
312
  lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
@@ -454,11 +463,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
454
463
  LM_GGML_ASSERT(hparams.n_expert_used == 0);
455
464
  }
456
465
 
457
- // zero-out the array hparams
458
466
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
459
467
  std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
460
468
  std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
461
469
 
470
+ std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
471
+
472
+ std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
473
+
462
474
  ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
463
475
  ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
464
476
 
@@ -562,9 +574,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
562
574
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
563
575
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
564
576
  ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
565
- hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
566
- hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
567
- hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
577
+
578
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
579
+ hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
580
+ hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
568
581
 
569
582
  switch (hparams.n_expert) {
570
583
  case 16: type = LLM_TYPE_17B_16E; break;
@@ -582,6 +595,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
582
595
  switch (hparams.n_layer) {
583
596
  case 32: type = LLM_TYPE_7B; break;
584
597
  case 80: type = LLM_TYPE_70B; break;
598
+ case 162: type = LLM_TYPE_405B; break;
585
599
  default: type = LLM_TYPE_UNKNOWN;
586
600
  }
587
601
  } break;
@@ -842,22 +856,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
842
856
  default: type = LLM_TYPE_UNKNOWN;
843
857
  }
844
858
 
845
- // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
846
- if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
847
- // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
848
- hparams.n_swa = 2047;
849
- } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
850
- // default value for Phi-3-mini-128k-instruct
851
- // note: this seems incorrect because the window is bigger than the train context?
852
- hparams.n_swa = 262144;
853
- } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
854
- // default value for Phi-3-medium-128k-instruct
855
- // note: this seems incorrect because the window is equal to the train context?
856
- hparams.n_swa = 131072;
857
- }
858
- bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
859
- if (!found_swa && hparams.n_swa == 0) {
860
- throw std::runtime_error("invalid value for sliding_window");
859
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
860
+
861
+ if (found_swa && hparams.n_swa > 0) {
862
+ LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
863
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
864
+
865
+ // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
866
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
867
+
868
+ hparams.n_swa = 0;
869
+ hparams.set_swa_pattern(1);
861
870
  }
862
871
  } break;
863
872
  case LLM_ARCH_PHIMOE:
@@ -927,8 +936,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
927
936
  } break;
928
937
  case LLM_ARCH_GEMMA2:
929
938
  {
939
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
930
940
  hparams.n_swa = 4096; // default value of gemma 2
931
- hparams.n_swa_pattern = 2;
941
+ hparams.set_swa_pattern(2);
932
942
  hparams.attn_soft_cap = true;
933
943
 
934
944
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
@@ -945,7 +955,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
945
955
  } break;
946
956
  case LLM_ARCH_GEMMA3:
947
957
  {
948
- hparams.n_swa_pattern = 6;
958
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
959
+ hparams.set_swa_pattern(6);
949
960
 
950
961
  hparams.rope_freq_base_train_swa = 10000.0f;
951
962
  hparams.rope_freq_scale_train_swa = 1.0f;
@@ -1029,7 +1040,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1029
1040
  } break;
1030
1041
  case LLM_ARCH_COHERE2:
1031
1042
  {
1032
- hparams.n_swa_pattern = 4;
1043
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1044
+ hparams.set_swa_pattern(4);
1033
1045
 
1034
1046
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1035
1047
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -1379,6 +1391,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1379
1391
  // Add additional layer/vocab/etc checks here for other model sizes
1380
1392
  default: type = LLM_TYPE_UNKNOWN;
1381
1393
  }
1394
+
1395
+ // For Granite MoE Shared
1396
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1382
1397
  } break;
1383
1398
  case LLM_ARCH_CHAMELEON:
1384
1399
  {
@@ -1482,6 +1497,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1482
1497
  }
1483
1498
 
1484
1499
  lm_ggml_backend_dev_t cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
1500
+ if (cpu_dev == nullptr) {
1501
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
1502
+ }
1485
1503
  const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
1486
1504
  const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
1487
1505
  auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
@@ -1649,8 +1667,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1649
1667
  for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
1650
1668
  std::regex pattern(overrides->pattern);
1651
1669
  if (std::regex_search(tensor_name, pattern)) {
1652
- LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), lm_ggml_backend_buft_name(overrides->buft));
1653
1670
  buft = overrides->buft;
1671
+ LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
1672
+ tensor_name.c_str(),
1673
+ lm_ggml_nbytes(t_meta) / 1024 / 1024, lm_ggml_type_name(t_meta->type),
1674
+ lm_ggml_backend_buft_name(buft));
1654
1675
  break;
1655
1676
  }
1656
1677
  }
@@ -1667,6 +1688,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1667
1688
  auto * buft_dev = lm_ggml_backend_buft_get_device(buft);
1668
1689
  if (ml.use_mmap && buft_dev && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
1669
1690
  auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
1691
+ if (!cpu_dev) {
1692
+ throw std::runtime_error("no CPU backend found");
1693
+ }
1670
1694
  buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
1671
1695
  }
1672
1696
 
@@ -1753,6 +1777,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1753
1777
  layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1754
1778
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1755
1779
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1780
+
1781
+ // For Granite MoE Shared
1782
+ if (hparams.n_ff_shexp > 0) {
1783
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1784
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1785
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
1786
+ }
1756
1787
  }
1757
1788
  }
1758
1789
  } break;
@@ -1848,7 +1879,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1848
1879
  layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
1849
1880
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1850
1881
 
1851
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1882
+ if (n_ff > 0) {
1883
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1884
+ }
1852
1885
 
1853
1886
  if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
1854
1887
  layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
@@ -1858,9 +1891,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1858
1891
  layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1859
1892
  }
1860
1893
 
1861
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1862
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1863
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1894
+ if (n_ff > 0) {
1895
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1896
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1897
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1898
+ }
1864
1899
 
1865
1900
  // optional MLP bias
1866
1901
  layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
@@ -3504,7 +3539,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3504
3539
 
3505
3540
  // output
3506
3541
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3507
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3542
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3543
+ // if output is NULL, init from the input tok embed
3544
+ if (output == NULL) {
3545
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3546
+ }
3508
3547
 
3509
3548
  for (int i = 0; i < n_layer; ++i) {
3510
3549
  auto & layer = layers[i];
@@ -4109,6 +4148,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4109
4148
  if (!dev) {
4110
4149
  // FIXME: workaround for CPU backend buft having a NULL device
4111
4150
  dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
4151
+ if (!dev) {
4152
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
4153
+ }
4112
4154
  }
4113
4155
  lm_ggml_backend_dev_props props;
4114
4156
  lm_ggml_backend_dev_get_props(dev, &props);
@@ -4238,7 +4280,7 @@ uint64_t llama_model::n_elements() const {
4238
4280
  }
4239
4281
 
4240
4282
  void llama_model::print_info() const {
4241
- const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
4283
+ const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
4242
4284
 
4243
4285
  auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
4244
4286
  bool is_var = false;
@@ -4281,7 +4323,7 @@ void llama_model::print_info() const {
4281
4323
  LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
4282
4324
  LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
4283
4325
  LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
4284
- LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern);
4326
+ LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
4285
4327
  LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
4286
4328
  LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
4287
4329
  LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
@@ -4299,7 +4341,7 @@ void llama_model::print_info() const {
4299
4341
  LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
4300
4342
  LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
4301
4343
  LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
4302
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
4344
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
4303
4345
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
4304
4346
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4305
4347
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
@@ -4355,10 +4397,13 @@ void llama_model::print_info() const {
4355
4397
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4356
4398
  }
4357
4399
 
4358
- if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
4400
+ if (arch == LLM_ARCH_MINICPM ||
4401
+ arch == LLM_ARCH_GRANITE ||
4402
+ arch == LLM_ARCH_GRANITE_MOE) {
4359
4403
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
4360
4404
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
4361
4405
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
4406
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
4362
4407
  }
4363
4408
 
4364
4409
  if (arch == LLM_ARCH_BAILINGMOE) {
@@ -4446,7 +4491,17 @@ const lm_ggml_tensor * llama_model::get_tensor(const char * name) const {
4446
4491
  return it->second;
4447
4492
  }
4448
4493
 
4449
- lm_ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
4494
+ float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
4495
+ return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
4496
+ }
4497
+
4498
+ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
4499
+ return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
4500
+ }
4501
+
4502
+ lm_ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
4503
+ const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
4504
+
4450
4505
  // choose long/short freq factors based on the context size
4451
4506
  if (layers[il].rope_freqs != nullptr) {
4452
4507
  return layers[il].rope_freqs;
@@ -4474,21 +4529,174 @@ struct llm_build_llama : public llm_graph_context {
4474
4529
  // inp_pos - contains the positions
4475
4530
  lm_ggml_tensor * inp_pos = build_inp_pos();
4476
4531
 
4532
+ auto * inp_attn = build_attn_inp_kv_unified();
4533
+
4534
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4535
+
4536
+ for (int il = 0; il < n_layer; ++il) {
4537
+ lm_ggml_tensor * inpSA = inpL;
4538
+
4539
+ // norm
4540
+ cur = build_norm(inpL,
4541
+ model.layers[il].attn_norm, NULL,
4542
+ LLM_NORM_RMS, il);
4543
+ cb(cur, "attn_norm", il);
4544
+
4545
+ // self-attention
4546
+ {
4547
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
4548
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4549
+
4550
+ // compute Q and K and RoPE them
4551
+ lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
4552
+ cb(Qcur, "Qcur", il);
4553
+ if (model.layers[il].bq) {
4554
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
4555
+ cb(Qcur, "Qcur", il);
4556
+ }
4557
+
4558
+ lm_ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
4559
+ cb(Kcur, "Kcur", il);
4560
+ if (model.layers[il].bk) {
4561
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
4562
+ cb(Kcur, "Kcur", il);
4563
+ }
4564
+
4565
+ lm_ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
4566
+ cb(Vcur, "Vcur", il);
4567
+ if (model.layers[il].bv) {
4568
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
4569
+ cb(Vcur, "Vcur", il);
4570
+ }
4571
+
4572
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
4573
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
4574
+ Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
4575
+
4576
+ Qcur = lm_ggml_rope_ext(
4577
+ ctx0, Qcur, inp_pos, rope_factors,
4578
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4579
+ ext_factor, attn_factor, beta_fast, beta_slow
4580
+ );
4581
+
4582
+ Kcur = lm_ggml_rope_ext(
4583
+ ctx0, Kcur, inp_pos, rope_factors,
4584
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4585
+ ext_factor, attn_factor, beta_fast, beta_slow
4586
+ );
4587
+
4588
+ cb(Qcur, "Qcur", il);
4589
+ cb(Kcur, "Kcur", il);
4590
+ cb(Vcur, "Vcur", il);
4591
+
4592
+ cur = build_attn(inp_attn, gf,
4593
+ model.layers[il].wo, model.layers[il].bo,
4594
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4595
+ cb(cur, "attn_out", il);
4596
+ }
4597
+
4598
+ if (il == n_layer - 1) {
4599
+ // skip computing output for unused tokens
4600
+ lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
4601
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
4602
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
4603
+ }
4604
+
4605
+ lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
4606
+ cb(ffn_inp, "ffn_inp", il);
4607
+
4608
+ // feed-forward network (non-MoE)
4609
+ if (model.layers[il].ffn_gate_inp == nullptr) {
4610
+
4611
+ cur = build_norm(ffn_inp,
4612
+ model.layers[il].ffn_norm, NULL,
4613
+ LLM_NORM_RMS, il);
4614
+ cb(cur, "ffn_norm", il);
4615
+
4616
+ cur = build_ffn(cur,
4617
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
4618
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
4619
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
4620
+ NULL,
4621
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
4622
+ cb(cur, "ffn_out", il);
4623
+ } else {
4624
+ // MoE branch
4625
+ cur = build_norm(ffn_inp,
4626
+ model.layers[il].ffn_norm, NULL,
4627
+ LLM_NORM_RMS, il);
4628
+ cb(cur, "ffn_norm", il);
4629
+
4630
+ cur = build_moe_ffn(cur,
4631
+ model.layers[il].ffn_gate_inp,
4632
+ model.layers[il].ffn_up_exps,
4633
+ model.layers[il].ffn_gate_exps,
4634
+ model.layers[il].ffn_down_exps,
4635
+ nullptr,
4636
+ n_expert, n_expert_used,
4637
+ LLM_FFN_SILU, true,
4638
+ false, 0.0,
4639
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
4640
+ il);
4641
+ cb(cur, "ffn_moe_out", il);
4642
+ }
4643
+
4644
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
4645
+ cb(cur, "ffn_out", il);
4646
+
4647
+ cur = build_cvec(cur, il);
4648
+ cb(cur, "l_out", il);
4649
+
4650
+ // input for next layer
4651
+ inpL = cur;
4652
+ }
4653
+
4654
+ cur = inpL;
4655
+
4656
+ cur = build_norm(cur,
4657
+ model.output_norm, NULL,
4658
+ LLM_NORM_RMS, -1);
4659
+
4660
+ cb(cur, "result_norm", -1);
4661
+ res->t_embd = cur;
4662
+
4663
+ // lm_head
4664
+ cur = build_lora_mm(model.output, cur);
4665
+
4666
+ cb(cur, "result_output", -1);
4667
+ res->t_logits = cur;
4668
+
4669
+ lm_ggml_build_forward_expand(gf, cur);
4670
+ }
4671
+ };
4672
+
4673
+ struct llm_build_llama_iswa : public llm_graph_context {
4674
+ llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
4675
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4676
+
4677
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4678
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
4679
+
4680
+ lm_ggml_tensor * cur;
4681
+ lm_ggml_tensor * inpL;
4682
+
4683
+ inpL = build_inp_embd(model.tok_embd);
4684
+
4685
+ // inp_pos - contains the positions
4686
+ lm_ggml_tensor * inp_pos = build_inp_pos();
4687
+
4477
4688
  // temperature tuning
4478
4689
  lm_ggml_tensor * inp_attn_scale = nullptr;
4479
- if (arch == LLM_ARCH_LLAMA4) {
4480
- inp_attn_scale = build_inp_attn_scale();
4481
- }
4690
+ inp_attn_scale = build_inp_attn_scale();
4482
4691
 
4483
- auto * inp_attn = build_attn_inp_kv_unified();
4692
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
4484
4693
 
4485
4694
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4695
+
4486
4696
  for (int il = 0; il < n_layer; ++il) {
4487
4697
  lm_ggml_tensor * inpSA = inpL;
4488
4698
 
4489
- bool use_rope = arch == LLM_ARCH_LLAMA4
4490
- ? (il + 1) % hparams.n_no_rope_layer_step != 0
4491
- : true;
4699
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
4492
4700
 
4493
4701
  // norm
4494
4702
  cur = build_norm(inpL,
@@ -4499,7 +4707,7 @@ struct llm_build_llama : public llm_graph_context {
4499
4707
  // self-attention
4500
4708
  {
4501
4709
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4502
- lm_ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
4710
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4503
4711
 
4504
4712
  // compute Q and K and RoPE them
4505
4713
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4547,7 +4755,7 @@ struct llm_build_llama : public llm_graph_context {
4547
4755
  cb(Kcur, "Kcur", il);
4548
4756
  cb(Vcur, "Vcur", il);
4549
4757
 
4550
- if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
4758
+ if (use_rope && hparams.use_kq_norm) {
4551
4759
  // Llama4TextL2Norm
4552
4760
  Qcur = lm_ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
4553
4761
  Kcur = lm_ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
@@ -4568,17 +4776,11 @@ struct llm_build_llama : public llm_graph_context {
4568
4776
  inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
4569
4777
  }
4570
4778
 
4571
- // For Granite architecture
4572
- if (hparams.f_residual_scale) {
4573
- cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
4574
- }
4575
-
4576
4779
  lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
4577
4780
  cb(ffn_inp, "ffn_inp", il);
4578
4781
 
4579
4782
  // feed-forward network (non-MoE)
4580
4783
  if (model.layers[il].ffn_gate_inp == nullptr) {
4581
-
4582
4784
  cur = build_norm(ffn_inp,
4583
4785
  model.layers[il].ffn_norm, NULL,
4584
4786
  LLM_NORM_RMS, il);
@@ -4591,9 +4793,7 @@ struct llm_build_llama : public llm_graph_context {
4591
4793
  NULL,
4592
4794
  LLM_FFN_SILU, LLM_FFN_PAR, il);
4593
4795
  cb(cur, "ffn_out", il);
4594
-
4595
- } else if (arch == LLM_ARCH_LLAMA4) {
4596
- // llama4 MoE
4796
+ } else {
4597
4797
  lm_ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
4598
4798
  model.layers[il].ffn_norm, NULL,
4599
4799
  LLM_NORM_RMS, il);
@@ -4622,31 +4822,6 @@ struct llm_build_llama : public llm_graph_context {
4622
4822
 
4623
4823
  cur = lm_ggml_add(ctx0, moe_out, shexp_out);
4624
4824
  cb(cur, "ffn_moe_out_merged", il);
4625
-
4626
- } else {
4627
- // MoE branch
4628
- cur = build_norm(ffn_inp,
4629
- model.layers[il].ffn_norm, NULL,
4630
- LLM_NORM_RMS, il);
4631
- cb(cur, "ffn_norm", il);
4632
-
4633
- cur = build_moe_ffn(cur,
4634
- model.layers[il].ffn_gate_inp,
4635
- model.layers[il].ffn_up_exps,
4636
- model.layers[il].ffn_gate_exps,
4637
- model.layers[il].ffn_down_exps,
4638
- nullptr,
4639
- n_expert, n_expert_used,
4640
- LLM_FFN_SILU, true,
4641
- false, 0.0,
4642
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
4643
- il);
4644
- cb(cur, "ffn_moe_out", il);
4645
- }
4646
-
4647
- // For Granite architecture
4648
- if (hparams.f_residual_scale) {
4649
- cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
4650
4825
  }
4651
4826
 
4652
4827
  cur = lm_ggml_add(ctx0, cur, ffn_inp);
@@ -4671,11 +4846,6 @@ struct llm_build_llama : public llm_graph_context {
4671
4846
  // lm_head
4672
4847
  cur = build_lora_mm(model.output, cur);
4673
4848
 
4674
- // For Granite architecture
4675
- if (hparams.f_logit_scale) {
4676
- cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4677
- }
4678
-
4679
4849
  cb(cur, "result_output", -1);
4680
4850
  res->t_logits = cur;
4681
4851
 
@@ -4705,6 +4875,7 @@ struct llm_build_deci : public llm_graph_context {
4705
4875
  lm_ggml_tensor * inpSA = inpL;
4706
4876
  const int64_t n_head_kv = hparams.n_head_kv(il);
4707
4877
  const int64_t n_head = hparams.n_head(il);
4878
+ const int64_t n_ff = hparams.n_ff(il);
4708
4879
 
4709
4880
  if (n_head == 0) {
4710
4881
  // attention-free layer of Llama-3_1-Nemotron-51B
@@ -4724,7 +4895,7 @@ struct llm_build_deci : public llm_graph_context {
4724
4895
  } else if (n_head > 0) {
4725
4896
  // self-attention
4726
4897
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4727
- lm_ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
4898
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4728
4899
 
4729
4900
  // compute Q and K and RoPE them
4730
4901
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4780,9 +4951,9 @@ struct llm_build_deci : public llm_graph_context {
4780
4951
  inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
4781
4952
  }
4782
4953
 
4783
- // For Granite architecture
4784
- if (hparams.f_residual_scale) {
4785
- cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
4954
+ // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
4955
+ if (n_ff == 0) {
4956
+ continue;
4786
4957
  }
4787
4958
 
4788
4959
  // modified to support attention-free layer of Llama-3_1-Nemotron-51B
@@ -4808,11 +4979,6 @@ struct llm_build_deci : public llm_graph_context {
4808
4979
  cb(cur, "ffn_out", il);
4809
4980
  }
4810
4981
 
4811
- // For Granite architecture
4812
- if (hparams.f_residual_scale) {
4813
- cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
4814
- }
4815
-
4816
4982
  cur = lm_ggml_add(ctx0, cur, ffn_inp);
4817
4983
  cb(cur, "ffn_out", il);
4818
4984
 
@@ -4835,11 +5001,6 @@ struct llm_build_deci : public llm_graph_context {
4835
5001
  // lm_head
4836
5002
  cur = build_lora_mm(model.output, cur);
4837
5003
 
4838
- // For Granite architecture
4839
- if (hparams.f_logit_scale) {
4840
- cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4841
- }
4842
-
4843
5004
  cb(cur, "result_output", -1);
4844
5005
  res->t_logits = cur;
4845
5006
 
@@ -7183,6 +7344,7 @@ struct llm_build_phi2 : public llm_graph_context {
7183
7344
  }
7184
7345
  };
7185
7346
 
7347
+ template<bool iswa>
7186
7348
  struct llm_build_phi3 : public llm_graph_context {
7187
7349
  llm_build_phi3(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
7188
7350
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -7198,7 +7360,14 @@ struct llm_build_phi3 : public llm_graph_context {
7198
7360
  // inp_pos - contains the positions
7199
7361
  lm_ggml_tensor * inp_pos = build_inp_pos();
7200
7362
 
7201
- auto * inp_attn = build_attn_inp_kv_unified();
7363
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
7364
+ inp_attn_type * inp_attn = nullptr;
7365
+
7366
+ if constexpr (iswa) {
7367
+ inp_attn = build_attn_inp_kv_unified_iswa();
7368
+ } else {
7369
+ inp_attn = build_attn_inp_kv_unified();
7370
+ }
7202
7371
 
7203
7372
  for (int il = 0; il < n_layer; ++il) {
7204
7373
  auto * residual = inpL;
@@ -7206,7 +7375,7 @@ struct llm_build_phi3 : public llm_graph_context {
7206
7375
  // self-attention
7207
7376
  {
7208
7377
  // rope freq factors for 128k context
7209
- lm_ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
7378
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
7210
7379
 
7211
7380
  lm_ggml_tensor* attn_norm_output = build_norm(inpL,
7212
7381
  model.layers[il].attn_norm,
@@ -7958,7 +8127,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7958
8127
  for (int il = 0; il < n_layer; ++il) {
7959
8128
  lm_ggml_tensor * inpSA = inpL;
7960
8129
 
7961
- lm_ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
8130
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
7962
8131
 
7963
8132
  // norm
7964
8133
  cur = build_norm(inpL,
@@ -8258,8 +8427,8 @@ struct llm_build_gemma : public llm_graph_context {
8258
8427
  }
8259
8428
  };
8260
8429
 
8261
- struct llm_build_gemma2 : public llm_graph_context {
8262
- llm_build_gemma2(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
8430
+ struct llm_build_gemma2_iswa : public llm_graph_context {
8431
+ llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
8263
8432
  const int64_t n_embd_head = hparams.n_embd_head_k;
8264
8433
 
8265
8434
  lm_ggml_tensor * cur;
@@ -8273,7 +8442,7 @@ struct llm_build_gemma2 : public llm_graph_context {
8273
8442
  // inp_pos - contains the positions
8274
8443
  lm_ggml_tensor * inp_pos = build_inp_pos();
8275
8444
 
8276
- auto * inp_attn = build_attn_inp_kv_unified();
8445
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
8277
8446
 
8278
8447
  for (int il = 0; il < n_layer; ++il) {
8279
8448
  // norm
@@ -8395,8 +8564,8 @@ struct llm_build_gemma2 : public llm_graph_context {
8395
8564
  }
8396
8565
  };
8397
8566
 
8398
- struct llm_build_gemma3 : public llm_graph_context {
8399
- llm_build_gemma3(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
8567
+ struct llm_build_gemma3_iswa : public llm_graph_context {
8568
+ llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
8400
8569
  const int64_t n_embd_head = hparams.n_embd_head_k;
8401
8570
 
8402
8571
  lm_ggml_tensor * cur;
@@ -8414,13 +8583,11 @@ struct llm_build_gemma3 : public llm_graph_context {
8414
8583
  lm_ggml_tensor * inp_pos = build_inp_pos();
8415
8584
 
8416
8585
  // TODO: is causal == true correct? might need some changes
8417
- auto * inp_attn = build_attn_inp_kv_unified();
8586
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
8418
8587
 
8419
8588
  for (int il = 0; il < n_layer; ++il) {
8420
- const bool is_swa = hparams.is_swa(il);
8421
-
8422
- const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
8423
- const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
8589
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
8590
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
8424
8591
 
8425
8592
  // norm
8426
8593
  cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
@@ -8997,8 +9164,8 @@ struct llm_build_command_r : public llm_graph_context {
8997
9164
  }
8998
9165
  };
8999
9166
 
9000
- struct llm_build_cohere2 : public llm_graph_context {
9001
- llm_build_cohere2(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
9167
+ struct llm_build_cohere2_iswa : public llm_graph_context {
9168
+ llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, lm_ggml_cgraph * gf) : llm_graph_context(params) {
9002
9169
  const int64_t n_embd_head = hparams.n_embd_head_v;
9003
9170
 
9004
9171
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9013,7 +9180,7 @@ struct llm_build_cohere2 : public llm_graph_context {
9013
9180
  // inp_pos - contains the positions
9014
9181
  lm_ggml_tensor * inp_pos = build_inp_pos();
9015
9182
 
9016
- auto * inp_attn = build_attn_inp_kv_unified();
9183
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
9017
9184
 
9018
9185
  for (int il = 0; il < n_layer; ++il) {
9019
9186
  const bool is_swa = hparams.is_swa(il);
@@ -9026,7 +9193,7 @@ struct llm_build_cohere2 : public llm_graph_context {
9026
9193
  // self-attention
9027
9194
  {
9028
9195
  // rope freq factors for 128k context
9029
- lm_ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
9196
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
9030
9197
 
9031
9198
  // compute Q and K and RoPE them
9032
9199
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9964,7 +10131,7 @@ struct llm_build_deepseek : public llm_graph_context {
9964
10131
  // self-attention
9965
10132
  {
9966
10133
  // rope freq factors for llama3; may return nullptr for llama2 and other models
9967
- lm_ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
10134
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
9968
10135
 
9969
10136
  // compute Q and K and RoPE them
9970
10137
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11328,7 +11495,7 @@ struct llm_build_exaone : public llm_graph_context {
11328
11495
  // self-attention
11329
11496
  {
11330
11497
  // rope freq factors for llama3; may return nullptr for llama2 and other models
11331
- lm_ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
11498
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
11332
11499
 
11333
11500
  // compute Q and K and RoPE them
11334
11501
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12178,6 +12345,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12178
12345
  }
12179
12346
  };
12180
12347
 
12348
+
12349
+ struct llm_build_granite : public llm_graph_context {
12350
+ llm_build_granite(
12351
+ const llama_model & model,
12352
+ const llm_graph_params & params,
12353
+ lm_ggml_cgraph * gf,
12354
+ const bool use_rope = true)
12355
+ : llm_graph_context(params) {
12356
+
12357
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12358
+
12359
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12360
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
12361
+
12362
+ lm_ggml_tensor * cur;
12363
+ lm_ggml_tensor * inpL;
12364
+
12365
+ inpL = build_inp_embd(model.tok_embd);
12366
+
12367
+ // inp_pos - built only if rope enabled
12368
+ lm_ggml_tensor * inp_pos = nullptr;
12369
+ if (use_rope) {
12370
+ inp_pos = build_inp_pos();
12371
+ }
12372
+
12373
+ auto * inp_attn = build_attn_inp_kv_unified();
12374
+
12375
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12376
+ for (int il = 0; il < n_layer; ++il) {
12377
+ lm_ggml_tensor * inpSA = inpL;
12378
+
12379
+ // norm
12380
+ cur = build_norm(inpL,
12381
+ model.layers[il].attn_norm, NULL,
12382
+ LLM_NORM_RMS, il);
12383
+ cb(cur, "attn_norm", il);
12384
+
12385
+ // self-attention
12386
+ {
12387
+ // compute Q and K and (optionally) RoPE them
12388
+ lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12389
+ cb(Qcur, "Qcur", il);
12390
+ if (model.layers[il].bq) {
12391
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
12392
+ cb(Qcur, "Qcur", il);
12393
+ }
12394
+
12395
+ lm_ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12396
+ cb(Kcur, "Kcur", il);
12397
+ if (model.layers[il].bk) {
12398
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
12399
+ cb(Kcur, "Kcur", il);
12400
+ }
12401
+
12402
+ lm_ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12403
+ cb(Vcur, "Vcur", il);
12404
+ if (model.layers[il].bv) {
12405
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
12406
+ cb(Vcur, "Vcur", il);
12407
+ }
12408
+
12409
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12410
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12411
+ Vcur = lm_ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12412
+
12413
+ if (use_rope) {
12414
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
12415
+ Qcur = lm_ggml_rope_ext(
12416
+ ctx0, Qcur, inp_pos, rope_factors,
12417
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12418
+ ext_factor, attn_factor, beta_fast, beta_slow
12419
+ );
12420
+
12421
+ Kcur = lm_ggml_rope_ext(
12422
+ ctx0, Kcur, inp_pos, rope_factors,
12423
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12424
+ ext_factor, attn_factor, beta_fast, beta_slow
12425
+ );
12426
+ }
12427
+
12428
+ cb(Qcur, "Qcur", il);
12429
+ cb(Kcur, "Kcur", il);
12430
+ cb(Vcur, "Vcur", il);
12431
+
12432
+ cur = build_attn(inp_attn, gf,
12433
+ model.layers[il].wo, model.layers[il].bo,
12434
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12435
+ cb(cur, "attn_out", il);
12436
+ }
12437
+
12438
+ if (il == n_layer - 1) {
12439
+ // skip computing output for unused tokens
12440
+ lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
12441
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
12442
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
12443
+ }
12444
+
12445
+ // For Granite architectures - scale residual
12446
+ cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
12447
+ lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
12448
+ cb(ffn_inp, "ffn_inp", il);
12449
+
12450
+ // feed-forward network (non-MoE)
12451
+ if (model.layers[il].ffn_gate_inp == nullptr) {
12452
+
12453
+ cur = build_norm(ffn_inp,
12454
+ model.layers[il].ffn_norm, NULL,
12455
+ LLM_NORM_RMS, il);
12456
+ cb(cur, "ffn_norm", il);
12457
+
12458
+ cur = build_ffn(cur,
12459
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12460
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12461
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12462
+ NULL,
12463
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12464
+ cb(cur, "ffn_out", il);
12465
+
12466
+ } else {
12467
+ // MoE branch
12468
+ cur = build_norm(ffn_inp,
12469
+ model.layers[il].ffn_norm, NULL,
12470
+ LLM_NORM_RMS, il);
12471
+ cb(cur, "ffn_norm", il);
12472
+
12473
+ lm_ggml_tensor * moe_out = build_moe_ffn(cur,
12474
+ model.layers[il].ffn_gate_inp,
12475
+ model.layers[il].ffn_up_exps,
12476
+ model.layers[il].ffn_gate_exps,
12477
+ model.layers[il].ffn_down_exps,
12478
+ nullptr,
12479
+ n_expert, n_expert_used,
12480
+ LLM_FFN_SILU, true,
12481
+ false, 0.0,
12482
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12483
+ il);
12484
+ cb(moe_out, "ffn_moe_out", il);
12485
+
12486
+ // For Granite MoE Shared
12487
+ if (hparams.n_ff_shexp > 0) {
12488
+ lm_ggml_tensor * ffn_shexp = build_ffn(cur,
12489
+ model.layers[il].ffn_up_shexp, NULL, NULL,
12490
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
12491
+ model.layers[il].ffn_down_shexp, NULL, NULL,
12492
+ NULL,
12493
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12494
+ cb(ffn_shexp, "ffn_shexp", il);
12495
+
12496
+ cur = lm_ggml_add(ctx0, moe_out, ffn_shexp);
12497
+ cb(cur, "ffn_out", il);
12498
+ } else {
12499
+ cur = moe_out;
12500
+ }
12501
+ }
12502
+
12503
+ // For Granite architectures - scale residual
12504
+ cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
12505
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
12506
+ cb(cur, "ffn_out", il);
12507
+
12508
+ cur = build_cvec(cur, il);
12509
+ cb(cur, "l_out", il);
12510
+
12511
+ // input for next layer
12512
+ inpL = cur;
12513
+ }
12514
+
12515
+ cur = inpL;
12516
+
12517
+ cur = build_norm(cur,
12518
+ model.output_norm, NULL,
12519
+ LLM_NORM_RMS, -1);
12520
+
12521
+ cb(cur, "result_norm", -1);
12522
+ res->t_embd = cur;
12523
+
12524
+ // lm_head
12525
+ cur = build_lora_mm(model.output, cur);
12526
+
12527
+ // For Granite architectures - scale logits
12528
+ cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
12529
+ cb(cur, "result_output", -1);
12530
+ res->t_logits = cur;
12531
+
12532
+ lm_ggml_build_forward_expand(gf, cur);
12533
+ }
12534
+ };
12535
+
12181
12536
  // ref: https://github.com/facebookresearch/chameleon
12182
12537
  // based on the original build_llama() function, changes:
12183
12538
  // * qk-norm
@@ -12709,7 +13064,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
12709
13064
  // self-attention
12710
13065
  {
12711
13066
  // rope freq factors for llama3; may return nullptr for llama2 and other models
12712
- lm_ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
13067
+ lm_ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
12713
13068
 
12714
13069
  // compute Q and K and RoPE them
12715
13070
  lm_ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12833,6 +13188,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
12833
13188
  llama_memory_i * res;
12834
13189
 
12835
13190
  switch (arch) {
13191
+ case LLM_ARCH_BERT:
13192
+ case LLM_ARCH_JINA_BERT_V2:
13193
+ case LLM_ARCH_NOMIC_BERT:
13194
+ case LLM_ARCH_NOMIC_BERT_MOE:
13195
+ case LLM_ARCH_WAVTOKENIZER_DEC:
13196
+ {
13197
+ res = nullptr;
13198
+ } break;
12836
13199
  case LLM_ARCH_MAMBA:
12837
13200
  case LLM_ARCH_RWKV6:
12838
13201
  case LLM_ARCH_RWKV6QWEN2:
@@ -12844,7 +13207,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
12844
13207
  LM_GGML_TYPE_F32,
12845
13208
  LM_GGML_TYPE_F32,
12846
13209
  cparams.offload_kqv,
12847
- std::max((uint32_t) 1, cparams.n_seq_max));
13210
+ std::max((uint32_t) 1, cparams.n_seq_max),
13211
+ cparams.n_seq_max);
12848
13212
  } break;
12849
13213
  default:
12850
13214
  {
@@ -12854,14 +13218,36 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
12854
13218
 
12855
13219
  LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
12856
13220
 
12857
- res = new llama_kv_cache_unified(
12858
- *this,
12859
- params.type_k,
12860
- params.type_v,
12861
- !cparams.flash_attn,
12862
- cparams.offload_kqv,
12863
- cparams.n_ctx,
12864
- padding);
13221
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13222
+ LM_GGML_ASSERT(hparams.is_swa_any());
13223
+
13224
+ res = new llama_kv_cache_unified_iswa(
13225
+ *this,
13226
+ params.type_k,
13227
+ params.type_v,
13228
+ !cparams.flash_attn,
13229
+ cparams.offload_kqv,
13230
+ params.swa_full,
13231
+ cparams.n_ctx,
13232
+ cparams.n_seq_max,
13233
+ cparams.n_batch,
13234
+ padding);
13235
+ } else {
13236
+ LM_GGML_ASSERT(!hparams.is_swa_any());
13237
+
13238
+ res = new llama_kv_cache_unified(
13239
+ *this,
13240
+ nullptr,
13241
+ params.type_k,
13242
+ params.type_v,
13243
+ !cparams.flash_attn,
13244
+ cparams.offload_kqv,
13245
+ cparams.n_ctx,
13246
+ cparams.n_seq_max,
13247
+ padding,
13248
+ hparams.n_swa,
13249
+ hparams.swa_type);
13250
+ }
12865
13251
  }
12866
13252
  }
12867
13253
 
@@ -12876,13 +13262,14 @@ llm_graph_result_ptr llama_model::build_graph(
12876
13262
 
12877
13263
  switch (arch) {
12878
13264
  case LLM_ARCH_LLAMA:
12879
- case LLM_ARCH_LLAMA4:
12880
13265
  case LLM_ARCH_MINICPM:
12881
- case LLM_ARCH_GRANITE:
12882
- case LLM_ARCH_GRANITE_MOE:
12883
13266
  {
12884
13267
  llm = std::make_unique<llm_build_llama>(*this, params, gf);
12885
13268
  } break;
13269
+ case LLM_ARCH_LLAMA4:
13270
+ {
13271
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
13272
+ } break;
12886
13273
  case LLM_ARCH_DECI:
12887
13274
  {
12888
13275
  llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -12957,7 +13344,11 @@ llm_graph_result_ptr llama_model::build_graph(
12957
13344
  case LLM_ARCH_PHI3:
12958
13345
  case LLM_ARCH_PHIMOE:
12959
13346
  {
12960
- llm = std::make_unique<llm_build_phi3>(*this, params, gf);
13347
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13348
+ llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
13349
+ } else {
13350
+ llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
13351
+ }
12961
13352
  } break;
12962
13353
  case LLM_ARCH_PLAMO:
12963
13354
  {
@@ -12989,11 +13380,11 @@ llm_graph_result_ptr llama_model::build_graph(
12989
13380
  } break;
12990
13381
  case LLM_ARCH_GEMMA2:
12991
13382
  {
12992
- llm = std::make_unique<llm_build_gemma2>(*this, params, gf);
13383
+ llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
12993
13384
  } break;
12994
13385
  case LLM_ARCH_GEMMA3:
12995
13386
  {
12996
- llm = std::make_unique<llm_build_gemma3>(*this, params, gf);
13387
+ llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
12997
13388
  } break;
12998
13389
  case LLM_ARCH_STARCODER2:
12999
13390
  {
@@ -13013,7 +13404,7 @@ llm_graph_result_ptr llama_model::build_graph(
13013
13404
  } break;
13014
13405
  case LLM_ARCH_COHERE2:
13015
13406
  {
13016
- llm = std::make_unique<llm_build_cohere2>(*this, params, gf);
13407
+ llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
13017
13408
  } break;
13018
13409
  case LLM_ARCH_DBRX:
13019
13410
  {
@@ -13110,6 +13501,11 @@ llm_graph_result_ptr llama_model::build_graph(
13110
13501
  {
13111
13502
  llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
13112
13503
  } break;
13504
+ case LLM_ARCH_GRANITE:
13505
+ case LLM_ARCH_GRANITE_MOE:
13506
+ {
13507
+ llm = std::make_unique<llm_build_granite>(*this, params, gf);
13508
+ } break;
13113
13509
  case LLM_ARCH_CHAMELEON:
13114
13510
  {
13115
13511
  llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
@@ -13361,6 +13757,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
13361
13757
  : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
13362
13758
  const auto & it = model->lm_gguf_kv.find(key);
13363
13759
  if (it == model->lm_gguf_kv.end()) {
13760
+ // one-off fix for very popular models (so we are not flooded with issues)
13761
+ // do not extend this list unless absolutely necessary
13762
+ // Mistral-Small-2503 does not have built-in chat template
13763
+ llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
13764
+ if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
13765
+ return "mistral-v7-tekken";
13766
+ }
13767
+
13364
13768
  return nullptr;
13365
13769
  }
13366
13770