@fugood/llama.node 0.3.17 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -40,6 +40,7 @@ const char * llm_type_name(llm_type type) {
40
40
  case LLM_TYPE_335M: return "335M";
41
41
  case LLM_TYPE_410M: return "410M";
42
42
  case LLM_TYPE_450M: return "450M";
43
+ case LLM_TYPE_475M: return "475M";
43
44
  case LLM_TYPE_770M: return "770M";
44
45
  case LLM_TYPE_780M: return "780M";
45
46
  case LLM_TYPE_0_5B: return "0.5B";
@@ -79,6 +80,7 @@ const char * llm_type_name(llm_type type) {
79
80
  case LLM_TYPE_236B: return "236B";
80
81
  case LLM_TYPE_290B: return "290B";
81
82
  case LLM_TYPE_314B: return "314B";
83
+ case LLM_TYPE_405B: return "405B";
82
84
  case LLM_TYPE_671B: return "671B";
83
85
  case LLM_TYPE_SMALL: return "0.1B";
84
86
  case LLM_TYPE_MEDIUM: return "0.4B";
@@ -115,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
115
117
  { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
116
118
  };
117
119
 
120
+ std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
121
+ return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
122
+ }
123
+
118
124
  static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
119
125
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
120
126
  if (kv.second == name) {
@@ -297,6 +303,10 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
297
303
  // add extra buffer types, only if no GPU device is present
298
304
  // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
299
305
  auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
306
+ if (cpu_dev == nullptr) {
307
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
308
+ }
309
+
300
310
  auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
301
311
  auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
302
312
  ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
@@ -581,6 +591,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
581
591
  switch (hparams.n_layer) {
582
592
  case 32: type = LLM_TYPE_7B; break;
583
593
  case 80: type = LLM_TYPE_70B; break;
594
+ case 162: type = LLM_TYPE_405B; break;
584
595
  default: type = LLM_TYPE_UNKNOWN;
585
596
  }
586
597
  } break;
@@ -707,7 +718,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
707
718
  ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
708
719
 
709
720
  if (hparams.n_layer == 12 && hparams.n_embd == 768) {
710
- type = LLM_TYPE_137M;
721
+ if (arch == LLM_ARCH_NOMIC_BERT) {
722
+ type = LLM_TYPE_137M;
723
+ } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) {
724
+ type = LLM_TYPE_475M;
725
+ }
711
726
  }
712
727
  } break;
713
728
  case LLM_ARCH_BLOOM:
@@ -768,6 +783,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
768
783
  // fall through
769
784
  case LLM_ARCH_QWEN2:
770
785
  {
786
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
771
787
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
772
788
  switch (hparams.n_layer) {
773
789
  case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
@@ -1373,6 +1389,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1373
1389
  // Add additional layer/vocab/etc checks here for other model sizes
1374
1390
  default: type = LLM_TYPE_UNKNOWN;
1375
1391
  }
1392
+
1393
+ // For Granite MoE Shared
1394
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1376
1395
  } break;
1377
1396
  case LLM_ARCH_CHAMELEON:
1378
1397
  {
@@ -1476,6 +1495,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1476
1495
  }
1477
1496
 
1478
1497
  ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1498
+ if (cpu_dev == nullptr) {
1499
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
1500
+ }
1479
1501
  const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
1480
1502
  const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
1481
1503
  auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
@@ -1643,8 +1665,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1643
1665
  for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
1644
1666
  std::regex pattern(overrides->pattern);
1645
1667
  if (std::regex_search(tensor_name, pattern)) {
1646
- LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
1647
1668
  buft = overrides->buft;
1669
+ LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
1670
+ tensor_name.c_str(),
1671
+ ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
1672
+ ggml_backend_buft_name(buft));
1648
1673
  break;
1649
1674
  }
1650
1675
  }
@@ -1661,6 +1686,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1661
1686
  auto * buft_dev = ggml_backend_buft_get_device(buft);
1662
1687
  if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
1663
1688
  auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1689
+ if (!cpu_dev) {
1690
+ throw std::runtime_error("no CPU backend found");
1691
+ }
1664
1692
  buft = ggml_backend_dev_buffer_type(cpu_dev);
1665
1693
  }
1666
1694
 
@@ -1747,6 +1775,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1747
1775
  layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1748
1776
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1749
1777
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1778
+
1779
+ // For Granite MoE Shared
1780
+ if (hparams.n_ff_shexp > 0) {
1781
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1782
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1783
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
1784
+ }
1750
1785
  }
1751
1786
  }
1752
1787
  } break;
@@ -1842,7 +1877,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1842
1877
  layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
1843
1878
  layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
1844
1879
 
1845
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1880
+ if (n_ff > 0) {
1881
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
1882
+ }
1846
1883
 
1847
1884
  if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
1848
1885
  layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
@@ -1852,9 +1889,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1852
1889
  layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
1853
1890
  }
1854
1891
 
1855
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1856
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1857
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1892
+ if (n_ff > 0) {
1893
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
1894
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
1895
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
1896
+ }
1858
1897
 
1859
1898
  // optional MLP bias
1860
1899
  layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
@@ -3498,7 +3537,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3498
3537
 
3499
3538
  // output
3500
3539
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3501
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3540
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3541
+ // if output is NULL, init from the input tok embed
3542
+ if (output == NULL) {
3543
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3544
+ }
3502
3545
 
3503
3546
  for (int i = 0; i < n_layer; ++i) {
3504
3547
  auto & layer = layers[i];
@@ -4103,6 +4146,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4103
4146
  if (!dev) {
4104
4147
  // FIXME: workaround for CPU backend buft having a NULL device
4105
4148
  dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
4149
+ if (!dev) {
4150
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
4151
+ }
4106
4152
  }
4107
4153
  ggml_backend_dev_props props;
4108
4154
  ggml_backend_dev_get_props(dev, &props);
@@ -4232,7 +4278,7 @@ uint64_t llama_model::n_elements() const {
4232
4278
  }
4233
4279
 
4234
4280
  void llama_model::print_info() const {
4235
- const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
4281
+ const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
4236
4282
 
4237
4283
  auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
4238
4284
  bool is_var = false;
@@ -4293,7 +4339,7 @@ void llama_model::print_info() const {
4293
4339
  LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
4294
4340
  LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
4295
4341
  LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
4296
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
4342
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
4297
4343
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
4298
4344
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4299
4345
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
@@ -4349,10 +4395,13 @@ void llama_model::print_info() const {
4349
4395
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4350
4396
  }
4351
4397
 
4352
- if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
4398
+ if (arch == LLM_ARCH_MINICPM ||
4399
+ arch == LLM_ARCH_GRANITE ||
4400
+ arch == LLM_ARCH_GRANITE_MOE) {
4353
4401
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
4354
4402
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
4355
4403
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
4404
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
4356
4405
  }
4357
4406
 
4358
4407
  if (arch == LLM_ARCH_BAILINGMOE) {
@@ -4440,6 +4489,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
4440
4489
  return it->second;
4441
4490
  }
4442
4491
 
4492
+ ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
4493
+ // choose long/short freq factors based on the context size
4494
+ if (layers[il].rope_freqs != nullptr) {
4495
+ return layers[il].rope_freqs;
4496
+ }
4497
+
4498
+ if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
4499
+ return layers[il].rope_long;
4500
+ }
4501
+
4502
+ return layers[il].rope_short;
4503
+ }
4504
+
4443
4505
  struct llm_build_llama : public llm_graph_context {
4444
4506
  llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
4445
4507
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -4480,7 +4542,7 @@ struct llm_build_llama : public llm_graph_context {
4480
4542
  // self-attention
4481
4543
  {
4482
4544
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4483
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
4545
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
4484
4546
 
4485
4547
  // compute Q and K and RoPE them
4486
4548
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4549,11 +4611,6 @@ struct llm_build_llama : public llm_graph_context {
4549
4611
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4550
4612
  }
4551
4613
 
4552
- // For Granite architecture
4553
- if (hparams.f_residual_scale) {
4554
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4555
- }
4556
-
4557
4614
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4558
4615
  cb(ffn_inp, "ffn_inp", il);
4559
4616
 
@@ -4625,11 +4682,6 @@ struct llm_build_llama : public llm_graph_context {
4625
4682
  cb(cur, "ffn_moe_out", il);
4626
4683
  }
4627
4684
 
4628
- // For Granite architecture
4629
- if (hparams.f_residual_scale) {
4630
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4631
- }
4632
-
4633
4685
  cur = ggml_add(ctx0, cur, ffn_inp);
4634
4686
  cb(cur, "ffn_out", il);
4635
4687
 
@@ -4652,11 +4704,6 @@ struct llm_build_llama : public llm_graph_context {
4652
4704
  // lm_head
4653
4705
  cur = build_lora_mm(model.output, cur);
4654
4706
 
4655
- // For Granite architecture
4656
- if (hparams.f_logit_scale) {
4657
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4658
- }
4659
-
4660
4707
  cb(cur, "result_output", -1);
4661
4708
  res->t_logits = cur;
4662
4709
 
@@ -4686,6 +4733,7 @@ struct llm_build_deci : public llm_graph_context {
4686
4733
  ggml_tensor * inpSA = inpL;
4687
4734
  const int64_t n_head_kv = hparams.n_head_kv(il);
4688
4735
  const int64_t n_head = hparams.n_head(il);
4736
+ const int64_t n_ff = hparams.n_ff(il);
4689
4737
 
4690
4738
  if (n_head == 0) {
4691
4739
  // attention-free layer of Llama-3_1-Nemotron-51B
@@ -4705,7 +4753,7 @@ struct llm_build_deci : public llm_graph_context {
4705
4753
  } else if (n_head > 0) {
4706
4754
  // self-attention
4707
4755
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4708
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
4756
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
4709
4757
 
4710
4758
  // compute Q and K and RoPE them
4711
4759
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4761,9 +4809,9 @@ struct llm_build_deci : public llm_graph_context {
4761
4809
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4762
4810
  }
4763
4811
 
4764
- // For Granite architecture
4765
- if (hparams.f_residual_scale) {
4766
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4812
+ // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
4813
+ if (n_ff == 0) {
4814
+ continue;
4767
4815
  }
4768
4816
 
4769
4817
  // modified to support attention-free layer of Llama-3_1-Nemotron-51B
@@ -4789,11 +4837,6 @@ struct llm_build_deci : public llm_graph_context {
4789
4837
  cb(cur, "ffn_out", il);
4790
4838
  }
4791
4839
 
4792
- // For Granite architecture
4793
- if (hparams.f_residual_scale) {
4794
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4795
- }
4796
-
4797
4840
  cur = ggml_add(ctx0, cur, ffn_inp);
4798
4841
  cb(cur, "ffn_out", il);
4799
4842
 
@@ -4816,11 +4859,6 @@ struct llm_build_deci : public llm_graph_context {
4816
4859
  // lm_head
4817
4860
  cur = build_lora_mm(model.output, cur);
4818
4861
 
4819
- // For Granite architecture
4820
- if (hparams.f_logit_scale) {
4821
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4822
- }
4823
-
4824
4862
  cb(cur, "result_output", -1);
4825
4863
  res->t_logits = cur;
4826
4864
 
@@ -7187,7 +7225,7 @@ struct llm_build_phi3 : public llm_graph_context {
7187
7225
  // self-attention
7188
7226
  {
7189
7227
  // rope freq factors for 128k context
7190
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
7228
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
7191
7229
 
7192
7230
  ggml_tensor* attn_norm_output = build_norm(inpL,
7193
7231
  model.layers[il].attn_norm,
@@ -7939,7 +7977,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7939
7977
  for (int il = 0; il < n_layer; ++il) {
7940
7978
  ggml_tensor * inpSA = inpL;
7941
7979
 
7942
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
7980
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
7943
7981
 
7944
7982
  // norm
7945
7983
  cur = build_norm(inpL,
@@ -8706,7 +8744,7 @@ struct llm_build_mamba : public llm_graph_context {
8706
8744
  ggml_tensor * state_mask,
8707
8745
  const llama_ubatch & ubatch,
8708
8746
  int il) const {
8709
- const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
8747
+ const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
8710
8748
 
8711
8749
  const auto kv_head = kv_self->head;
8712
8750
 
@@ -9007,7 +9045,7 @@ struct llm_build_cohere2 : public llm_graph_context {
9007
9045
  // self-attention
9008
9046
  {
9009
9047
  // rope freq factors for 128k context
9010
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
9048
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
9011
9049
 
9012
9050
  // compute Q and K and RoPE them
9013
9051
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9945,7 +9983,7 @@ struct llm_build_deepseek : public llm_graph_context {
9945
9983
  // self-attention
9946
9984
  {
9947
9985
  // rope freq factors for llama3; may return nullptr for llama2 and other models
9948
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
9986
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
9949
9987
 
9950
9988
  // compute Q and K and RoPE them
9951
9989
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11309,7 +11347,7 @@ struct llm_build_exaone : public llm_graph_context {
11309
11347
  // self-attention
11310
11348
  {
11311
11349
  // rope freq factors for llama3; may return nullptr for llama2 and other models
11312
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
11350
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
11313
11351
 
11314
11352
  // compute Q and K and RoPE them
11315
11353
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11454,7 +11492,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
11454
11492
  ggml_tensor * state_mask,
11455
11493
  const llama_ubatch & ubatch,
11456
11494
  int il) const {
11457
- const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
11495
+ const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
11458
11496
 
11459
11497
  const auto n_tokens = ubatch.n_tokens;
11460
11498
  const auto n_seqs = ubatch.n_seqs;
@@ -11850,7 +11888,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
11850
11888
  ggml_tensor *& first_layer_value,
11851
11889
  const llama_ubatch & ubatch,
11852
11890
  int il) const {
11853
- const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
11891
+ const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
11854
11892
 
11855
11893
  const auto n_tokens = ubatch.n_tokens;
11856
11894
  const auto n_seqs = ubatch.n_seqs;
@@ -12159,6 +12197,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12159
12197
  }
12160
12198
  };
12161
12199
 
12200
+
12201
+ struct llm_build_granite : public llm_graph_context {
12202
+ llm_build_granite(
12203
+ const llama_model & model,
12204
+ const llm_graph_params & params,
12205
+ ggml_cgraph * gf,
12206
+ const bool use_rope = true)
12207
+ : llm_graph_context(params) {
12208
+
12209
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12210
+
12211
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12212
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12213
+
12214
+ ggml_tensor * cur;
12215
+ ggml_tensor * inpL;
12216
+
12217
+ inpL = build_inp_embd(model.tok_embd);
12218
+
12219
+ // inp_pos - built only if rope enabled
12220
+ ggml_tensor * inp_pos = nullptr;
12221
+ if (use_rope) {
12222
+ inp_pos = build_inp_pos();
12223
+ }
12224
+
12225
+ auto * inp_attn = build_attn_inp_kv_unified();
12226
+
12227
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12228
+ for (int il = 0; il < n_layer; ++il) {
12229
+ ggml_tensor * inpSA = inpL;
12230
+
12231
+ // norm
12232
+ cur = build_norm(inpL,
12233
+ model.layers[il].attn_norm, NULL,
12234
+ LLM_NORM_RMS, il);
12235
+ cb(cur, "attn_norm", il);
12236
+
12237
+ // self-attention
12238
+ {
12239
+ // compute Q and K and (optionally) RoPE them
12240
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12241
+ cb(Qcur, "Qcur", il);
12242
+ if (model.layers[il].bq) {
12243
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12244
+ cb(Qcur, "Qcur", il);
12245
+ }
12246
+
12247
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12248
+ cb(Kcur, "Kcur", il);
12249
+ if (model.layers[il].bk) {
12250
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12251
+ cb(Kcur, "Kcur", il);
12252
+ }
12253
+
12254
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12255
+ cb(Vcur, "Vcur", il);
12256
+ if (model.layers[il].bv) {
12257
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12258
+ cb(Vcur, "Vcur", il);
12259
+ }
12260
+
12261
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12262
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12263
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12264
+
12265
+ if (use_rope) {
12266
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
12267
+ Qcur = ggml_rope_ext(
12268
+ ctx0, Qcur, inp_pos, rope_factors,
12269
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12270
+ ext_factor, attn_factor, beta_fast, beta_slow
12271
+ );
12272
+
12273
+ Kcur = ggml_rope_ext(
12274
+ ctx0, Kcur, inp_pos, rope_factors,
12275
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12276
+ ext_factor, attn_factor, beta_fast, beta_slow
12277
+ );
12278
+ }
12279
+
12280
+ cb(Qcur, "Qcur", il);
12281
+ cb(Kcur, "Kcur", il);
12282
+ cb(Vcur, "Vcur", il);
12283
+
12284
+ cur = build_attn(inp_attn, gf,
12285
+ model.layers[il].wo, model.layers[il].bo,
12286
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12287
+ cb(cur, "attn_out", il);
12288
+ }
12289
+
12290
+ if (il == n_layer - 1) {
12291
+ // skip computing output for unused tokens
12292
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12293
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12294
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12295
+ }
12296
+
12297
+ // For Granite architectures - scale residual
12298
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12299
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12300
+ cb(ffn_inp, "ffn_inp", il);
12301
+
12302
+ // feed-forward network (non-MoE)
12303
+ if (model.layers[il].ffn_gate_inp == nullptr) {
12304
+
12305
+ cur = build_norm(ffn_inp,
12306
+ model.layers[il].ffn_norm, NULL,
12307
+ LLM_NORM_RMS, il);
12308
+ cb(cur, "ffn_norm", il);
12309
+
12310
+ cur = build_ffn(cur,
12311
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12312
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12313
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12314
+ NULL,
12315
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12316
+ cb(cur, "ffn_out", il);
12317
+
12318
+ } else {
12319
+ // MoE branch
12320
+ cur = build_norm(ffn_inp,
12321
+ model.layers[il].ffn_norm, NULL,
12322
+ LLM_NORM_RMS, il);
12323
+ cb(cur, "ffn_norm", il);
12324
+
12325
+ ggml_tensor * moe_out = build_moe_ffn(cur,
12326
+ model.layers[il].ffn_gate_inp,
12327
+ model.layers[il].ffn_up_exps,
12328
+ model.layers[il].ffn_gate_exps,
12329
+ model.layers[il].ffn_down_exps,
12330
+ nullptr,
12331
+ n_expert, n_expert_used,
12332
+ LLM_FFN_SILU, true,
12333
+ false, 0.0,
12334
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12335
+ il);
12336
+ cb(moe_out, "ffn_moe_out", il);
12337
+
12338
+ // For Granite MoE Shared
12339
+ if (hparams.n_ff_shexp > 0) {
12340
+ ggml_tensor * ffn_shexp = build_ffn(cur,
12341
+ model.layers[il].ffn_up_shexp, NULL, NULL,
12342
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
12343
+ model.layers[il].ffn_down_shexp, NULL, NULL,
12344
+ NULL,
12345
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12346
+ cb(ffn_shexp, "ffn_shexp", il);
12347
+
12348
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
12349
+ cb(cur, "ffn_out", il);
12350
+ } else {
12351
+ cur = moe_out;
12352
+ }
12353
+ }
12354
+
12355
+ // For Granite architectures - scale residual
12356
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12357
+ cur = ggml_add(ctx0, cur, ffn_inp);
12358
+ cb(cur, "ffn_out", il);
12359
+
12360
+ cur = build_cvec(cur, il);
12361
+ cb(cur, "l_out", il);
12362
+
12363
+ // input for next layer
12364
+ inpL = cur;
12365
+ }
12366
+
12367
+ cur = inpL;
12368
+
12369
+ cur = build_norm(cur,
12370
+ model.output_norm, NULL,
12371
+ LLM_NORM_RMS, -1);
12372
+
12373
+ cb(cur, "result_norm", -1);
12374
+ res->t_embd = cur;
12375
+
12376
+ // lm_head
12377
+ cur = build_lora_mm(model.output, cur);
12378
+
12379
+ // For Granite architectures - scale logits
12380
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
12381
+ cb(cur, "result_output", -1);
12382
+ res->t_logits = cur;
12383
+
12384
+ ggml_build_forward_expand(gf, cur);
12385
+ }
12386
+ };
12387
+
12162
12388
  // ref: https://github.com/facebookresearch/chameleon
12163
12389
  // based on the original build_llama() function, changes:
12164
12390
  // * qk-norm
@@ -12690,7 +12916,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
12690
12916
  // self-attention
12691
12917
  {
12692
12918
  // rope freq factors for llama3; may return nullptr for llama2 and other models
12693
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
12919
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
12694
12920
 
12695
12921
  // compute Q and K and RoPE them
12696
12922
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12810,36 +13036,46 @@ struct llm_build_bailingmoe : public llm_graph_context {
12810
13036
  }
12811
13037
  };
12812
13038
 
12813
- llama_memory_i * llama_model::create_memory() const {
13039
+ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
12814
13040
  llama_memory_i * res;
12815
13041
 
12816
13042
  switch (arch) {
13043
+ case LLM_ARCH_BERT:
13044
+ case LLM_ARCH_JINA_BERT_V2:
13045
+ case LLM_ARCH_NOMIC_BERT:
13046
+ case LLM_ARCH_NOMIC_BERT_MOE:
13047
+ {
13048
+ res = nullptr;
13049
+ } break;
12817
13050
  case LLM_ARCH_MAMBA:
12818
13051
  case LLM_ARCH_RWKV6:
12819
13052
  case LLM_ARCH_RWKV6QWEN2:
12820
13053
  case LLM_ARCH_RWKV7:
12821
13054
  case LLM_ARCH_ARWKV7:
12822
13055
  {
12823
- res = new llama_kv_cache_unified(hparams, {
12824
- /*.get_rope_factors =*/ nullptr
12825
- });
13056
+ res = new llama_kv_cache_recurrent(
13057
+ *this,
13058
+ GGML_TYPE_F32,
13059
+ GGML_TYPE_F32,
13060
+ cparams.offload_kqv,
13061
+ std::max((uint32_t) 1, cparams.n_seq_max));
12826
13062
  } break;
12827
13063
  default:
12828
13064
  {
12829
- res = new llama_kv_cache_unified(hparams, {
12830
- /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
12831
- // choose long/short freq factors based on the context size
12832
- if (layers[il].rope_freqs != nullptr) {
12833
- return layers[il].rope_freqs;
12834
- }
13065
+ const auto padding = llama_kv_cache_unified::get_padding(cparams);
12835
13066
 
12836
- if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
12837
- return layers[il].rope_long;
12838
- }
13067
+ cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
12839
13068
 
12840
- return layers[il].rope_short;
12841
- }
12842
- });
13069
+ LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
13070
+
13071
+ res = new llama_kv_cache_unified(
13072
+ *this,
13073
+ params.type_k,
13074
+ params.type_v,
13075
+ !cparams.flash_attn,
13076
+ cparams.offload_kqv,
13077
+ cparams.n_ctx,
13078
+ padding);
12843
13079
  }
12844
13080
  }
12845
13081
 
@@ -12856,8 +13092,6 @@ llm_graph_result_ptr llama_model::build_graph(
12856
13092
  case LLM_ARCH_LLAMA:
12857
13093
  case LLM_ARCH_LLAMA4:
12858
13094
  case LLM_ARCH_MINICPM:
12859
- case LLM_ARCH_GRANITE:
12860
- case LLM_ARCH_GRANITE_MOE:
12861
13095
  {
12862
13096
  llm = std::make_unique<llm_build_llama>(*this, params, gf);
12863
13097
  } break;
@@ -13088,6 +13322,11 @@ llm_graph_result_ptr llama_model::build_graph(
13088
13322
  {
13089
13323
  llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
13090
13324
  } break;
13325
+ case LLM_ARCH_GRANITE:
13326
+ case LLM_ARCH_GRANITE_MOE:
13327
+ {
13328
+ llm = std::make_unique<llm_build_granite>(*this, params, gf);
13329
+ } break;
13091
13330
  case LLM_ARCH_CHAMELEON:
13092
13331
  {
13093
13332
  llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
@@ -13221,8 +13460,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13221
13460
  case LLM_ARCH_DECI:
13222
13461
  case LLM_ARCH_BAICHUAN:
13223
13462
  case LLM_ARCH_STARCODER:
13224
- case LLM_ARCH_PLAMO:
13225
- case LLM_ARCH_ORION:
13226
13463
  case LLM_ARCH_INTERNLM2:
13227
13464
  case LLM_ARCH_MINICPM:
13228
13465
  case LLM_ARCH_XVERSE:
@@ -13260,6 +13497,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13260
13497
  case LLM_ARCH_PHI2:
13261
13498
  case LLM_ARCH_PHI3:
13262
13499
  case LLM_ARCH_PHIMOE:
13500
+ case LLM_ARCH_PLAMO:
13263
13501
  case LLM_ARCH_GEMMA:
13264
13502
  case LLM_ARCH_GEMMA2:
13265
13503
  case LLM_ARCH_GEMMA3:
@@ -13267,6 +13505,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
13267
13505
  case LLM_ARCH_OPENELM:
13268
13506
  case LLM_ARCH_GPTNEOX:
13269
13507
  case LLM_ARCH_CODESHELL:
13508
+ case LLM_ARCH_ORION:
13270
13509
  case LLM_ARCH_NEMOTRON:
13271
13510
  case LLM_ARCH_EXAONE:
13272
13511
  case LLM_ARCH_MINICPM3:
@@ -13339,6 +13578,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
13339
13578
  : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
13340
13579
  const auto & it = model->gguf_kv.find(key);
13341
13580
  if (it == model->gguf_kv.end()) {
13581
+ // one-off fix for very popular models (so we are not flooded with issues)
13582
+ // do not extend this list unless absolutely necessary
13583
+ // Mistral-Small-2503 does not have built-in chat template
13584
+ llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
13585
+ if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
13586
+ return "mistral-v7-tekken";
13587
+ }
13588
+
13342
13589
  return nullptr;
13343
13590
  }
13344
13591