@novastera-oss/llamarn 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/build-info.cpp +2 -2
  14. package/cpp/llama.cpp/README.md +11 -3
  15. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  17. package/cpp/llama.cpp/common/arg.cpp +153 -113
  18. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  19. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  20. package/cpp/llama.cpp/common/chat.cpp +847 -699
  21. package/cpp/llama.cpp/common/chat.h +73 -6
  22. package/cpp/llama.cpp/common/common.cpp +50 -82
  23. package/cpp/llama.cpp/common/common.h +21 -17
  24. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  25. package/cpp/llama.cpp/common/json-partial.h +37 -0
  26. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  27. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  28. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  29. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  30. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  31. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  32. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  33. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  34. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  35. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  36. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  37. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  74. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  120. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  121. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  122. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  123. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  124. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  125. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  126. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  127. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  128. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  129. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  130. package/cpp/llama.cpp/include/llama.h +62 -125
  131. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  132. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  133. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  134. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  135. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  150. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  152. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  154. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  159. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  160. package/cpp/llama.cpp/models/templates/README.md +2 -0
  161. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  162. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  163. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  164. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  165. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  166. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  167. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  168. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  169. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  170. package/cpp/llama.cpp/src/llama-context.h +30 -0
  171. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  172. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  173. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  174. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  175. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  176. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  177. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  178. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  179. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  180. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  181. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  182. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  183. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  184. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  185. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  186. package/cpp/llama.cpp/src/llama-model.h +6 -1
  187. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  188. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  189. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  190. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  191. package/cpp/llama.cpp/src/llama.cpp +14 -0
  192. package/cpp/rn-completion.cpp +4 -2
  193. package/ios/include/chat.h +73 -6
  194. package/ios/include/common/minja/chat-template.hpp +9 -5
  195. package/ios/include/common/minja/minja.hpp +69 -36
  196. package/ios/include/common.h +21 -17
  197. package/ios/include/llama.h +62 -125
  198. package/ios/libs/llama.xcframework/Info.plist +19 -19
  199. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  200. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  201. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  202. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  203. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  204. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  205. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  206. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  207. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  208. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  227. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  228. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  229. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  231. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  232. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  233. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  234. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  235. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  236. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  240. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  241. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  242. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  243. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  244. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  245. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  246. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  247. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  248. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  249. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  253. package/package.json +1 -1
  254. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  255. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  256. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  257. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  258. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  259. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  260. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  261. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  262. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  263. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
@@ -117,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
117
117
  { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
118
118
  };
119
119
 
120
+ std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
121
+ return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
122
+ }
123
+
120
124
  static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
121
125
  for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
122
126
  if (kv.second == name) {
@@ -459,11 +463,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
459
463
  GGML_ASSERT(hparams.n_expert_used == 0);
460
464
  }
461
465
 
462
- // zero-out the array hparams
463
466
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
464
467
  std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
465
468
  std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
466
469
 
470
+ std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
471
+
472
+ std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
473
+
467
474
  ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
468
475
  ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
469
476
 
@@ -567,9 +574,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
567
574
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
568
575
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
569
576
  ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
570
- hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
571
- hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
572
- hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
577
+
578
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
579
+ hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
580
+ hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
573
581
 
574
582
  switch (hparams.n_expert) {
575
583
  case 16: type = LLM_TYPE_17B_16E; break;
@@ -675,6 +683,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
675
683
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
676
684
  ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
677
685
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
686
+ ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
678
687
 
679
688
  switch (hparams.n_layer) {
680
689
  case 3:
@@ -848,22 +857,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
848
857
  default: type = LLM_TYPE_UNKNOWN;
849
858
  }
850
859
 
851
- // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
852
- if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
853
- // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
854
- hparams.n_swa = 2047;
855
- } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
856
- // default value for Phi-3-mini-128k-instruct
857
- // note: this seems incorrect because the window is bigger than the train context?
858
- hparams.n_swa = 262144;
859
- } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
860
- // default value for Phi-3-medium-128k-instruct
861
- // note: this seems incorrect because the window is equal to the train context?
862
- hparams.n_swa = 131072;
863
- }
864
- bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
865
- if (!found_swa && hparams.n_swa == 0) {
866
- throw std::runtime_error("invalid value for sliding_window");
860
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
861
+
862
+ if (found_swa && hparams.n_swa > 0) {
863
+ LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
864
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
865
+
866
+ // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
867
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
868
+
869
+ hparams.n_swa = 0;
870
+ hparams.set_swa_pattern(1);
867
871
  }
868
872
  } break;
869
873
  case LLM_ARCH_PHIMOE:
@@ -933,8 +937,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
933
937
  } break;
934
938
  case LLM_ARCH_GEMMA2:
935
939
  {
940
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
936
941
  hparams.n_swa = 4096; // default value of gemma 2
937
- hparams.n_swa_pattern = 2;
942
+ hparams.set_swa_pattern(2);
938
943
  hparams.attn_soft_cap = true;
939
944
 
940
945
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
@@ -951,7 +956,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
951
956
  } break;
952
957
  case LLM_ARCH_GEMMA3:
953
958
  {
954
- hparams.n_swa_pattern = 6;
959
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
960
+ hparams.set_swa_pattern(6);
955
961
 
956
962
  hparams.rope_freq_base_train_swa = 10000.0f;
957
963
  hparams.rope_freq_scale_train_swa = 1.0f;
@@ -1035,7 +1041,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1035
1041
  } break;
1036
1042
  case LLM_ARCH_COHERE2:
1037
1043
  {
1038
- hparams.n_swa_pattern = 4;
1044
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1045
+ hparams.set_swa_pattern(4);
1039
1046
 
1040
1047
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1041
1048
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@@ -1385,6 +1392,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1385
1392
  // Add additional layer/vocab/etc checks here for other model sizes
1386
1393
  default: type = LLM_TYPE_UNKNOWN;
1387
1394
  }
1395
+
1396
+ // For Granite MoE Shared
1397
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1388
1398
  } break;
1389
1399
  case LLM_ARCH_CHAMELEON:
1390
1400
  {
@@ -1768,6 +1778,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1768
1778
  layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1769
1779
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1770
1780
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1781
+
1782
+ // For Granite MoE Shared
1783
+ if (hparams.n_ff_shexp > 0) {
1784
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1785
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1786
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
1787
+ }
1771
1788
  }
1772
1789
  }
1773
1790
  } break;
@@ -2097,7 +2114,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2097
2114
  case LLM_ARCH_NOMIC_BERT_MOE:
2098
2115
  {
2099
2116
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2100
- type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
2117
+ type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
2101
2118
 
2102
2119
  if (arch == LLM_ARCH_BERT) {
2103
2120
  pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
@@ -2105,8 +2122,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2105
2122
  cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
2106
2123
  cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
2107
2124
 
2108
- cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
2109
- cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, TENSOR_NOT_REQUIRED);
2125
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2126
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
2110
2127
  }
2111
2128
 
2112
2129
  tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
@@ -2115,7 +2132,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2115
2132
  for (int i = 0; i < n_layer; ++i) {
2116
2133
  auto & layer = layers[i];
2117
2134
 
2118
- if (arch == LLM_ARCH_BERT) {
2135
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2136
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
2137
+
2138
+ if (!layer.wqkv) {
2119
2139
  layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2120
2140
  layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
2121
2141
 
@@ -2124,12 +2144,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2124
2144
 
2125
2145
  layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2126
2146
  layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
2127
- } else {
2128
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
2129
- }
2130
-
2131
- if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
2132
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
2133
2147
  }
2134
2148
 
2135
2149
  layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
@@ -2473,7 +2487,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2473
2487
 
2474
2488
  // output
2475
2489
  output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2476
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2490
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2491
+ // if output is NULL, init from the input tok embed
2492
+ if (output == NULL) {
2493
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2494
+ }
2477
2495
 
2478
2496
  for (int i = 0; i < n_layer; ++i) {
2479
2497
  auto & layer = layers[i];
@@ -4264,7 +4282,7 @@ uint64_t llama_model::n_elements() const {
4264
4282
  }
4265
4283
 
4266
4284
  void llama_model::print_info() const {
4267
- const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
4285
+ const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
4268
4286
 
4269
4287
  auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
4270
4288
  bool is_var = false;
@@ -4307,7 +4325,7 @@ void llama_model::print_info() const {
4307
4325
  LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
4308
4326
  LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
4309
4327
  LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
4310
- LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern);
4328
+ LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
4311
4329
  LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
4312
4330
  LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
4313
4331
  LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
@@ -4325,7 +4343,7 @@ void llama_model::print_info() const {
4325
4343
  LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
4326
4344
  LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
4327
4345
  LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
4328
- LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
4346
+ LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
4329
4347
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
4330
4348
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
4331
4349
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
@@ -4381,10 +4399,13 @@ void llama_model::print_info() const {
4381
4399
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4382
4400
  }
4383
4401
 
4384
- if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
4402
+ if (arch == LLM_ARCH_MINICPM ||
4403
+ arch == LLM_ARCH_GRANITE ||
4404
+ arch == LLM_ARCH_GRANITE_MOE) {
4385
4405
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
4386
4406
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
4387
4407
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
4408
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
4388
4409
  }
4389
4410
 
4390
4411
  if (arch == LLM_ARCH_BAILINGMOE) {
@@ -4472,7 +4493,17 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
4472
4493
  return it->second;
4473
4494
  }
4474
4495
 
4475
- ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
4496
+ float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const {
4497
+ return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
4498
+ }
4499
+
4500
+ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) const {
4501
+ return hparams.is_swa(il) ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
4502
+ }
4503
+
4504
+ ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const {
4505
+ const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
4506
+
4476
4507
  // choose long/short freq factors based on the context size
4477
4508
  if (layers[il].rope_freqs != nullptr) {
4478
4509
  return layers[il].rope_freqs;
@@ -4500,21 +4531,174 @@ struct llm_build_llama : public llm_graph_context {
4500
4531
  // inp_pos - contains the positions
4501
4532
  ggml_tensor * inp_pos = build_inp_pos();
4502
4533
 
4534
+ auto * inp_attn = build_attn_inp_kv_unified();
4535
+
4536
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4537
+
4538
+ for (int il = 0; il < n_layer; ++il) {
4539
+ ggml_tensor * inpSA = inpL;
4540
+
4541
+ // norm
4542
+ cur = build_norm(inpL,
4543
+ model.layers[il].attn_norm, NULL,
4544
+ LLM_NORM_RMS, il);
4545
+ cb(cur, "attn_norm", il);
4546
+
4547
+ // self-attention
4548
+ {
4549
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
4550
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4551
+
4552
+ // compute Q and K and RoPE them
4553
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
4554
+ cb(Qcur, "Qcur", il);
4555
+ if (model.layers[il].bq) {
4556
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4557
+ cb(Qcur, "Qcur", il);
4558
+ }
4559
+
4560
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
4561
+ cb(Kcur, "Kcur", il);
4562
+ if (model.layers[il].bk) {
4563
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4564
+ cb(Kcur, "Kcur", il);
4565
+ }
4566
+
4567
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
4568
+ cb(Vcur, "Vcur", il);
4569
+ if (model.layers[il].bv) {
4570
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4571
+ cb(Vcur, "Vcur", il);
4572
+ }
4573
+
4574
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
4575
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
4576
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
4577
+
4578
+ Qcur = ggml_rope_ext(
4579
+ ctx0, Qcur, inp_pos, rope_factors,
4580
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4581
+ ext_factor, attn_factor, beta_fast, beta_slow
4582
+ );
4583
+
4584
+ Kcur = ggml_rope_ext(
4585
+ ctx0, Kcur, inp_pos, rope_factors,
4586
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
4587
+ ext_factor, attn_factor, beta_fast, beta_slow
4588
+ );
4589
+
4590
+ cb(Qcur, "Qcur", il);
4591
+ cb(Kcur, "Kcur", il);
4592
+ cb(Vcur, "Vcur", il);
4593
+
4594
+ cur = build_attn(inp_attn, gf,
4595
+ model.layers[il].wo, model.layers[il].bo,
4596
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
4597
+ cb(cur, "attn_out", il);
4598
+ }
4599
+
4600
+ if (il == n_layer - 1) {
4601
+ // skip computing output for unused tokens
4602
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
4603
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
4604
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4605
+ }
4606
+
4607
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4608
+ cb(ffn_inp, "ffn_inp", il);
4609
+
4610
+ // feed-forward network (non-MoE)
4611
+ if (model.layers[il].ffn_gate_inp == nullptr) {
4612
+
4613
+ cur = build_norm(ffn_inp,
4614
+ model.layers[il].ffn_norm, NULL,
4615
+ LLM_NORM_RMS, il);
4616
+ cb(cur, "ffn_norm", il);
4617
+
4618
+ cur = build_ffn(cur,
4619
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
4620
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
4621
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
4622
+ NULL,
4623
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
4624
+ cb(cur, "ffn_out", il);
4625
+ } else {
4626
+ // MoE branch
4627
+ cur = build_norm(ffn_inp,
4628
+ model.layers[il].ffn_norm, NULL,
4629
+ LLM_NORM_RMS, il);
4630
+ cb(cur, "ffn_norm", il);
4631
+
4632
+ cur = build_moe_ffn(cur,
4633
+ model.layers[il].ffn_gate_inp,
4634
+ model.layers[il].ffn_up_exps,
4635
+ model.layers[il].ffn_gate_exps,
4636
+ model.layers[il].ffn_down_exps,
4637
+ nullptr,
4638
+ n_expert, n_expert_used,
4639
+ LLM_FFN_SILU, true,
4640
+ false, 0.0,
4641
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
4642
+ il);
4643
+ cb(cur, "ffn_moe_out", il);
4644
+ }
4645
+
4646
+ cur = ggml_add(ctx0, cur, ffn_inp);
4647
+ cb(cur, "ffn_out", il);
4648
+
4649
+ cur = build_cvec(cur, il);
4650
+ cb(cur, "l_out", il);
4651
+
4652
+ // input for next layer
4653
+ inpL = cur;
4654
+ }
4655
+
4656
+ cur = inpL;
4657
+
4658
+ cur = build_norm(cur,
4659
+ model.output_norm, NULL,
4660
+ LLM_NORM_RMS, -1);
4661
+
4662
+ cb(cur, "result_norm", -1);
4663
+ res->t_embd = cur;
4664
+
4665
+ // lm_head
4666
+ cur = build_lora_mm(model.output, cur);
4667
+
4668
+ cb(cur, "result_output", -1);
4669
+ res->t_logits = cur;
4670
+
4671
+ ggml_build_forward_expand(gf, cur);
4672
+ }
4673
+ };
4674
+
4675
+ struct llm_build_llama_iswa : public llm_graph_context {
4676
+ llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
4677
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4678
+
4679
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4680
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
4681
+
4682
+ ggml_tensor * cur;
4683
+ ggml_tensor * inpL;
4684
+
4685
+ inpL = build_inp_embd(model.tok_embd);
4686
+
4687
+ // inp_pos - contains the positions
4688
+ ggml_tensor * inp_pos = build_inp_pos();
4689
+
4503
4690
  // temperature tuning
4504
4691
  ggml_tensor * inp_attn_scale = nullptr;
4505
- if (arch == LLM_ARCH_LLAMA4) {
4506
- inp_attn_scale = build_inp_attn_scale();
4507
- }
4692
+ inp_attn_scale = build_inp_attn_scale();
4508
4693
 
4509
- auto * inp_attn = build_attn_inp_kv_unified();
4694
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
4510
4695
 
4511
4696
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
4697
+
4512
4698
  for (int il = 0; il < n_layer; ++il) {
4513
4699
  ggml_tensor * inpSA = inpL;
4514
4700
 
4515
- bool use_rope = arch == LLM_ARCH_LLAMA4
4516
- ? (il + 1) % hparams.n_no_rope_layer_step != 0
4517
- : true;
4701
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
4518
4702
 
4519
4703
  // norm
4520
4704
  cur = build_norm(inpL,
@@ -4525,7 +4709,7 @@ struct llm_build_llama : public llm_graph_context {
4525
4709
  // self-attention
4526
4710
  {
4527
4711
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4528
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
4712
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4529
4713
 
4530
4714
  // compute Q and K and RoPE them
4531
4715
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4573,7 +4757,7 @@ struct llm_build_llama : public llm_graph_context {
4573
4757
  cb(Kcur, "Kcur", il);
4574
4758
  cb(Vcur, "Vcur", il);
4575
4759
 
4576
- if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
4760
+ if (use_rope && hparams.use_kq_norm) {
4577
4761
  // Llama4TextL2Norm
4578
4762
  Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
4579
4763
  Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
@@ -4594,17 +4778,11 @@ struct llm_build_llama : public llm_graph_context {
4594
4778
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4595
4779
  }
4596
4780
 
4597
- // For Granite architecture
4598
- if (hparams.f_residual_scale) {
4599
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4600
- }
4601
-
4602
4781
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4603
4782
  cb(ffn_inp, "ffn_inp", il);
4604
4783
 
4605
4784
  // feed-forward network (non-MoE)
4606
4785
  if (model.layers[il].ffn_gate_inp == nullptr) {
4607
-
4608
4786
  cur = build_norm(ffn_inp,
4609
4787
  model.layers[il].ffn_norm, NULL,
4610
4788
  LLM_NORM_RMS, il);
@@ -4617,9 +4795,7 @@ struct llm_build_llama : public llm_graph_context {
4617
4795
  NULL,
4618
4796
  LLM_FFN_SILU, LLM_FFN_PAR, il);
4619
4797
  cb(cur, "ffn_out", il);
4620
-
4621
- } else if (arch == LLM_ARCH_LLAMA4) {
4622
- // llama4 MoE
4798
+ } else {
4623
4799
  ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
4624
4800
  model.layers[il].ffn_norm, NULL,
4625
4801
  LLM_NORM_RMS, il);
@@ -4648,31 +4824,6 @@ struct llm_build_llama : public llm_graph_context {
4648
4824
 
4649
4825
  cur = ggml_add(ctx0, moe_out, shexp_out);
4650
4826
  cb(cur, "ffn_moe_out_merged", il);
4651
-
4652
- } else {
4653
- // MoE branch
4654
- cur = build_norm(ffn_inp,
4655
- model.layers[il].ffn_norm, NULL,
4656
- LLM_NORM_RMS, il);
4657
- cb(cur, "ffn_norm", il);
4658
-
4659
- cur = build_moe_ffn(cur,
4660
- model.layers[il].ffn_gate_inp,
4661
- model.layers[il].ffn_up_exps,
4662
- model.layers[il].ffn_gate_exps,
4663
- model.layers[il].ffn_down_exps,
4664
- nullptr,
4665
- n_expert, n_expert_used,
4666
- LLM_FFN_SILU, true,
4667
- false, 0.0,
4668
- LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
4669
- il);
4670
- cb(cur, "ffn_moe_out", il);
4671
- }
4672
-
4673
- // For Granite architecture
4674
- if (hparams.f_residual_scale) {
4675
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4676
4827
  }
4677
4828
 
4678
4829
  cur = ggml_add(ctx0, cur, ffn_inp);
@@ -4697,11 +4848,6 @@ struct llm_build_llama : public llm_graph_context {
4697
4848
  // lm_head
4698
4849
  cur = build_lora_mm(model.output, cur);
4699
4850
 
4700
- // For Granite architecture
4701
- if (hparams.f_logit_scale) {
4702
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4703
- }
4704
-
4705
4851
  cb(cur, "result_output", -1);
4706
4852
  res->t_logits = cur;
4707
4853
 
@@ -4751,7 +4897,7 @@ struct llm_build_deci : public llm_graph_context {
4751
4897
  } else if (n_head > 0) {
4752
4898
  // self-attention
4753
4899
  // rope freq factors for llama3; may return nullptr for llama2 and other models
4754
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
4900
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
4755
4901
 
4756
4902
  // compute Q and K and RoPE them
4757
4903
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4812,11 +4958,6 @@ struct llm_build_deci : public llm_graph_context {
4812
4958
  continue;
4813
4959
  }
4814
4960
 
4815
- // For Granite architecture
4816
- if (hparams.f_residual_scale) {
4817
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4818
- }
4819
-
4820
4961
  // modified to support attention-free layer of Llama-3_1-Nemotron-51B
4821
4962
  ggml_tensor * ffn_inp = cur;
4822
4963
  if (n_head > 0) {
@@ -4840,11 +4981,6 @@ struct llm_build_deci : public llm_graph_context {
4840
4981
  cb(cur, "ffn_out", il);
4841
4982
  }
4842
4983
 
4843
- // For Granite architecture
4844
- if (hparams.f_residual_scale) {
4845
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4846
- }
4847
-
4848
4984
  cur = ggml_add(ctx0, cur, ffn_inp);
4849
4985
  cb(cur, "ffn_out", il);
4850
4986
 
@@ -4867,11 +5003,6 @@ struct llm_build_deci : public llm_graph_context {
4867
5003
  // lm_head
4868
5004
  cur = build_lora_mm(model.output, cur);
4869
5005
 
4870
- // For Granite architecture
4871
- if (hparams.f_logit_scale) {
4872
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4873
- }
4874
-
4875
5006
  cb(cur, "result_output", -1);
4876
5007
  res->t_logits = cur;
4877
5008
 
@@ -5754,8 +5885,10 @@ struct llm_build_bert : public llm_graph_context {
5754
5885
  inpL = build_inp_embd(model.tok_embd);
5755
5886
 
5756
5887
  // token types are hardcoded to zero ("Sentence A")
5757
- ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5758
- inpL = ggml_add(ctx0, inpL, type_row0);
5888
+ if (model.type_embd) {
5889
+ ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
5890
+ inpL = ggml_add(ctx0, inpL, type_row0);
5891
+ }
5759
5892
  if (model.arch == LLM_ARCH_BERT) {
5760
5893
  inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
5761
5894
  }
@@ -5776,36 +5909,11 @@ struct llm_build_bert : public llm_graph_context {
5776
5909
  ggml_tensor * Vcur;
5777
5910
 
5778
5911
  // self-attention
5779
- if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
5780
- Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
5781
-
5782
- if (model.layers[il].attn_q_norm) {
5783
- Qcur = build_norm(Qcur,
5784
- model.layers[il].attn_q_norm,
5785
- model.layers[il].attn_q_norm_b,
5786
- LLM_NORM, il);
5787
- }
5788
-
5789
- Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
5790
-
5791
- if (model.layers[il].attn_k_norm) {
5792
- Kcur = build_norm(Kcur,
5793
- model.layers[il].attn_k_norm,
5794
- model.layers[il].attn_k_norm_b,
5795
- LLM_NORM, il);
5796
- }
5797
-
5798
- Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
5799
-
5800
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5801
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5802
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5803
- } else {
5804
- // compute Q and K and RoPE them
5912
+ if (model.layers[il].wqkv) {
5805
5913
  cur = build_lora_mm(model.layers[il].wqkv, cur);
5806
5914
  cb(cur, "wqkv", il);
5807
5915
 
5808
- if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5916
+ if (model.layers[il].bqkv) {
5809
5917
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5810
5918
  cb(cur, "bqkv", il);
5811
5919
  }
@@ -5813,11 +5921,32 @@ struct llm_build_bert : public llm_graph_context {
5813
5921
  Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5814
5922
  Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5815
5923
  Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5924
+ } else {
5925
+ Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
5926
+ Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
5927
+ Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
5928
+ }
5816
5929
 
5817
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5818
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5819
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5930
+ if (model.layers[il].attn_q_norm) {
5931
+ Qcur = build_norm(Qcur,
5932
+ model.layers[il].attn_q_norm,
5933
+ model.layers[il].attn_q_norm_b,
5934
+ LLM_NORM, il);
5935
+ }
5936
+
5937
+ if (model.layers[il].attn_k_norm) {
5938
+ Kcur = build_norm(Kcur,
5939
+ model.layers[il].attn_k_norm,
5940
+ model.layers[il].attn_k_norm_b,
5941
+ LLM_NORM, il);
5942
+ }
5820
5943
 
5944
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5945
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5946
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5947
+
5948
+ // RoPE
5949
+ if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
5821
5950
  Qcur = ggml_rope_ext(
5822
5951
  ctx0, Qcur, inp_pos, nullptr,
5823
5952
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -7215,6 +7344,7 @@ struct llm_build_phi2 : public llm_graph_context {
7215
7344
  }
7216
7345
  };
7217
7346
 
7347
+ template<bool iswa>
7218
7348
  struct llm_build_phi3 : public llm_graph_context {
7219
7349
  llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7220
7350
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -7230,7 +7360,14 @@ struct llm_build_phi3 : public llm_graph_context {
7230
7360
  // inp_pos - contains the positions
7231
7361
  ggml_tensor * inp_pos = build_inp_pos();
7232
7362
 
7233
- auto * inp_attn = build_attn_inp_kv_unified();
7363
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
7364
+ inp_attn_type * inp_attn = nullptr;
7365
+
7366
+ if constexpr (iswa) {
7367
+ inp_attn = build_attn_inp_kv_unified_iswa();
7368
+ } else {
7369
+ inp_attn = build_attn_inp_kv_unified();
7370
+ }
7234
7371
 
7235
7372
  for (int il = 0; il < n_layer; ++il) {
7236
7373
  auto * residual = inpL;
@@ -7238,7 +7375,7 @@ struct llm_build_phi3 : public llm_graph_context {
7238
7375
  // self-attention
7239
7376
  {
7240
7377
  // rope freq factors for 128k context
7241
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
7378
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
7242
7379
 
7243
7380
  ggml_tensor* attn_norm_output = build_norm(inpL,
7244
7381
  model.layers[il].attn_norm,
@@ -7990,7 +8127,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7990
8127
  for (int il = 0; il < n_layer; ++il) {
7991
8128
  ggml_tensor * inpSA = inpL;
7992
8129
 
7993
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
8130
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
7994
8131
 
7995
8132
  // norm
7996
8133
  cur = build_norm(inpL,
@@ -8290,8 +8427,8 @@ struct llm_build_gemma : public llm_graph_context {
8290
8427
  }
8291
8428
  };
8292
8429
 
8293
- struct llm_build_gemma2 : public llm_graph_context {
8294
- llm_build_gemma2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8430
+ struct llm_build_gemma2_iswa : public llm_graph_context {
8431
+ llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8295
8432
  const int64_t n_embd_head = hparams.n_embd_head_k;
8296
8433
 
8297
8434
  ggml_tensor * cur;
@@ -8305,7 +8442,7 @@ struct llm_build_gemma2 : public llm_graph_context {
8305
8442
  // inp_pos - contains the positions
8306
8443
  ggml_tensor * inp_pos = build_inp_pos();
8307
8444
 
8308
- auto * inp_attn = build_attn_inp_kv_unified();
8445
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
8309
8446
 
8310
8447
  for (int il = 0; il < n_layer; ++il) {
8311
8448
  // norm
@@ -8427,8 +8564,8 @@ struct llm_build_gemma2 : public llm_graph_context {
8427
8564
  }
8428
8565
  };
8429
8566
 
8430
- struct llm_build_gemma3 : public llm_graph_context {
8431
- llm_build_gemma3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8567
+ struct llm_build_gemma3_iswa : public llm_graph_context {
8568
+ llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
8432
8569
  const int64_t n_embd_head = hparams.n_embd_head_k;
8433
8570
 
8434
8571
  ggml_tensor * cur;
@@ -8446,13 +8583,11 @@ struct llm_build_gemma3 : public llm_graph_context {
8446
8583
  ggml_tensor * inp_pos = build_inp_pos();
8447
8584
 
8448
8585
  // TODO: is causal == true correct? might need some changes
8449
- auto * inp_attn = build_attn_inp_kv_unified();
8586
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
8450
8587
 
8451
8588
  for (int il = 0; il < n_layer; ++il) {
8452
- const bool is_swa = hparams.is_swa(il);
8453
-
8454
- const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
8455
- const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
8589
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
8590
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
8456
8591
 
8457
8592
  // norm
8458
8593
  cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
@@ -9029,8 +9164,8 @@ struct llm_build_command_r : public llm_graph_context {
9029
9164
  }
9030
9165
  };
9031
9166
 
9032
- struct llm_build_cohere2 : public llm_graph_context {
9033
- llm_build_cohere2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9167
+ struct llm_build_cohere2_iswa : public llm_graph_context {
9168
+ llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
9034
9169
  const int64_t n_embd_head = hparams.n_embd_head_v;
9035
9170
 
9036
9171
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9045,7 +9180,7 @@ struct llm_build_cohere2 : public llm_graph_context {
9045
9180
  // inp_pos - contains the positions
9046
9181
  ggml_tensor * inp_pos = build_inp_pos();
9047
9182
 
9048
- auto * inp_attn = build_attn_inp_kv_unified();
9183
+ auto * inp_attn = build_attn_inp_kv_unified_iswa();
9049
9184
 
9050
9185
  for (int il = 0; il < n_layer; ++il) {
9051
9186
  const bool is_swa = hparams.is_swa(il);
@@ -9058,7 +9193,7 @@ struct llm_build_cohere2 : public llm_graph_context {
9058
9193
  // self-attention
9059
9194
  {
9060
9195
  // rope freq factors for 128k context
9061
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
9196
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
9062
9197
 
9063
9198
  // compute Q and K and RoPE them
9064
9199
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9996,7 +10131,7 @@ struct llm_build_deepseek : public llm_graph_context {
9996
10131
  // self-attention
9997
10132
  {
9998
10133
  // rope freq factors for llama3; may return nullptr for llama2 and other models
9999
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
10134
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
10000
10135
 
10001
10136
  // compute Q and K and RoPE them
10002
10137
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11360,7 +11495,7 @@ struct llm_build_exaone : public llm_graph_context {
11360
11495
  // self-attention
11361
11496
  {
11362
11497
  // rope freq factors for llama3; may return nullptr for llama2 and other models
11363
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
11498
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
11364
11499
 
11365
11500
  // compute Q and K and RoPE them
11366
11501
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12210,6 +12345,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12210
12345
  }
12211
12346
  };
12212
12347
 
12348
+
12349
+ struct llm_build_granite : public llm_graph_context {
12350
+ llm_build_granite(
12351
+ const llama_model & model,
12352
+ const llm_graph_params & params,
12353
+ ggml_cgraph * gf,
12354
+ const bool use_rope = true)
12355
+ : llm_graph_context(params) {
12356
+
12357
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12358
+
12359
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12360
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12361
+
12362
+ ggml_tensor * cur;
12363
+ ggml_tensor * inpL;
12364
+
12365
+ inpL = build_inp_embd(model.tok_embd);
12366
+
12367
+ // inp_pos - built only if rope enabled
12368
+ ggml_tensor * inp_pos = nullptr;
12369
+ if (use_rope) {
12370
+ inp_pos = build_inp_pos();
12371
+ }
12372
+
12373
+ auto * inp_attn = build_attn_inp_kv_unified();
12374
+
12375
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12376
+ for (int il = 0; il < n_layer; ++il) {
12377
+ ggml_tensor * inpSA = inpL;
12378
+
12379
+ // norm
12380
+ cur = build_norm(inpL,
12381
+ model.layers[il].attn_norm, NULL,
12382
+ LLM_NORM_RMS, il);
12383
+ cb(cur, "attn_norm", il);
12384
+
12385
+ // self-attention
12386
+ {
12387
+ // compute Q and K and (optionally) RoPE them
12388
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12389
+ cb(Qcur, "Qcur", il);
12390
+ if (model.layers[il].bq) {
12391
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12392
+ cb(Qcur, "Qcur", il);
12393
+ }
12394
+
12395
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12396
+ cb(Kcur, "Kcur", il);
12397
+ if (model.layers[il].bk) {
12398
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12399
+ cb(Kcur, "Kcur", il);
12400
+ }
12401
+
12402
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12403
+ cb(Vcur, "Vcur", il);
12404
+ if (model.layers[il].bv) {
12405
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12406
+ cb(Vcur, "Vcur", il);
12407
+ }
12408
+
12409
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12410
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12411
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12412
+
12413
+ if (use_rope) {
12414
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
12415
+ Qcur = ggml_rope_ext(
12416
+ ctx0, Qcur, inp_pos, rope_factors,
12417
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12418
+ ext_factor, attn_factor, beta_fast, beta_slow
12419
+ );
12420
+
12421
+ Kcur = ggml_rope_ext(
12422
+ ctx0, Kcur, inp_pos, rope_factors,
12423
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12424
+ ext_factor, attn_factor, beta_fast, beta_slow
12425
+ );
12426
+ }
12427
+
12428
+ cb(Qcur, "Qcur", il);
12429
+ cb(Kcur, "Kcur", il);
12430
+ cb(Vcur, "Vcur", il);
12431
+
12432
+ cur = build_attn(inp_attn, gf,
12433
+ model.layers[il].wo, model.layers[il].bo,
12434
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12435
+ cb(cur, "attn_out", il);
12436
+ }
12437
+
12438
+ if (il == n_layer - 1) {
12439
+ // skip computing output for unused tokens
12440
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12441
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12442
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12443
+ }
12444
+
12445
+ // For Granite architectures - scale residual
12446
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12447
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12448
+ cb(ffn_inp, "ffn_inp", il);
12449
+
12450
+ // feed-forward network (non-MoE)
12451
+ if (model.layers[il].ffn_gate_inp == nullptr) {
12452
+
12453
+ cur = build_norm(ffn_inp,
12454
+ model.layers[il].ffn_norm, NULL,
12455
+ LLM_NORM_RMS, il);
12456
+ cb(cur, "ffn_norm", il);
12457
+
12458
+ cur = build_ffn(cur,
12459
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12460
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12461
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12462
+ NULL,
12463
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12464
+ cb(cur, "ffn_out", il);
12465
+
12466
+ } else {
12467
+ // MoE branch
12468
+ cur = build_norm(ffn_inp,
12469
+ model.layers[il].ffn_norm, NULL,
12470
+ LLM_NORM_RMS, il);
12471
+ cb(cur, "ffn_norm", il);
12472
+
12473
+ ggml_tensor * moe_out = build_moe_ffn(cur,
12474
+ model.layers[il].ffn_gate_inp,
12475
+ model.layers[il].ffn_up_exps,
12476
+ model.layers[il].ffn_gate_exps,
12477
+ model.layers[il].ffn_down_exps,
12478
+ nullptr,
12479
+ n_expert, n_expert_used,
12480
+ LLM_FFN_SILU, true,
12481
+ false, 0.0,
12482
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12483
+ il);
12484
+ cb(moe_out, "ffn_moe_out", il);
12485
+
12486
+ // For Granite MoE Shared
12487
+ if (hparams.n_ff_shexp > 0) {
12488
+ ggml_tensor * ffn_shexp = build_ffn(cur,
12489
+ model.layers[il].ffn_up_shexp, NULL, NULL,
12490
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
12491
+ model.layers[il].ffn_down_shexp, NULL, NULL,
12492
+ NULL,
12493
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12494
+ cb(ffn_shexp, "ffn_shexp", il);
12495
+
12496
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
12497
+ cb(cur, "ffn_out", il);
12498
+ } else {
12499
+ cur = moe_out;
12500
+ }
12501
+ }
12502
+
12503
+ // For Granite architectures - scale residual
12504
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12505
+ cur = ggml_add(ctx0, cur, ffn_inp);
12506
+ cb(cur, "ffn_out", il);
12507
+
12508
+ cur = build_cvec(cur, il);
12509
+ cb(cur, "l_out", il);
12510
+
12511
+ // input for next layer
12512
+ inpL = cur;
12513
+ }
12514
+
12515
+ cur = inpL;
12516
+
12517
+ cur = build_norm(cur,
12518
+ model.output_norm, NULL,
12519
+ LLM_NORM_RMS, -1);
12520
+
12521
+ cb(cur, "result_norm", -1);
12522
+ res->t_embd = cur;
12523
+
12524
+ // lm_head
12525
+ cur = build_lora_mm(model.output, cur);
12526
+
12527
+ // For Granite architectures - scale logits
12528
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
12529
+ cb(cur, "result_output", -1);
12530
+ res->t_logits = cur;
12531
+
12532
+ ggml_build_forward_expand(gf, cur);
12533
+ }
12534
+ };
12535
+
12213
12536
  // ref: https://github.com/facebookresearch/chameleon
12214
12537
  // based on the original build_llama() function, changes:
12215
12538
  // * qk-norm
@@ -12741,7 +13064,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
12741
13064
  // self-attention
12742
13065
  {
12743
13066
  // rope freq factors for llama3; may return nullptr for llama2 and other models
12744
- ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
13067
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
12745
13068
 
12746
13069
  // compute Q and K and RoPE them
12747
13070
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12869,6 +13192,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
12869
13192
  case LLM_ARCH_JINA_BERT_V2:
12870
13193
  case LLM_ARCH_NOMIC_BERT:
12871
13194
  case LLM_ARCH_NOMIC_BERT_MOE:
13195
+ case LLM_ARCH_WAVTOKENIZER_DEC:
12872
13196
  {
12873
13197
  res = nullptr;
12874
13198
  } break;
@@ -12883,7 +13207,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
12883
13207
  GGML_TYPE_F32,
12884
13208
  GGML_TYPE_F32,
12885
13209
  cparams.offload_kqv,
12886
- std::max((uint32_t) 1, cparams.n_seq_max));
13210
+ std::max((uint32_t) 1, cparams.n_seq_max),
13211
+ cparams.n_seq_max);
12887
13212
  } break;
12888
13213
  default:
12889
13214
  {
@@ -12893,14 +13218,36 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
12893
13218
 
12894
13219
  LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
12895
13220
 
12896
- res = new llama_kv_cache_unified(
12897
- *this,
12898
- params.type_k,
12899
- params.type_v,
12900
- !cparams.flash_attn,
12901
- cparams.offload_kqv,
12902
- cparams.n_ctx,
12903
- padding);
13221
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13222
+ GGML_ASSERT(hparams.is_swa_any());
13223
+
13224
+ res = new llama_kv_cache_unified_iswa(
13225
+ *this,
13226
+ params.type_k,
13227
+ params.type_v,
13228
+ !cparams.flash_attn,
13229
+ cparams.offload_kqv,
13230
+ params.swa_full,
13231
+ cparams.n_ctx,
13232
+ cparams.n_seq_max,
13233
+ cparams.n_batch,
13234
+ padding);
13235
+ } else {
13236
+ GGML_ASSERT(!hparams.is_swa_any());
13237
+
13238
+ res = new llama_kv_cache_unified(
13239
+ *this,
13240
+ nullptr,
13241
+ params.type_k,
13242
+ params.type_v,
13243
+ !cparams.flash_attn,
13244
+ cparams.offload_kqv,
13245
+ cparams.n_ctx,
13246
+ cparams.n_seq_max,
13247
+ padding,
13248
+ hparams.n_swa,
13249
+ hparams.swa_type);
13250
+ }
12904
13251
  }
12905
13252
  }
12906
13253
 
@@ -12915,13 +13262,13 @@ llm_graph_result_ptr llama_model::build_graph(
12915
13262
 
12916
13263
  switch (arch) {
12917
13264
  case LLM_ARCH_LLAMA:
12918
- case LLM_ARCH_LLAMA4:
12919
- case LLM_ARCH_MINICPM:
12920
- case LLM_ARCH_GRANITE:
12921
- case LLM_ARCH_GRANITE_MOE:
12922
13265
  {
12923
13266
  llm = std::make_unique<llm_build_llama>(*this, params, gf);
12924
13267
  } break;
13268
+ case LLM_ARCH_LLAMA4:
13269
+ {
13270
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params, gf);
13271
+ } break;
12925
13272
  case LLM_ARCH_DECI:
12926
13273
  {
12927
13274
  llm = std::make_unique<llm_build_deci>(*this, params, gf);
@@ -12996,7 +13343,11 @@ llm_graph_result_ptr llama_model::build_graph(
12996
13343
  case LLM_ARCH_PHI3:
12997
13344
  case LLM_ARCH_PHIMOE:
12998
13345
  {
12999
- llm = std::make_unique<llm_build_phi3>(*this, params, gf);
13346
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13347
+ llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
13348
+ } else {
13349
+ llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
13350
+ }
13000
13351
  } break;
13001
13352
  case LLM_ARCH_PLAMO:
13002
13353
  {
@@ -13028,11 +13379,11 @@ llm_graph_result_ptr llama_model::build_graph(
13028
13379
  } break;
13029
13380
  case LLM_ARCH_GEMMA2:
13030
13381
  {
13031
- llm = std::make_unique<llm_build_gemma2>(*this, params, gf);
13382
+ llm = std::make_unique<llm_build_gemma2_iswa>(*this, params, gf);
13032
13383
  } break;
13033
13384
  case LLM_ARCH_GEMMA3:
13034
13385
  {
13035
- llm = std::make_unique<llm_build_gemma3>(*this, params, gf);
13386
+ llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
13036
13387
  } break;
13037
13388
  case LLM_ARCH_STARCODER2:
13038
13389
  {
@@ -13052,7 +13403,7 @@ llm_graph_result_ptr llama_model::build_graph(
13052
13403
  } break;
13053
13404
  case LLM_ARCH_COHERE2:
13054
13405
  {
13055
- llm = std::make_unique<llm_build_cohere2>(*this, params, gf);
13406
+ llm = std::make_unique<llm_build_cohere2_iswa>(*this, params, gf);
13056
13407
  } break;
13057
13408
  case LLM_ARCH_DBRX:
13058
13409
  {
@@ -13149,6 +13500,12 @@ llm_graph_result_ptr llama_model::build_graph(
13149
13500
  {
13150
13501
  llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
13151
13502
  } break;
13503
+ case LLM_ARCH_GRANITE:
13504
+ case LLM_ARCH_GRANITE_MOE:
13505
+ case LLM_ARCH_MINICPM:
13506
+ {
13507
+ llm = std::make_unique<llm_build_granite>(*this, params, gf);
13508
+ } break;
13152
13509
  case LLM_ARCH_CHAMELEON:
13153
13510
  {
13154
13511
  llm = std::make_unique<llm_build_chameleon>(*this, params, gf);