@fugood/llama.node 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/CMakeLists.txt +5 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +29 -1
  17. package/package.json +1 -1
  18. package/src/EmbeddingWorker.cpp +15 -5
  19. package/src/EmbeddingWorker.h +2 -1
  20. package/src/LlamaCompletionWorker.cpp +17 -1
  21. package/src/LlamaContext.cpp +86 -18
  22. package/src/LlamaContext.h +2 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +197 -159
  24. package/src/llama.cpp/.github/workflows/docker.yml +5 -8
  25. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  27. package/src/llama.cpp/CMakeLists.txt +11 -6
  28. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  29. package/src/llama.cpp/cmake/common.cmake +33 -0
  30. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  31. package/src/llama.cpp/common/CMakeLists.txt +6 -2
  32. package/src/llama.cpp/common/arg.cpp +426 -245
  33. package/src/llama.cpp/common/common.cpp +143 -80
  34. package/src/llama.cpp/common/common.h +81 -24
  35. package/src/llama.cpp/common/sampling.cpp +53 -19
  36. package/src/llama.cpp/common/sampling.h +22 -1
  37. package/src/llama.cpp/common/speculative.cpp +274 -0
  38. package/src/llama.cpp/common/speculative.h +28 -0
  39. package/src/llama.cpp/docs/build.md +101 -148
  40. package/src/llama.cpp/examples/CMakeLists.txt +32 -13
  41. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +5 -4
  43. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  47. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  48. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  49. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  50. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  52. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  55. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  57. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  59. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
  61. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/infill/infill.cpp +1 -1
  63. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  64. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
  65. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  66. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  67. package/src/llama.cpp/examples/llava/clip.cpp +262 -66
  68. package/src/llama.cpp/examples/llava/clip.h +8 -2
  69. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  70. package/src/llama.cpp/examples/llava/llava.cpp +46 -19
  71. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
  72. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  73. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  75. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  76. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
  77. package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
  78. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/main/main.cpp +9 -5
  80. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  83. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  84. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  87. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  88. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
  90. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  91. package/src/llama.cpp/examples/run/run.cpp +911 -0
  92. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
  94. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
  95. package/src/llama.cpp/examples/server/server.cpp +1758 -886
  96. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  97. package/src/llama.cpp/examples/server/utils.hpp +94 -304
  98. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  99. package/src/llama.cpp/examples/simple/simple.cpp +4 -0
  100. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
  101. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
  102. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
  104. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  106. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
  108. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  109. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  110. package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
  111. package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
  112. package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
  113. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  114. package/src/llama.cpp/ggml/include/ggml.h +106 -24
  115. package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
  123. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  124. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  125. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
  126. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  127. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  128. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  129. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  130. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  131. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  132. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  133. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  134. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  135. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
  136. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  137. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  138. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
  139. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
  140. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
  141. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  142. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
  143. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
  151. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
  152. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
  153. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  155. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
  156. package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
  157. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
  158. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
  159. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
  160. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
  161. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
  162. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  163. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  164. package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
  165. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
  167. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
  169. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
  172. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  173. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  174. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
  175. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
  176. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  177. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
  178. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
  182. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
  183. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  184. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  185. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
  187. package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
  188. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
  189. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
  190. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
  191. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
  192. package/src/llama.cpp/ggml/src/ggml.c +367 -207
  193. package/src/llama.cpp/include/llama-cpp.h +25 -0
  194. package/src/llama.cpp/include/llama.h +26 -19
  195. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  196. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  197. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  198. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  199. package/src/llama.cpp/src/CMakeLists.txt +2 -7
  200. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  201. package/src/llama.cpp/src/llama-grammar.h +2 -5
  202. package/src/llama.cpp/src/llama-sampling.cpp +35 -90
  203. package/src/llama.cpp/src/llama-vocab.cpp +6 -1
  204. package/src/llama.cpp/src/llama.cpp +1748 -640
  205. package/src/llama.cpp/src/unicode.cpp +62 -51
  206. package/src/llama.cpp/src/unicode.h +9 -10
  207. package/src/llama.cpp/tests/CMakeLists.txt +48 -37
  208. package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
  209. package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
  210. package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
  211. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  212. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  213. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  214. package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
  215. package/src/llama.cpp/tests/test-rope.cpp +61 -20
  216. package/src/llama.cpp/tests/test-sampling.cpp +2 -2
  217. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  218. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  219. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  220. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  221. package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
  222. package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
  223. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
  224. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  225. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
@@ -163,6 +163,7 @@ enum llm_arch {
163
163
  LLM_ARCH_QWEN,
164
164
  LLM_ARCH_QWEN2,
165
165
  LLM_ARCH_QWEN2MOE,
166
+ LLM_ARCH_QWEN2VL,
166
167
  LLM_ARCH_PHI2,
167
168
  LLM_ARCH_PHI3,
168
169
  LLM_ARCH_PLAMO,
@@ -179,9 +180,11 @@ enum llm_arch {
179
180
  LLM_ARCH_COMMAND_R,
180
181
  LLM_ARCH_DBRX,
181
182
  LLM_ARCH_OLMO,
183
+ LLM_ARCH_OLMO2,
182
184
  LLM_ARCH_OLMOE,
183
185
  LLM_ARCH_OPENELM,
184
186
  LLM_ARCH_ARCTIC,
187
+ LLM_ARCH_DEEPSEEK,
185
188
  LLM_ARCH_DEEPSEEK2,
186
189
  LLM_ARCH_CHATGLM,
187
190
  LLM_ARCH_BITNET,
@@ -194,60 +197,65 @@ enum llm_arch {
194
197
  LLM_ARCH_GRANITE,
195
198
  LLM_ARCH_GRANITE_MOE,
196
199
  LLM_ARCH_CHAMELEON,
200
+ LLM_ARCH_WAVTOKENIZER_DEC,
197
201
  LLM_ARCH_UNKNOWN,
198
202
  };
199
203
 
200
204
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
201
- { LLM_ARCH_LLAMA, "llama" },
202
- { LLM_ARCH_FALCON, "falcon" },
203
- { LLM_ARCH_GROK, "grok" },
204
- { LLM_ARCH_GPT2, "gpt2" },
205
- { LLM_ARCH_GPTJ, "gptj" },
206
- { LLM_ARCH_GPTNEOX, "gptneox" },
207
- { LLM_ARCH_MPT, "mpt" },
208
- { LLM_ARCH_BAICHUAN, "baichuan" },
209
- { LLM_ARCH_STARCODER, "starcoder" },
210
- { LLM_ARCH_REFACT, "refact" },
211
- { LLM_ARCH_BERT, "bert" },
212
- { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
213
- { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
214
- { LLM_ARCH_BLOOM, "bloom" },
215
- { LLM_ARCH_STABLELM, "stablelm" },
216
- { LLM_ARCH_QWEN, "qwen" },
217
- { LLM_ARCH_QWEN2, "qwen2" },
218
- { LLM_ARCH_QWEN2MOE, "qwen2moe" },
219
- { LLM_ARCH_PHI2, "phi2" },
220
- { LLM_ARCH_PHI3, "phi3" },
221
- { LLM_ARCH_PLAMO, "plamo" },
222
- { LLM_ARCH_CODESHELL, "codeshell" },
223
- { LLM_ARCH_ORION, "orion" },
224
- { LLM_ARCH_INTERNLM2, "internlm2" },
225
- { LLM_ARCH_MINICPM, "minicpm" },
226
- { LLM_ARCH_MINICPM3, "minicpm3" },
227
- { LLM_ARCH_GEMMA, "gemma" },
228
- { LLM_ARCH_GEMMA2, "gemma2" },
229
- { LLM_ARCH_STARCODER2, "starcoder2" },
230
- { LLM_ARCH_MAMBA, "mamba" },
231
- { LLM_ARCH_XVERSE, "xverse" },
232
- { LLM_ARCH_COMMAND_R, "command-r" },
233
- { LLM_ARCH_DBRX, "dbrx" },
234
- { LLM_ARCH_OLMO, "olmo" },
235
- { LLM_ARCH_OLMOE, "olmoe" },
236
- { LLM_ARCH_OPENELM, "openelm" },
237
- { LLM_ARCH_ARCTIC, "arctic" },
238
- { LLM_ARCH_DEEPSEEK2, "deepseek2" },
239
- { LLM_ARCH_CHATGLM, "chatglm" },
240
- { LLM_ARCH_BITNET, "bitnet" },
241
- { LLM_ARCH_T5, "t5" },
242
- { LLM_ARCH_T5ENCODER, "t5encoder" },
243
- { LLM_ARCH_JAIS, "jais" },
244
- { LLM_ARCH_NEMOTRON, "nemotron" },
245
- { LLM_ARCH_EXAONE, "exaone" },
246
- { LLM_ARCH_RWKV6, "rwkv6" },
247
- { LLM_ARCH_GRANITE, "granite" },
248
- { LLM_ARCH_GRANITE_MOE, "granitemoe" },
249
- { LLM_ARCH_CHAMELEON, "chameleon" },
250
- { LLM_ARCH_UNKNOWN, "(unknown)" },
205
+ { LLM_ARCH_LLAMA, "llama" },
206
+ { LLM_ARCH_FALCON, "falcon" },
207
+ { LLM_ARCH_GROK, "grok" },
208
+ { LLM_ARCH_GPT2, "gpt2" },
209
+ { LLM_ARCH_GPTJ, "gptj" },
210
+ { LLM_ARCH_GPTNEOX, "gptneox" },
211
+ { LLM_ARCH_MPT, "mpt" },
212
+ { LLM_ARCH_BAICHUAN, "baichuan" },
213
+ { LLM_ARCH_STARCODER, "starcoder" },
214
+ { LLM_ARCH_REFACT, "refact" },
215
+ { LLM_ARCH_BERT, "bert" },
216
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
217
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
218
+ { LLM_ARCH_BLOOM, "bloom" },
219
+ { LLM_ARCH_STABLELM, "stablelm" },
220
+ { LLM_ARCH_QWEN, "qwen" },
221
+ { LLM_ARCH_QWEN2, "qwen2" },
222
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
223
+ { LLM_ARCH_QWEN2VL, "qwen2vl" },
224
+ { LLM_ARCH_PHI2, "phi2" },
225
+ { LLM_ARCH_PHI3, "phi3" },
226
+ { LLM_ARCH_PLAMO, "plamo" },
227
+ { LLM_ARCH_CODESHELL, "codeshell" },
228
+ { LLM_ARCH_ORION, "orion" },
229
+ { LLM_ARCH_INTERNLM2, "internlm2" },
230
+ { LLM_ARCH_MINICPM, "minicpm" },
231
+ { LLM_ARCH_MINICPM3, "minicpm3" },
232
+ { LLM_ARCH_GEMMA, "gemma" },
233
+ { LLM_ARCH_GEMMA2, "gemma2" },
234
+ { LLM_ARCH_STARCODER2, "starcoder2" },
235
+ { LLM_ARCH_MAMBA, "mamba" },
236
+ { LLM_ARCH_XVERSE, "xverse" },
237
+ { LLM_ARCH_COMMAND_R, "command-r" },
238
+ { LLM_ARCH_DBRX, "dbrx" },
239
+ { LLM_ARCH_OLMO, "olmo" },
240
+ { LLM_ARCH_OLMO2, "olmo2" },
241
+ { LLM_ARCH_OLMOE, "olmoe" },
242
+ { LLM_ARCH_OPENELM, "openelm" },
243
+ { LLM_ARCH_ARCTIC, "arctic" },
244
+ { LLM_ARCH_DEEPSEEK, "deepseek" },
245
+ { LLM_ARCH_DEEPSEEK2, "deepseek2" },
246
+ { LLM_ARCH_CHATGLM, "chatglm" },
247
+ { LLM_ARCH_BITNET, "bitnet" },
248
+ { LLM_ARCH_T5, "t5" },
249
+ { LLM_ARCH_T5ENCODER, "t5encoder" },
250
+ { LLM_ARCH_JAIS, "jais" },
251
+ { LLM_ARCH_NEMOTRON, "nemotron" },
252
+ { LLM_ARCH_EXAONE, "exaone" },
253
+ { LLM_ARCH_RWKV6, "rwkv6" },
254
+ { LLM_ARCH_GRANITE, "granite" },
255
+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
256
+ { LLM_ARCH_CHAMELEON, "chameleon" },
257
+ { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
258
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
251
259
  };
252
260
 
253
261
  enum llm_kv {
@@ -267,6 +275,7 @@ enum llm_kv {
267
275
  LLM_KV_VOCAB_SIZE,
268
276
  LLM_KV_CONTEXT_LENGTH,
269
277
  LLM_KV_EMBEDDING_LENGTH,
278
+ LLM_KV_FEATURES_LENGTH,
270
279
  LLM_KV_BLOCK_COUNT,
271
280
  LLM_KV_LEADING_DENSE_BLOCK_COUNT,
272
281
  LLM_KV_FEED_FORWARD_LENGTH,
@@ -298,6 +307,8 @@ enum llm_kv {
298
307
  LLM_KV_ATTENTION_VALUE_LENGTH,
299
308
  LLM_KV_ATTENTION_LAYERNORM_EPS,
300
309
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
310
+ LLM_KV_ATTENTION_GROUPNORM_EPS,
311
+ LLM_KV_ATTENTION_GROUPNORM_GROUPS,
301
312
  LLM_KV_ATTENTION_CAUSAL,
302
313
  LLM_KV_ATTENTION_Q_LORA_RANK,
303
314
  LLM_KV_ATTENTION_KV_LORA_RANK,
@@ -306,6 +317,7 @@ enum llm_kv {
306
317
  LLM_KV_ATTENTION_SCALE,
307
318
 
308
319
  LLM_KV_ROPE_DIMENSION_COUNT,
320
+ LLM_KV_ROPE_DIMENSION_SECTIONS,
309
321
  LLM_KV_ROPE_FREQ_BASE,
310
322
  LLM_KV_ROPE_SCALE_LINEAR,
311
323
  LLM_KV_ROPE_SCALING_TYPE,
@@ -360,6 +372,12 @@ enum llm_kv {
360
372
  LLM_KV_ADAPTER_TYPE,
361
373
  LLM_KV_ADAPTER_LORA_ALPHA,
362
374
 
375
+ LLM_KV_POSNET_EMBEDDING_LENGTH,
376
+ LLM_KV_POSNET_BLOCK_COUNT,
377
+
378
+ LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
379
+ LLM_KV_CONVNEXT_BLOCK_COUNT,
380
+
363
381
  // deprecated:
364
382
  LLM_KV_TOKENIZER_PREFIX_ID,
365
383
  LLM_KV_TOKENIZER_SUFFIX_ID,
@@ -383,6 +401,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
383
401
  { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
384
402
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
385
403
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
404
+ { LLM_KV_FEATURES_LENGTH, "%s.features_length" },
386
405
  { LLM_KV_BLOCK_COUNT, "%s.block_count" },
387
406
  { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
388
407
  { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
@@ -414,6 +433,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
414
433
  { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
415
434
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
416
435
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
436
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
437
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
417
438
  { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
418
439
  { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
419
440
  { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
@@ -422,6 +443,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
422
443
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
423
444
 
424
445
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
446
+ { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
425
447
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
426
448
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
427
449
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
@@ -443,6 +465,12 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
443
465
 
444
466
  { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
445
467
 
468
+ { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
469
+ { LLM_KV_POSNET_BLOCK_COUNT, "%s.posnet.block_count" },
470
+
471
+ { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
472
+ { LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
473
+
446
474
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
447
475
  { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
448
476
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -601,6 +629,22 @@ enum llm_tensor {
601
629
  LLM_TENSOR_ENC_OUTPUT_NORM,
602
630
  LLM_TENSOR_CLS,
603
631
  LLM_TENSOR_CLS_OUT,
632
+ LLM_TENSOR_CONV1D,
633
+ LLM_TENSOR_CONVNEXT_DW,
634
+ LLM_TENSOR_CONVNEXT_NORM,
635
+ LLM_TENSOR_CONVNEXT_PW1,
636
+ LLM_TENSOR_CONVNEXT_PW2,
637
+ LLM_TENSOR_CONVNEXT_GAMMA,
638
+ LLM_TENSOR_POS_NET_CONV1,
639
+ LLM_TENSOR_POS_NET_CONV2,
640
+ LLM_TENSOR_POS_NET_NORM,
641
+ LLM_TENSOR_POS_NET_NORM1,
642
+ LLM_TENSOR_POS_NET_NORM2,
643
+ LLM_TENSOR_POS_NET_ATTN_NORM,
644
+ LLM_TENSOR_POS_NET_ATTN_Q,
645
+ LLM_TENSOR_POS_NET_ATTN_K,
646
+ LLM_TENSOR_POS_NET_ATTN_V,
647
+ LLM_TENSOR_POS_NET_ATTN_OUT,
604
648
  };
605
649
 
606
650
  static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -896,6 +940,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
896
940
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
897
941
  },
898
942
  },
943
+ {
944
+ LLM_ARCH_QWEN2VL,
945
+ {
946
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
947
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
948
+ { LLM_TENSOR_OUTPUT, "output" },
949
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
950
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
951
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
952
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
953
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
954
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
955
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
956
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
957
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
958
+ },
959
+ },
899
960
  {
900
961
  LLM_ARCH_QWEN2MOE,
901
962
  {
@@ -1034,6 +1095,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1034
1095
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1035
1096
  { LLM_TENSOR_OUTPUT, "output" },
1036
1097
  { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1098
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
1099
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
1037
1100
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1038
1101
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1039
1102
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
@@ -1207,6 +1270,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1207
1270
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1208
1271
  },
1209
1272
  },
1273
+ {
1274
+ LLM_ARCH_OLMO2,
1275
+ {
1276
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1277
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1278
+ { LLM_TENSOR_OUTPUT, "output" },
1279
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1280
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1281
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1282
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1283
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1284
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1285
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1286
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1287
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1288
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1289
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1290
+ },
1291
+ },
1210
1292
  {
1211
1293
  LLM_ARCH_OLMOE,
1212
1294
  {
@@ -1265,6 +1347,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1265
1347
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1266
1348
  },
1267
1349
  },
1350
+ {
1351
+ LLM_ARCH_DEEPSEEK,
1352
+ {
1353
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1354
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1355
+ { LLM_TENSOR_OUTPUT, "output" },
1356
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1357
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1358
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1359
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1360
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1361
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1362
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1363
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1364
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1365
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1366
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1367
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1368
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1369
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1370
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1371
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1372
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1373
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1374
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1375
+ },
1376
+ },
1268
1377
  {
1269
1378
  LLM_ARCH_DEEPSEEK2,
1270
1379
  {
@@ -1520,6 +1629,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1520
1629
  { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1521
1630
  },
1522
1631
  },
1632
+ {
1633
+ LLM_ARCH_WAVTOKENIZER_DEC,
1634
+ {
1635
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1636
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1637
+ { LLM_TENSOR_CONV1D, "conv1d" },
1638
+ { LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" },
1639
+ { LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" },
1640
+ { LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" },
1641
+ { LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" },
1642
+ { LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" },
1643
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1644
+ { LLM_TENSOR_OUTPUT, "output" },
1645
+ { LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" },
1646
+ { LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" },
1647
+ { LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" },
1648
+ { LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" },
1649
+ { LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" },
1650
+ { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },
1651
+ { LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" },
1652
+ { LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" },
1653
+ { LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" },
1654
+ { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
1655
+ },
1656
+ },
1523
1657
  {
1524
1658
  LLM_ARCH_UNKNOWN,
1525
1659
  {
@@ -1528,6 +1662,69 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1528
1662
  },
1529
1663
  };
1530
1664
 
1665
+ enum llm_chat_template {
1666
+ LLM_CHAT_TEMPLATE_CHATML,
1667
+ LLM_CHAT_TEMPLATE_LLAMA_2,
1668
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
1669
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
1670
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
1671
+ LLM_CHAT_TEMPLATE_MISTRAL_V1,
1672
+ LLM_CHAT_TEMPLATE_MISTRAL_V3,
1673
+ LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
1674
+ LLM_CHAT_TEMPLATE_MISTRAL_V7,
1675
+ LLM_CHAT_TEMPLATE_PHI_3,
1676
+ LLM_CHAT_TEMPLATE_ZEPHYR,
1677
+ LLM_CHAT_TEMPLATE_MONARCH,
1678
+ LLM_CHAT_TEMPLATE_GEMMA,
1679
+ LLM_CHAT_TEMPLATE_ORION,
1680
+ LLM_CHAT_TEMPLATE_OPENCHAT,
1681
+ LLM_CHAT_TEMPLATE_VICUNA,
1682
+ LLM_CHAT_TEMPLATE_VICUNA_ORCA,
1683
+ LLM_CHAT_TEMPLATE_DEEPSEEK,
1684
+ LLM_CHAT_TEMPLATE_DEEPSEEK_2,
1685
+ LLM_CHAT_TEMPLATE_COMMAND_R,
1686
+ LLM_CHAT_TEMPLATE_LLAMA_3,
1687
+ LLM_CHAT_TEMPLATE_CHATGML_3,
1688
+ LLM_CHAT_TEMPLATE_CHATGML_4,
1689
+ LLM_CHAT_TEMPLATE_MINICPM,
1690
+ LLM_CHAT_TEMPLATE_EXAONE_3,
1691
+ LLM_CHAT_TEMPLATE_RWKV_WORLD,
1692
+ LLM_CHAT_TEMPLATE_GRANITE,
1693
+ LLM_CHAT_TEMPLATE_GIGACHAT,
1694
+ LLM_CHAT_TEMPLATE_UNKNOWN,
1695
+ };
1696
+
1697
+ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
1698
+ { "chatml", LLM_CHAT_TEMPLATE_CHATML },
1699
+ { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
1700
+ { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
1701
+ { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
1702
+ { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
1703
+ { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
1704
+ { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
1705
+ { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
1706
+ { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
1707
+ { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
1708
+ { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
1709
+ { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
1710
+ { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
1711
+ { "orion", LLM_CHAT_TEMPLATE_ORION },
1712
+ { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
1713
+ { "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
1714
+ { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
1715
+ { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
1716
+ { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
1717
+ { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
1718
+ { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
1719
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
1720
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
1721
+ { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
1722
+ { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
1723
+ { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
1724
+ { "granite", LLM_CHAT_TEMPLATE_GRANITE },
1725
+ { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
1726
+ };
1727
+
1531
1728
  static llm_arch llm_arch_from_string(const std::string & name) {
1532
1729
  for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
1533
1730
  if (kv.second == name) {
@@ -1601,9 +1798,10 @@ struct LLM_TN {
1601
1798
  //
1602
1799
 
1603
1800
  static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
1604
- { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
1605
- { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
1606
- { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
1801
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
1802
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
1803
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
1804
+ { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
1607
1805
  };
1608
1806
 
1609
1807
  static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
@@ -1709,7 +1907,7 @@ private:
1709
1907
  DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1710
1908
  NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1711
1909
  if (!bufLen) {
1712
- ret = format("Win32 error code: %s", error_code);
1910
+ ret = format("Win32 error code: %lx", error_code);
1713
1911
  } else {
1714
1912
  ret = lpMsgBuf;
1715
1913
  LocalFree(lpMsgBuf);
@@ -2047,7 +2245,7 @@ struct llama_mmap {
2047
2245
  HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
2048
2246
 
2049
2247
  // may fail on pre-Windows 8 systems
2050
- pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
2248
+ pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
2051
2249
 
2052
2250
  if (pPrefetchVirtualMemory) {
2053
2251
  // advise the kernel to preload the mapped memory
@@ -2320,6 +2518,7 @@ enum e_model {
2320
2518
  MODEL_16B,
2321
2519
  MODEL_20B,
2322
2520
  MODEL_30B,
2521
+ MODEL_32B,
2323
2522
  MODEL_34B,
2324
2523
  MODEL_35B,
2325
2524
  MODEL_40B,
@@ -2345,15 +2544,26 @@ static const size_t kiB = 1024;
2345
2544
  static const size_t MiB = 1024*kiB;
2346
2545
  static const size_t GiB = 1024*MiB;
2347
2546
 
2547
+ struct llama_hparams_posnet {
2548
+ uint32_t n_embd;
2549
+ uint32_t n_layer;
2550
+ };
2551
+
2552
+ struct llama_hparams_convnext {
2553
+ uint32_t n_embd;
2554
+ uint32_t n_layer;
2555
+ };
2556
+
2348
2557
  struct llama_hparams {
2349
2558
  bool vocab_only;
2350
2559
  bool rope_finetuned;
2351
2560
  bool use_par_res;
2352
2561
  bool swin_norm;
2353
2562
 
2354
- uint32_t n_vocab;
2563
+ uint32_t n_vocab = 0;
2355
2564
  uint32_t n_ctx_train; // context size the model was trained on
2356
2565
  uint32_t n_embd;
2566
+ uint32_t n_embd_features = 0;
2357
2567
  uint32_t n_layer;
2358
2568
  uint32_t n_rot;
2359
2569
  uint32_t n_swa = 0; // sliding window attention (SWA)
@@ -2364,6 +2574,10 @@ struct llama_hparams {
2364
2574
  uint32_t n_vocab_type = 0; // for BERT-style token types
2365
2575
  uint32_t n_rel_attn_bkts = 0;
2366
2576
 
2577
+ // for WavTokenizer
2578
+ struct llama_hparams_posnet posnet;
2579
+ struct llama_hparams_convnext convnext;
2580
+
2367
2581
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
2368
2582
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
2369
2583
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -2378,6 +2592,9 @@ struct llama_hparams {
2378
2592
 
2379
2593
  float f_norm_eps;
2380
2594
  float f_norm_rms_eps;
2595
+ float f_norm_group_eps;
2596
+
2597
+ uint32_t n_norm_groups;
2381
2598
 
2382
2599
  float f_attn_logit_softcapping = 50.0f;
2383
2600
  float f_final_logit_softcapping = 30.0f;
@@ -2388,11 +2605,12 @@ struct llama_hparams {
2388
2605
  uint32_t time_decay_extra_dim = 0;
2389
2606
  uint32_t wkv_head_size = 0;
2390
2607
 
2391
- float rope_attn_factor = 1.0f;
2392
- float rope_freq_base_train;
2393
- float rope_freq_scale_train;
2394
- uint32_t n_ctx_orig_yarn;
2395
- float rope_yarn_log_mul;
2608
+ float rope_attn_factor = 1.0f;
2609
+ float rope_freq_base_train;
2610
+ float rope_freq_scale_train;
2611
+ uint32_t n_ctx_orig_yarn;
2612
+ float rope_yarn_log_mul;
2613
+ int rope_sections[4];
2396
2614
 
2397
2615
  // for State Space Models
2398
2616
  uint32_t ssm_d_conv = 0;
@@ -2422,63 +2640,6 @@ struct llama_hparams {
2422
2640
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
2423
2641
  enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
2424
2642
 
2425
- bool operator!=(const llama_hparams & other) const {
2426
- if (this->vocab_only != other.vocab_only) return true;
2427
- if (this->n_vocab != other.n_vocab) return true;
2428
- if (this->n_ctx_train != other.n_ctx_train) return true;
2429
- if (this->n_embd != other.n_embd) return true;
2430
- if (this->n_layer != other.n_layer) return true;
2431
- if (this->n_rot != other.n_rot) return true;
2432
- if (this->n_swa != other.n_swa) return true;
2433
- if (this->n_embd_head_k != other.n_embd_head_k) return true;
2434
- if (this->n_embd_head_v != other.n_embd_head_v) return true;
2435
- if (this->n_expert != other.n_expert) return true;
2436
- if (this->n_expert_used != other.n_expert_used) return true;
2437
-
2438
- if (this->n_head_arr != other.n_head_arr) return true;
2439
- if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
2440
- if (this->n_ff_arr != other.n_ff_arr) return true;
2441
-
2442
- if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
2443
- if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
2444
- if (this->n_lora_q != other.n_lora_q) return true;
2445
- if (this->n_lora_kv != other.n_lora_kv) return true;
2446
- if (this->n_ff_exp != other.n_ff_exp) return true;
2447
- if (this->n_ff_shexp != other.n_ff_shexp) return true;
2448
- if (this->n_expert_shared != other.n_expert_shared) return true;
2449
-
2450
- if (this->rope_finetuned != other.rope_finetuned) return true;
2451
- if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
2452
-
2453
- if (this->ssm_d_conv != other.ssm_d_conv) return true;
2454
- if (this->ssm_d_inner != other.ssm_d_inner) return true;
2455
- if (this->ssm_d_state != other.ssm_d_state) return true;
2456
- if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
2457
- if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
2458
-
2459
- if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
2460
- if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
2461
- if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
2462
- if (this->wkv_head_size != other.wkv_head_size) return true;
2463
-
2464
- if (this->dec_start_token_id != other.dec_start_token_id) return true;
2465
-
2466
- const float EPSILON = 1e-9f;
2467
-
2468
- if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2469
- if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2470
- if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2471
- if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2472
- if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2473
- if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2474
- if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2475
- if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2476
- if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2477
- if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
2478
-
2479
- return false;
2480
- }
2481
-
2482
2643
  uint32_t n_head(uint32_t il = 0) const {
2483
2644
  if (il < n_layer) {
2484
2645
  return n_head_arr[il];
@@ -2531,21 +2692,21 @@ struct llama_hparams {
2531
2692
  if (wkv_head_size != 0) {
2532
2693
  // for RWKV models
2533
2694
  return 2 * n_embd;
2534
- } else {
2535
- // TODO: maybe support other convolution strides than 1
2536
- // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
2537
- return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
2538
2695
  }
2696
+
2697
+ // TODO: maybe support other convolution strides than 1
2698
+ // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
2699
+ return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
2539
2700
  }
2540
2701
 
2541
2702
  uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
2542
2703
  if (wkv_head_size != 0) {
2543
2704
  // corresponds to RWKV's wkv_states size
2544
2705
  return n_embd * wkv_head_size;
2545
- } else {
2546
- // corresponds to Mamba's ssm_states size
2547
- return ssm_d_state * ssm_d_inner;
2548
2706
  }
2707
+
2708
+ // corresponds to Mamba's ssm_states size
2709
+ return ssm_d_state * ssm_d_inner;
2549
2710
  }
2550
2711
  };
2551
2712
 
@@ -2583,142 +2744,187 @@ struct llama_cparams {
2583
2744
  void * cb_eval_user_data;
2584
2745
  };
2585
2746
 
2586
- // TODO: separate into "llama_layer_enc" and "llama_layer_dec"
2587
- struct llama_layer {
2588
- llama_layer() {
2589
- // initialize all pointers to NULL
2590
- std::memset(this, 0, sizeof(*this));
2591
- }
2747
+ struct llama_layer_posnet {
2748
+ // resnet
2749
+ struct ggml_tensor * norm1 = nullptr;
2750
+ struct ggml_tensor * norm1_b = nullptr;
2751
+
2752
+ struct ggml_tensor * conv1 = nullptr;
2753
+ struct ggml_tensor * conv1_b = nullptr;
2754
+
2755
+ struct ggml_tensor * norm2 = nullptr;
2756
+ struct ggml_tensor * norm2_b = nullptr;
2757
+
2758
+ struct ggml_tensor * conv2 = nullptr;
2759
+ struct ggml_tensor * conv2_b = nullptr;
2592
2760
 
2761
+ // attention
2762
+ struct ggml_tensor * attn_norm = nullptr;
2763
+ struct ggml_tensor * attn_norm_b = nullptr;
2764
+
2765
+ struct ggml_tensor * attn_q = nullptr;
2766
+ struct ggml_tensor * attn_q_b = nullptr;
2767
+
2768
+ struct ggml_tensor * attn_k = nullptr;
2769
+ struct ggml_tensor * attn_k_b = nullptr;
2770
+
2771
+ struct ggml_tensor * attn_v = nullptr;
2772
+ struct ggml_tensor * attn_v_b = nullptr;
2773
+
2774
+ struct ggml_tensor * attn_o = nullptr;
2775
+ struct ggml_tensor * attn_o_b = nullptr;
2776
+
2777
+ // normalize
2778
+ struct ggml_tensor * norm = nullptr;
2779
+ struct ggml_tensor * norm_b = nullptr;
2780
+ };
2781
+
2782
+ struct llama_layer_convnext {
2783
+ struct ggml_tensor * dw = nullptr;
2784
+ struct ggml_tensor * dw_b = nullptr;
2785
+
2786
+ struct ggml_tensor * norm = nullptr;
2787
+ struct ggml_tensor * norm_b = nullptr;
2788
+
2789
+ struct ggml_tensor * pw1 = nullptr;
2790
+ struct ggml_tensor * pw1_b = nullptr;
2791
+
2792
+ struct ggml_tensor * pw2 = nullptr;
2793
+ struct ggml_tensor * pw2_b = nullptr;
2794
+
2795
+ struct ggml_tensor * gamma = nullptr;
2796
+ };
2797
+
2798
+ struct llama_layer {
2593
2799
  // normalization
2594
- struct ggml_tensor * attn_norm;
2595
- struct ggml_tensor * attn_norm_b;
2596
- struct ggml_tensor * attn_norm_2;
2597
- struct ggml_tensor * attn_norm_2_b;
2598
- struct ggml_tensor * attn_q_norm;
2599
- struct ggml_tensor * attn_q_norm_b;
2600
- struct ggml_tensor * attn_k_norm;
2601
- struct ggml_tensor * attn_k_norm_b;
2602
- struct ggml_tensor * attn_out_norm;
2603
- struct ggml_tensor * attn_out_norm_b;
2604
- struct ggml_tensor * attn_q_a_norm;
2605
- struct ggml_tensor * attn_kv_a_norm;
2606
- struct ggml_tensor * attn_sub_norm;
2607
- struct ggml_tensor * attn_post_norm;
2608
- struct ggml_tensor * ffn_sub_norm;
2609
- struct ggml_tensor * attn_norm_cross;
2610
- struct ggml_tensor * attn_norm_enc;
2800
+ struct ggml_tensor * attn_norm = nullptr;
2801
+ struct ggml_tensor * attn_norm_b = nullptr;
2802
+ struct ggml_tensor * attn_norm_2 = nullptr;
2803
+ struct ggml_tensor * attn_norm_2_b = nullptr;
2804
+ struct ggml_tensor * attn_q_norm = nullptr;
2805
+ struct ggml_tensor * attn_q_norm_b = nullptr;
2806
+ struct ggml_tensor * attn_k_norm = nullptr;
2807
+ struct ggml_tensor * attn_k_norm_b = nullptr;
2808
+ struct ggml_tensor * attn_out_norm = nullptr;
2809
+ struct ggml_tensor * attn_out_norm_b = nullptr;
2810
+ struct ggml_tensor * attn_q_a_norm = nullptr;
2811
+ struct ggml_tensor * attn_kv_a_norm = nullptr;
2812
+ struct ggml_tensor * attn_sub_norm = nullptr;
2813
+ struct ggml_tensor * attn_post_norm = nullptr;
2814
+ struct ggml_tensor * ffn_sub_norm = nullptr;
2815
+ struct ggml_tensor * attn_norm_cross = nullptr;
2816
+ struct ggml_tensor * attn_norm_enc = nullptr;
2611
2817
 
2612
2818
  // attention
2613
- struct ggml_tensor * wq;
2614
- struct ggml_tensor * wk;
2615
- struct ggml_tensor * wv;
2616
- struct ggml_tensor * wo;
2617
- struct ggml_tensor * wqkv;
2618
- struct ggml_tensor * wq_a;
2619
- struct ggml_tensor * wq_b;
2620
- struct ggml_tensor * wkv_a_mqa;
2621
- struct ggml_tensor * wkv_b;
2622
- struct ggml_tensor * wq_cross;
2623
- struct ggml_tensor * wk_cross;
2624
- struct ggml_tensor * wv_cross;
2625
- struct ggml_tensor * wo_cross;
2626
- struct ggml_tensor * wq_enc;
2627
- struct ggml_tensor * wk_enc;
2628
- struct ggml_tensor * wv_enc;
2629
- struct ggml_tensor * wo_enc;
2819
+ struct ggml_tensor * wq = nullptr;
2820
+ struct ggml_tensor * wk = nullptr;
2821
+ struct ggml_tensor * wv = nullptr;
2822
+ struct ggml_tensor * wo = nullptr;
2823
+ struct ggml_tensor * wqkv = nullptr;
2824
+ struct ggml_tensor * wq_a = nullptr;
2825
+ struct ggml_tensor * wq_b = nullptr;
2826
+ struct ggml_tensor * wkv_a_mqa = nullptr;
2827
+ struct ggml_tensor * wkv_b = nullptr;
2828
+ struct ggml_tensor * wq_cross = nullptr;
2829
+ struct ggml_tensor * wk_cross = nullptr;
2830
+ struct ggml_tensor * wv_cross = nullptr;
2831
+ struct ggml_tensor * wo_cross = nullptr;
2832
+ struct ggml_tensor * wq_enc = nullptr;
2833
+ struct ggml_tensor * wk_enc = nullptr;
2834
+ struct ggml_tensor * wv_enc = nullptr;
2835
+ struct ggml_tensor * wo_enc = nullptr;
2630
2836
 
2631
2837
  // attention bias
2632
- struct ggml_tensor * bq;
2633
- struct ggml_tensor * bk;
2634
- struct ggml_tensor * bv;
2635
- struct ggml_tensor * bo;
2636
- struct ggml_tensor * bqkv;
2838
+ struct ggml_tensor * bq = nullptr;
2839
+ struct ggml_tensor * bk = nullptr;
2840
+ struct ggml_tensor * bv = nullptr;
2841
+ struct ggml_tensor * bo = nullptr;
2842
+ struct ggml_tensor * bqkv = nullptr;
2637
2843
 
2638
2844
  // relative position bias
2639
- struct ggml_tensor * attn_rel_b;
2640
- struct ggml_tensor * attn_rel_b_enc;
2641
- struct ggml_tensor * attn_rel_b_cross;
2845
+ struct ggml_tensor * attn_rel_b = nullptr;
2846
+ struct ggml_tensor * attn_rel_b_enc = nullptr;
2847
+ struct ggml_tensor * attn_rel_b_cross = nullptr;
2642
2848
 
2643
2849
  // normalization
2644
- struct ggml_tensor * ffn_norm;
2645
- struct ggml_tensor * ffn_norm_b;
2646
- struct ggml_tensor * ffn_post_norm;
2647
- struct ggml_tensor * layer_out_norm;
2648
- struct ggml_tensor * layer_out_norm_b;
2649
- struct ggml_tensor * ffn_norm_exps;
2650
- struct ggml_tensor * ffn_norm_enc;
2850
+ struct ggml_tensor * ffn_norm = nullptr;
2851
+ struct ggml_tensor * ffn_norm_b = nullptr;
2852
+ struct ggml_tensor * ffn_post_norm = nullptr;
2853
+ struct ggml_tensor * layer_out_norm = nullptr;
2854
+ struct ggml_tensor * layer_out_norm_b = nullptr;
2855
+ struct ggml_tensor * ffn_norm_exps = nullptr;
2856
+ struct ggml_tensor * ffn_norm_enc = nullptr;
2651
2857
 
2652
2858
  // ff
2653
- struct ggml_tensor * ffn_gate; // w1
2654
- struct ggml_tensor * ffn_down; // w2
2655
- struct ggml_tensor * ffn_up; // w3
2656
- struct ggml_tensor * ffn_gate_enc;
2657
- struct ggml_tensor * ffn_down_enc;
2658
- struct ggml_tensor * ffn_up_enc;
2859
+ struct ggml_tensor * ffn_gate = nullptr; // w1
2860
+ struct ggml_tensor * ffn_down = nullptr; // w2
2861
+ struct ggml_tensor * ffn_up = nullptr; // w3
2862
+ struct ggml_tensor * ffn_gate_enc = nullptr;
2863
+ struct ggml_tensor * ffn_down_enc = nullptr;
2864
+ struct ggml_tensor * ffn_up_enc = nullptr;
2659
2865
 
2660
2866
  // ff MoE
2661
- struct ggml_tensor * ffn_gate_inp;
2662
- struct ggml_tensor * ffn_gate_exps;
2663
- struct ggml_tensor * ffn_down_exps;
2664
- struct ggml_tensor * ffn_up_exps ;
2867
+ struct ggml_tensor * ffn_gate_inp = nullptr;
2868
+ struct ggml_tensor * ffn_gate_exps = nullptr;
2869
+ struct ggml_tensor * ffn_down_exps = nullptr;
2870
+ struct ggml_tensor * ffn_up_exps = nullptr;
2665
2871
 
2666
2872
  // ff shared expert (shexp)
2667
- struct ggml_tensor * ffn_gate_inp_shexp;
2668
- struct ggml_tensor * ffn_gate_shexp;
2669
- struct ggml_tensor * ffn_down_shexp;
2670
- struct ggml_tensor * ffn_up_shexp;
2873
+ struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
2874
+ struct ggml_tensor * ffn_gate_shexp = nullptr;
2875
+ struct ggml_tensor * ffn_down_shexp = nullptr;
2876
+ struct ggml_tensor * ffn_up_shexp = nullptr;
2671
2877
 
2672
2878
  // ff bias
2673
- struct ggml_tensor * ffn_gate_b;
2674
- struct ggml_tensor * ffn_down_b; // b2
2675
- struct ggml_tensor * ffn_up_b; // b3
2676
- struct ggml_tensor * ffn_act;
2879
+ struct ggml_tensor * ffn_gate_b = nullptr;
2880
+ struct ggml_tensor * ffn_down_b = nullptr; // b2
2881
+ struct ggml_tensor * ffn_up_b = nullptr; // b3
2882
+ struct ggml_tensor * ffn_act = nullptr;
2677
2883
 
2678
2884
  // mamba proj
2679
- struct ggml_tensor * ssm_in;
2680
- struct ggml_tensor * ssm_x;
2681
- struct ggml_tensor * ssm_dt;
2682
- struct ggml_tensor * ssm_out;
2885
+ struct ggml_tensor * ssm_in = nullptr;
2886
+ struct ggml_tensor * ssm_x = nullptr;
2887
+ struct ggml_tensor * ssm_dt = nullptr;
2888
+ struct ggml_tensor * ssm_out = nullptr;
2683
2889
 
2684
2890
  // mamba
2685
- struct ggml_tensor * ssm_conv1d;
2686
- struct ggml_tensor * ssm_a;
2687
- struct ggml_tensor * ssm_d;
2891
+ struct ggml_tensor * ssm_conv1d = nullptr;
2892
+ struct ggml_tensor * ssm_a = nullptr;
2893
+ struct ggml_tensor * ssm_d = nullptr;
2688
2894
 
2689
2895
  // mamba bias
2690
- struct ggml_tensor * ssm_conv1d_b;
2691
- struct ggml_tensor * ssm_dt_b;
2896
+ struct ggml_tensor * ssm_conv1d_b = nullptr;
2897
+ struct ggml_tensor * ssm_dt_b = nullptr;
2692
2898
 
2693
2899
  // rwkv
2694
- struct ggml_tensor * time_mix_w1;
2695
- struct ggml_tensor * time_mix_w2;
2696
- struct ggml_tensor * time_mix_lerp_x;
2697
- struct ggml_tensor * time_mix_lerp_w;
2698
- struct ggml_tensor * time_mix_lerp_k;
2699
- struct ggml_tensor * time_mix_lerp_v;
2700
- struct ggml_tensor * time_mix_lerp_r;
2701
- struct ggml_tensor * time_mix_lerp_g;
2702
-
2703
- struct ggml_tensor * time_mix_first;
2704
- struct ggml_tensor * time_mix_decay;
2705
- struct ggml_tensor * time_mix_decay_w1;
2706
- struct ggml_tensor * time_mix_decay_w2;
2707
- struct ggml_tensor * time_mix_key;
2708
- struct ggml_tensor * time_mix_value;
2709
- struct ggml_tensor * time_mix_receptance;
2710
- struct ggml_tensor * time_mix_gate;
2711
-
2712
- struct ggml_tensor * time_mix_ln;
2713
- struct ggml_tensor * time_mix_ln_b;
2714
- struct ggml_tensor * time_mix_output;
2715
-
2716
- struct ggml_tensor * channel_mix_lerp_k;
2717
- struct ggml_tensor * channel_mix_lerp_r;
2718
-
2719
- struct ggml_tensor * channel_mix_key;
2720
- struct ggml_tensor * channel_mix_receptance;
2721
- struct ggml_tensor * channel_mix_value;
2900
+ struct ggml_tensor * time_mix_w1 = nullptr;
2901
+ struct ggml_tensor * time_mix_w2 = nullptr;
2902
+ struct ggml_tensor * time_mix_lerp_x = nullptr;
2903
+ struct ggml_tensor * time_mix_lerp_w = nullptr;
2904
+ struct ggml_tensor * time_mix_lerp_k = nullptr;
2905
+ struct ggml_tensor * time_mix_lerp_v = nullptr;
2906
+ struct ggml_tensor * time_mix_lerp_r = nullptr;
2907
+ struct ggml_tensor * time_mix_lerp_g = nullptr;
2908
+
2909
+ struct ggml_tensor * time_mix_first = nullptr;
2910
+ struct ggml_tensor * time_mix_decay = nullptr;
2911
+ struct ggml_tensor * time_mix_decay_w1 = nullptr;
2912
+ struct ggml_tensor * time_mix_decay_w2 = nullptr;
2913
+ struct ggml_tensor * time_mix_key = nullptr;
2914
+ struct ggml_tensor * time_mix_value = nullptr;
2915
+ struct ggml_tensor * time_mix_receptance = nullptr;
2916
+ struct ggml_tensor * time_mix_gate = nullptr;
2917
+
2918
+ struct ggml_tensor * time_mix_ln = nullptr;
2919
+ struct ggml_tensor * time_mix_ln_b = nullptr;
2920
+ struct ggml_tensor * time_mix_output = nullptr;
2921
+
2922
+ struct ggml_tensor * channel_mix_lerp_k = nullptr;
2923
+ struct ggml_tensor * channel_mix_lerp_r = nullptr;
2924
+
2925
+ struct ggml_tensor * channel_mix_key = nullptr;
2926
+ struct ggml_tensor * channel_mix_receptance = nullptr;
2927
+ struct ggml_tensor * channel_mix_value = nullptr;
2722
2928
 
2723
2929
  // long rope factors
2724
2930
  struct ggml_tensor * rope_long = nullptr;
@@ -2726,13 +2932,17 @@ struct llama_layer {
2726
2932
  struct ggml_tensor * rope_freqs = nullptr;
2727
2933
 
2728
2934
  // bitnet scale
2729
- struct ggml_tensor * wq_scale;
2730
- struct ggml_tensor * wk_scale;
2731
- struct ggml_tensor * wv_scale;
2732
- struct ggml_tensor * wo_scale;
2733
- struct ggml_tensor * ffn_gate_scale;
2734
- struct ggml_tensor * ffn_up_scale;
2735
- struct ggml_tensor * ffn_down_scale;
2935
+ struct ggml_tensor * wq_scale = nullptr;
2936
+ struct ggml_tensor * wk_scale = nullptr;
2937
+ struct ggml_tensor * wv_scale = nullptr;
2938
+ struct ggml_tensor * wo_scale = nullptr;
2939
+ struct ggml_tensor * ffn_gate_scale = nullptr;
2940
+ struct ggml_tensor * ffn_up_scale = nullptr;
2941
+ struct ggml_tensor * ffn_down_scale = nullptr;
2942
+
2943
+ struct llama_layer_posnet posnet;
2944
+
2945
+ struct llama_layer_convnext convnext;
2736
2946
  };
2737
2947
 
2738
2948
  // very similar to llama_batch,
@@ -2863,6 +3073,9 @@ struct llama_model {
2863
3073
  struct ggml_tensor * cls_out = nullptr;
2864
3074
  struct ggml_tensor * cls_out_b = nullptr;
2865
3075
 
3076
+ struct ggml_tensor * conv1d = nullptr;
3077
+ struct ggml_tensor * conv1d_b = nullptr;
3078
+
2866
3079
  std::vector<llama_layer> layers;
2867
3080
 
2868
3081
  // gguf metadata
@@ -2947,6 +3160,7 @@ struct llama_sbatch {
2947
3160
  // batch indices of the output
2948
3161
  std::vector<size_t> out_ids;
2949
3162
  std::vector<llama_sbatch_seq> seq;
3163
+
2950
3164
  const llama_batch * batch = nullptr;
2951
3165
 
2952
3166
  // buffers for the ubatch
@@ -3292,6 +3506,11 @@ struct llama_context {
3292
3506
  // whether we are computing encoder output or decoder output
3293
3507
  bool is_encoding = false;
3294
3508
 
3509
+ // TODO: find a better way to accommodate mutli-dimension position encoding methods
3510
+ // number of position id each token get, 1 for each token in most cases.
3511
+ // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
3512
+ int n_pos_per_token = 1;
3513
+
3295
3514
  // output of the encoder part of the encoder-decoder models
3296
3515
  std::vector<float> embd_enc;
3297
3516
  std::vector<std::set<llama_seq_id>> seq_ids_enc;
@@ -3362,6 +3581,17 @@ static int llama_get_device_count(const llama_model & model) {
3362
3581
  return (int) model.devices.size();
3363
3582
  }
3364
3583
 
3584
+ static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) {
3585
+ auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
3586
+ [name](const std::pair<std::string, struct ggml_tensor *> & it) {
3587
+ return it.first == name;
3588
+ });
3589
+ if (it == model->tensors_by_name.end()) {
3590
+ return nullptr;
3591
+ }
3592
+ return it->second;
3593
+ }
3594
+
3365
3595
  template<typename F>
3366
3596
  static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
3367
3597
  ggml_init_params params = {
@@ -3415,7 +3645,9 @@ static bool llama_kv_cache_init(
3415
3645
 
3416
3646
  const struct llama_hparams & hparams = model.hparams;
3417
3647
 
3418
- const int64_t n_layer = hparams.n_layer;
3648
+ const int32_t n_layer = hparams.n_layer;
3649
+
3650
+ LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
3419
3651
 
3420
3652
  cache.has_shift = false;
3421
3653
 
@@ -3456,10 +3688,12 @@ static bool llama_kv_cache_init(
3456
3688
  cache.k_l.reserve(n_layer);
3457
3689
  cache.v_l.reserve(n_layer);
3458
3690
 
3459
- for (int i = 0; i < (int) n_layer; i++) {
3691
+ for (int i = 0; i < n_layer; i++) {
3460
3692
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
3461
3693
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
3462
3694
 
3695
+ LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
3696
+
3463
3697
  ggml_backend_buffer_type_t buft;
3464
3698
  if (offload) {
3465
3699
  auto * dev = model.dev_layer.at(i).dev;
@@ -4492,9 +4726,6 @@ struct llama_model_loader {
4492
4726
  case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
4493
4727
  case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
4494
4728
  case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
4495
- case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
4496
- case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
4497
- case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
4498
4729
  default:
4499
4730
  {
4500
4731
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -4845,7 +5076,9 @@ struct llama_model_loader {
4845
5076
  mappings.reserve(files.size());
4846
5077
  mmaps_used.reserve(files.size());
4847
5078
  for (const auto & file : files) {
4848
- std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
5079
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
5080
+ auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
5081
+ std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
4849
5082
  mmaps_used.emplace_back(mapping->size, 0);
4850
5083
  if (mlock_mmaps) {
4851
5084
  std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@@ -5256,9 +5489,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
5256
5489
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
5257
5490
  case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
5258
5491
  case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
5259
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
5260
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
5261
- case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
5262
5492
 
5263
5493
  default: return "unknown, may not work";
5264
5494
  }
@@ -5307,6 +5537,7 @@ static const char * llama_model_type_name(e_model type) {
5307
5537
  case MODEL_16B: return "16B";
5308
5538
  case MODEL_20B: return "20B";
5309
5539
  case MODEL_30B: return "30B";
5540
+ case MODEL_32B: return "32B";
5310
5541
  case MODEL_34B: return "34B";
5311
5542
  case MODEL_35B: return "35B";
5312
5543
  case MODEL_40B: return "40B";
@@ -5375,7 +5606,7 @@ static void llm_load_hparams(
5375
5606
  ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
5376
5607
 
5377
5608
  // get hparams kv
5378
- ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
5609
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
5379
5610
 
5380
5611
  // everything past this point is not vocab-related
5381
5612
  if (hparams.vocab_only) {
@@ -5388,6 +5619,16 @@ static void llm_load_hparams(
5388
5619
  ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
5389
5620
  ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
5390
5621
 
5622
+ if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
5623
+ ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
5624
+
5625
+ ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
5626
+ ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
5627
+
5628
+ ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
5629
+ ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
5630
+ }
5631
+
5391
5632
  GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
5392
5633
  GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
5393
5634
  if (hparams.n_expert > 0) {
@@ -5396,13 +5637,13 @@ static void llm_load_hparams(
5396
5637
  GGML_ASSERT(hparams.n_expert_used == 0);
5397
5638
  }
5398
5639
 
5399
- // zero-out the per-layer hparams
5640
+ // zero-out the array hparams
5400
5641
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
5401
5642
  std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
5402
5643
  std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
5403
5644
 
5404
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
5405
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
5645
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
5646
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
5406
5647
 
5407
5648
  // n_head_kv is optional, default to n_head
5408
5649
  hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5494,8 +5735,12 @@ static void llm_load_hparams(
5494
5735
  case LLM_ARCH_MINICPM:
5495
5736
  {
5496
5737
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5738
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5739
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5740
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5497
5741
 
5498
5742
  switch (hparams.n_layer) {
5743
+ case 52: model.type = e_model::MODEL_1B; break;
5499
5744
  case 40: model.type = e_model::MODEL_2B; break;
5500
5745
  default: model.type = e_model::MODEL_UNKNOWN;
5501
5746
  }
@@ -5660,6 +5905,13 @@ static void llm_load_hparams(
5660
5905
  default: model.type = e_model::MODEL_UNKNOWN;
5661
5906
  }
5662
5907
  } break;
5908
+ case LLM_ARCH_QWEN2VL:
5909
+ {
5910
+ std::array<int, 4> section_dims;
5911
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
5912
+ std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
5913
+ }
5914
+ // fall through
5663
5915
  case LLM_ARCH_QWEN2:
5664
5916
  {
5665
5917
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -5667,7 +5919,10 @@ static void llm_load_hparams(
5667
5919
  case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
5668
5920
  case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
5669
5921
  case 32: model.type = e_model::MODEL_7B; break;
5922
+ case 36: model.type = e_model::MODEL_3B; break;
5670
5923
  case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
5924
+ case 48: model.type = e_model::MODEL_14B; break;
5925
+ case 64: model.type = e_model::MODEL_32B; break;
5671
5926
  case 80: model.type = e_model::MODEL_70B; break;
5672
5927
  default: model.type = e_model::MODEL_UNKNOWN;
5673
5928
  }
@@ -5877,6 +6132,17 @@ static void llm_load_hparams(
5877
6132
  default: model.type = e_model::MODEL_UNKNOWN;
5878
6133
  }
5879
6134
  } break;
6135
+ case LLM_ARCH_OLMO2:
6136
+ {
6137
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6138
+
6139
+ switch (hparams.n_layer) {
6140
+ case 16: model.type = e_model::MODEL_1B; break;
6141
+ case 32: model.type = e_model::MODEL_7B; break;
6142
+ case 40: model.type = e_model::MODEL_13B; break;
6143
+ default: model.type = e_model::MODEL_UNKNOWN;
6144
+ }
6145
+ } break;
5880
6146
  case LLM_ARCH_OLMOE:
5881
6147
  {
5882
6148
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -5956,6 +6222,19 @@ static void llm_load_hparams(
5956
6222
  model.type = e_model::MODEL_UNKNOWN;
5957
6223
  }
5958
6224
  } break;
6225
+ case LLM_ARCH_DEEPSEEK:
6226
+ {
6227
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6228
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
6229
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
6230
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
6231
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
6232
+
6233
+ switch (hparams.n_layer) {
6234
+ case 28: model.type = e_model::MODEL_20B; break;
6235
+ default: model.type = e_model::MODEL_UNKNOWN;
6236
+ }
6237
+ } break;
5959
6238
  case LLM_ARCH_DEEPSEEK2:
5960
6239
  {
5961
6240
  bool is_lite = (hparams.n_layer == 27);
@@ -6109,6 +6388,13 @@ static void llm_load_hparams(
6109
6388
  default: model.type = e_model::MODEL_UNKNOWN;
6110
6389
  }
6111
6390
  } break;
6391
+ case LLM_ARCH_WAVTOKENIZER_DEC:
6392
+ {
6393
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
6394
+ ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
6395
+ ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
6396
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
6397
+ } break;
6112
6398
  default: (void)0;
6113
6399
  }
6114
6400
 
@@ -6138,7 +6424,7 @@ static void llm_load_vocab(
6138
6424
  ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
6139
6425
  ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
6140
6426
 
6141
- if (tokenizer_model == "no_vocab") {
6427
+ if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
6142
6428
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
6143
6429
 
6144
6430
  // default special tokens
@@ -6302,10 +6588,12 @@ static void llm_load_vocab(
6302
6588
  tokenizer_pre == "phi-2" ||
6303
6589
  tokenizer_pre == "jina-es" ||
6304
6590
  tokenizer_pre == "jina-de" ||
6591
+ tokenizer_pre == "gigachat" ||
6305
6592
  tokenizer_pre == "jina-v1-en" ||
6306
6593
  tokenizer_pre == "jina-v2-es" ||
6307
6594
  tokenizer_pre == "jina-v2-de" ||
6308
- tokenizer_pre == "jina-v2-code") {
6595
+ tokenizer_pre == "jina-v2-code" ||
6596
+ tokenizer_pre == "roberta-bpe") {
6309
6597
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
6310
6598
  } else if (
6311
6599
  tokenizer_pre == "refact") {
@@ -6372,6 +6660,9 @@ static void llm_load_vocab(
6372
6660
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
6373
6661
  vocab.tokenizer_add_bos = true;
6374
6662
  vocab.tokenizer_clean_spaces = false;
6663
+ } else if (
6664
+ tokenizer_pre == "minerva-7b") {
6665
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
6375
6666
  } else {
6376
6667
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
6377
6668
  }
@@ -6950,6 +7241,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6950
7241
 
6951
7242
  LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
6952
7243
 
7244
+ if (model.arch == LLM_ARCH_DEEPSEEK) {
7245
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7246
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7247
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7248
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7249
+ }
7250
+
6953
7251
  if (model.arch == LLM_ARCH_DEEPSEEK2) {
6954
7252
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
6955
7253
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@@ -6965,7 +7263,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6965
7263
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6966
7264
  }
6967
7265
 
6968
- if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
7266
+ if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
6969
7267
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6970
7268
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6971
7269
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -7106,6 +7404,22 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
7106
7404
  {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
7107
7405
  // this tensor is loaded for T5, but never used
7108
7406
  {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
7407
+ {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
7408
+ {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
7409
+ {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
7410
+ {LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
7411
+ {LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
7412
+ {LLM_TENSOR_POS_NET_CONV2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
7413
+ {LLM_TENSOR_POS_NET_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
7414
+ {LLM_TENSOR_POS_NET_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
7415
+ {LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
7416
+ {LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
7417
+ {LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
7418
+ {LLM_TENSOR_CONVNEXT_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
7419
+ {LLM_TENSOR_CONVNEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
7420
+ {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
7421
+ {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
7422
+ {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
7109
7423
  };
7110
7424
 
7111
7425
  // checks if the weight tensor can be used with the specified buffer type and device
@@ -7149,12 +7463,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
7149
7463
  } break;
7150
7464
  case GGML_OP_ADD:
7151
7465
  {
7152
- ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
7466
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
7153
7467
  op_tensor = ggml_add(ctx, a, w);
7154
7468
  } break;
7155
7469
  case GGML_OP_MUL:
7156
7470
  {
7157
- ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
7471
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
7158
7472
  op_tensor = ggml_mul(ctx, a, w);
7159
7473
  } break;
7160
7474
  case GGML_OP_DIV:
@@ -7210,6 +7524,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
7210
7524
  ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
7211
7525
  op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
7212
7526
  } break;
7527
+ case GGML_OP_IM2COL:
7528
+ {
7529
+ const int n_embd = hparams.n_embd;
7530
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
7531
+ op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
7532
+ } break;
7213
7533
  default:
7214
7534
  GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
7215
7535
  }
@@ -7340,7 +7660,8 @@ static bool llm_load_tensors(
7340
7660
  model.main_gpu = main_gpu;
7341
7661
  model.n_gpu_layers = n_gpu_layers;
7342
7662
 
7343
- const int n_layer = hparams.n_layer;
7663
+ const int n_layer = hparams.n_layer;
7664
+
7344
7665
  bool use_mmap_buffer = true;
7345
7666
 
7346
7667
  // build a list of buffer types for the CPU and GPU devices
@@ -7590,7 +7911,13 @@ static bool llm_load_tensors(
7590
7911
 
7591
7912
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7592
7913
 
7593
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7914
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7915
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7916
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7917
+ }
7918
+ else {
7919
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7920
+ }
7594
7921
 
7595
7922
  if (n_expert == 0) {
7596
7923
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
@@ -8057,6 +8384,7 @@ static bool llm_load_tensors(
8057
8384
  }
8058
8385
  } break;
8059
8386
  case LLM_ARCH_QWEN2:
8387
+ case LLM_ARCH_QWEN2VL:
8060
8388
  {
8061
8389
  model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
8062
8390
 
@@ -8559,6 +8887,31 @@ static bool llm_load_tensors(
8559
8887
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
8560
8888
  }
8561
8889
  } break;
8890
+ case LLM_ARCH_OLMO2:
8891
+ {
8892
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
8893
+
8894
+ // output
8895
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
8896
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
8897
+
8898
+ for (int i = 0; i < n_layer; ++i) {
8899
+ auto & layer = model.layers[i];
8900
+
8901
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
8902
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
8903
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
8904
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
8905
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
8906
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
8907
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
8908
+
8909
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
8910
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
8911
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
8912
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
8913
+ }
8914
+ } break;
8562
8915
  case LLM_ARCH_OLMOE:
8563
8916
  {
8564
8917
  model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -8692,6 +9045,55 @@ static bool llm_load_tensors(
8692
9045
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
8693
9046
  }
8694
9047
  } break;
9048
+ case LLM_ARCH_DEEPSEEK:
9049
+ {
9050
+
9051
+ const int64_t n_ff_exp = hparams.n_ff_exp;
9052
+ const int64_t n_expert_shared = hparams.n_expert_shared;
9053
+
9054
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
9055
+
9056
+ // output
9057
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
9058
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
9059
+
9060
+ for (int i = 0; i < n_layer; ++i) {
9061
+ auto & layer = model.layers[i];
9062
+
9063
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
9064
+
9065
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
9066
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
9067
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
9068
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
9069
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
9070
+
9071
+ if (i < (int) hparams.n_layer_dense_lead) {
9072
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
9073
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
9074
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
9075
+ } else {
9076
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
9077
+
9078
+ if (n_expert == 0) {
9079
+ throw std::runtime_error("n_expert must be > 0");
9080
+ }
9081
+ if (n_expert_used == 0) {
9082
+ throw std::runtime_error("n_expert_used must be > 0");
9083
+ }
9084
+
9085
+ // MoE branch
9086
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
9087
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
9088
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
9089
+
9090
+ // Shared expert branch
9091
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
9092
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
9093
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
9094
+ }
9095
+ }
9096
+ } break;
8695
9097
  case LLM_ARCH_DEEPSEEK2:
8696
9098
  {
8697
9099
  const bool is_lite = (hparams.n_layer == 27);
@@ -9062,9 +9464,9 @@ static bool llm_load_tensors(
9062
9464
  } break;
9063
9465
  case LLM_ARCH_CHAMELEON:
9064
9466
  {
9065
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
9467
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
9066
9468
 
9067
- // output
9469
+ // output
9068
9470
  model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
9069
9471
  model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
9070
9472
  // if output is NULL, init from the input tok embed
@@ -9093,13 +9495,116 @@ static bool llm_load_tensors(
9093
9495
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
9094
9496
  }
9095
9497
  } break;
9096
- default:
9097
- throw std::runtime_error("unknown architecture");
9098
- }
9498
+ case LLM_ARCH_WAVTOKENIZER_DEC:
9499
+ {
9500
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
9099
9501
 
9100
- if (n_moved_tensors > 0) {
9101
- LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
9102
- __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
9502
+ model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
9503
+ model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
9504
+
9505
+ // posnet
9506
+ {
9507
+ const int64_t n_embd = hparams.posnet.n_embd;
9508
+
9509
+ for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
9510
+ auto & layer = model.layers[i].posnet;
9511
+
9512
+ // posnet:
9513
+ //
9514
+ // - resnet
9515
+ // - resnet
9516
+ // - attn
9517
+ // - resnet
9518
+ // - resnet
9519
+ // - norm
9520
+ //
9521
+ switch (i) {
9522
+ case 0:
9523
+ case 1:
9524
+ case 3:
9525
+ case 4:
9526
+ {
9527
+ layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
9528
+ layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
9529
+
9530
+ layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
9531
+ layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
9532
+
9533
+ layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
9534
+ layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
9535
+
9536
+ layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
9537
+ layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
9538
+ } break;
9539
+ case 2:
9540
+ {
9541
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
9542
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
9543
+
9544
+ layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
9545
+ layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
9546
+
9547
+ layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
9548
+ layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
9549
+
9550
+ layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
9551
+ layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
9552
+
9553
+ layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
9554
+ layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
9555
+ } break;
9556
+ case 5:
9557
+ {
9558
+ layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
9559
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
9560
+ } break;
9561
+ default: GGML_ABORT("unknown posnet layer");
9562
+ };
9563
+ }
9564
+ }
9565
+
9566
+ GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
9567
+
9568
+ model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
9569
+ model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
9570
+
9571
+ // convnext
9572
+ {
9573
+ const int64_t n_embd = hparams.convnext.n_embd;
9574
+
9575
+ for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
9576
+ auto & layer = model.layers[i].convnext;
9577
+
9578
+ layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
9579
+ layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
9580
+
9581
+ layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
9582
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
9583
+
9584
+ layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
9585
+ layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
9586
+
9587
+ layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
9588
+ layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
9589
+
9590
+ layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
9591
+ }
9592
+
9593
+ // output
9594
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
9595
+ model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
9596
+ }
9597
+
9598
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
9599
+ model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
9600
+ } break;
9601
+ default:
9602
+ throw std::runtime_error("unknown architecture");
9603
+ }
9604
+
9605
+ if (n_moved_tensors > 0) {
9606
+ LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
9607
+ __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
9103
9608
  ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
9104
9609
  }
9105
9610
  }
@@ -9133,7 +9638,7 @@ static bool llm_load_tensors(
9133
9638
  ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
9134
9639
  if (!dev) {
9135
9640
  // FIXME: workaround for CPU backend buft having a NULL device
9136
- dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0);
9641
+ dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
9137
9642
  }
9138
9643
  ggml_backend_dev_props props;
9139
9644
  ggml_backend_dev_get_props(dev, &props);
@@ -9312,6 +9817,7 @@ enum llm_ffn_gate_type {
9312
9817
  enum llm_norm_type {
9313
9818
  LLM_NORM,
9314
9819
  LLM_NORM_RMS,
9820
+ LLM_NORM_GROUP,
9315
9821
  };
9316
9822
 
9317
9823
  static struct ggml_tensor * llm_build_inp_embd(
@@ -9332,7 +9838,7 @@ static struct ggml_tensor * llm_build_inp_embd(
9332
9838
 
9333
9839
  inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
9334
9840
  } else {
9335
- lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
9841
+ lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
9336
9842
  inpL = lctx.inp_embd;
9337
9843
  ggml_set_input(lctx.inp_embd);
9338
9844
  }
@@ -9453,8 +9959,14 @@ static struct ggml_tensor * llm_build_norm(
9453
9959
  const llm_build_cb & cb,
9454
9960
  int il) {
9455
9961
  switch (type) {
9456
- case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break;
9457
- case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
9962
+ case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break;
9963
+ case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break;
9964
+ case LLM_NORM_GROUP:
9965
+ {
9966
+ cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
9967
+ cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
9968
+ cur = ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]);
9969
+ } break;
9458
9970
  }
9459
9971
 
9460
9972
  if (mw || mb) {
@@ -12421,6 +12933,124 @@ struct llm_build_context {
12421
12933
  return gf;
12422
12934
  }
12423
12935
 
12936
+ struct ggml_cgraph * build_qwen2vl() {
12937
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
12938
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12939
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12940
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12941
+
12942
+ struct ggml_tensor * cur;
12943
+ struct ggml_tensor * inpL;
12944
+
12945
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
12946
+
12947
+ // inp_pos - contains the positions
12948
+ lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
12949
+ cb(lctx.inp_pos, "inp_pos", -1);
12950
+ ggml_set_input(lctx.inp_pos);
12951
+ struct ggml_tensor * inp_pos = lctx.inp_pos;
12952
+
12953
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
12954
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
12955
+ int sections[4];
12956
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
12957
+
12958
+ for (int il = 0; il < n_layer; ++il) {
12959
+ struct ggml_tensor * inpSA = inpL;
12960
+
12961
+ // norm
12962
+ cur = llm_build_norm(ctx0, inpL, hparams,
12963
+ model.layers[il].attn_norm, NULL,
12964
+ LLM_NORM_RMS, cb, il);
12965
+ cb(cur, "attn_norm", il);
12966
+
12967
+ // self-attention
12968
+ {
12969
+ // compute Q and K and RoPE them
12970
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
12971
+ cb(Qcur, "Qcur", il);
12972
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12973
+ cb(Qcur, "Qcur", il);
12974
+
12975
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
12976
+ cb(Kcur, "Kcur", il);
12977
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12978
+ cb(Kcur, "Kcur", il);
12979
+
12980
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
12981
+ cb(Vcur, "Vcur", il);
12982
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12983
+ cb(Vcur, "Vcur", il);
12984
+
12985
+ Qcur = ggml_rope_multi(
12986
+ ctx0,
12987
+ ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
12988
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
12989
+ ext_factor, attn_factor, beta_fast, beta_slow
12990
+ );
12991
+ cb(Qcur, "Qcur", il);
12992
+
12993
+ Kcur = ggml_rope_multi(
12994
+ ctx0,
12995
+ ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
12996
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
12997
+ ext_factor, attn_factor, beta_fast, beta_slow
12998
+ );
12999
+ cb(Kcur, "Kcur", il);
13000
+
13001
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13002
+ model.layers[il].wo, model.layers[il].bo,
13003
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13004
+ }
13005
+
13006
+ if (il == n_layer - 1) {
13007
+ // skip computing output for unused tokens
13008
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
13009
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13010
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13011
+ }
13012
+
13013
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13014
+ cb(ffn_inp, "ffn_inp", il);
13015
+
13016
+ // feed-forward network
13017
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
13018
+ model.layers[il].ffn_norm, NULL,
13019
+ LLM_NORM_RMS, cb, il);
13020
+ cb(cur, "ffn_norm", il);
13021
+
13022
+ cur = llm_build_ffn(ctx0, lctx, cur,
13023
+ model.layers[il].ffn_up, NULL, NULL,
13024
+ model.layers[il].ffn_gate, NULL, NULL,
13025
+ model.layers[il].ffn_down, NULL, NULL,
13026
+ NULL,
13027
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
13028
+ cb(cur, "ffn_out", il);
13029
+
13030
+ cur = ggml_add(ctx0, cur, ffn_inp);
13031
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
13032
+ cb(cur, "l_out", il);
13033
+
13034
+ // input for next layer
13035
+ inpL = cur;
13036
+ }
13037
+
13038
+ cur = inpL;
13039
+
13040
+ cur = llm_build_norm(ctx0, cur, hparams,
13041
+ model.output_norm, NULL,
13042
+ LLM_NORM_RMS, cb, -1);
13043
+ cb(cur, "result_norm", -1);
13044
+
13045
+ // lm_head
13046
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13047
+ cb(cur, "result_output", -1);
13048
+
13049
+ ggml_build_forward_expand(gf, cur);
13050
+
13051
+ return gf;
13052
+ }
13053
+
12424
13054
  struct ggml_cgraph * build_qwen2moe() {
12425
13055
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
12426
13056
 
@@ -12704,7 +13334,13 @@ struct llm_build_context {
12704
13334
  struct ggml_tensor * inp_pos = build_inp_pos();
12705
13335
 
12706
13336
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
12707
- struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
13337
+ struct ggml_tensor * KQ_mask = nullptr;
13338
+ if (hparams.n_swa == 0) {
13339
+ // Phi-4 doesn't use sliding window attention
13340
+ KQ_mask = build_inp_KQ_mask();
13341
+ } else {
13342
+ KQ_mask = build_inp_KQ_mask_swa();
13343
+ }
12708
13344
 
12709
13345
  for (int il = 0; il < n_layer; ++il) {
12710
13346
  auto residual = inpL;
@@ -12762,7 +13398,7 @@ struct llm_build_context {
12762
13398
 
12763
13399
  cur = llm_build_kv(ctx0, lctx, kv_self, gf,
12764
13400
  model.layers[il].wo, model.layers[il].bo,
12765
- Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
13401
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
12766
13402
  }
12767
13403
 
12768
13404
  if (il == n_layer - 1) {
@@ -13372,21 +14008,18 @@ struct llm_build_context {
13372
14008
  return gf;
13373
14009
  }
13374
14010
 
13375
- // ref: https://arxiv.org/abs/2203.03466
13376
- // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
13377
- // based on the original build_llama() function
13378
- struct ggml_cgraph * build_minicpm() {
14011
+ struct ggml_cgraph * build_minicpm3() {
13379
14012
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13380
14013
 
13381
- const int64_t n_embd_head = hparams.n_embd_head_v;
13382
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13383
- GGML_ASSERT(n_embd_head == hparams.n_rot);
13384
-
13385
- const int64_t n_embd = hparams.n_embd;
13386
14014
  //TODO: if the model varies, these parameters need to be read from the model
13387
14015
  const int64_t n_embd_base = 256;
13388
14016
  const float scale_embd = 12.0f;
13389
14017
  const float scale_depth = 1.4f;
14018
+ const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
14019
+
14020
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
14021
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
14022
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
13390
14023
 
13391
14024
  struct ggml_tensor * cur;
13392
14025
  struct ggml_tensor * inpL;
@@ -13406,209 +14039,65 @@ struct llm_build_context {
13406
14039
  for (int il = 0; il < n_layer; ++il) {
13407
14040
  struct ggml_tensor * inpSA = inpL;
13408
14041
 
14042
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
13409
14043
  // norm
13410
14044
  cur = llm_build_norm(ctx0, inpL, hparams,
13411
14045
  model.layers[il].attn_norm, NULL,
13412
14046
  LLM_NORM_RMS, cb, il);
13413
14047
  cb(cur, "attn_norm", il);
13414
14048
 
13415
- // self-attention
14049
+ // self_attention
13416
14050
  {
13417
- // compute Q and K and RoPE them
13418
- struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13419
- cb(Qcur, "Qcur", il);
13420
- if (model.layers[il].bq) {
13421
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
13422
- cb(Qcur, "Qcur", il);
13423
- }
13424
-
13425
- struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13426
- cb(Kcur, "Kcur", il);
13427
- if (model.layers[il].bk) {
13428
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
13429
- cb(Kcur, "Kcur", il);
13430
- }
14051
+ struct ggml_tensor * q = NULL;
14052
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
14053
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
14054
+ cb(q, "q", il);
13431
14055
 
13432
- struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13433
- cb(Vcur, "Vcur", il);
13434
- if (model.layers[il].bv) {
13435
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
13436
- cb(Vcur, "Vcur", il);
13437
- }
14056
+ q = llm_build_norm(ctx0, q, hparams,
14057
+ model.layers[il].attn_q_a_norm, NULL,
14058
+ LLM_NORM_RMS, cb, il);
14059
+ cb(q, "q", il);
13438
14060
 
13439
- Qcur = ggml_rope_ext(
13440
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
13441
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13442
- ext_factor, attn_factor, beta_fast, beta_slow
13443
- );
13444
- cb(Qcur, "Qcur", il);
14061
+ // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
14062
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
14063
+ cb(q, "q", il);
13445
14064
 
13446
- Kcur = ggml_rope_ext(
13447
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
13448
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13449
- ext_factor, attn_factor, beta_fast, beta_slow
13450
- );
13451
- cb(Kcur, "Kcur", il);
14065
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
14066
+ struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
14067
+ ggml_row_size(q->type, hparams.n_embd_head_k),
14068
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
14069
+ 0);
14070
+ cb(q_nope, "q_nope", il);
13452
14071
 
13453
- cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13454
- model.layers[il].wo, model.layers[il].bo,
13455
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13456
- }
14072
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
14073
+ struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
14074
+ ggml_row_size(q->type, hparams.n_embd_head_k),
14075
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
14076
+ ggml_row_size(q->type, n_embd_head_qk_nope));
14077
+ cb(q_pe, "q_pe", il);
13457
14078
 
13458
- if (il == n_layer - 1) {
13459
- // skip computing output for unused tokens
13460
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
13461
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13462
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
13463
- }
14079
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
14080
+ struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
14081
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
13464
14082
 
13465
- // scale_res - scale the hidden states for residual connection
13466
- const float scale_res = scale_depth/sqrtf(float(n_layer));
13467
- cur = ggml_scale(ctx0, cur, scale_res);
13468
- cb(cur, "hidden_scaled", -1);
14083
+ // split into {kv_lora_rank, n_tokens}
14084
+ struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
14085
+ kv_pe_compresseed->nb[1],
14086
+ 0);
14087
+ cb(kv_compressed, "kv_compressed", il);
13469
14088
 
13470
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
13471
- cb(ffn_inp, "ffn_inp", il);
14089
+ // and {n_embd_head_qk_rope, n_tokens}
14090
+ struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
14091
+ kv_pe_compresseed->nb[1],
14092
+ kv_pe_compresseed->nb[1],
14093
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
14094
+ cb(k_pe, "k_pe", il);
13472
14095
 
13473
- // feed-forward network
13474
- {
13475
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
13476
- model.layers[il].ffn_norm, NULL,
14096
+ kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
14097
+ kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
14098
+ model.layers[il].attn_kv_a_norm, NULL,
13477
14099
  LLM_NORM_RMS, cb, il);
13478
- cb(cur, "ffn_norm", il);
13479
-
13480
- cur = llm_build_ffn(ctx0, lctx, cur,
13481
- model.layers[il].ffn_up, NULL, NULL,
13482
- model.layers[il].ffn_gate, NULL, NULL,
13483
- model.layers[il].ffn_down, NULL, NULL,
13484
- NULL,
13485
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
13486
- cb(cur, "ffn_out", il);
13487
- }
13488
-
13489
- // scale the hidden states for residual connection
13490
- cur = ggml_scale(ctx0, cur, scale_res);
13491
- cb(cur, "hidden_scaled_ffn", -1);
13492
-
13493
- cur = ggml_add(ctx0, cur, ffn_inp);
13494
- cur = lctx.cvec.apply_to(ctx0, cur, il);
13495
- cb(cur, "l_out", il);
13496
-
13497
- // input for next layer
13498
- inpL = cur;
13499
- }
13500
-
13501
- cur = inpL;
13502
-
13503
- cur = llm_build_norm(ctx0, cur, hparams,
13504
- model.output_norm, NULL,
13505
- LLM_NORM_RMS, cb, -1);
13506
- cb(cur, "result_norm", -1);
13507
-
13508
- // lm_head scaling
13509
- const float scale_lmhead = float(n_embd_base)/float(n_embd);
13510
- cur = ggml_scale(ctx0, cur, scale_lmhead);
13511
- cb(cur, "lmhead_scaling", -1);
13512
-
13513
- // lm_head
13514
- cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13515
- cb(cur, "result_output", -1);
13516
-
13517
- ggml_build_forward_expand(gf, cur);
13518
-
13519
- return gf;
13520
- }
13521
-
13522
- struct ggml_cgraph * build_minicpm3() {
13523
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13524
-
13525
- //TODO: if the model varies, these parameters need to be read from the model
13526
- const int64_t n_embd_base = 256;
13527
- const float scale_embd = 12.0f;
13528
- const float scale_depth = 1.4f;
13529
- const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
13530
-
13531
- const uint32_t n_embd_head_qk_rope = hparams.n_rot;
13532
- const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
13533
- const uint32_t kv_lora_rank = hparams.n_lora_kv;
13534
-
13535
- struct ggml_tensor * cur;
13536
- struct ggml_tensor * inpL;
13537
-
13538
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
13539
-
13540
- // scale the input embeddings
13541
- inpL = ggml_scale(ctx0, inpL, scale_embd);
13542
- cb(inpL, "inp_scaled", -1);
13543
-
13544
- // inp_pos - contains the positions
13545
- struct ggml_tensor * inp_pos = build_inp_pos();
13546
-
13547
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13548
- struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
13549
-
13550
- for (int il = 0; il < n_layer; ++il) {
13551
- struct ggml_tensor * inpSA = inpL;
13552
-
13553
- struct ggml_tensor * rope_factors = build_rope_factors(il);
13554
- // norm
13555
- cur = llm_build_norm(ctx0, inpL, hparams,
13556
- model.layers[il].attn_norm, NULL,
13557
- LLM_NORM_RMS, cb, il);
13558
- cb(cur, "attn_norm", il);
13559
-
13560
- // self_attention
13561
- {
13562
- struct ggml_tensor * q = NULL;
13563
- // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
13564
- q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
13565
- cb(q, "q", il);
13566
-
13567
- q = llm_build_norm(ctx0, q, hparams,
13568
- model.layers[il].attn_q_a_norm, NULL,
13569
- LLM_NORM_RMS, cb, il);
13570
- cb(q, "q", il);
13571
-
13572
- // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
13573
- q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
13574
- cb(q, "q", il);
13575
-
13576
- // split into {n_head * n_embd_head_qk_nope, n_tokens}
13577
- struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
13578
- ggml_row_size(q->type, hparams.n_embd_head_k),
13579
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
13580
- 0);
13581
- cb(q_nope, "q_nope", il);
13582
-
13583
- // and {n_head * n_embd_head_qk_rope, n_tokens}
13584
- struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
13585
- ggml_row_size(q->type, hparams.n_embd_head_k),
13586
- ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
13587
- ggml_row_size(q->type, n_embd_head_qk_nope));
13588
- cb(q_pe, "q_pe", il);
13589
-
13590
- // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
13591
- struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
13592
- cb(kv_pe_compresseed, "kv_pe_compresseed", il);
13593
-
13594
- // split into {kv_lora_rank, n_tokens}
13595
- struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
13596
- kv_pe_compresseed->nb[1],
13597
- 0);
13598
- cb(kv_compressed, "kv_compressed", il);
13599
-
13600
- // and {n_embd_head_qk_rope, n_tokens}
13601
- struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
13602
- kv_pe_compresseed->nb[1],
13603
- kv_pe_compresseed->nb[1],
13604
- ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
13605
- cb(k_pe, "k_pe", il);
13606
-
13607
- kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
13608
- kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
13609
- model.layers[il].attn_kv_a_norm, NULL,
13610
- LLM_NORM_RMS, cb, il);
13611
- cb(kv_compressed, "kv_compressed", il);
14100
+ cb(kv_compressed, "kv_compressed", il);
13612
14101
 
13613
14102
  // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
13614
14103
  struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
@@ -14424,6 +14913,130 @@ struct llm_build_context {
14424
14913
  return gf;
14425
14914
  }
14426
14915
 
14916
+ struct ggml_cgraph * build_olmo2() {
14917
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
14918
+
14919
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
14920
+ int32_t n_tokens = this->n_tokens;
14921
+
14922
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14923
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14924
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
14925
+
14926
+ struct ggml_tensor * cur;
14927
+ struct ggml_tensor * inpL;
14928
+
14929
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
14930
+
14931
+ // inp_pos - contains the positions
14932
+ struct ggml_tensor * inp_pos = build_inp_pos();
14933
+
14934
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
14935
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
14936
+
14937
+ for (int il = 0; il < n_layer; ++il) {
14938
+ struct ggml_tensor * inpSA = inpL;
14939
+
14940
+ cur = inpL;
14941
+
14942
+ // self_attention
14943
+ {
14944
+ // compute Q and K and RoPE them
14945
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
14946
+ cb(Qcur, "Qcur", il);
14947
+
14948
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
14949
+ cb(Kcur, "Kcur", il);
14950
+
14951
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
14952
+ cb(Vcur, "Vcur", il);
14953
+
14954
+ Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
14955
+ LLM_NORM_RMS, cb, il);
14956
+ cb(Qcur, "Qcur_normed", il);
14957
+
14958
+ Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
14959
+ LLM_NORM_RMS, cb, il);
14960
+ cb(Kcur, "Kcur_normed", il);
14961
+
14962
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14963
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14964
+
14965
+ Qcur = ggml_rope_ext(
14966
+ ctx0, Qcur, inp_pos, nullptr,
14967
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14968
+ ext_factor, attn_factor, beta_fast, beta_slow
14969
+ );
14970
+ cb(Qcur, "Qcur_rope", il);
14971
+
14972
+ Kcur = ggml_rope_ext(
14973
+ ctx0, Kcur, inp_pos, nullptr,
14974
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14975
+ ext_factor, attn_factor, beta_fast, beta_slow
14976
+ );
14977
+ cb(Kcur, "Kcur_rope", il);
14978
+
14979
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
14980
+ model.layers[il].wo, NULL,
14981
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
14982
+ }
14983
+
14984
+ cur = llm_build_norm(ctx0, cur, hparams,
14985
+ model.layers[il].attn_post_norm, NULL,
14986
+ LLM_NORM_RMS, cb, il);
14987
+ cb(cur, "attn_post_norm", il);
14988
+
14989
+ if (il == n_layer - 1) {
14990
+ // skip computing output for unused tokens
14991
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
14992
+ n_tokens = n_outputs;
14993
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14994
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14995
+ }
14996
+
14997
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14998
+ cb(ffn_inp, "ffn_inp", il);
14999
+
15000
+ // feed-forward network
15001
+ cur = llm_build_ffn(ctx0, lctx, ffn_inp,
15002
+ model.layers[il].ffn_up, NULL, NULL,
15003
+ model.layers[il].ffn_gate, NULL, NULL,
15004
+ model.layers[il].ffn_down, NULL, NULL,
15005
+ NULL,
15006
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
15007
+ cb(cur, "ffn_out", il);
15008
+
15009
+ cur = llm_build_norm(ctx0, cur, hparams,
15010
+ model.layers[il].ffn_post_norm, NULL,
15011
+ LLM_NORM_RMS, cb, -1);
15012
+ cb(cur, "ffn_post_norm", -1);
15013
+
15014
+ cur = ggml_add(ctx0, cur, ffn_inp);
15015
+ cb(cur, "ffn_out", il);
15016
+
15017
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
15018
+ cb(cur, "l_out", il);
15019
+
15020
+ // input for next layer
15021
+ inpL = cur;
15022
+ }
15023
+
15024
+ cur = inpL;
15025
+
15026
+ cur = llm_build_norm(ctx0, cur, hparams,
15027
+ model.output_norm, NULL,
15028
+ LLM_NORM_RMS, cb, -1);
15029
+ cb(cur, "result_norm", -1);
15030
+
15031
+ // lm_head
15032
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
15033
+ cb(cur, "result_output", -1);
15034
+
15035
+ ggml_build_forward_expand(gf, cur);
15036
+
15037
+ return gf;
15038
+ }
15039
+
14427
15040
  // based on the build_qwen2moe() function, changes:
14428
15041
  // * removed shared experts
14429
15042
  // * removed bias
@@ -14905,29 +15518,183 @@ struct llm_build_context {
14905
15518
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
14906
15519
  cb(cur, "ffn_out", il);
14907
15520
 
14908
- struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
14909
- cb(ffn_out, "ffn_out", il);
15521
+ struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
15522
+ cb(ffn_out, "ffn_out", il);
15523
+
15524
+ // MoE
15525
+ cur = llm_build_norm(ctx0, inpSA, hparams,
15526
+ model.layers[il].ffn_norm_exps, NULL,
15527
+ LLM_NORM_RMS, cb, il);
15528
+ cb(cur, "ffn_norm_exps", il);
15529
+
15530
+ cur = llm_build_moe_ffn(ctx0, lctx, cur,
15531
+ model.layers[il].ffn_gate_inp,
15532
+ model.layers[il].ffn_up_exps,
15533
+ model.layers[il].ffn_gate_exps,
15534
+ model.layers[il].ffn_down_exps,
15535
+ n_expert, n_expert_used,
15536
+ LLM_FFN_SILU, true,
15537
+ false, 0.0,
15538
+ cb, il);
15539
+ cb(cur, "ffn_moe_out", il);
15540
+
15541
+ cur = ggml_add(ctx0, cur, ffn_out);
15542
+ cb(cur, "ffn_out", il);
15543
+
15544
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
15545
+ cb(cur, "l_out", il);
15546
+
15547
+ // input for next layer
15548
+ inpL = cur;
15549
+ }
15550
+
15551
+ cur = inpL;
15552
+
15553
+ cur = llm_build_norm(ctx0, cur, hparams,
15554
+ model.output_norm, NULL,
15555
+ LLM_NORM_RMS, cb, -1);
15556
+ cb(cur, "result_norm", -1);
15557
+
15558
+ // lm_head
15559
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
15560
+ cb(cur, "result_output", -1);
15561
+
15562
+ ggml_build_forward_expand(gf, cur);
15563
+
15564
+ return gf;
15565
+ }
15566
+
15567
+ struct ggml_cgraph * build_deepseek() {
15568
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15569
+
15570
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
15571
+ int32_t n_tokens = this->n_tokens;
15572
+
15573
+ const int64_t n_embd_head = hparams.n_embd_head_v;
15574
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15575
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
15576
+
15577
+ struct ggml_tensor * cur;
15578
+ struct ggml_tensor * inpL;
15579
+
15580
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
15581
+
15582
+ // inp_pos - contains the positions
15583
+ struct ggml_tensor * inp_pos = build_inp_pos();
15584
+
15585
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
15586
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
15587
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15588
+ for (int il = 0; il < n_layer; ++il) {
15589
+ struct ggml_tensor * inpSA = inpL;
15590
+
15591
+ // norm
15592
+ cur = llm_build_norm(ctx0, inpL, hparams,
15593
+ model.layers[il].attn_norm, NULL,
15594
+ LLM_NORM_RMS, cb, il);
15595
+ cb(cur, "attn_norm", il);
15596
+
15597
+ // self-attention
15598
+ {
15599
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
15600
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
15601
+
15602
+ // compute Q and K and RoPE them
15603
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
15604
+ cb(Qcur, "Qcur", il);
15605
+ if (model.layers[il].bq) {
15606
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
15607
+ cb(Qcur, "Qcur", il);
15608
+ }
15609
+
15610
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
15611
+ cb(Kcur, "Kcur", il);
15612
+ if (model.layers[il].bk) {
15613
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
15614
+ cb(Kcur, "Kcur", il);
15615
+ }
15616
+
15617
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
15618
+ cb(Vcur, "Vcur", il);
15619
+ if (model.layers[il].bv) {
15620
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
15621
+ cb(Vcur, "Vcur", il);
15622
+ }
15623
+
15624
+ Qcur = ggml_rope_ext(
15625
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
15626
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15627
+ ext_factor, attn_factor, beta_fast, beta_slow
15628
+ );
15629
+ cb(Qcur, "Qcur", il);
15630
+
15631
+ Kcur = ggml_rope_ext(
15632
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
15633
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15634
+ ext_factor, attn_factor, beta_fast, beta_slow
15635
+ );
15636
+ cb(Kcur, "Kcur", il);
15637
+
15638
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
15639
+ model.layers[il].wo, model.layers[il].bo,
15640
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
15641
+ }
15642
+
15643
+ if (il == n_layer - 1) {
15644
+ // skip computing output for unused tokens
15645
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
15646
+ n_tokens = n_outputs;
15647
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
15648
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
15649
+ }
15650
+
14910
15651
 
14911
- // MoE
14912
- cur = llm_build_norm(ctx0, inpSA, hparams,
14913
- model.layers[il].ffn_norm_exps, NULL,
15652
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
15653
+ cb(ffn_inp, "ffn_inp", il);
15654
+
15655
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
15656
+ model.layers[il].ffn_norm, NULL,
14914
15657
  LLM_NORM_RMS, cb, il);
14915
- cb(cur, "ffn_norm_exps", il);
15658
+ cb(cur, "ffn_norm", il);
14916
15659
 
14917
- cur = llm_build_moe_ffn(ctx0, lctx, cur,
14918
- model.layers[il].ffn_gate_inp,
14919
- model.layers[il].ffn_up_exps,
14920
- model.layers[il].ffn_gate_exps,
14921
- model.layers[il].ffn_down_exps,
14922
- n_expert, n_expert_used,
14923
- LLM_FFN_SILU, true,
14924
- false, 0.0,
14925
- cb, il);
14926
- cb(cur, "ffn_moe_out", il);
15660
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
15661
+ cur = llm_build_ffn(ctx0, lctx, cur,
15662
+ model.layers[il].ffn_up, NULL, NULL,
15663
+ model.layers[il].ffn_gate, NULL, NULL,
15664
+ model.layers[il].ffn_down, NULL, NULL,
15665
+ NULL,
15666
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
15667
+ cb(cur, "ffn_out", il);
15668
+ } else {
15669
+ // MoE branch
15670
+ ggml_tensor * moe_out =
15671
+ llm_build_moe_ffn(ctx0, lctx, cur,
15672
+ model.layers[il].ffn_gate_inp,
15673
+ model.layers[il].ffn_up_exps,
15674
+ model.layers[il].ffn_gate_exps,
15675
+ model.layers[il].ffn_down_exps,
15676
+ n_expert, n_expert_used,
15677
+ LLM_FFN_SILU, false,
15678
+ false, hparams.expert_weights_scale,
15679
+ cb, il);
15680
+ cb(moe_out, "ffn_moe_out", il);
14927
15681
 
14928
- cur = ggml_add(ctx0, cur, ffn_out);
14929
- cb(cur, "ffn_out", il);
15682
+ // FFN shared expert
15683
+ {
15684
+ ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
15685
+ model.layers[il].ffn_up_shexp, NULL, NULL,
15686
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
15687
+ model.layers[il].ffn_down_shexp, NULL, NULL,
15688
+ NULL,
15689
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
15690
+ cb(ffn_shexp, "ffn_shexp", il);
15691
+
15692
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
15693
+ cb(cur, "ffn_out", il);
15694
+ }
15695
+ }
14930
15696
 
15697
+ cur = ggml_add(ctx0, cur, ffn_inp);
14931
15698
  cur = lctx.cvec.apply_to(ctx0, cur, il);
14932
15699
  cb(cur, "l_out", il);
14933
15700
 
@@ -14944,6 +15711,7 @@ struct llm_build_context {
14944
15711
 
14945
15712
  // lm_head
14946
15713
  cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
15714
+
14947
15715
  cb(cur, "result_output", -1);
14948
15716
 
14949
15717
  ggml_build_forward_expand(gf, cur);
@@ -15330,7 +16098,7 @@ struct llm_build_context {
15330
16098
  return gf;
15331
16099
  }
15332
16100
 
15333
- struct ggml_cgraph * build_t5_encoder() {
16101
+ struct ggml_cgraph * build_t5_enc() {
15334
16102
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15335
16103
 
15336
16104
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -15462,7 +16230,7 @@ struct llm_build_context {
15462
16230
  return gf;
15463
16231
  }
15464
16232
 
15465
- struct ggml_cgraph * build_t5_decoder() {
16233
+ struct ggml_cgraph * build_t5_dec() {
15466
16234
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15467
16235
 
15468
16236
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -16411,6 +17179,158 @@ struct llm_build_context {
16411
17179
 
16412
17180
  return gf;
16413
17181
  }
17182
+
17183
+ struct ggml_cgraph * build_wavtokenizer_dec() {
17184
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
17185
+
17186
+ struct ggml_tensor * cur;
17187
+ struct ggml_tensor * inpL;
17188
+
17189
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
17190
+
17191
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
17192
+
17193
+ cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
17194
+ cur = ggml_add(ctx0, cur, model.conv1d_b);
17195
+
17196
+ // posnet
17197
+ for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
17198
+ const auto & layer = model.layers[il].posnet;
17199
+
17200
+ inpL = cur;
17201
+
17202
+ switch (il) {
17203
+ case 0:
17204
+ case 1:
17205
+ case 3:
17206
+ case 4:
17207
+ {
17208
+ cur = llm_build_norm(ctx0, cur, hparams,
17209
+ layer.norm1,
17210
+ layer.norm1_b,
17211
+ LLM_NORM_GROUP, cb, 0);
17212
+
17213
+ cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
17214
+
17215
+ cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
17216
+ cur = ggml_add(ctx0, cur, layer.conv1_b);
17217
+
17218
+ cur = llm_build_norm(ctx0, cur, hparams,
17219
+ layer.norm2,
17220
+ layer.norm2_b,
17221
+ LLM_NORM_GROUP, cb, 0);
17222
+
17223
+ cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
17224
+
17225
+ cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
17226
+ cur = ggml_add(ctx0, cur, layer.conv2_b);
17227
+
17228
+ cur = ggml_add(ctx0, cur, inpL);
17229
+ } break;
17230
+ case 2:
17231
+ {
17232
+ cur = llm_build_norm(ctx0, cur, hparams,
17233
+ layer.attn_norm,
17234
+ layer.attn_norm_b,
17235
+ LLM_NORM_GROUP, cb, 0);
17236
+
17237
+ struct ggml_tensor * q;
17238
+ struct ggml_tensor * k;
17239
+ struct ggml_tensor * v;
17240
+
17241
+ q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
17242
+ k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
17243
+ v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
17244
+
17245
+ q = ggml_add(ctx0, q, layer.attn_q_b);
17246
+ k = ggml_add(ctx0, k, layer.attn_k_b);
17247
+ v = ggml_add(ctx0, v, layer.attn_v_b);
17248
+
17249
+ q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
17250
+ k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
17251
+
17252
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
17253
+
17254
+ kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
17255
+
17256
+ cur = ggml_mul_mat(ctx0, kq, v);
17257
+
17258
+ cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
17259
+ cur = ggml_add(ctx0, cur, layer.attn_o_b);
17260
+
17261
+ cur = ggml_add(ctx0, cur, inpL);
17262
+ } break;
17263
+ case 5:
17264
+ {
17265
+ cur = llm_build_norm(ctx0, cur, hparams,
17266
+ layer.norm,
17267
+ layer.norm_b,
17268
+ LLM_NORM_GROUP, cb, 0);
17269
+ } break;
17270
+ default: GGML_ABORT("unknown posnet layer");
17271
+ };
17272
+ }
17273
+
17274
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
17275
+
17276
+ cur = llm_build_norm(ctx0, cur, hparams,
17277
+ model.tok_norm,
17278
+ model.tok_norm_b,
17279
+ LLM_NORM, cb, -1);
17280
+
17281
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
17282
+
17283
+ inpL = cur;
17284
+
17285
+ // convnext
17286
+ for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
17287
+ const auto & layer = model.layers[il].convnext;
17288
+
17289
+ cur = inpL;
17290
+
17291
+ cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
17292
+ cur = ggml_add(ctx0, cur, layer.dw_b);
17293
+
17294
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
17295
+
17296
+ cur = llm_build_norm(ctx0, cur, hparams,
17297
+ layer.norm,
17298
+ layer.norm_b,
17299
+ LLM_NORM, cb, -1);
17300
+
17301
+ cur = llm_build_ffn(ctx0, lctx, cur,
17302
+ layer.pw1, layer.pw1_b, NULL,
17303
+ NULL, NULL, NULL,
17304
+ layer.pw2, layer.pw2_b, NULL,
17305
+ NULL,
17306
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
17307
+
17308
+ cur = ggml_mul(ctx0, cur, layer.gamma);
17309
+
17310
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
17311
+
17312
+ inpL = ggml_add(ctx0, cur, inpL);
17313
+ }
17314
+
17315
+ cur = inpL;
17316
+
17317
+ cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
17318
+
17319
+ cur = llm_build_norm(ctx0, cur, hparams,
17320
+ model.output_norm,
17321
+ model.output_norm_b,
17322
+ LLM_NORM, cb, -1);
17323
+
17324
+ // lm_head
17325
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
17326
+
17327
+ cur = ggml_add(ctx0, cur, model.output_b);
17328
+ cb(cur, "result_embd", -1);
17329
+
17330
+ ggml_build_forward_expand(gf, cur);
17331
+
17332
+ return gf;
17333
+ }
16414
17334
  };
16415
17335
 
16416
17336
  static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16493,6 +17413,7 @@ static struct ggml_cgraph * llama_build_graph(
16493
17413
 
16494
17414
  switch (model.arch) {
16495
17415
  case LLM_ARCH_LLAMA:
17416
+ case LLM_ARCH_MINICPM:
16496
17417
  case LLM_ARCH_GRANITE:
16497
17418
  case LLM_ARCH_GRANITE_MOE:
16498
17419
  {
@@ -16544,6 +17465,11 @@ static struct ggml_cgraph * llama_build_graph(
16544
17465
  {
16545
17466
  result = llm.build_qwen2();
16546
17467
  } break;
17468
+ case LLM_ARCH_QWEN2VL:
17469
+ {
17470
+ lctx.n_pos_per_token = 4;
17471
+ result = llm.build_qwen2vl();
17472
+ } break;
16547
17473
  case LLM_ARCH_QWEN2MOE:
16548
17474
  {
16549
17475
  result = llm.build_qwen2moe();
@@ -16576,10 +17502,6 @@ static struct ggml_cgraph * llama_build_graph(
16576
17502
  {
16577
17503
  result = llm.build_internlm2();
16578
17504
  } break;
16579
- case LLM_ARCH_MINICPM:
16580
- {
16581
- result = llm.build_minicpm();
16582
- } break;
16583
17505
  case LLM_ARCH_MINICPM3:
16584
17506
  {
16585
17507
  result = llm.build_minicpm3();
@@ -16616,6 +17538,10 @@ static struct ggml_cgraph * llama_build_graph(
16616
17538
  {
16617
17539
  result = llm.build_olmo();
16618
17540
  } break;
17541
+ case LLM_ARCH_OLMO2:
17542
+ {
17543
+ result = llm.build_olmo2();
17544
+ } break;
16619
17545
  case LLM_ARCH_OLMOE:
16620
17546
  {
16621
17547
  result = llm.build_olmoe();
@@ -16632,6 +17558,10 @@ static struct ggml_cgraph * llama_build_graph(
16632
17558
  {
16633
17559
  result = llm.build_arctic();
16634
17560
  } break;
17561
+ case LLM_ARCH_DEEPSEEK:
17562
+ {
17563
+ result = llm.build_deepseek();
17564
+ } break;
16635
17565
  case LLM_ARCH_DEEPSEEK2:
16636
17566
  {
16637
17567
  result = llm.build_deepseek2();
@@ -16647,14 +17577,14 @@ static struct ggml_cgraph * llama_build_graph(
16647
17577
  case LLM_ARCH_T5:
16648
17578
  {
16649
17579
  if (lctx.is_encoding) {
16650
- result = llm.build_t5_encoder();
17580
+ result = llm.build_t5_enc();
16651
17581
  } else {
16652
- result = llm.build_t5_decoder();
17582
+ result = llm.build_t5_dec();
16653
17583
  }
16654
17584
  } break;
16655
17585
  case LLM_ARCH_T5ENCODER:
16656
17586
  {
16657
- result = llm.build_t5_encoder();
17587
+ result = llm.build_t5_enc();
16658
17588
  } break;
16659
17589
  case LLM_ARCH_JAIS:
16660
17590
  {
@@ -16676,6 +17606,10 @@ static struct ggml_cgraph * llama_build_graph(
16676
17606
  {
16677
17607
  result = llm.build_chameleon();
16678
17608
  } break;
17609
+ case LLM_ARCH_WAVTOKENIZER_DEC:
17610
+ {
17611
+ result = llm.build_wavtokenizer_dec();
17612
+ } break;
16679
17613
  default:
16680
17614
  GGML_ABORT("fatal error");
16681
17615
  }
@@ -16762,35 +17696,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
16762
17696
 
16763
17697
  if (ubatch.pos && lctx.inp_pos) {
16764
17698
  const int64_t n_tokens = ubatch.n_tokens;
16765
-
16766
- ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
17699
+ auto n_pos = lctx.n_pos_per_token;
17700
+ ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
16767
17701
  }
16768
17702
 
16769
17703
  if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
16770
- GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
16771
- const int64_t n_tokens = ubatch.n_tokens;
17704
+ //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
17705
+
17706
+ if (!lctx.inp_out_ids) {
17707
+ LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
17708
+ } else {
17709
+ const int64_t n_tokens = ubatch.n_tokens;
16772
17710
 
16773
- GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
16774
- int32_t * data = (int32_t *) lctx.inp_out_ids->data;
17711
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
17712
+ int32_t * data = (int32_t *) lctx.inp_out_ids->data;
16775
17713
 
16776
- if (lctx.n_outputs == n_tokens) {
16777
- for (int i = 0; i < n_tokens; ++i) {
16778
- data[i] = i;
16779
- }
16780
- } else if (ubatch.output) {
16781
- int32_t n_outputs = 0;
16782
- for (int i = 0; i < n_tokens; ++i) {
16783
- if (ubatch.output[i]) {
16784
- data[n_outputs++] = i;
17714
+ if (lctx.n_outputs == n_tokens) {
17715
+ for (int i = 0; i < n_tokens; ++i) {
17716
+ data[i] = i;
16785
17717
  }
17718
+ } else if (ubatch.output) {
17719
+ int32_t n_outputs = 0;
17720
+ for (int i = 0; i < n_tokens; ++i) {
17721
+ if (ubatch.output[i]) {
17722
+ data[n_outputs++] = i;
17723
+ }
17724
+ }
17725
+ // the graph needs to have been passed the correct number of outputs
17726
+ GGML_ASSERT(lctx.n_outputs == n_outputs);
17727
+ } else if (lctx.n_outputs == 1) {
17728
+ // only keep last output
17729
+ data[0] = n_tokens - 1;
17730
+ } else {
17731
+ GGML_ASSERT(lctx.n_outputs == 0);
16786
17732
  }
16787
- // the graph needs to have been passed the correct number of outputs
16788
- GGML_ASSERT(lctx.n_outputs == n_outputs);
16789
- } else if (lctx.n_outputs == 1) {
16790
- // only keep last output
16791
- data[0] = n_tokens - 1;
16792
- } else {
16793
- GGML_ASSERT(lctx.n_outputs == 0);
16794
17733
  }
16795
17734
  }
16796
17735
 
@@ -17258,8 +18197,9 @@ static enum ggml_status llama_graph_compute(
17258
18197
  int n_threads,
17259
18198
  ggml_threadpool * threadpool) {
17260
18199
  if (lctx.backend_cpu != nullptr) {
17261
- ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
17262
- ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
18200
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
18201
+ auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
18202
+ set_threadpool_fn(lctx.backend_cpu, threadpool);
17263
18203
  }
17264
18204
 
17265
18205
  // set the number of threads for all the backends
@@ -17460,6 +18400,7 @@ static int llama_decode_internal(
17460
18400
  embd = nullptr; // do not extract embeddings when not needed
17461
18401
  GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
17462
18402
  }
18403
+
17463
18404
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
17464
18405
 
17465
18406
  ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
@@ -18026,13 +18967,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
18026
18967
  static void llama_kv_cache_update_internal(struct llama_context & lctx) {
18027
18968
  bool need_reserve = false;
18028
18969
 
18029
- // apply K-shift if needed
18030
- if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
18031
- if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
18032
- GGML_ABORT("Deepseek2 does not support K-shift");
18970
+ if (lctx.kv_self.has_shift) {
18971
+ if (!llama_kv_cache_can_shift(&lctx)) {
18972
+ GGML_ABORT("The current context does not support K-shift");
18033
18973
  }
18034
18974
 
18035
- {
18975
+ // apply K-shift if needed
18976
+ if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
18036
18977
  ggml_backend_sched_reset(lctx.sched.get());
18037
18978
 
18038
18979
  ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
@@ -18247,10 +19188,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18247
19188
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18248
19189
  new_type = GGML_TYPE_IQ3_S;
18249
19190
  }
18250
- else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
18251
- new_type == GGML_TYPE_Q4_0_8_8) {
18252
- new_type = GGML_TYPE_Q4_0;
18253
- }
18254
19191
  else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
18255
19192
  new_type = GGML_TYPE_Q4_K;
18256
19193
  }
@@ -18573,9 +19510,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18573
19510
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
18574
19511
  case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
18575
19512
  case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
18576
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
18577
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
18578
- case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
18579
19513
 
18580
19514
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
18581
19515
  }
@@ -18914,14 +19848,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18914
19848
  f32_data = (float *) f32_conv_buf.data();
18915
19849
  }
18916
19850
 
18917
- int chunk_size_multiplier = 1;
18918
- if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
18919
- if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
18920
- else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
18921
- if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
18922
- else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
18923
- }
18924
-
18925
19851
  LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
18926
19852
  fflush(stdout);
18927
19853
 
@@ -18934,8 +19860,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18934
19860
  const int64_t nrows = tensor->ne[1];
18935
19861
 
18936
19862
  static const int64_t min_chunk_size = 32 * 512;
18937
- const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
18938
- chunk_size_multiplier;
19863
+ const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
18939
19864
 
18940
19865
  const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
18941
19866
  const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
@@ -19176,6 +20101,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
19176
20101
  //
19177
20102
  struct llama_model_params llama_model_default_params() {
19178
20103
  struct llama_model_params result = {
20104
+ /*.devices =*/ nullptr,
19179
20105
  /*.n_gpu_layers =*/ 0,
19180
20106
  /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
19181
20107
  /*.main_gpu =*/ 0,
@@ -19293,7 +20219,11 @@ void llama_backend_init(void) {
19293
20219
 
19294
20220
  void llama_numa_init(enum ggml_numa_strategy numa) {
19295
20221
  if (numa != GGML_NUMA_STRATEGY_DISABLED) {
19296
- ggml_numa_init(numa);
20222
+ auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
20223
+ GGML_ASSERT(dev && "CPU backend is not loaded");
20224
+ auto * reg = ggml_backend_dev_backend_reg(dev);
20225
+ auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
20226
+ numa_init_fn(numa);
19297
20227
  }
19298
20228
  }
19299
20229
 
@@ -19384,19 +20314,24 @@ struct llama_model * llama_load_model_from_file(
19384
20314
  }
19385
20315
 
19386
20316
  // create list of devices to use with this model
19387
- // currently, we use all available devices
19388
- // TODO: rework API to give user more control over device selection
19389
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
19390
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
19391
- switch (ggml_backend_dev_type(dev)) {
19392
- case GGML_BACKEND_DEVICE_TYPE_CPU:
19393
- case GGML_BACKEND_DEVICE_TYPE_ACCEL:
19394
- // skip CPU backends since they are handled separately
19395
- break;
20317
+ if (params.devices) {
20318
+ for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
20319
+ model->devices.push_back(*dev);
20320
+ }
20321
+ } else {
20322
+ // use all available devices
20323
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
20324
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
20325
+ switch (ggml_backend_dev_type(dev)) {
20326
+ case GGML_BACKEND_DEVICE_TYPE_CPU:
20327
+ case GGML_BACKEND_DEVICE_TYPE_ACCEL:
20328
+ // skip CPU backends since they are handled separately
20329
+ break;
19396
20330
 
19397
- case GGML_BACKEND_DEVICE_TYPE_GPU:
19398
- model->devices.push_back(dev);
19399
- break;
20331
+ case GGML_BACKEND_DEVICE_TYPE_GPU:
20332
+ model->devices.push_back(dev);
20333
+ break;
20334
+ }
19400
20335
  }
19401
20336
  }
19402
20337
 
@@ -19567,9 +20502,6 @@ struct llama_context * llama_new_context_with_model(
19567
20502
  __func__, n_ctx_per_seq, hparams.n_ctx_train);
19568
20503
  }
19569
20504
 
19570
- ctx->abort_callback = params.abort_callback;
19571
- ctx->abort_callback_data = params.abort_callback_data;
19572
-
19573
20505
  ctx->logits_all = params.logits_all;
19574
20506
 
19575
20507
  // build worst-case graph for encoder if a model contains encoder
@@ -19618,7 +20550,7 @@ struct llama_context * llama_new_context_with_model(
19618
20550
  }
19619
20551
 
19620
20552
  // add CPU backend
19621
- ctx->backend_cpu = ggml_backend_cpu_init();
20553
+ ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
19622
20554
  if (ctx->backend_cpu == nullptr) {
19623
20555
  LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
19624
20556
  llama_free(ctx);
@@ -19638,6 +20570,8 @@ struct llama_context * llama_new_context_with_model(
19638
20570
  }
19639
20571
  }
19640
20572
 
20573
+ llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
20574
+
19641
20575
  if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
19642
20576
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
19643
20577
  llama_free(ctx);
@@ -19683,7 +20617,8 @@ struct llama_context * llama_new_context_with_model(
19683
20617
  std::vector<ggml_backend_t> backend_ptrs;
19684
20618
  for (auto & backend : ctx->backends) {
19685
20619
  auto * buft = ggml_backend_get_default_buffer_type(backend.get());
19686
- if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
20620
+ auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
20621
+ if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
19687
20622
  // use the host buffer of the first device CPU for faster transfer of the intermediate state
19688
20623
  auto * dev = model->devices[0];
19689
20624
  auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
@@ -19711,7 +20646,8 @@ struct llama_context * llama_new_context_with_model(
19711
20646
  // pipeline parallelism requires support for async compute and events in all devices
19712
20647
  if (pipeline_parallel) {
19713
20648
  for (auto & backend : ctx->backends) {
19714
- if (ggml_backend_is_cpu(backend.get())) {
20649
+ auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
20650
+ if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
19715
20651
  // ignore CPU backend
19716
20652
  continue;
19717
20653
  }
@@ -19853,6 +20789,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19853
20789
  case LLM_ARCH_T5ENCODER:
19854
20790
  case LLM_ARCH_JAIS:
19855
20791
  case LLM_ARCH_RWKV6:
20792
+ case LLM_ARCH_WAVTOKENIZER_DEC:
19856
20793
  return LLAMA_ROPE_TYPE_NONE;
19857
20794
 
19858
20795
  // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -19867,6 +20804,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19867
20804
  case LLM_ARCH_COMMAND_R:
19868
20805
  case LLM_ARCH_OLMO:
19869
20806
  case LLM_ARCH_ARCTIC:
20807
+ case LLM_ARCH_DEEPSEEK:
19870
20808
  case LLM_ARCH_DEEPSEEK2:
19871
20809
  case LLM_ARCH_CHATGLM:
19872
20810
  case LLM_ARCH_GRANITE:
@@ -19885,6 +20823,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19885
20823
  case LLM_ARCH_QWEN:
19886
20824
  case LLM_ARCH_QWEN2:
19887
20825
  case LLM_ARCH_QWEN2MOE:
20826
+ case LLM_ARCH_OLMO2:
19888
20827
  case LLM_ARCH_OLMOE:
19889
20828
  case LLM_ARCH_PHI2:
19890
20829
  case LLM_ARCH_PHI3:
@@ -19899,6 +20838,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
19899
20838
  case LLM_ARCH_MINICPM3:
19900
20839
  return LLAMA_ROPE_TYPE_NEOX;
19901
20840
 
20841
+ case LLM_ARCH_QWEN2VL:
20842
+ return LLAMA_ROPE_TYPE_MROPE;
20843
+
19902
20844
  // all model arches should be listed explicitly here
19903
20845
  case LLM_ARCH_UNKNOWN:
19904
20846
  GGML_ABORT("unknown architecture");
@@ -19965,17 +20907,6 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
19965
20907
  return model->n_elements;
19966
20908
  }
19967
20909
 
19968
- struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
19969
- auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
19970
- [name](const std::pair<std::string, struct ggml_tensor *> & it) {
19971
- return it.first == name;
19972
- });
19973
- if (it == model->tensors_by_name.end()) {
19974
- return nullptr;
19975
- }
19976
- return it->second;
19977
- }
19978
-
19979
20910
  bool llama_model_has_encoder(const struct llama_model * model) {
19980
20911
  switch (model->arch) {
19981
20912
  case LLM_ARCH_T5: return true;
@@ -20276,6 +21207,10 @@ void llama_kv_cache_update(struct llama_context * ctx) {
20276
21207
  llama_kv_cache_update_internal(*ctx);
20277
21208
  }
20278
21209
 
21210
+ bool llama_kv_cache_can_shift(struct llama_context * ctx) {
21211
+ return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
21212
+ }
21213
+
20279
21214
  // deprecated
20280
21215
  size_t llama_get_state_size(struct llama_context * ctx) {
20281
21216
  return llama_state_get_size(ctx);
@@ -21260,6 +22195,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
21260
22195
  void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
21261
22196
  ctx->abort_callback = abort_callback;
21262
22197
  ctx->abort_callback_data = abort_callback_data;
22198
+
22199
+ for (auto & backend : ctx->backends) {
22200
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
22201
+ auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
22202
+ if (set_abort_callback_fn) {
22203
+ set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
22204
+ }
22205
+ }
21263
22206
  }
21264
22207
 
21265
22208
  void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
@@ -21455,7 +22398,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
21455
22398
  throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
21456
22399
  }
21457
22400
  } else if ((size_t) i >= ctx->output_ids.size()) {
21458
- throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
22401
+ throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
21459
22402
  } else {
21460
22403
  j = ctx->output_ids[i];
21461
22404
  }
@@ -21626,18 +22569,111 @@ int32_t llama_detokenize(
21626
22569
  // chat templates
21627
22570
  //
21628
22571
 
22572
+ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
22573
+ if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
22574
+ return LLM_CHAT_TEMPLATES.at(tmpl);
22575
+ }
22576
+ auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
22577
+ return tmpl.find(haystack) != std::string::npos;
22578
+ };
22579
+ if (tmpl_contains("<|im_start|>")) {
22580
+ return LLM_CHAT_TEMPLATE_CHATML;
22581
+ } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
22582
+ if (tmpl_contains("[SYSTEM_PROMPT]")) {
22583
+ return LLM_CHAT_TEMPLATE_MISTRAL_V7;
22584
+ } else if (
22585
+ // catches official 'v1' template
22586
+ tmpl_contains("' [INST] ' + system_message")
22587
+ // catches official 'v3' and 'v3-tekken' templates
22588
+ || tmpl_contains("[AVAILABLE_TOOLS]")
22589
+ ) {
22590
+ // Official mistral 'v1', 'v3' and 'v3-tekken' templates
22591
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
22592
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
22593
+ if (tmpl_contains(" [INST]")) {
22594
+ return LLM_CHAT_TEMPLATE_MISTRAL_V1;
22595
+ } else if (tmpl_contains("\"[INST]\"")) {
22596
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
22597
+ }
22598
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3;
22599
+ } else {
22600
+ // llama2 template and its variants
22601
+ // [variant] support system message
22602
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
22603
+ bool support_system_message = tmpl_contains("<<SYS>>");
22604
+ bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
22605
+ bool strip_message = tmpl_contains("content.strip()");
22606
+ if (strip_message) {
22607
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
22608
+ } else if (add_bos_inside_history) {
22609
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
22610
+ } else if (support_system_message) {
22611
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
22612
+ } else {
22613
+ return LLM_CHAT_TEMPLATE_LLAMA_2;
22614
+ }
22615
+ }
22616
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
22617
+ return LLM_CHAT_TEMPLATE_PHI_3;
22618
+ } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
22619
+ return LLM_CHAT_TEMPLATE_ZEPHYR;
22620
+ } else if (tmpl_contains("bos_token + message['role']")) {
22621
+ return LLM_CHAT_TEMPLATE_MONARCH;
22622
+ } else if (tmpl_contains("<start_of_turn>")) {
22623
+ return LLM_CHAT_TEMPLATE_GEMMA;
22624
+ } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
22625
+ // OrionStarAI/Orion-14B-Chat
22626
+ return LLM_CHAT_TEMPLATE_ORION;
22627
+ } else if (tmpl_contains("GPT4 Correct ")) {
22628
+ // openchat/openchat-3.5-0106
22629
+ return LLM_CHAT_TEMPLATE_OPENCHAT;
22630
+ } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
22631
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
22632
+ if (tmpl_contains("SYSTEM: ")) {
22633
+ return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
22634
+ }
22635
+ return LLM_CHAT_TEMPLATE_VICUNA;
22636
+ } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
22637
+ // deepseek-ai/deepseek-coder-33b-instruct
22638
+ return LLM_CHAT_TEMPLATE_DEEPSEEK;
22639
+ } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
22640
+ // CohereForAI/c4ai-command-r-plus
22641
+ return LLM_CHAT_TEMPLATE_COMMAND_R;
22642
+ } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
22643
+ return LLM_CHAT_TEMPLATE_LLAMA_3;
22644
+ } else if (tmpl_contains("[gMASK]sop")) {
22645
+ // chatglm3-6b
22646
+ return LLM_CHAT_TEMPLATE_CHATGML_3;
22647
+ } else if (tmpl_contains("[gMASK]<sop>")) {
22648
+ return LLM_CHAT_TEMPLATE_CHATGML_4;
22649
+ } else if (tmpl_contains(LU8("<用户>"))) {
22650
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
22651
+ return LLM_CHAT_TEMPLATE_MINICPM;
22652
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
22653
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
22654
+ } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
22655
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
22656
+ // EXAONE-3.0-7.8B-Instruct
22657
+ return LLM_CHAT_TEMPLATE_EXAONE_3;
22658
+ } else if (tmpl_contains("rwkv-world")) {
22659
+ return LLM_CHAT_TEMPLATE_RWKV_WORLD;
22660
+ } else if (tmpl_contains("<|start_of_role|>")) {
22661
+ return LLM_CHAT_TEMPLATE_GRANITE;
22662
+ } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
22663
+ return LLM_CHAT_TEMPLATE_GIGACHAT;
22664
+ }
22665
+ return LLM_CHAT_TEMPLATE_UNKNOWN;
22666
+ }
22667
+
21629
22668
  // Simple version of "llama_apply_chat_template" that only works with strings
21630
22669
  // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
21631
22670
  static int32_t llama_chat_apply_template_internal(
21632
- const std::string & tmpl,
22671
+ const llm_chat_template tmpl,
21633
22672
  const std::vector<const llama_chat_message *> & chat,
21634
22673
  std::string & dest, bool add_ass) {
21635
22674
  // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
21636
22675
  std::stringstream ss;
21637
- auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
21638
- return tmpl.find(haystack) != std::string::npos;
21639
- };
21640
- if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
22676
+ if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
21641
22677
  // chatml template
21642
22678
  for (auto message : chat) {
21643
22679
  ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -21645,16 +22681,59 @@ static int32_t llama_chat_apply_template_internal(
21645
22681
  if (add_ass) {
21646
22682
  ss << "<|im_start|>assistant\n";
21647
22683
  }
21648
- } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
22684
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
22685
+ // Official mistral 'v7' template
22686
+ // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
22687
+ for (auto message : chat) {
22688
+ std::string role(message->role);
22689
+ std::string content(message->content);
22690
+ if (role == "system") {
22691
+ ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
22692
+ } else if (role == "user") {
22693
+ ss << "[INST] " << content << "[/INST]";
22694
+ }
22695
+ else {
22696
+ ss << " " << content << "</s>";
22697
+ }
22698
+ }
22699
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
22700
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
22701
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
22702
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
22703
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
22704
+ std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
22705
+ std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
22706
+ bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
22707
+ bool is_inside_turn = false;
22708
+ for (auto message : chat) {
22709
+ if (!is_inside_turn) {
22710
+ ss << leading_space << "[INST]" << trailing_space;
22711
+ is_inside_turn = true;
22712
+ }
22713
+ std::string role(message->role);
22714
+ std::string content(message->content);
22715
+ if (role == "system") {
22716
+ ss << content << "\n\n";
22717
+ } else if (role == "user") {
22718
+ ss << content << leading_space << "[/INST]";
22719
+ } else {
22720
+ ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
22721
+ is_inside_turn = false;
22722
+ }
22723
+ }
22724
+ } else if (
22725
+ tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
22726
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
22727
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
22728
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
21649
22729
  // llama2 template and its variants
21650
22730
  // [variant] support system message
21651
- bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
21652
- // [variant] space before + after response
21653
- bool space_around_response = tmpl_contains("' ' + eos_token");
22731
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
22732
+ bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
21654
22733
  // [variant] add BOS inside history
21655
- bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
22734
+ bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
21656
22735
  // [variant] trim spaces from the input message
21657
- bool strip_message = tmpl_contains("content.strip()");
22736
+ bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
21658
22737
  // construct the prompt
21659
22738
  bool is_inside_turn = true; // skip BOS at the beginning
21660
22739
  ss << "[INST] ";
@@ -21675,12 +22754,11 @@ static int32_t llama_chat_apply_template_internal(
21675
22754
  } else if (role == "user") {
21676
22755
  ss << content << " [/INST]";
21677
22756
  } else {
21678
- ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
22757
+ ss << content << "</s>";
21679
22758
  is_inside_turn = false;
21680
22759
  }
21681
22760
  }
21682
- // llama2 templates seem to not care about "add_generation_prompt"
21683
- } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
22761
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
21684
22762
  // Phi 3
21685
22763
  for (auto message : chat) {
21686
22764
  std::string role(message->role);
@@ -21689,7 +22767,7 @@ static int32_t llama_chat_apply_template_internal(
21689
22767
  if (add_ass) {
21690
22768
  ss << "<|assistant|>\n";
21691
22769
  }
21692
- } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
22770
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
21693
22771
  // zephyr template
21694
22772
  for (auto message : chat) {
21695
22773
  ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -21697,7 +22775,7 @@ static int32_t llama_chat_apply_template_internal(
21697
22775
  if (add_ass) {
21698
22776
  ss << "<|assistant|>\n";
21699
22777
  }
21700
- } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
22778
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
21701
22779
  // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
21702
22780
  for (auto message : chat) {
21703
22781
  std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -21706,7 +22784,7 @@ static int32_t llama_chat_apply_template_internal(
21706
22784
  if (add_ass) {
21707
22785
  ss << "<s>assistant\n";
21708
22786
  }
21709
- } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
22787
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
21710
22788
  // google/gemma-7b-it
21711
22789
  std::string system_prompt = "";
21712
22790
  for (auto message : chat) {
@@ -21728,7 +22806,7 @@ static int32_t llama_chat_apply_template_internal(
21728
22806
  if (add_ass) {
21729
22807
  ss << "<start_of_turn>model\n";
21730
22808
  }
21731
- } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
22809
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
21732
22810
  // OrionStarAI/Orion-14B-Chat
21733
22811
  std::string system_prompt = "";
21734
22812
  for (auto message : chat) {
@@ -21748,7 +22826,7 @@ static int32_t llama_chat_apply_template_internal(
21748
22826
  ss << message->content << "</s>";
21749
22827
  }
21750
22828
  }
21751
- } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
22829
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
21752
22830
  // openchat/openchat-3.5-0106,
21753
22831
  for (auto message : chat) {
21754
22832
  std::string role(message->role);
@@ -21762,13 +22840,13 @@ static int32_t llama_chat_apply_template_internal(
21762
22840
  if (add_ass) {
21763
22841
  ss << "GPT4 Correct Assistant:";
21764
22842
  }
21765
- } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
22843
+ } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
21766
22844
  // eachadea/vicuna-13b-1.1 (and Orca variant)
21767
22845
  for (auto message : chat) {
21768
22846
  std::string role(message->role);
21769
22847
  if (role == "system") {
21770
22848
  // Orca-Vicuna variant uses a system prefix
21771
- if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
22849
+ if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
21772
22850
  ss << "SYSTEM: " << message->content << "\n";
21773
22851
  } else {
21774
22852
  ss << message->content << "\n\n";
@@ -21782,7 +22860,7 @@ static int32_t llama_chat_apply_template_internal(
21782
22860
  if (add_ass) {
21783
22861
  ss << "ASSISTANT:";
21784
22862
  }
21785
- } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
22863
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
21786
22864
  // deepseek-ai/deepseek-coder-33b-instruct
21787
22865
  for (auto message : chat) {
21788
22866
  std::string role(message->role);
@@ -21797,7 +22875,7 @@ static int32_t llama_chat_apply_template_internal(
21797
22875
  if (add_ass) {
21798
22876
  ss << "### Response:\n";
21799
22877
  }
21800
- } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
22878
+ } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
21801
22879
  // CohereForAI/c4ai-command-r-plus
21802
22880
  for (auto message : chat) {
21803
22881
  std::string role(message->role);
@@ -21812,7 +22890,7 @@ static int32_t llama_chat_apply_template_internal(
21812
22890
  if (add_ass) {
21813
22891
  ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
21814
22892
  }
21815
- } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
22893
+ } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
21816
22894
  // Llama 3
21817
22895
  for (auto message : chat) {
21818
22896
  std::string role(message->role);
@@ -21821,7 +22899,7 @@ static int32_t llama_chat_apply_template_internal(
21821
22899
  if (add_ass) {
21822
22900
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
21823
22901
  }
21824
- } else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
22902
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
21825
22903
  // chatglm3-6b
21826
22904
  ss << "[gMASK]" << "sop";
21827
22905
  for (auto message : chat) {
@@ -21831,7 +22909,7 @@ static int32_t llama_chat_apply_template_internal(
21831
22909
  if (add_ass) {
21832
22910
  ss << "<|assistant|>";
21833
22911
  }
21834
- } else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
22912
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
21835
22913
  ss << "[gMASK]" << "<sop>";
21836
22914
  for (auto message : chat) {
21837
22915
  std::string role(message->role);
@@ -21840,7 +22918,7 @@ static int32_t llama_chat_apply_template_internal(
21840
22918
  if (add_ass) {
21841
22919
  ss << "<|assistant|>";
21842
22920
  }
21843
- } else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
22921
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
21844
22922
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
21845
22923
  for (auto message : chat) {
21846
22924
  std::string role(message->role);
@@ -21852,7 +22930,7 @@ static int32_t llama_chat_apply_template_internal(
21852
22930
  ss << trim(message->content);
21853
22931
  }
21854
22932
  }
21855
- } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
22933
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
21856
22934
  // DeepSeek-V2
21857
22935
  for (auto message : chat) {
21858
22936
  std::string role(message->role);
@@ -21867,7 +22945,7 @@ static int32_t llama_chat_apply_template_internal(
21867
22945
  if (add_ass) {
21868
22946
  ss << "Assistant:";
21869
22947
  }
21870
- } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
22948
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
21871
22949
  // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
21872
22950
  // EXAONE-3.0-7.8B-Instruct
21873
22951
  for (auto message : chat) {
@@ -21883,7 +22961,7 @@ static int32_t llama_chat_apply_template_internal(
21883
22961
  if (add_ass) {
21884
22962
  ss << "[|assistant|]";
21885
22963
  }
21886
- } else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world")) {
22964
+ } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
21887
22965
  // this template requires the model to have "\n\n" as EOT token
21888
22966
  for (auto message : chat) {
21889
22967
  std::string role(message->role);
@@ -21893,7 +22971,7 @@ static int32_t llama_chat_apply_template_internal(
21893
22971
  ss << message->content << "\n\n";
21894
22972
  }
21895
22973
  }
21896
- } else if (tmpl == "granite" || tmpl_contains("<|start_of_role|>")) {
22974
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
21897
22975
  // IBM Granite template
21898
22976
  for (const auto & message : chat) {
21899
22977
  std::string role(message->role);
@@ -21906,6 +22984,32 @@ static int32_t llama_chat_apply_template_internal(
21906
22984
  if (add_ass) {
21907
22985
  ss << "<|start_of_role|>assistant<|end_of_role|>\n";
21908
22986
  }
22987
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
22988
+ // GigaChat template
22989
+ bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
22990
+
22991
+ // Handle system message if present
22992
+ if (has_system) {
22993
+ ss << "<s>" << chat[0]->content << "<|message_sep|>";
22994
+ } else {
22995
+ ss << "<s>";
22996
+ }
22997
+
22998
+ // Process remaining messages
22999
+ for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
23000
+ std::string role(chat[i]->role);
23001
+ if (role == "user") {
23002
+ ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
23003
+ << "available functions<|role_sep|>[]<|message_sep|>";
23004
+ } else if (role == "assistant") {
23005
+ ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
23006
+ }
23007
+ }
23008
+
23009
+ // Add generation prompt if needed
23010
+ if (add_ass) {
23011
+ ss << "assistant<|role_sep|>";
23012
+ }
21909
23013
  } else {
21910
23014
  // template not supported
21911
23015
  return -1;
@@ -21925,15 +23029,15 @@ int32_t llama_chat_apply_template(
21925
23029
  std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
21926
23030
  if (tmpl == nullptr) {
21927
23031
  GGML_ASSERT(model != nullptr);
21928
- // load template from model
21929
- std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
21930
- std::string template_key = "tokenizer.chat_template";
21931
- int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
21932
- if (res < 0) {
23032
+
23033
+ // load template from model, if available
23034
+ const auto & it = model->gguf_kv.find("tokenizer.chat_template");
23035
+ if (it != model->gguf_kv.end() && it->second.size() > 0) {
23036
+ curr_tmpl = it->second;
23037
+ }
23038
+ else {
21933
23039
  // worst case: there is no information about template, we will use chatml by default
21934
- curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
21935
- } else {
21936
- curr_tmpl = std::string(model_template.data(), model_template.size());
23040
+ curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
21937
23041
  }
21938
23042
  }
21939
23043
 
@@ -21945,7 +23049,11 @@ int32_t llama_chat_apply_template(
21945
23049
  }
21946
23050
 
21947
23051
  std::string formatted_chat;
21948
- int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
23052
+ llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
23053
+ if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
23054
+ return -1;
23055
+ }
23056
+ int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
21949
23057
  if (res < 0) {
21950
23058
  return res;
21951
23059
  }
@@ -21955,6 +23063,15 @@ int32_t llama_chat_apply_template(
21955
23063
  return res;
21956
23064
  }
21957
23065
 
23066
+ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
23067
+ auto it = LLM_CHAT_TEMPLATES.begin();
23068
+ for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
23069
+ output[i] = it->first.c_str();
23070
+ std::advance(it, 1);
23071
+ }
23072
+ return (int32_t) LLM_CHAT_TEMPLATES.size();
23073
+ }
23074
+
21958
23075
  //
21959
23076
  // sampling
21960
23077
  //
@@ -22001,32 +23118,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
22001
23118
  }
22002
23119
 
22003
23120
  const char * llama_print_system_info(void) {
22004
- ggml_cpu_init(); // some ARM features are detected at runtime
22005
-
22006
23121
  static std::string s;
22007
23122
 
22008
- s = "";
22009
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
22010
- s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
22011
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
22012
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
22013
- s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
22014
- s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
22015
- s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
22016
- s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
22017
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
22018
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
22019
- s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
22020
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
22021
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
22022
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
22023
- s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
22024
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
22025
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
22026
- s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
22027
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
22028
- s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
22029
- s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
23123
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
23124
+ auto * reg = ggml_backend_reg_get(i);
23125
+ auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
23126
+ if (get_features_fn) {
23127
+ ggml_backend_feature * features = get_features_fn(reg);
23128
+ s += ggml_backend_reg_name(reg);
23129
+ s += " : ";
23130
+ for (; features->name; features++) {
23131
+ s += features->name;
23132
+ s += " = ";
23133
+ s += features->value;
23134
+ s += " | ";
23135
+ }
23136
+ }
23137
+ }
22030
23138
 
22031
23139
  return s.c_str();
22032
23140
  }