@fugood/llama.node 0.3.15 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +243 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +14 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  136. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  138. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
  143. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  144. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  145. package/src/llama.cpp/include/llama.h +30 -11
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  147. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  149. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  150. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  151. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  152. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  153. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  154. package/src/llama.cpp/src/llama-arch.cpp +161 -17
  155. package/src/llama.cpp/src/llama-arch.h +16 -0
  156. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  157. package/src/llama.cpp/src/llama-chat.h +6 -2
  158. package/src/llama.cpp/src/llama-context.cpp +108 -92
  159. package/src/llama.cpp/src/llama-context.h +1 -2
  160. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  161. package/src/llama.cpp/src/llama-graph.h +26 -6
  162. package/src/llama.cpp/src/llama-hparams.h +13 -0
  163. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  164. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  165. package/src/llama.cpp/src/llama-memory.h +1 -1
  166. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  167. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  168. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  169. package/src/llama.cpp/src/llama-model.cpp +1544 -291
  170. package/src/llama.cpp/src/llama-model.h +13 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  172. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  173. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  174. package/src/llama.cpp/src/llama.cpp +1 -1
  175. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  176. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  177. package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
  178. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  179. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  180. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  181. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  182. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  183. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  184. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  185. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  186. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  188. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  189. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  190. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  191. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  192. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  193. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  203. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -6,6 +6,7 @@
6
6
 
7
7
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
8
8
  { LLM_ARCH_LLAMA, "llama" },
9
+ { LLM_ARCH_LLAMA4, "llama4" },
9
10
  { LLM_ARCH_DECI, "deci" },
10
11
  { LLM_ARCH_FALCON, "falcon" },
11
12
  { LLM_ARCH_GROK, "grok" },
@@ -18,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
18
19
  { LLM_ARCH_REFACT, "refact" },
19
20
  { LLM_ARCH_BERT, "bert" },
20
21
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
22
+ { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
21
23
  { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
22
24
  { LLM_ARCH_BLOOM, "bloom" },
23
25
  { LLM_ARCH_STABLELM, "stablelm" },
@@ -25,6 +27,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
25
27
  { LLM_ARCH_QWEN2, "qwen2" },
26
28
  { LLM_ARCH_QWEN2MOE, "qwen2moe" },
27
29
  { LLM_ARCH_QWEN2VL, "qwen2vl" },
30
+ { LLM_ARCH_QWEN3, "qwen3" },
31
+ { LLM_ARCH_QWEN3MOE, "qwen3moe" },
28
32
  { LLM_ARCH_PHI2, "phi2" },
29
33
  { LLM_ARCH_PHI3, "phi3" },
30
34
  { LLM_ARCH_PHIMOE, "phimoe" },
@@ -51,6 +55,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
51
55
  { LLM_ARCH_DEEPSEEK, "deepseek" },
52
56
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
53
57
  { LLM_ARCH_CHATGLM, "chatglm" },
58
+ { LLM_ARCH_GLM4, "glm4" },
54
59
  { LLM_ARCH_BITNET, "bitnet" },
55
60
  { LLM_ARCH_T5, "t5" },
56
61
  { LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -65,6 +70,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
65
70
  { LLM_ARCH_GRANITE_MOE, "granitemoe" },
66
71
  { LLM_ARCH_CHAMELEON, "chameleon" },
67
72
  { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
73
+ { LLM_ARCH_PLM, "plm" },
74
+ { LLM_ARCH_BAILINGMOE, "bailingmoe" },
68
75
  { LLM_ARCH_UNKNOWN, "(unknown)" },
69
76
  };
70
77
 
@@ -73,6 +80,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
73
80
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
74
81
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
75
82
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
83
+ { LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
76
84
  { LLM_KV_GENERAL_NAME, "general.name" },
77
85
  { LLM_KV_GENERAL_AUTHOR, "general.author" },
78
86
  { LLM_KV_GENERAL_VERSION, "general.version" },
@@ -99,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
99
107
  { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
100
108
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
101
109
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
110
+ { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
102
111
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
103
112
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
104
113
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -111,6 +120,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
111
120
  { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
112
121
  { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
113
122
  { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
123
+ { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
114
124
 
115
125
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
116
126
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -132,6 +142,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
132
142
  { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
133
143
  { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
134
144
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
145
+ { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
146
+ { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
135
147
 
136
148
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
137
149
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -230,6 +242,35 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
230
242
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
231
243
  },
232
244
  },
245
+ {
246
+ LLM_ARCH_LLAMA4,
247
+ {
248
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
249
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
250
+ { LLM_TENSOR_OUTPUT, "output" },
251
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
252
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
253
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
254
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
255
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
256
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
257
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
258
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
259
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
260
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
261
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
262
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
263
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
264
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
265
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
266
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
267
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
268
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
269
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
270
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
271
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
272
+ },
273
+ },
233
274
  {
234
275
  LLM_ARCH_DECI,
235
276
  {
@@ -433,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
433
474
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
434
475
  },
435
476
  },
477
+ {
478
+ LLM_ARCH_NOMIC_BERT_MOE,
479
+ {
480
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
481
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
482
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
483
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
484
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
485
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
486
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
487
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
488
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
489
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
490
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
491
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
492
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
493
+ },
494
+ },
436
495
  {
437
496
  LLM_ARCH_JINA_BERT_V2,
438
497
  {
@@ -561,6 +620,45 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
561
620
  { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
562
621
  },
563
622
  },
623
+ {
624
+ LLM_ARCH_QWEN3,
625
+ {
626
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
627
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
628
+ { LLM_TENSOR_OUTPUT, "output" },
629
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
630
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
631
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
632
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
633
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
634
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
635
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
636
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
637
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
638
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
639
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
640
+ },
641
+ },
642
+ {
643
+ LLM_ARCH_QWEN3MOE,
644
+ {
645
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
646
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
647
+ { LLM_TENSOR_OUTPUT, "output" },
648
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
649
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
650
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
651
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
652
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
653
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
654
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
655
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
656
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
657
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
658
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
659
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
660
+ },
661
+ },
564
662
  {
565
663
  LLM_ARCH_PHI2,
566
664
  {
@@ -778,6 +876,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
778
876
  {
779
877
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
780
878
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
879
+ { LLM_TENSOR_OUTPUT, "output" },
781
880
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
782
881
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
783
882
  { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@@ -1026,6 +1125,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1026
1125
  { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1027
1126
  { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1028
1127
  { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1128
+ { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
1129
+ { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
1029
1130
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1030
1131
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1031
1132
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
@@ -1042,6 +1143,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1042
1143
  { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1043
1144
  },
1044
1145
  },
1146
+ {
1147
+ LLM_ARCH_PLM,
1148
+ {
1149
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1150
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1151
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1152
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1153
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1154
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1155
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1156
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1157
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1158
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1159
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1160
+ },
1161
+ },
1045
1162
  {
1046
1163
  LLM_ARCH_CHATGLM,
1047
1164
  {
@@ -1060,6 +1177,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1060
1177
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1061
1178
  },
1062
1179
  },
1180
+ {
1181
+ LLM_ARCH_GLM4,
1182
+ {
1183
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1184
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1185
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1186
+ { LLM_TENSOR_OUTPUT, "output" },
1187
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1188
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1189
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1190
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1191
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1192
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1193
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1194
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1195
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1196
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1197
+ },
1198
+ },
1063
1199
  {
1064
1200
  LLM_ARCH_BITNET,
1065
1201
  {
@@ -1391,6 +1527,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1391
1527
  { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
1392
1528
  },
1393
1529
  },
1530
+ {
1531
+ LLM_ARCH_BAILINGMOE,
1532
+ {
1533
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1534
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1535
+ { LLM_TENSOR_OUTPUT, "output" },
1536
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1537
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1538
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1539
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1540
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1541
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1542
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1543
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1544
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1545
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1546
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1547
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1548
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1549
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1550
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1551
+ },
1552
+ },
1394
1553
  {
1395
1554
  LLM_ARCH_UNKNOWN,
1396
1555
  {
@@ -1428,23 +1587,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1428
1587
  {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1429
1588
  {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1430
1589
  {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1431
- {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1432
- {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1433
- {LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1434
- {LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1435
- {LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1436
- {LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1437
- {LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1438
- {LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1439
- {LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1440
- {LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1441
- {LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1442
- {LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1443
- {LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1444
- {LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1445
- {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1446
- {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1447
- {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1590
+ {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1591
+ {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1448
1592
  {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1449
1593
  {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1450
1594
  {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -10,6 +10,7 @@
10
10
 
11
11
  enum llm_arch {
12
12
  LLM_ARCH_LLAMA,
13
+ LLM_ARCH_LLAMA4,
13
14
  LLM_ARCH_DECI,
14
15
  LLM_ARCH_FALCON,
15
16
  LLM_ARCH_BAICHUAN,
@@ -22,6 +23,7 @@ enum llm_arch {
22
23
  LLM_ARCH_REFACT,
23
24
  LLM_ARCH_BERT,
24
25
  LLM_ARCH_NOMIC_BERT,
26
+ LLM_ARCH_NOMIC_BERT_MOE,
25
27
  LLM_ARCH_JINA_BERT_V2,
26
28
  LLM_ARCH_BLOOM,
27
29
  LLM_ARCH_STABLELM,
@@ -29,6 +31,8 @@ enum llm_arch {
29
31
  LLM_ARCH_QWEN2,
30
32
  LLM_ARCH_QWEN2MOE,
31
33
  LLM_ARCH_QWEN2VL,
34
+ LLM_ARCH_QWEN3,
35
+ LLM_ARCH_QWEN3MOE,
32
36
  LLM_ARCH_PHI2,
33
37
  LLM_ARCH_PHI3,
34
38
  LLM_ARCH_PHIMOE,
@@ -55,6 +59,7 @@ enum llm_arch {
55
59
  LLM_ARCH_DEEPSEEK,
56
60
  LLM_ARCH_DEEPSEEK2,
57
61
  LLM_ARCH_CHATGLM,
62
+ LLM_ARCH_GLM4,
58
63
  LLM_ARCH_BITNET,
59
64
  LLM_ARCH_T5,
60
65
  LLM_ARCH_T5ENCODER,
@@ -69,6 +74,8 @@ enum llm_arch {
69
74
  LLM_ARCH_GRANITE_MOE,
70
75
  LLM_ARCH_CHAMELEON,
71
76
  LLM_ARCH_WAVTOKENIZER_DEC,
77
+ LLM_ARCH_PLM,
78
+ LLM_ARCH_BAILINGMOE,
72
79
  LLM_ARCH_UNKNOWN,
73
80
  };
74
81
 
@@ -77,6 +84,7 @@ enum llm_kv {
77
84
  LLM_KV_GENERAL_ARCHITECTURE,
78
85
  LLM_KV_GENERAL_QUANTIZATION_VERSION,
79
86
  LLM_KV_GENERAL_ALIGNMENT,
87
+ LLM_KV_GENERAL_FILE_TYPE,
80
88
  LLM_KV_GENERAL_NAME,
81
89
  LLM_KV_GENERAL_AUTHOR,
82
90
  LLM_KV_GENERAL_VERSION,
@@ -103,6 +111,7 @@ enum llm_kv {
103
111
  LLM_KV_EXPERT_WEIGHTS_SCALE,
104
112
  LLM_KV_EXPERT_WEIGHTS_NORM,
105
113
  LLM_KV_EXPERT_GATING_FUNC,
114
+ LLM_KV_MOE_EVERY_N_LAYERS,
106
115
  LLM_KV_POOLING_TYPE,
107
116
  LLM_KV_LOGIT_SCALE,
108
117
  LLM_KV_DECODER_START_TOKEN_ID,
@@ -115,6 +124,7 @@ enum llm_kv {
115
124
  LLM_KV_RESIDUAL_SCALE,
116
125
  LLM_KV_EMBEDDING_SCALE,
117
126
  LLM_KV_TOKEN_SHIFT_COUNT,
127
+ LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
118
128
 
119
129
  LLM_KV_ATTENTION_HEAD_COUNT,
120
130
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -136,6 +146,8 @@ enum llm_kv {
136
146
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
137
147
  LLM_KV_ATTENTION_SLIDING_WINDOW,
138
148
  LLM_KV_ATTENTION_SCALE,
149
+ LLM_KV_ATTENTION_KEY_LENGTH_MLA,
150
+ LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
139
151
 
140
152
  LLM_KV_ROPE_DIMENSION_COUNT,
141
153
  LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -249,6 +261,8 @@ enum llm_tensor {
249
261
  LLM_TENSOR_ATTN_Q_NORM,
250
262
  LLM_TENSOR_ATTN_K_NORM,
251
263
  LLM_TENSOR_LAYER_OUT_NORM,
264
+ LLM_TENSOR_POST_ATTN_NORM,
265
+ LLM_TENSOR_POST_MLP_NORM,
252
266
  LLM_TENSOR_SSM_IN,
253
267
  LLM_TENSOR_SSM_CONV1D,
254
268
  LLM_TENSOR_SSM_X,
@@ -296,6 +310,8 @@ enum llm_tensor {
296
310
  LLM_TENSOR_ATTN_Q_B,
297
311
  LLM_TENSOR_ATTN_KV_A_MQA,
298
312
  LLM_TENSOR_ATTN_KV_B,
313
+ LLM_TENSOR_ATTN_K_B,
314
+ LLM_TENSOR_ATTN_V_B,
299
315
  LLM_TENSOR_ATTN_Q_A_NORM,
300
316
  LLM_TENSOR_ATTN_KV_A_NORM,
301
317
  LLM_TENSOR_ATTN_SUB_NORM,
@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
50
50
  { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
51
51
  { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
52
52
  { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
53
- { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
54
- { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
53
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
54
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
55
55
  { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
56
56
  { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
57
57
  { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
@@ -59,6 +59,10 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
59
59
  { "granite", LLM_CHAT_TEMPLATE_GRANITE },
60
60
  { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
61
61
  { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
62
+ { "yandex", LLM_CHAT_TEMPLATE_YANDEX },
63
+ { "bailing", LLM_CHAT_TEMPLATE_BAILING },
64
+ { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
65
+ { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
62
66
  };
63
67
 
64
68
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -78,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
78
82
  if (tmpl_contains("<|im_start|>")) {
79
83
  return tmpl_contains("<|im_sep|>")
80
84
  ? LLM_CHAT_TEMPLATE_PHI_4
81
- : LLM_CHAT_TEMPLATE_CHATML;
85
+ : tmpl_contains("<end_of_utterance>")
86
+ ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
87
+ : LLM_CHAT_TEMPLATE_CHATML;
82
88
  } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
83
89
  if (tmpl_contains("[SYSTEM_PROMPT]")) {
84
90
  return LLM_CHAT_TEMPLATE_MISTRAL_V7;
@@ -116,8 +122,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
116
122
  }
117
123
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
118
124
  return LLM_CHAT_TEMPLATE_PHI_3;
125
+ } else if (tmpl_contains("[gMASK]<sop>")) {
126
+ return LLM_CHAT_TEMPLATE_CHATGLM_4;
119
127
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
120
128
  return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
129
+ } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
130
+ return LLM_CHAT_TEMPLATE_GLMEDGE;
121
131
  } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
122
132
  return LLM_CHAT_TEMPLATE_ZEPHYR;
123
133
  } else if (tmpl_contains("bos_token + message['role']")) {
@@ -146,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
146
156
  return LLM_CHAT_TEMPLATE_LLAMA_3;
147
157
  } else if (tmpl_contains("[gMASK]sop")) {
148
158
  // chatglm3-6b
149
- return LLM_CHAT_TEMPLATE_CHATGML_3;
150
- } else if (tmpl_contains("[gMASK]<sop>")) {
151
- return LLM_CHAT_TEMPLATE_CHATGML_4;
159
+ return LLM_CHAT_TEMPLATE_CHATGLM_3;
152
160
  } else if (tmpl_contains(LU8("<用户>"))) {
153
161
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
154
162
  return LLM_CHAT_TEMPLATE_MINICPM;
@@ -168,6 +176,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
168
176
  return LLM_CHAT_TEMPLATE_GIGACHAT;
169
177
  } else if (tmpl_contains("<|role_start|>")) {
170
178
  return LLM_CHAT_TEMPLATE_MEGREZ;
179
+ } else if (tmpl_contains(" Ассистент:")) {
180
+ return LLM_CHAT_TEMPLATE_YANDEX;
181
+ } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
182
+ return LLM_CHAT_TEMPLATE_BAILING;
183
+ } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
184
+ return LLM_CHAT_TEMPLATE_LLAMA4;
171
185
  }
172
186
  return LLM_CHAT_TEMPLATE_UNKNOWN;
173
187
  }
@@ -423,7 +437,7 @@ int32_t llm_chat_apply_template(
423
437
  if (add_ass) {
424
438
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
425
439
  }
426
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
440
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
427
441
  // chatglm3-6b
428
442
  ss << "[gMASK]" << "sop";
429
443
  for (auto message : chat) {
@@ -433,7 +447,7 @@ int32_t llm_chat_apply_template(
433
447
  if (add_ass) {
434
448
  ss << "<|assistant|>";
435
449
  }
436
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
450
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
437
451
  ss << "[gMASK]" << "<sop>";
438
452
  for (auto message : chat) {
439
453
  std::string role(message->role);
@@ -442,14 +456,6 @@ int32_t llm_chat_apply_template(
442
456
  if (add_ass) {
443
457
  ss << "<|assistant|>";
444
458
  }
445
- } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
446
- for (auto message : chat) {
447
- std::string role(message->role);
448
- ss << "<|" << role << "|>" << "\n" << message->content;
449
- }
450
- if (add_ass) {
451
- ss << "<|assistant|>";
452
- }
453
459
  } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
454
460
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
455
461
  for (auto message : chat) {
@@ -567,6 +573,66 @@ int32_t llm_chat_apply_template(
567
573
  if (add_ass) {
568
574
  ss << "<|role_start|>assistant<|role_end|>";
569
575
  }
576
+ } else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
577
+ // Yandex template ("\n\n" is defined as EOT token)
578
+
579
+ ss << "<s>";
580
+
581
+ for (size_t i = 0; i < chat.size(); i++) {
582
+ std::string role(chat[i]->role);
583
+ if (role == "user") {
584
+ ss << " Пользователь: " << chat[i]->content << "\n\n";
585
+ } else if (role == "assistant") {
586
+ ss << " Ассистент: " << chat[i]->content << "\n\n";
587
+ }
588
+ }
589
+
590
+ // Add generation prompt if needed
591
+ if (add_ass) {
592
+ ss << " Ассистент:[SEP]";
593
+ }
594
+ } else if (tmpl == LLM_CHAT_TEMPLATE_BAILING) {
595
+ // Bailing (Ling) template
596
+ for (auto message : chat) {
597
+ std::string role(message->role);
598
+
599
+ if (role == "user") {
600
+ role = "HUMAN";
601
+ } else {
602
+ std::transform(role.begin(), role.end(), role.begin(), ::toupper);
603
+ }
604
+
605
+ ss << "<role>" << role << "</role>" << message->content;
606
+ }
607
+
608
+ if (add_ass) {
609
+ ss << "<role>ASSISTANT</role>";
610
+ }
611
+ } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA4) {
612
+ // Llama 4
613
+ for (auto message : chat) {
614
+ std::string role(message->role);
615
+ ss << "<|header_start|>" << role << "<|header_end|>\n\n" << trim(message->content) << "<|eot|>";
616
+ }
617
+ if (add_ass) {
618
+ ss << "<|header_start|>assistant<|header_end|>\n\n";
619
+ }
620
+ } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
621
+ // SmolVLM
622
+ ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
623
+ for (auto message : chat) {
624
+ std::string role(message->role);
625
+ if (role == "system") {
626
+ ss << message->content << "\n\n";
627
+ } else if (role == "user") {
628
+ ss << "User: " << message->content << "<end_of_utterance>\n";
629
+ } else {
630
+ ss << "Assistant: " << message->content << "<end_of_utterance>\n";
631
+ }
632
+ }
633
+ if (add_ass) {
634
+ ss << "Assistant:";
635
+ }
570
636
  } else {
571
637
  // template not supported
572
638
  return -1;
@@ -585,4 +651,3 @@ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
585
651
  }
586
652
  return (int32_t) LLM_CHAT_TEMPLATES.size();
587
653
  }
588
-
@@ -29,8 +29,8 @@ enum llm_chat_template {
29
29
  LLM_CHAT_TEMPLATE_DEEPSEEK_3,
30
30
  LLM_CHAT_TEMPLATE_COMMAND_R,
31
31
  LLM_CHAT_TEMPLATE_LLAMA_3,
32
- LLM_CHAT_TEMPLATE_CHATGML_3,
33
- LLM_CHAT_TEMPLATE_CHATGML_4,
32
+ LLM_CHAT_TEMPLATE_CHATGLM_3,
33
+ LLM_CHAT_TEMPLATE_CHATGLM_4,
34
34
  LLM_CHAT_TEMPLATE_GLMEDGE,
35
35
  LLM_CHAT_TEMPLATE_MINICPM,
36
36
  LLM_CHAT_TEMPLATE_EXAONE_3,
@@ -38,6 +38,10 @@ enum llm_chat_template {
38
38
  LLM_CHAT_TEMPLATE_GRANITE,
39
39
  LLM_CHAT_TEMPLATE_GIGACHAT,
40
40
  LLM_CHAT_TEMPLATE_MEGREZ,
41
+ LLM_CHAT_TEMPLATE_YANDEX,
42
+ LLM_CHAT_TEMPLATE_BAILING,
43
+ LLM_CHAT_TEMPLATE_LLAMA4,
44
+ LLM_CHAT_TEMPLATE_SMOLVLM,
41
45
  LLM_CHAT_TEMPLATE_UNKNOWN,
42
46
  };
43
47