@fugood/llama.node 0.3.15 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +243 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +14 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +161 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1544 -291
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
8
8
|
{ LLM_ARCH_LLAMA, "llama" },
|
|
9
|
+
{ LLM_ARCH_LLAMA4, "llama4" },
|
|
9
10
|
{ LLM_ARCH_DECI, "deci" },
|
|
10
11
|
{ LLM_ARCH_FALCON, "falcon" },
|
|
11
12
|
{ LLM_ARCH_GROK, "grok" },
|
|
@@ -18,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
18
19
|
{ LLM_ARCH_REFACT, "refact" },
|
|
19
20
|
{ LLM_ARCH_BERT, "bert" },
|
|
20
21
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
|
22
|
+
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
|
21
23
|
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
|
22
24
|
{ LLM_ARCH_BLOOM, "bloom" },
|
|
23
25
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
|
@@ -25,6 +27,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
25
27
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
|
26
28
|
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
|
27
29
|
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
|
30
|
+
{ LLM_ARCH_QWEN3, "qwen3" },
|
|
31
|
+
{ LLM_ARCH_QWEN3MOE, "qwen3moe" },
|
|
28
32
|
{ LLM_ARCH_PHI2, "phi2" },
|
|
29
33
|
{ LLM_ARCH_PHI3, "phi3" },
|
|
30
34
|
{ LLM_ARCH_PHIMOE, "phimoe" },
|
|
@@ -51,6 +55,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
51
55
|
{ LLM_ARCH_DEEPSEEK, "deepseek" },
|
|
52
56
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
|
53
57
|
{ LLM_ARCH_CHATGLM, "chatglm" },
|
|
58
|
+
{ LLM_ARCH_GLM4, "glm4" },
|
|
54
59
|
{ LLM_ARCH_BITNET, "bitnet" },
|
|
55
60
|
{ LLM_ARCH_T5, "t5" },
|
|
56
61
|
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
|
@@ -65,6 +70,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
65
70
|
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
|
66
71
|
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
|
67
72
|
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
|
73
|
+
{ LLM_ARCH_PLM, "plm" },
|
|
74
|
+
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
|
68
75
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
69
76
|
};
|
|
70
77
|
|
|
@@ -73,6 +80,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
73
80
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
|
74
81
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
|
75
82
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
|
83
|
+
{ LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
|
|
76
84
|
{ LLM_KV_GENERAL_NAME, "general.name" },
|
|
77
85
|
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
|
78
86
|
{ LLM_KV_GENERAL_VERSION, "general.version" },
|
|
@@ -99,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
99
107
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
|
100
108
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
|
101
109
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
|
110
|
+
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
|
102
111
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
|
103
112
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
|
104
113
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
@@ -111,6 +120,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
111
120
|
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
|
112
121
|
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
|
113
122
|
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
|
|
123
|
+
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
|
|
114
124
|
|
|
115
125
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
|
116
126
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
@@ -132,6 +142,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
132
142
|
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
|
133
143
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
|
134
144
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
145
|
+
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
|
146
|
+
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
|
135
147
|
|
|
136
148
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
137
149
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
|
@@ -230,6 +242,35 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
230
242
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
231
243
|
},
|
|
232
244
|
},
|
|
245
|
+
{
|
|
246
|
+
LLM_ARCH_LLAMA4,
|
|
247
|
+
{
|
|
248
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
249
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
250
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
251
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
252
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
253
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
254
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
255
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
256
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
257
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
258
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
259
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
260
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
261
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
262
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
263
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
|
264
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
|
265
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
|
266
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
267
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
268
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
269
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
270
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
271
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
272
|
+
},
|
|
273
|
+
},
|
|
233
274
|
{
|
|
234
275
|
LLM_ARCH_DECI,
|
|
235
276
|
{
|
|
@@ -433,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
433
474
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
434
475
|
},
|
|
435
476
|
},
|
|
477
|
+
{
|
|
478
|
+
LLM_ARCH_NOMIC_BERT_MOE,
|
|
479
|
+
{
|
|
480
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
481
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
482
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
|
483
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
484
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
485
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
486
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
|
487
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
488
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
489
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
490
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
491
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
492
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
493
|
+
},
|
|
494
|
+
},
|
|
436
495
|
{
|
|
437
496
|
LLM_ARCH_JINA_BERT_V2,
|
|
438
497
|
{
|
|
@@ -561,6 +620,45 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
561
620
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
562
621
|
},
|
|
563
622
|
},
|
|
623
|
+
{
|
|
624
|
+
LLM_ARCH_QWEN3,
|
|
625
|
+
{
|
|
626
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
627
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
628
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
629
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
630
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
631
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
632
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
633
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
634
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
635
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
636
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
637
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
638
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
639
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
640
|
+
},
|
|
641
|
+
},
|
|
642
|
+
{
|
|
643
|
+
LLM_ARCH_QWEN3MOE,
|
|
644
|
+
{
|
|
645
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
646
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
647
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
648
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
649
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
650
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
651
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
652
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
653
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
654
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
655
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
656
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
657
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
658
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
659
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
660
|
+
},
|
|
661
|
+
},
|
|
564
662
|
{
|
|
565
663
|
LLM_ARCH_PHI2,
|
|
566
664
|
{
|
|
@@ -778,6 +876,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
778
876
|
{
|
|
779
877
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
780
878
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
879
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
781
880
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
782
881
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
783
882
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
@@ -1026,6 +1125,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1026
1125
|
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
|
1027
1126
|
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
|
1028
1127
|
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
|
1128
|
+
{ LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
|
|
1129
|
+
{ LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
|
|
1029
1130
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1030
1131
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1031
1132
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
@@ -1042,6 +1143,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1042
1143
|
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
1043
1144
|
},
|
|
1044
1145
|
},
|
|
1146
|
+
{
|
|
1147
|
+
LLM_ARCH_PLM,
|
|
1148
|
+
{
|
|
1149
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1150
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1151
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1152
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1153
|
+
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
|
1154
|
+
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
|
|
1155
|
+
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
|
1156
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1157
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1158
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1159
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1160
|
+
},
|
|
1161
|
+
},
|
|
1045
1162
|
{
|
|
1046
1163
|
LLM_ARCH_CHATGLM,
|
|
1047
1164
|
{
|
|
@@ -1060,6 +1177,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1060
1177
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1061
1178
|
},
|
|
1062
1179
|
},
|
|
1180
|
+
{
|
|
1181
|
+
LLM_ARCH_GLM4,
|
|
1182
|
+
{
|
|
1183
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1184
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
1185
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1186
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1187
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1188
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1189
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1190
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1191
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1192
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1193
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1194
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1195
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
1196
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
1197
|
+
},
|
|
1198
|
+
},
|
|
1063
1199
|
{
|
|
1064
1200
|
LLM_ARCH_BITNET,
|
|
1065
1201
|
{
|
|
@@ -1391,6 +1527,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1391
1527
|
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
|
|
1392
1528
|
},
|
|
1393
1529
|
},
|
|
1530
|
+
{
|
|
1531
|
+
LLM_ARCH_BAILINGMOE,
|
|
1532
|
+
{
|
|
1533
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1534
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1535
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1536
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
1537
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1538
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1539
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1540
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1541
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1542
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1543
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1544
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1545
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1546
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1547
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
|
1548
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1549
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1550
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1551
|
+
},
|
|
1552
|
+
},
|
|
1394
1553
|
{
|
|
1395
1554
|
LLM_ARCH_UNKNOWN,
|
|
1396
1555
|
{
|
|
@@ -1428,23 +1587,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
1428
1587
|
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1429
1588
|
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1430
1589
|
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1431
|
-
{
|
|
1432
|
-
{
|
|
1433
|
-
{LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1434
|
-
{LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1435
|
-
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1436
|
-
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1437
|
-
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1438
|
-
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1439
|
-
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1440
|
-
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1441
|
-
{LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1442
|
-
{LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1443
|
-
{LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1444
|
-
{LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1445
|
-
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1446
|
-
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1447
|
-
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1590
|
+
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1591
|
+
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1448
1592
|
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1449
1593
|
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1450
1594
|
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
|
|
11
11
|
enum llm_arch {
|
|
12
12
|
LLM_ARCH_LLAMA,
|
|
13
|
+
LLM_ARCH_LLAMA4,
|
|
13
14
|
LLM_ARCH_DECI,
|
|
14
15
|
LLM_ARCH_FALCON,
|
|
15
16
|
LLM_ARCH_BAICHUAN,
|
|
@@ -22,6 +23,7 @@ enum llm_arch {
|
|
|
22
23
|
LLM_ARCH_REFACT,
|
|
23
24
|
LLM_ARCH_BERT,
|
|
24
25
|
LLM_ARCH_NOMIC_BERT,
|
|
26
|
+
LLM_ARCH_NOMIC_BERT_MOE,
|
|
25
27
|
LLM_ARCH_JINA_BERT_V2,
|
|
26
28
|
LLM_ARCH_BLOOM,
|
|
27
29
|
LLM_ARCH_STABLELM,
|
|
@@ -29,6 +31,8 @@ enum llm_arch {
|
|
|
29
31
|
LLM_ARCH_QWEN2,
|
|
30
32
|
LLM_ARCH_QWEN2MOE,
|
|
31
33
|
LLM_ARCH_QWEN2VL,
|
|
34
|
+
LLM_ARCH_QWEN3,
|
|
35
|
+
LLM_ARCH_QWEN3MOE,
|
|
32
36
|
LLM_ARCH_PHI2,
|
|
33
37
|
LLM_ARCH_PHI3,
|
|
34
38
|
LLM_ARCH_PHIMOE,
|
|
@@ -55,6 +59,7 @@ enum llm_arch {
|
|
|
55
59
|
LLM_ARCH_DEEPSEEK,
|
|
56
60
|
LLM_ARCH_DEEPSEEK2,
|
|
57
61
|
LLM_ARCH_CHATGLM,
|
|
62
|
+
LLM_ARCH_GLM4,
|
|
58
63
|
LLM_ARCH_BITNET,
|
|
59
64
|
LLM_ARCH_T5,
|
|
60
65
|
LLM_ARCH_T5ENCODER,
|
|
@@ -69,6 +74,8 @@ enum llm_arch {
|
|
|
69
74
|
LLM_ARCH_GRANITE_MOE,
|
|
70
75
|
LLM_ARCH_CHAMELEON,
|
|
71
76
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
|
77
|
+
LLM_ARCH_PLM,
|
|
78
|
+
LLM_ARCH_BAILINGMOE,
|
|
72
79
|
LLM_ARCH_UNKNOWN,
|
|
73
80
|
};
|
|
74
81
|
|
|
@@ -77,6 +84,7 @@ enum llm_kv {
|
|
|
77
84
|
LLM_KV_GENERAL_ARCHITECTURE,
|
|
78
85
|
LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
|
79
86
|
LLM_KV_GENERAL_ALIGNMENT,
|
|
87
|
+
LLM_KV_GENERAL_FILE_TYPE,
|
|
80
88
|
LLM_KV_GENERAL_NAME,
|
|
81
89
|
LLM_KV_GENERAL_AUTHOR,
|
|
82
90
|
LLM_KV_GENERAL_VERSION,
|
|
@@ -103,6 +111,7 @@ enum llm_kv {
|
|
|
103
111
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
|
104
112
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
|
105
113
|
LLM_KV_EXPERT_GATING_FUNC,
|
|
114
|
+
LLM_KV_MOE_EVERY_N_LAYERS,
|
|
106
115
|
LLM_KV_POOLING_TYPE,
|
|
107
116
|
LLM_KV_LOGIT_SCALE,
|
|
108
117
|
LLM_KV_DECODER_START_TOKEN_ID,
|
|
@@ -115,6 +124,7 @@ enum llm_kv {
|
|
|
115
124
|
LLM_KV_RESIDUAL_SCALE,
|
|
116
125
|
LLM_KV_EMBEDDING_SCALE,
|
|
117
126
|
LLM_KV_TOKEN_SHIFT_COUNT,
|
|
127
|
+
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
|
|
118
128
|
|
|
119
129
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
|
120
130
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
|
@@ -136,6 +146,8 @@ enum llm_kv {
|
|
|
136
146
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
|
137
147
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
|
138
148
|
LLM_KV_ATTENTION_SCALE,
|
|
149
|
+
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
|
150
|
+
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
|
139
151
|
|
|
140
152
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
141
153
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
|
@@ -249,6 +261,8 @@ enum llm_tensor {
|
|
|
249
261
|
LLM_TENSOR_ATTN_Q_NORM,
|
|
250
262
|
LLM_TENSOR_ATTN_K_NORM,
|
|
251
263
|
LLM_TENSOR_LAYER_OUT_NORM,
|
|
264
|
+
LLM_TENSOR_POST_ATTN_NORM,
|
|
265
|
+
LLM_TENSOR_POST_MLP_NORM,
|
|
252
266
|
LLM_TENSOR_SSM_IN,
|
|
253
267
|
LLM_TENSOR_SSM_CONV1D,
|
|
254
268
|
LLM_TENSOR_SSM_X,
|
|
@@ -296,6 +310,8 @@ enum llm_tensor {
|
|
|
296
310
|
LLM_TENSOR_ATTN_Q_B,
|
|
297
311
|
LLM_TENSOR_ATTN_KV_A_MQA,
|
|
298
312
|
LLM_TENSOR_ATTN_KV_B,
|
|
313
|
+
LLM_TENSOR_ATTN_K_B,
|
|
314
|
+
LLM_TENSOR_ATTN_V_B,
|
|
299
315
|
LLM_TENSOR_ATTN_Q_A_NORM,
|
|
300
316
|
LLM_TENSOR_ATTN_KV_A_NORM,
|
|
301
317
|
LLM_TENSOR_ATTN_SUB_NORM,
|
|
@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
50
50
|
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
|
51
51
|
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
|
52
52
|
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
|
53
|
-
{ "chatglm3",
|
|
54
|
-
{ "chatglm4",
|
|
53
|
+
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
|
54
|
+
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
|
|
55
55
|
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
|
56
56
|
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
|
57
57
|
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
|
@@ -59,6 +59,10 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
59
59
|
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
|
60
60
|
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
|
61
61
|
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
|
|
62
|
+
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
|
63
|
+
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
|
|
64
|
+
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
|
65
|
+
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
|
62
66
|
};
|
|
63
67
|
|
|
64
68
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
|
@@ -78,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
78
82
|
if (tmpl_contains("<|im_start|>")) {
|
|
79
83
|
return tmpl_contains("<|im_sep|>")
|
|
80
84
|
? LLM_CHAT_TEMPLATE_PHI_4
|
|
81
|
-
:
|
|
85
|
+
: tmpl_contains("<end_of_utterance>")
|
|
86
|
+
? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
|
|
87
|
+
: LLM_CHAT_TEMPLATE_CHATML;
|
|
82
88
|
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
|
83
89
|
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
|
84
90
|
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
|
@@ -116,8 +122,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
116
122
|
}
|
|
117
123
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
|
118
124
|
return LLM_CHAT_TEMPLATE_PHI_3;
|
|
125
|
+
} else if (tmpl_contains("[gMASK]<sop>")) {
|
|
126
|
+
return LLM_CHAT_TEMPLATE_CHATGLM_4;
|
|
119
127
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
|
120
128
|
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
|
129
|
+
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
|
130
|
+
return LLM_CHAT_TEMPLATE_GLMEDGE;
|
|
121
131
|
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
|
122
132
|
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
|
123
133
|
} else if (tmpl_contains("bos_token + message['role']")) {
|
|
@@ -146,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
146
156
|
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
|
147
157
|
} else if (tmpl_contains("[gMASK]sop")) {
|
|
148
158
|
// chatglm3-6b
|
|
149
|
-
return
|
|
150
|
-
} else if (tmpl_contains("[gMASK]<sop>")) {
|
|
151
|
-
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
|
159
|
+
return LLM_CHAT_TEMPLATE_CHATGLM_3;
|
|
152
160
|
} else if (tmpl_contains(LU8("<用户>"))) {
|
|
153
161
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
|
154
162
|
return LLM_CHAT_TEMPLATE_MINICPM;
|
|
@@ -168,6 +176,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
168
176
|
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
|
169
177
|
} else if (tmpl_contains("<|role_start|>")) {
|
|
170
178
|
return LLM_CHAT_TEMPLATE_MEGREZ;
|
|
179
|
+
} else if (tmpl_contains(" Ассистент:")) {
|
|
180
|
+
return LLM_CHAT_TEMPLATE_YANDEX;
|
|
181
|
+
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
|
|
182
|
+
return LLM_CHAT_TEMPLATE_BAILING;
|
|
183
|
+
} else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
|
|
184
|
+
return LLM_CHAT_TEMPLATE_LLAMA4;
|
|
171
185
|
}
|
|
172
186
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
|
173
187
|
}
|
|
@@ -423,7 +437,7 @@ int32_t llm_chat_apply_template(
|
|
|
423
437
|
if (add_ass) {
|
|
424
438
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
|
425
439
|
}
|
|
426
|
-
} else if (tmpl ==
|
|
440
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
|
|
427
441
|
// chatglm3-6b
|
|
428
442
|
ss << "[gMASK]" << "sop";
|
|
429
443
|
for (auto message : chat) {
|
|
@@ -433,7 +447,7 @@ int32_t llm_chat_apply_template(
|
|
|
433
447
|
if (add_ass) {
|
|
434
448
|
ss << "<|assistant|>";
|
|
435
449
|
}
|
|
436
|
-
} else if (tmpl ==
|
|
450
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
|
437
451
|
ss << "[gMASK]" << "<sop>";
|
|
438
452
|
for (auto message : chat) {
|
|
439
453
|
std::string role(message->role);
|
|
@@ -442,14 +456,6 @@ int32_t llm_chat_apply_template(
|
|
|
442
456
|
if (add_ass) {
|
|
443
457
|
ss << "<|assistant|>";
|
|
444
458
|
}
|
|
445
|
-
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
|
446
|
-
for (auto message : chat) {
|
|
447
|
-
std::string role(message->role);
|
|
448
|
-
ss << "<|" << role << "|>" << "\n" << message->content;
|
|
449
|
-
}
|
|
450
|
-
if (add_ass) {
|
|
451
|
-
ss << "<|assistant|>";
|
|
452
|
-
}
|
|
453
459
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
|
454
460
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
|
455
461
|
for (auto message : chat) {
|
|
@@ -567,6 +573,66 @@ int32_t llm_chat_apply_template(
|
|
|
567
573
|
if (add_ass) {
|
|
568
574
|
ss << "<|role_start|>assistant<|role_end|>";
|
|
569
575
|
}
|
|
576
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
|
|
577
|
+
// Yandex template ("\n\n" is defined as EOT token)
|
|
578
|
+
|
|
579
|
+
ss << "<s>";
|
|
580
|
+
|
|
581
|
+
for (size_t i = 0; i < chat.size(); i++) {
|
|
582
|
+
std::string role(chat[i]->role);
|
|
583
|
+
if (role == "user") {
|
|
584
|
+
ss << " Пользователь: " << chat[i]->content << "\n\n";
|
|
585
|
+
} else if (role == "assistant") {
|
|
586
|
+
ss << " Ассистент: " << chat[i]->content << "\n\n";
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
// Add generation prompt if needed
|
|
591
|
+
if (add_ass) {
|
|
592
|
+
ss << " Ассистент:[SEP]";
|
|
593
|
+
}
|
|
594
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_BAILING) {
|
|
595
|
+
// Bailing (Ling) template
|
|
596
|
+
for (auto message : chat) {
|
|
597
|
+
std::string role(message->role);
|
|
598
|
+
|
|
599
|
+
if (role == "user") {
|
|
600
|
+
role = "HUMAN";
|
|
601
|
+
} else {
|
|
602
|
+
std::transform(role.begin(), role.end(), role.begin(), ::toupper);
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
ss << "<role>" << role << "</role>" << message->content;
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
if (add_ass) {
|
|
609
|
+
ss << "<role>ASSISTANT</role>";
|
|
610
|
+
}
|
|
611
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA4) {
|
|
612
|
+
// Llama 4
|
|
613
|
+
for (auto message : chat) {
|
|
614
|
+
std::string role(message->role);
|
|
615
|
+
ss << "<|header_start|>" << role << "<|header_end|>\n\n" << trim(message->content) << "<|eot|>";
|
|
616
|
+
}
|
|
617
|
+
if (add_ass) {
|
|
618
|
+
ss << "<|header_start|>assistant<|header_end|>\n\n";
|
|
619
|
+
}
|
|
620
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
|
|
621
|
+
// SmolVLM
|
|
622
|
+
ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
|
|
623
|
+
for (auto message : chat) {
|
|
624
|
+
std::string role(message->role);
|
|
625
|
+
if (role == "system") {
|
|
626
|
+
ss << message->content << "\n\n";
|
|
627
|
+
} else if (role == "user") {
|
|
628
|
+
ss << "User: " << message->content << "<end_of_utterance>\n";
|
|
629
|
+
} else {
|
|
630
|
+
ss << "Assistant: " << message->content << "<end_of_utterance>\n";
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
if (add_ass) {
|
|
634
|
+
ss << "Assistant:";
|
|
635
|
+
}
|
|
570
636
|
} else {
|
|
571
637
|
// template not supported
|
|
572
638
|
return -1;
|
|
@@ -585,4 +651,3 @@ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
|
|
|
585
651
|
}
|
|
586
652
|
return (int32_t) LLM_CHAT_TEMPLATES.size();
|
|
587
653
|
}
|
|
588
|
-
|
|
@@ -29,8 +29,8 @@ enum llm_chat_template {
|
|
|
29
29
|
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
|
30
30
|
LLM_CHAT_TEMPLATE_COMMAND_R,
|
|
31
31
|
LLM_CHAT_TEMPLATE_LLAMA_3,
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
|
33
|
+
LLM_CHAT_TEMPLATE_CHATGLM_4,
|
|
34
34
|
LLM_CHAT_TEMPLATE_GLMEDGE,
|
|
35
35
|
LLM_CHAT_TEMPLATE_MINICPM,
|
|
36
36
|
LLM_CHAT_TEMPLATE_EXAONE_3,
|
|
@@ -38,6 +38,10 @@ enum llm_chat_template {
|
|
|
38
38
|
LLM_CHAT_TEMPLATE_GRANITE,
|
|
39
39
|
LLM_CHAT_TEMPLATE_GIGACHAT,
|
|
40
40
|
LLM_CHAT_TEMPLATE_MEGREZ,
|
|
41
|
+
LLM_CHAT_TEMPLATE_YANDEX,
|
|
42
|
+
LLM_CHAT_TEMPLATE_BAILING,
|
|
43
|
+
LLM_CHAT_TEMPLATE_LLAMA4,
|
|
44
|
+
LLM_CHAT_TEMPLATE_SMOLVLM,
|
|
41
45
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
|
42
46
|
};
|
|
43
47
|
|