@fugood/llama.node 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +8 -8
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +44 -999
- package/src/llama.cpp/common/arg.h +2 -2
- package/src/llama.cpp/common/chat.cpp +17 -2
- package/src/llama.cpp/common/common.cpp +33 -0
- package/src/llama.cpp/common/common.h +15 -1
- package/src/llama.cpp/common/download.cpp +1054 -0
- package/src/llama.cpp/common/download.h +55 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/include/ggml.h +2 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +21 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -75
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +0 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
- package/src/llama.cpp/include/llama.h +7 -3
- package/src/llama.cpp/src/CMakeLists.txt +95 -0
- package/src/llama.cpp/src/llama-arch.cpp +108 -0
- package/src/llama.cpp/src/llama-arch.h +11 -0
- package/src/llama.cpp/src/llama-batch.cpp +63 -31
- package/src/llama.cpp/src/llama-batch.h +12 -1
- package/src/llama.cpp/src/llama-chat.cpp +32 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +36 -13
- package/src/llama.cpp/src/llama-context.h +5 -5
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +3 -3
- package/src/llama.cpp/src/llama-hparams.cpp +11 -1
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +33 -1
- package/src/llama.cpp/src/llama-kv-cells.h +44 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +4 -3
- package/src/llama.cpp/src/llama-model.cpp +320 -13171
- package/src/llama.cpp/src/llama-model.h +8 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/apertus.cpp +125 -0
- package/src/llama.cpp/src/models/arcee.cpp +135 -0
- package/src/llama.cpp/src/models/arctic.cpp +138 -0
- package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/src/llama.cpp/src/models/baichuan.cpp +122 -0
- package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/src/llama.cpp/src/models/bert.cpp +176 -0
- package/src/llama.cpp/src/models/bitnet.cpp +160 -0
- package/src/llama.cpp/src/models/bloom.cpp +101 -0
- package/src/llama.cpp/src/models/chameleon.cpp +178 -0
- package/src/llama.cpp/src/models/chatglm.cpp +132 -0
- package/src/llama.cpp/src/models/codeshell.cpp +111 -0
- package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/src/llama.cpp/src/models/command-r.cpp +122 -0
- package/src/llama.cpp/src/models/dbrx.cpp +123 -0
- package/src/llama.cpp/src/models/deci.cpp +135 -0
- package/src/llama.cpp/src/models/deepseek.cpp +144 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
- package/src/llama.cpp/src/models/dots1.cpp +134 -0
- package/src/llama.cpp/src/models/dream.cpp +105 -0
- package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/src/llama.cpp/src/models/ernie4-5.cpp +110 -0
- package/src/llama.cpp/src/models/exaone.cpp +114 -0
- package/src/llama.cpp/src/models/exaone4.cpp +123 -0
- package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/src/llama.cpp/src/models/falcon.cpp +120 -0
- package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/src/llama.cpp/src/models/gemma.cpp +112 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/src/llama.cpp/src/models/glm4.cpp +127 -0
- package/src/llama.cpp/src/models/gpt2.cpp +105 -0
- package/src/llama.cpp/src/models/gptneox.cpp +144 -0
- package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/src/llama.cpp/src/models/granite.cpp +211 -0
- package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/src/llama.cpp/src/models/grok.cpp +159 -0
- package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/src/llama.cpp/src/models/internlm2.cpp +120 -0
- package/src/llama.cpp/src/models/jais.cpp +86 -0
- package/src/llama.cpp/src/models/jamba.cpp +106 -0
- package/src/llama.cpp/src/models/lfm2.cpp +173 -0
- package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/src/llama.cpp/src/models/llada.cpp +99 -0
- package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/src/llama.cpp/src/models/llama.cpp +155 -0
- package/src/llama.cpp/src/models/mamba.cpp +55 -0
- package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/src/llama.cpp/src/models/models.h +481 -0
- package/src/llama.cpp/src/models/mpt.cpp +126 -0
- package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/src/llama.cpp/src/models/nemotron.cpp +122 -0
- package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/src/llama.cpp/src/models/olmo.cpp +121 -0
- package/src/llama.cpp/src/models/olmo2.cpp +150 -0
- package/src/llama.cpp/src/models/olmoe.cpp +124 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
- package/src/llama.cpp/src/models/openelm.cpp +124 -0
- package/src/llama.cpp/src/models/orion.cpp +123 -0
- package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/src/llama.cpp/src/models/phi2.cpp +121 -0
- package/src/llama.cpp/src/models/phi3.cpp +152 -0
- package/src/llama.cpp/src/models/plamo.cpp +110 -0
- package/src/llama.cpp/src/models/plamo2.cpp +316 -0
- package/src/llama.cpp/src/models/plm.cpp +168 -0
- package/src/llama.cpp/src/models/qwen.cpp +108 -0
- package/src/llama.cpp/src/models/qwen2.cpp +117 -0
- package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/src/llama.cpp/src/models/qwen3.cpp +117 -0
- package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/src/llama.cpp/src/models/refact.cpp +94 -0
- package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/src/llama.cpp/src/models/smollm3.cpp +128 -0
- package/src/llama.cpp/src/models/stablelm.cpp +146 -0
- package/src/llama.cpp/src/models/starcoder.cpp +100 -0
- package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/src/llama.cpp/src/models/xverse.cpp +108 -0
|
@@ -35,6 +35,101 @@ add_library(llama
|
|
|
35
35
|
unicode-data.cpp
|
|
36
36
|
unicode.cpp
|
|
37
37
|
unicode.h
|
|
38
|
+
models/apertus.cpp
|
|
39
|
+
models/arcee.cpp
|
|
40
|
+
models/arctic.cpp
|
|
41
|
+
models/arwkv7.cpp
|
|
42
|
+
models/baichuan.cpp
|
|
43
|
+
models/bailingmoe.cpp
|
|
44
|
+
models/bailingmoe2.cpp
|
|
45
|
+
models/bert.cpp
|
|
46
|
+
models/bitnet.cpp
|
|
47
|
+
models/bloom.cpp
|
|
48
|
+
models/chameleon.cpp
|
|
49
|
+
models/chatglm.cpp
|
|
50
|
+
models/codeshell.cpp
|
|
51
|
+
models/cogvlm.cpp
|
|
52
|
+
models/cohere2-iswa.cpp
|
|
53
|
+
models/command-r.cpp
|
|
54
|
+
models/dbrx.cpp
|
|
55
|
+
models/deci.cpp
|
|
56
|
+
models/deepseek.cpp
|
|
57
|
+
models/deepseek2.cpp
|
|
58
|
+
models/dots1.cpp
|
|
59
|
+
models/dream.cpp
|
|
60
|
+
models/ernie4-5-moe.cpp
|
|
61
|
+
models/ernie4-5.cpp
|
|
62
|
+
models/exaone.cpp
|
|
63
|
+
models/exaone4.cpp
|
|
64
|
+
models/falcon-h1.cpp
|
|
65
|
+
models/falcon.cpp
|
|
66
|
+
models/gemma-embedding.cpp
|
|
67
|
+
models/gemma.cpp
|
|
68
|
+
models/gemma2-iswa.cpp
|
|
69
|
+
models/gemma3-iswa.cpp
|
|
70
|
+
models/gemma3n-iswa.cpp
|
|
71
|
+
models/glm4-moe.cpp
|
|
72
|
+
models/glm4.cpp
|
|
73
|
+
models/gpt2.cpp
|
|
74
|
+
models/gptneox.cpp
|
|
75
|
+
models/granite-hybrid.cpp
|
|
76
|
+
models/granite.cpp
|
|
77
|
+
models/grok.cpp
|
|
78
|
+
models/grovemoe.cpp
|
|
79
|
+
models/hunyuan-dense.cpp
|
|
80
|
+
models/hunyuan-moe.cpp
|
|
81
|
+
models/internlm2.cpp
|
|
82
|
+
models/jais.cpp
|
|
83
|
+
models/jamba.cpp
|
|
84
|
+
models/lfm2.cpp
|
|
85
|
+
models/llada-moe.cpp
|
|
86
|
+
models/llada.cpp
|
|
87
|
+
models/llama-iswa.cpp
|
|
88
|
+
models/llama.cpp
|
|
89
|
+
models/mamba.cpp
|
|
90
|
+
models/minicpm3.cpp
|
|
91
|
+
models/minimax-m2.cpp
|
|
92
|
+
models/mpt.cpp
|
|
93
|
+
models/nemotron-h.cpp
|
|
94
|
+
models/nemotron.cpp
|
|
95
|
+
models/neo-bert.cpp
|
|
96
|
+
models/olmo.cpp
|
|
97
|
+
models/olmo2.cpp
|
|
98
|
+
models/olmoe.cpp
|
|
99
|
+
models/openai-moe-iswa.cpp
|
|
100
|
+
models/openelm.cpp
|
|
101
|
+
models/orion.cpp
|
|
102
|
+
models/pangu-embedded.cpp
|
|
103
|
+
models/phi2.cpp
|
|
104
|
+
models/phi3.cpp
|
|
105
|
+
models/plamo.cpp
|
|
106
|
+
models/plamo2.cpp
|
|
107
|
+
models/plm.cpp
|
|
108
|
+
models/qwen.cpp
|
|
109
|
+
models/qwen2.cpp
|
|
110
|
+
models/qwen2moe.cpp
|
|
111
|
+
models/qwen2vl.cpp
|
|
112
|
+
models/qwen3.cpp
|
|
113
|
+
models/qwen3vl.cpp
|
|
114
|
+
models/qwen3vl-moe.cpp
|
|
115
|
+
models/qwen3moe.cpp
|
|
116
|
+
models/refact.cpp
|
|
117
|
+
models/rwkv6-base.cpp
|
|
118
|
+
models/rwkv6.cpp
|
|
119
|
+
models/rwkv6qwen2.cpp
|
|
120
|
+
models/rwkv7-base.cpp
|
|
121
|
+
models/rwkv7.cpp
|
|
122
|
+
models/seed-oss.cpp
|
|
123
|
+
models/smallthinker.cpp
|
|
124
|
+
models/smollm3.cpp
|
|
125
|
+
models/stablelm.cpp
|
|
126
|
+
models/starcoder.cpp
|
|
127
|
+
models/starcoder2.cpp
|
|
128
|
+
models/t5-dec.cpp
|
|
129
|
+
models/t5-enc.cpp
|
|
130
|
+
models/wavtokenizer-dec.cpp
|
|
131
|
+
models/xverse.cpp
|
|
132
|
+
models/graph-context-mamba.cpp
|
|
38
133
|
)
|
|
39
134
|
|
|
40
135
|
target_include_directories(llama PRIVATE .)
|
|
@@ -32,6 +32,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
32
32
|
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
|
33
33
|
{ LLM_ARCH_QWEN3, "qwen3" },
|
|
34
34
|
{ LLM_ARCH_QWEN3MOE, "qwen3moe" },
|
|
35
|
+
{ LLM_ARCH_QWEN3VL, "qwen3vl" },
|
|
36
|
+
{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
|
|
35
37
|
{ LLM_ARCH_PHI2, "phi2" },
|
|
36
38
|
{ LLM_ARCH_PHI3, "phi3" },
|
|
37
39
|
{ LLM_ARCH_PHIMOE, "phimoe" },
|
|
@@ -103,6 +105,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
103
105
|
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
|
104
106
|
{ LLM_ARCH_GROVEMOE, "grovemoe" },
|
|
105
107
|
{ LLM_ARCH_APERTUS, "apertus" },
|
|
108
|
+
{ LLM_ARCH_MINIMAX_M2, "minimax-m2" },
|
|
109
|
+
{ LLM_ARCH_COGVLM, "cogvlm" },
|
|
110
|
+
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
|
106
111
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
107
112
|
};
|
|
108
113
|
|
|
@@ -145,6 +150,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
145
150
|
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
|
|
146
151
|
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
|
147
152
|
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
|
153
|
+
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
|
|
148
154
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
|
149
155
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
|
150
156
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
@@ -779,6 +785,45 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
779
785
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
780
786
|
},
|
|
781
787
|
},
|
|
788
|
+
{
|
|
789
|
+
LLM_ARCH_QWEN3VL,
|
|
790
|
+
{
|
|
791
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
792
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
793
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
794
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
795
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
796
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
797
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
798
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
799
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
800
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
801
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
802
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
803
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
804
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
805
|
+
},
|
|
806
|
+
},
|
|
807
|
+
{
|
|
808
|
+
LLM_ARCH_QWEN3VLMOE,
|
|
809
|
+
{
|
|
810
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
811
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
812
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
813
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
814
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
815
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
816
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
817
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
818
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
819
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
820
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
821
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
822
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
823
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
824
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
825
|
+
},
|
|
826
|
+
},
|
|
782
827
|
{
|
|
783
828
|
LLM_ARCH_PHI2,
|
|
784
829
|
{
|
|
@@ -2312,6 +2357,64 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2312
2357
|
{ LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
|
|
2313
2358
|
},
|
|
2314
2359
|
},
|
|
2360
|
+
{
|
|
2361
|
+
LLM_ARCH_MINIMAX_M2,
|
|
2362
|
+
{
|
|
2363
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2364
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2365
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2366
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2367
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2368
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2369
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2370
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2371
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
2372
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
2373
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2374
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2375
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2376
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
2377
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2378
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
2379
|
+
},
|
|
2380
|
+
},
|
|
2381
|
+
{
|
|
2382
|
+
LLM_ARCH_PANGU_EMBED,
|
|
2383
|
+
{
|
|
2384
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2385
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2386
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2387
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2388
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2389
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2390
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2391
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2392
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2393
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
2394
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
2395
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2396
|
+
},
|
|
2397
|
+
},
|
|
2398
|
+
{
|
|
2399
|
+
LLM_ARCH_COGVLM,
|
|
2400
|
+
{
|
|
2401
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2402
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2403
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2404
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2405
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
2406
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2407
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2408
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
2409
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
2410
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2411
|
+
{ LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" },
|
|
2412
|
+
{ LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" },
|
|
2413
|
+
{ LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" },
|
|
2414
|
+
{ LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" },
|
|
2415
|
+
{ LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
|
|
2416
|
+
},
|
|
2417
|
+
},
|
|
2315
2418
|
{
|
|
2316
2419
|
LLM_ARCH_UNKNOWN,
|
|
2317
2420
|
{
|
|
@@ -2488,6 +2591,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2488
2591
|
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
|
2489
2592
|
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2490
2593
|
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2594
|
+
{LLM_TENSOR_VISEXP_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2595
|
+
{LLM_TENSOR_VISEXP_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2596
|
+
{LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2597
|
+
{LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2598
|
+
{LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2491
2599
|
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
|
2492
2600
|
// These tensors only exist in the last layer(s) and are treated as output tensors
|
|
2493
2601
|
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
@@ -36,6 +36,8 @@ enum llm_arch {
|
|
|
36
36
|
LLM_ARCH_QWEN2VL,
|
|
37
37
|
LLM_ARCH_QWEN3,
|
|
38
38
|
LLM_ARCH_QWEN3MOE,
|
|
39
|
+
LLM_ARCH_QWEN3VL,
|
|
40
|
+
LLM_ARCH_QWEN3VLMOE,
|
|
39
41
|
LLM_ARCH_PHI2,
|
|
40
42
|
LLM_ARCH_PHI3,
|
|
41
43
|
LLM_ARCH_PHIMOE,
|
|
@@ -107,6 +109,9 @@ enum llm_arch {
|
|
|
107
109
|
LLM_ARCH_SEED_OSS,
|
|
108
110
|
LLM_ARCH_GROVEMOE,
|
|
109
111
|
LLM_ARCH_APERTUS,
|
|
112
|
+
LLM_ARCH_MINIMAX_M2,
|
|
113
|
+
LLM_ARCH_COGVLM,
|
|
114
|
+
LLM_ARCH_PANGU_EMBED,
|
|
110
115
|
LLM_ARCH_UNKNOWN,
|
|
111
116
|
};
|
|
112
117
|
|
|
@@ -149,6 +154,7 @@ enum llm_kv {
|
|
|
149
154
|
LLM_KV_EXPERTS_PER_GROUP,
|
|
150
155
|
LLM_KV_MOE_EVERY_N_LAYERS,
|
|
151
156
|
LLM_KV_NEXTN_PREDICT_LAYERS,
|
|
157
|
+
LLM_KV_NUM_DEEPSTACK_LAYERS,
|
|
152
158
|
LLM_KV_POOLING_TYPE,
|
|
153
159
|
LLM_KV_LOGIT_SCALE,
|
|
154
160
|
LLM_KV_DECODER_START_TOKEN_ID,
|
|
@@ -455,6 +461,11 @@ enum llm_tensor {
|
|
|
455
461
|
LLM_TENSOR_SHORTCONV_CONV,
|
|
456
462
|
LLM_TENSOR_SHORTCONV_INPROJ,
|
|
457
463
|
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
|
464
|
+
LLM_TENSOR_VISEXP_ATTN_QKV,
|
|
465
|
+
LLM_TENSOR_VISEXP_ATTN_OUT,
|
|
466
|
+
LLM_TENSOR_VISEXP_FFN_GATE,
|
|
467
|
+
LLM_TENSOR_VISEXP_FFN_DOWN,
|
|
468
|
+
LLM_TENSOR_VISEXP_FFN_UP,
|
|
458
469
|
LLM_TENSOR_NEXTN_EH_PROJ,
|
|
459
470
|
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
|
460
471
|
LLM_TENSOR_NEXTN_ENORM,
|
|
@@ -215,6 +215,7 @@ bool llama_batch_allocr::init(
|
|
|
215
215
|
/*.n_seq_tokens =*/ (uint32_t) 1,
|
|
216
216
|
/*.n_seqs =*/ (uint32_t) batch.n_tokens,
|
|
217
217
|
/*.n_seqs_unq =*/ (uint32_t) this->seq_id_unq.size(),
|
|
218
|
+
/*.n_pos =*/ n_pos_per_embd,
|
|
218
219
|
/*.token =*/ batch.token,
|
|
219
220
|
/*.embd =*/ batch.embd,
|
|
220
221
|
/*.pos =*/ batch.pos,
|
|
@@ -251,46 +252,72 @@ bool llama_batch_allocr::init(
|
|
|
251
252
|
// consistency checks
|
|
252
253
|
//
|
|
253
254
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
255
|
+
if (n_pos_per_embd > 1) {
|
|
256
|
+
// M-RoPE case: allow position to "jump" forward only (non-continuous positions are allowed)
|
|
257
|
+
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
|
258
|
+
if (seq_pos[s].empty()) {
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
|
|
263
|
+
|
|
264
|
+
if (batch.token) {
|
|
265
|
+
if (p0 >= 0 && p0 >= seq_pos_min(s)) {
|
|
266
|
+
LLAMA_LOG_ERROR(
|
|
267
|
+
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
|
268
|
+
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
|
269
|
+
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
|
270
|
+
" for M-RoPE, it is required that the position satisfies: X < Y\n",
|
|
271
|
+
__func__, s, s, p0, s, seq_pos_min(s));
|
|
272
|
+
|
|
273
|
+
return false;
|
|
274
|
+
}
|
|
275
|
+
} else {
|
|
276
|
+
// embedding inputs can have overlapping positions
|
|
277
|
+
if (p0 >= 0 && p0 > seq_pos_min(s)) {
|
|
278
|
+
LLAMA_LOG_ERROR(
|
|
279
|
+
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
|
280
|
+
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
|
281
|
+
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
|
282
|
+
" for M-RoPE, it is required that the position satisfies: X <= Y\n",
|
|
283
|
+
__func__, s, s, p0, s, seq_pos_min(s));
|
|
284
|
+
|
|
285
|
+
return false;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
257
288
|
}
|
|
289
|
+
} else {
|
|
290
|
+
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
|
291
|
+
if (seq_pos[s].empty()) {
|
|
292
|
+
continue;
|
|
293
|
+
}
|
|
258
294
|
|
|
259
|
-
|
|
295
|
+
const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
|
|
260
296
|
|
|
261
|
-
|
|
262
|
-
|
|
297
|
+
if (p0 >= 0) {
|
|
298
|
+
bool ok = true;
|
|
263
299
|
|
|
264
|
-
if (batch.token) {
|
|
265
300
|
if (seq_pos_min(s) != p0 + 1) {
|
|
266
301
|
ok = false;
|
|
267
302
|
}
|
|
268
|
-
} else {
|
|
269
|
-
assert(batch.embd);
|
|
270
303
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
304
|
+
if (!ok) {
|
|
305
|
+
LLAMA_LOG_ERROR(
|
|
306
|
+
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
|
307
|
+
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
|
308
|
+
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
|
309
|
+
" it is required that the sequence positions remain consecutive: Y = X + 1\n",
|
|
310
|
+
__func__, s, s, p0, s, seq_pos_min(s));
|
|
311
|
+
|
|
312
|
+
return false;
|
|
275
313
|
}
|
|
276
314
|
}
|
|
277
315
|
|
|
278
|
-
if (
|
|
279
|
-
LLAMA_LOG_ERROR(
|
|
280
|
-
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
|
281
|
-
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
|
282
|
-
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
|
283
|
-
" it is required that the sequence positions remain consecutive: Y = X + 1\n",
|
|
284
|
-
__func__, s, s, p0, s, seq_pos_min(s));
|
|
285
|
-
|
|
316
|
+
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
|
|
317
|
+
LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
|
|
286
318
|
return false;
|
|
287
319
|
}
|
|
288
320
|
}
|
|
289
|
-
|
|
290
|
-
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
|
|
291
|
-
LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
|
|
292
|
-
return false;
|
|
293
|
-
}
|
|
294
321
|
}
|
|
295
322
|
|
|
296
323
|
if (memory) {
|
|
@@ -389,6 +416,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
|
|
|
389
416
|
/*.n_seq_tokens =*/ n_seq_tokens,
|
|
390
417
|
/*.n_seqs =*/ n_seqs,
|
|
391
418
|
/*.n_seqs_unq =*/ n_seqs,
|
|
419
|
+
/*.n_pos =*/ n_pos_per_embd,
|
|
392
420
|
|
|
393
421
|
/*.token =*/ udata->token.data(),
|
|
394
422
|
/*.embd =*/ nullptr,
|
|
@@ -655,10 +683,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
655
683
|
|
|
656
684
|
auto udata = std::make_shared<llama_ubatch::data_t>();
|
|
657
685
|
|
|
658
|
-
const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
|
|
659
|
-
|
|
660
686
|
const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
|
|
661
|
-
const int64_t n_pos_all = (int64_t) n_tokens*
|
|
687
|
+
const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd;
|
|
662
688
|
|
|
663
689
|
udata->token .resize(n_tokens);
|
|
664
690
|
udata->embd .resize(n_embd_all);
|
|
@@ -680,8 +706,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
680
706
|
memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
|
|
681
707
|
}
|
|
682
708
|
|
|
683
|
-
for (
|
|
684
|
-
|
|
709
|
+
for (size_t j = 0; j < (size_t)n_pos_per_embd; ++j) {
|
|
710
|
+
// if we are using M-RoPE
|
|
711
|
+
// if the current batch is text, we need to broadcast the same position across all RoPE sections
|
|
712
|
+
// otherwise, the input batch is image embeddings, we copy the positions as-is
|
|
713
|
+
// if we are not using M-RoPE, there is only one position per token (this loop runs only once)
|
|
714
|
+
size_t src_off = batch.token ? 0 : j*batch.n_tokens;
|
|
715
|
+
udata->pos[j*n_tokens + i] = batch.pos[src_off + idxs[i]];
|
|
685
716
|
}
|
|
686
717
|
|
|
687
718
|
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
|
@@ -710,6 +741,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
710
741
|
/*.n_seq_tokens =*/ n_tokens/n_seqs,
|
|
711
742
|
/*.n_seqs =*/ n_seqs,
|
|
712
743
|
/*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(),
|
|
744
|
+
/*.n_pos =*/ n_pos_per_embd,
|
|
713
745
|
|
|
714
746
|
/*.token =*/ batch.token ? udata->token.data() : nullptr,
|
|
715
747
|
/*.embd =*/ batch.embd ? udata->embd.data() : nullptr,
|
|
@@ -17,6 +17,16 @@ struct llama_ubatch {
|
|
|
17
17
|
return b_equal_seqs != 0;
|
|
18
18
|
}
|
|
19
19
|
|
|
20
|
+
// typical for M-RoPE cases:
|
|
21
|
+
// 0 - sequantial position of the tokens/embeddings in the sequence
|
|
22
|
+
// 1 - y position in the image
|
|
23
|
+
// 2 - x position in the image
|
|
24
|
+
// 3 - other
|
|
25
|
+
bool is_pos_2d() const {
|
|
26
|
+
// TODO @ngxson : we may need to check for model arch when more models use >1 positions
|
|
27
|
+
return n_pos >= 3;
|
|
28
|
+
}
|
|
29
|
+
|
|
20
30
|
uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
|
|
21
31
|
// otherwise address sanitizer complains
|
|
22
32
|
// TODO: whole_seqs for embeddings?
|
|
@@ -25,6 +35,7 @@ struct llama_ubatch {
|
|
|
25
35
|
uint32_t n_seq_tokens; // tokens per sequence set
|
|
26
36
|
uint32_t n_seqs; // sequence sets in the ubatch
|
|
27
37
|
uint32_t n_seqs_unq; // unique sequence ids in the ubatch
|
|
38
|
+
uint32_t n_pos; // number of position inputs for each token/embedding
|
|
28
39
|
|
|
29
40
|
// seq_id_unq: unique sequence ids in the ubatch
|
|
30
41
|
// seq_idx: indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
|
|
@@ -33,7 +44,7 @@ struct llama_ubatch {
|
|
|
33
44
|
// // size | idx | val
|
|
34
45
|
llama_token * token; // [n_tokens] | i | id, token
|
|
35
46
|
float * embd; // [n_embd, n_tokens] | i | embd
|
|
36
|
-
llama_pos * pos; // [n_tokens]
|
|
47
|
+
llama_pos * pos; // [n_tokens*n_pos] | i | pos
|
|
37
48
|
int32_t * n_seq_id; // [n_tokens] | i | -
|
|
38
49
|
llama_seq_id ** seq_id; // [n_tokens] | s | s0, s1, seq_id
|
|
39
50
|
llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
|
|
@@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
73
73
|
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
|
74
74
|
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
|
75
75
|
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
|
76
|
+
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
|
|
76
77
|
};
|
|
77
78
|
|
|
78
79
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
|
@@ -213,6 +214,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
213
214
|
return LLM_CHAT_TEMPLATE_SEED_OSS;
|
|
214
215
|
} else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
|
|
215
216
|
return LLM_CHAT_TEMPLATE_GROK_2;
|
|
217
|
+
} else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
|
|
218
|
+
return LLM_CHAT_TEMPLATE_PANGU_EMBED;
|
|
216
219
|
}
|
|
217
220
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
|
218
221
|
}
|
|
@@ -813,6 +816,35 @@ int32_t llm_chat_apply_template(
|
|
|
813
816
|
if (add_ass) {
|
|
814
817
|
ss << "Assistant:";
|
|
815
818
|
}
|
|
819
|
+
}else if (tmpl == LLM_CHAT_TEMPLATE_PANGU_EMBED) {
|
|
820
|
+
// [unused9]系统:xxx[unused10]
|
|
821
|
+
// [unused9]用户:xxx[unused10]
|
|
822
|
+
// [unused9]助手:xxx[unused10]
|
|
823
|
+
// ...
|
|
824
|
+
for (size_t i = 0; i < chat.size(); ++i) {
|
|
825
|
+
const auto & msg = chat[i];
|
|
826
|
+
const std::string & role = msg->role;
|
|
827
|
+
const std::string & content = msg->content;
|
|
828
|
+
|
|
829
|
+
if (i == 0 && role != "system") {
|
|
830
|
+
ss << "[unused9]系统:[unused10]";
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
if (role == "system") {
|
|
834
|
+
ss << "[unused9]系统:" << content << "[unused10]";
|
|
835
|
+
} else if (role == "user") {
|
|
836
|
+
ss << "[unused9]用户:" << content << "[unused10]";
|
|
837
|
+
} else if (role == "assistant") {
|
|
838
|
+
ss << "[unused9]助手:" << content << "[unused10]";
|
|
839
|
+
} else if (role == "tool") {
|
|
840
|
+
ss << "[unused9]工具:" << content << "[unused10]";
|
|
841
|
+
} else if (role == "function") {
|
|
842
|
+
ss << "[unused9]方法:" << content << "[unused10]";
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
if (add_ass) {
|
|
846
|
+
ss << "[unused9]助手:";
|
|
847
|
+
}
|
|
816
848
|
} else {
|
|
817
849
|
// template not supported
|
|
818
850
|
return -1;
|
|
@@ -21,6 +21,8 @@ llama_context::llama_context(
|
|
|
21
21
|
llama_context_params params) :
|
|
22
22
|
model(model),
|
|
23
23
|
balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
|
|
24
|
+
// TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
|
|
25
|
+
// may need to be backend-dependent
|
|
24
26
|
LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
|
|
25
27
|
|
|
26
28
|
t_start_us = model.t_start_us;
|
|
@@ -112,11 +114,28 @@ llama_context::llama_context(
|
|
|
112
114
|
}
|
|
113
115
|
}
|
|
114
116
|
|
|
115
|
-
|
|
117
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
|
|
118
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
|
119
|
+
|
|
120
|
+
if (cparams.kv_unified) {
|
|
121
|
+
cparams.n_ctx_seq = cparams.n_ctx;
|
|
122
|
+
} else {
|
|
123
|
+
cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
124
|
+
cparams.n_ctx_seq = GGML_PAD(cparams.n_ctx_seq, 256);
|
|
125
|
+
|
|
126
|
+
if (cparams.n_ctx_seq == 0) {
|
|
127
|
+
throw std::runtime_error("n_ctx_seq == 0");
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if (cparams.n_ctx != cparams.n_ctx_seq * cparams.n_seq_max) {
|
|
131
|
+
cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max;
|
|
132
|
+
LLAMA_LOG_WARN("%s: n_ctx is not divisible by n_seq_max - rounding down to %u\n", __func__, cparams.n_ctx);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
116
135
|
|
|
117
136
|
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
|
118
137
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
|
119
|
-
LLAMA_LOG_INFO("%s:
|
|
138
|
+
LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq);
|
|
120
139
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
|
121
140
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
|
122
141
|
LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
|
|
@@ -125,14 +144,14 @@ llama_context::llama_context(
|
|
|
125
144
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
|
126
145
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
|
127
146
|
|
|
128
|
-
if (
|
|
129
|
-
LLAMA_LOG_WARN("%s:
|
|
130
|
-
__func__,
|
|
147
|
+
if (cparams.n_ctx_seq < hparams.n_ctx_train) {
|
|
148
|
+
LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
|
|
149
|
+
__func__, cparams.n_ctx_seq, hparams.n_ctx_train);
|
|
131
150
|
}
|
|
132
151
|
|
|
133
|
-
if (
|
|
134
|
-
LLAMA_LOG_WARN("%s:
|
|
135
|
-
__func__,
|
|
152
|
+
if (cparams.n_ctx_seq > hparams.n_ctx_train) {
|
|
153
|
+
LLAMA_LOG_WARN("%s: n_ctx_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
|
|
154
|
+
__func__, cparams.n_ctx_seq, hparams.n_ctx_train);
|
|
136
155
|
}
|
|
137
156
|
|
|
138
157
|
if (!hparams.vocab_only) {
|
|
@@ -453,8 +472,8 @@ uint32_t llama_context::n_ctx() const {
|
|
|
453
472
|
return cparams.n_ctx;
|
|
454
473
|
}
|
|
455
474
|
|
|
456
|
-
uint32_t llama_context::
|
|
457
|
-
return cparams.
|
|
475
|
+
uint32_t llama_context::n_ctx_seq() const {
|
|
476
|
+
return cparams.n_ctx_seq;
|
|
458
477
|
}
|
|
459
478
|
|
|
460
479
|
uint32_t llama_context::n_batch() const {
|
|
@@ -808,7 +827,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
808
827
|
|
|
809
828
|
const auto & hparams = model.hparams;
|
|
810
829
|
|
|
811
|
-
const int64_t n_embd = hparams.
|
|
830
|
+
const int64_t n_embd = hparams.n_embd_inp();
|
|
812
831
|
const int64_t n_vocab = model.vocab.n_tokens();
|
|
813
832
|
|
|
814
833
|
// note: during encode, we always pass the full sequence starting from pos = 0
|
|
@@ -977,7 +996,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
977
996
|
const auto & hparams = model.hparams;
|
|
978
997
|
|
|
979
998
|
const int64_t n_vocab = vocab.n_tokens();
|
|
980
|
-
const int64_t n_embd = hparams.
|
|
999
|
+
const int64_t n_embd = hparams.n_embd_inp();
|
|
981
1000
|
|
|
982
1001
|
// when computing embeddings, all tokens are output
|
|
983
1002
|
const bool output_all = cparams.embeddings;
|
|
@@ -2135,7 +2154,7 @@ void llama_context::opt_epoch_iter(
|
|
|
2135
2154
|
batch.logits [pos_batch] = true;
|
|
2136
2155
|
}
|
|
2137
2156
|
|
|
2138
|
-
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.
|
|
2157
|
+
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
|
2139
2158
|
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
|
2140
2159
|
return;
|
|
2141
2160
|
}
|
|
@@ -2383,6 +2402,10 @@ uint32_t llama_n_ctx(const llama_context * ctx) {
|
|
|
2383
2402
|
return ctx->n_ctx();
|
|
2384
2403
|
}
|
|
2385
2404
|
|
|
2405
|
+
uint32_t llama_n_ctx_seq(const llama_context * ctx) {
|
|
2406
|
+
return ctx->n_ctx_seq();
|
|
2407
|
+
}
|
|
2408
|
+
|
|
2386
2409
|
uint32_t llama_n_batch(const llama_context * ctx) {
|
|
2387
2410
|
return ctx->n_batch();
|
|
2388
2411
|
}
|
|
@@ -43,11 +43,11 @@ struct llama_context {
|
|
|
43
43
|
|
|
44
44
|
ggml_backend_sched_t get_sched() const;
|
|
45
45
|
|
|
46
|
-
uint32_t n_ctx()
|
|
47
|
-
uint32_t
|
|
48
|
-
uint32_t n_batch()
|
|
49
|
-
uint32_t n_ubatch()
|
|
50
|
-
uint32_t n_seq_max()
|
|
46
|
+
uint32_t n_ctx() const;
|
|
47
|
+
uint32_t n_ctx_seq() const;
|
|
48
|
+
uint32_t n_batch() const;
|
|
49
|
+
uint32_t n_ubatch() const;
|
|
50
|
+
uint32_t n_seq_max() const;
|
|
51
51
|
|
|
52
52
|
uint32_t n_threads() const;
|
|
53
53
|
uint32_t n_threads_batch() const;
|