@fugood/llama.node 1.3.0-rc.6 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +12 -2
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +8 -9
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +39 -1001
- package/src/llama.cpp/common/arg.h +2 -2
- package/src/llama.cpp/common/chat.cpp +216 -2
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +33 -0
- package/src/llama.cpp/common/common.h +13 -0
- package/src/llama.cpp/common/download.cpp +1054 -0
- package/src/llama.cpp/common/download.h +55 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +2 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +7 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +10 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +0 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -35
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
- package/src/llama.cpp/include/llama.h +7 -3
- package/src/llama.cpp/src/CMakeLists.txt +95 -0
- package/src/llama.cpp/src/llama-arch.cpp +108 -0
- package/src/llama.cpp/src/llama-arch.h +11 -0
- package/src/llama.cpp/src/llama-batch.cpp +63 -31
- package/src/llama.cpp/src/llama-batch.h +12 -1
- package/src/llama.cpp/src/llama-chat.cpp +32 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +44 -16
- package/src/llama.cpp/src/llama-context.h +5 -5
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +12 -7
- package/src/llama.cpp/src/llama-hparams.cpp +11 -1
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +56 -21
- package/src/llama.cpp/src/llama-kv-cache.h +2 -4
- package/src/llama.cpp/src/llama-kv-cells.h +44 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-model.cpp +350 -13194
- package/src/llama.cpp/src/llama-model.h +9 -2
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/apertus.cpp +125 -0
- package/src/llama.cpp/src/models/arcee.cpp +135 -0
- package/src/llama.cpp/src/models/arctic.cpp +138 -0
- package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/src/llama.cpp/src/models/baichuan.cpp +122 -0
- package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/src/llama.cpp/src/models/bert.cpp +176 -0
- package/src/llama.cpp/src/models/bitnet.cpp +160 -0
- package/src/llama.cpp/src/models/bloom.cpp +101 -0
- package/src/llama.cpp/src/models/chameleon.cpp +178 -0
- package/src/llama.cpp/src/models/chatglm.cpp +132 -0
- package/src/llama.cpp/src/models/codeshell.cpp +111 -0
- package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/src/llama.cpp/src/models/command-r.cpp +122 -0
- package/src/llama.cpp/src/models/dbrx.cpp +123 -0
- package/src/llama.cpp/src/models/deci.cpp +135 -0
- package/src/llama.cpp/src/models/deepseek.cpp +144 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
- package/src/llama.cpp/src/models/dots1.cpp +134 -0
- package/src/llama.cpp/src/models/dream.cpp +105 -0
- package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/src/llama.cpp/src/models/ernie4-5.cpp +111 -0
- package/src/llama.cpp/src/models/exaone.cpp +114 -0
- package/src/llama.cpp/src/models/exaone4.cpp +123 -0
- package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/src/llama.cpp/src/models/falcon.cpp +120 -0
- package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/src/llama.cpp/src/models/gemma.cpp +112 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/src/llama.cpp/src/models/glm4.cpp +127 -0
- package/src/llama.cpp/src/models/gpt2.cpp +105 -0
- package/src/llama.cpp/src/models/gptneox.cpp +144 -0
- package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/src/llama.cpp/src/models/granite.cpp +211 -0
- package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/src/llama.cpp/src/models/grok.cpp +159 -0
- package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/src/llama.cpp/src/models/internlm2.cpp +120 -0
- package/src/llama.cpp/src/models/jais.cpp +86 -0
- package/src/llama.cpp/src/models/jamba.cpp +106 -0
- package/src/llama.cpp/src/models/lfm2.cpp +173 -0
- package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/src/llama.cpp/src/models/llada.cpp +99 -0
- package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/src/llama.cpp/src/models/llama.cpp +155 -0
- package/src/llama.cpp/src/models/mamba.cpp +55 -0
- package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/src/llama.cpp/src/models/models.h +481 -0
- package/src/llama.cpp/src/models/mpt.cpp +126 -0
- package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/src/llama.cpp/src/models/nemotron.cpp +122 -0
- package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/src/llama.cpp/src/models/olmo.cpp +121 -0
- package/src/llama.cpp/src/models/olmo2.cpp +150 -0
- package/src/llama.cpp/src/models/olmoe.cpp +124 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/openelm.cpp +124 -0
- package/src/llama.cpp/src/models/orion.cpp +123 -0
- package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/src/llama.cpp/src/models/phi2.cpp +121 -0
- package/src/llama.cpp/src/models/phi3.cpp +152 -0
- package/src/llama.cpp/src/models/plamo.cpp +110 -0
- package/src/llama.cpp/src/models/plamo2.cpp +316 -0
- package/src/llama.cpp/src/models/plm.cpp +168 -0
- package/src/llama.cpp/src/models/qwen.cpp +108 -0
- package/src/llama.cpp/src/models/qwen2.cpp +117 -0
- package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/src/llama.cpp/src/models/qwen3.cpp +117 -0
- package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/src/llama.cpp/src/models/refact.cpp +94 -0
- package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/src/llama.cpp/src/models/smollm3.cpp +128 -0
- package/src/llama.cpp/src/models/stablelm.cpp +146 -0
- package/src/llama.cpp/src/models/starcoder.cpp +100 -0
- package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/src/llama.cpp/src/models/xverse.cpp +108 -0
|
@@ -32,6 +32,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
32
32
|
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
|
33
33
|
{ LLM_ARCH_QWEN3, "qwen3" },
|
|
34
34
|
{ LLM_ARCH_QWEN3MOE, "qwen3moe" },
|
|
35
|
+
{ LLM_ARCH_QWEN3VL, "qwen3vl" },
|
|
36
|
+
{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
|
|
35
37
|
{ LLM_ARCH_PHI2, "phi2" },
|
|
36
38
|
{ LLM_ARCH_PHI3, "phi3" },
|
|
37
39
|
{ LLM_ARCH_PHIMOE, "phimoe" },
|
|
@@ -103,6 +105,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
103
105
|
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
|
104
106
|
{ LLM_ARCH_GROVEMOE, "grovemoe" },
|
|
105
107
|
{ LLM_ARCH_APERTUS, "apertus" },
|
|
108
|
+
{ LLM_ARCH_MINIMAX_M2, "minimax-m2" },
|
|
109
|
+
{ LLM_ARCH_COGVLM, "cogvlm" },
|
|
110
|
+
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
|
106
111
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
107
112
|
};
|
|
108
113
|
|
|
@@ -145,6 +150,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
145
150
|
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
|
|
146
151
|
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
|
147
152
|
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
|
153
|
+
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
|
|
148
154
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
|
149
155
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
|
150
156
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
@@ -779,6 +785,45 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
779
785
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
780
786
|
},
|
|
781
787
|
},
|
|
788
|
+
{
|
|
789
|
+
LLM_ARCH_QWEN3VL,
|
|
790
|
+
{
|
|
791
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
792
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
793
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
794
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
795
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
796
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
797
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
798
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
799
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
800
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
801
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
802
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
803
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
804
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
805
|
+
},
|
|
806
|
+
},
|
|
807
|
+
{
|
|
808
|
+
LLM_ARCH_QWEN3VLMOE,
|
|
809
|
+
{
|
|
810
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
811
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
812
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
813
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
814
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
815
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
816
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
817
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
818
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
819
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
820
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
821
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
822
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
823
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
824
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
825
|
+
},
|
|
826
|
+
},
|
|
782
827
|
{
|
|
783
828
|
LLM_ARCH_PHI2,
|
|
784
829
|
{
|
|
@@ -2312,6 +2357,64 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2312
2357
|
{ LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" },
|
|
2313
2358
|
},
|
|
2314
2359
|
},
|
|
2360
|
+
{
|
|
2361
|
+
LLM_ARCH_MINIMAX_M2,
|
|
2362
|
+
{
|
|
2363
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2364
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2365
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2366
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2367
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2368
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2369
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2370
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2371
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
2372
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
2373
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2374
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2375
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2376
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
2377
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2378
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
2379
|
+
},
|
|
2380
|
+
},
|
|
2381
|
+
{
|
|
2382
|
+
LLM_ARCH_PANGU_EMBED,
|
|
2383
|
+
{
|
|
2384
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2385
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2386
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2387
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2388
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2389
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2390
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2391
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2392
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2393
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
2394
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
2395
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2396
|
+
},
|
|
2397
|
+
},
|
|
2398
|
+
{
|
|
2399
|
+
LLM_ARCH_COGVLM,
|
|
2400
|
+
{
|
|
2401
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2402
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2403
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2404
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2405
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
2406
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2407
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2408
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
2409
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
2410
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2411
|
+
{ LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" },
|
|
2412
|
+
{ LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" },
|
|
2413
|
+
{ LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" },
|
|
2414
|
+
{ LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" },
|
|
2415
|
+
{ LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
|
|
2416
|
+
},
|
|
2417
|
+
},
|
|
2315
2418
|
{
|
|
2316
2419
|
LLM_ARCH_UNKNOWN,
|
|
2317
2420
|
{
|
|
@@ -2488,6 +2591,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
2488
2591
|
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
|
2489
2592
|
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2490
2593
|
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2594
|
+
{LLM_TENSOR_VISEXP_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2595
|
+
{LLM_TENSOR_VISEXP_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2596
|
+
{LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2597
|
+
{LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2598
|
+
{LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2491
2599
|
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
|
2492
2600
|
// These tensors only exist in the last layer(s) and are treated as output tensors
|
|
2493
2601
|
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
|
@@ -36,6 +36,8 @@ enum llm_arch {
|
|
|
36
36
|
LLM_ARCH_QWEN2VL,
|
|
37
37
|
LLM_ARCH_QWEN3,
|
|
38
38
|
LLM_ARCH_QWEN3MOE,
|
|
39
|
+
LLM_ARCH_QWEN3VL,
|
|
40
|
+
LLM_ARCH_QWEN3VLMOE,
|
|
39
41
|
LLM_ARCH_PHI2,
|
|
40
42
|
LLM_ARCH_PHI3,
|
|
41
43
|
LLM_ARCH_PHIMOE,
|
|
@@ -107,6 +109,9 @@ enum llm_arch {
|
|
|
107
109
|
LLM_ARCH_SEED_OSS,
|
|
108
110
|
LLM_ARCH_GROVEMOE,
|
|
109
111
|
LLM_ARCH_APERTUS,
|
|
112
|
+
LLM_ARCH_MINIMAX_M2,
|
|
113
|
+
LLM_ARCH_COGVLM,
|
|
114
|
+
LLM_ARCH_PANGU_EMBED,
|
|
110
115
|
LLM_ARCH_UNKNOWN,
|
|
111
116
|
};
|
|
112
117
|
|
|
@@ -149,6 +154,7 @@ enum llm_kv {
|
|
|
149
154
|
LLM_KV_EXPERTS_PER_GROUP,
|
|
150
155
|
LLM_KV_MOE_EVERY_N_LAYERS,
|
|
151
156
|
LLM_KV_NEXTN_PREDICT_LAYERS,
|
|
157
|
+
LLM_KV_NUM_DEEPSTACK_LAYERS,
|
|
152
158
|
LLM_KV_POOLING_TYPE,
|
|
153
159
|
LLM_KV_LOGIT_SCALE,
|
|
154
160
|
LLM_KV_DECODER_START_TOKEN_ID,
|
|
@@ -455,6 +461,11 @@ enum llm_tensor {
|
|
|
455
461
|
LLM_TENSOR_SHORTCONV_CONV,
|
|
456
462
|
LLM_TENSOR_SHORTCONV_INPROJ,
|
|
457
463
|
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
|
464
|
+
LLM_TENSOR_VISEXP_ATTN_QKV,
|
|
465
|
+
LLM_TENSOR_VISEXP_ATTN_OUT,
|
|
466
|
+
LLM_TENSOR_VISEXP_FFN_GATE,
|
|
467
|
+
LLM_TENSOR_VISEXP_FFN_DOWN,
|
|
468
|
+
LLM_TENSOR_VISEXP_FFN_UP,
|
|
458
469
|
LLM_TENSOR_NEXTN_EH_PROJ,
|
|
459
470
|
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
|
460
471
|
LLM_TENSOR_NEXTN_ENORM,
|
|
@@ -215,6 +215,7 @@ bool llama_batch_allocr::init(
|
|
|
215
215
|
/*.n_seq_tokens =*/ (uint32_t) 1,
|
|
216
216
|
/*.n_seqs =*/ (uint32_t) batch.n_tokens,
|
|
217
217
|
/*.n_seqs_unq =*/ (uint32_t) this->seq_id_unq.size(),
|
|
218
|
+
/*.n_pos =*/ n_pos_per_embd,
|
|
218
219
|
/*.token =*/ batch.token,
|
|
219
220
|
/*.embd =*/ batch.embd,
|
|
220
221
|
/*.pos =*/ batch.pos,
|
|
@@ -251,46 +252,72 @@ bool llama_batch_allocr::init(
|
|
|
251
252
|
// consistency checks
|
|
252
253
|
//
|
|
253
254
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
255
|
+
if (n_pos_per_embd > 1) {
|
|
256
|
+
// M-RoPE case: allow position to "jump" forward only (non-continuous positions are allowed)
|
|
257
|
+
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
|
258
|
+
if (seq_pos[s].empty()) {
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
|
|
263
|
+
|
|
264
|
+
if (batch.token) {
|
|
265
|
+
if (p0 >= 0 && p0 >= seq_pos_min(s)) {
|
|
266
|
+
LLAMA_LOG_ERROR(
|
|
267
|
+
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
|
268
|
+
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
|
269
|
+
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
|
270
|
+
" for M-RoPE, it is required that the position satisfies: X < Y\n",
|
|
271
|
+
__func__, s, s, p0, s, seq_pos_min(s));
|
|
272
|
+
|
|
273
|
+
return false;
|
|
274
|
+
}
|
|
275
|
+
} else {
|
|
276
|
+
// embedding inputs can have overlapping positions
|
|
277
|
+
if (p0 >= 0 && p0 > seq_pos_min(s)) {
|
|
278
|
+
LLAMA_LOG_ERROR(
|
|
279
|
+
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
|
280
|
+
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
|
281
|
+
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
|
282
|
+
" for M-RoPE, it is required that the position satisfies: X <= Y\n",
|
|
283
|
+
__func__, s, s, p0, s, seq_pos_min(s));
|
|
284
|
+
|
|
285
|
+
return false;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
257
288
|
}
|
|
289
|
+
} else {
|
|
290
|
+
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
|
291
|
+
if (seq_pos[s].empty()) {
|
|
292
|
+
continue;
|
|
293
|
+
}
|
|
258
294
|
|
|
259
|
-
|
|
295
|
+
const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
|
|
260
296
|
|
|
261
|
-
|
|
262
|
-
|
|
297
|
+
if (p0 >= 0) {
|
|
298
|
+
bool ok = true;
|
|
263
299
|
|
|
264
|
-
if (batch.token) {
|
|
265
300
|
if (seq_pos_min(s) != p0 + 1) {
|
|
266
301
|
ok = false;
|
|
267
302
|
}
|
|
268
|
-
} else {
|
|
269
|
-
assert(batch.embd);
|
|
270
303
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
304
|
+
if (!ok) {
|
|
305
|
+
LLAMA_LOG_ERROR(
|
|
306
|
+
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
|
307
|
+
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
|
308
|
+
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
|
309
|
+
" it is required that the sequence positions remain consecutive: Y = X + 1\n",
|
|
310
|
+
__func__, s, s, p0, s, seq_pos_min(s));
|
|
311
|
+
|
|
312
|
+
return false;
|
|
275
313
|
}
|
|
276
314
|
}
|
|
277
315
|
|
|
278
|
-
if (
|
|
279
|
-
LLAMA_LOG_ERROR(
|
|
280
|
-
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
|
|
281
|
-
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
|
|
282
|
-
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
|
|
283
|
-
" it is required that the sequence positions remain consecutive: Y = X + 1\n",
|
|
284
|
-
__func__, s, s, p0, s, seq_pos_min(s));
|
|
285
|
-
|
|
316
|
+
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
|
|
317
|
+
LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
|
|
286
318
|
return false;
|
|
287
319
|
}
|
|
288
320
|
}
|
|
289
|
-
|
|
290
|
-
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
|
|
291
|
-
LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
|
|
292
|
-
return false;
|
|
293
|
-
}
|
|
294
321
|
}
|
|
295
322
|
|
|
296
323
|
if (memory) {
|
|
@@ -389,6 +416,7 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
|
|
|
389
416
|
/*.n_seq_tokens =*/ n_seq_tokens,
|
|
390
417
|
/*.n_seqs =*/ n_seqs,
|
|
391
418
|
/*.n_seqs_unq =*/ n_seqs,
|
|
419
|
+
/*.n_pos =*/ n_pos_per_embd,
|
|
392
420
|
|
|
393
421
|
/*.token =*/ udata->token.data(),
|
|
394
422
|
/*.embd =*/ nullptr,
|
|
@@ -655,10 +683,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
655
683
|
|
|
656
684
|
auto udata = std::make_shared<llama_ubatch::data_t>();
|
|
657
685
|
|
|
658
|
-
const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
|
|
659
|
-
|
|
660
686
|
const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
|
|
661
|
-
const int64_t n_pos_all = (int64_t) n_tokens*
|
|
687
|
+
const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd;
|
|
662
688
|
|
|
663
689
|
udata->token .resize(n_tokens);
|
|
664
690
|
udata->embd .resize(n_embd_all);
|
|
@@ -680,8 +706,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
680
706
|
memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
|
|
681
707
|
}
|
|
682
708
|
|
|
683
|
-
for (
|
|
684
|
-
|
|
709
|
+
for (size_t j = 0; j < (size_t)n_pos_per_embd; ++j) {
|
|
710
|
+
// if we are using M-RoPE
|
|
711
|
+
// if the current batch is text, we need to broadcast the same position across all RoPE sections
|
|
712
|
+
// otherwise, the input batch is image embeddings, we copy the positions as-is
|
|
713
|
+
// if we are not using M-RoPE, there is only one position per token (this loop runs only once)
|
|
714
|
+
size_t src_off = batch.token ? 0 : j*batch.n_tokens;
|
|
715
|
+
udata->pos[j*n_tokens + i] = batch.pos[src_off + idxs[i]];
|
|
685
716
|
}
|
|
686
717
|
|
|
687
718
|
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
|
@@ -710,6 +741,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
|
|
710
741
|
/*.n_seq_tokens =*/ n_tokens/n_seqs,
|
|
711
742
|
/*.n_seqs =*/ n_seqs,
|
|
712
743
|
/*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(),
|
|
744
|
+
/*.n_pos =*/ n_pos_per_embd,
|
|
713
745
|
|
|
714
746
|
/*.token =*/ batch.token ? udata->token.data() : nullptr,
|
|
715
747
|
/*.embd =*/ batch.embd ? udata->embd.data() : nullptr,
|
|
@@ -17,6 +17,16 @@ struct llama_ubatch {
|
|
|
17
17
|
return b_equal_seqs != 0;
|
|
18
18
|
}
|
|
19
19
|
|
|
20
|
+
// typical for M-RoPE cases:
|
|
21
|
+
// 0 - sequantial position of the tokens/embeddings in the sequence
|
|
22
|
+
// 1 - y position in the image
|
|
23
|
+
// 2 - x position in the image
|
|
24
|
+
// 3 - other
|
|
25
|
+
bool is_pos_2d() const {
|
|
26
|
+
// TODO @ngxson : we may need to check for model arch when more models use >1 positions
|
|
27
|
+
return n_pos >= 3;
|
|
28
|
+
}
|
|
29
|
+
|
|
20
30
|
uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
|
|
21
31
|
// otherwise address sanitizer complains
|
|
22
32
|
// TODO: whole_seqs for embeddings?
|
|
@@ -25,6 +35,7 @@ struct llama_ubatch {
|
|
|
25
35
|
uint32_t n_seq_tokens; // tokens per sequence set
|
|
26
36
|
uint32_t n_seqs; // sequence sets in the ubatch
|
|
27
37
|
uint32_t n_seqs_unq; // unique sequence ids in the ubatch
|
|
38
|
+
uint32_t n_pos; // number of position inputs for each token/embedding
|
|
28
39
|
|
|
29
40
|
// seq_id_unq: unique sequence ids in the ubatch
|
|
30
41
|
// seq_idx: indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
|
|
@@ -33,7 +44,7 @@ struct llama_ubatch {
|
|
|
33
44
|
// // size | idx | val
|
|
34
45
|
llama_token * token; // [n_tokens] | i | id, token
|
|
35
46
|
float * embd; // [n_embd, n_tokens] | i | embd
|
|
36
|
-
llama_pos * pos; // [n_tokens]
|
|
47
|
+
llama_pos * pos; // [n_tokens*n_pos] | i | pos
|
|
37
48
|
int32_t * n_seq_id; // [n_tokens] | i | -
|
|
38
49
|
llama_seq_id ** seq_id; // [n_tokens] | s | s0, s1, seq_id
|
|
39
50
|
llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
|
|
@@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
73
73
|
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
|
74
74
|
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
|
75
75
|
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
|
76
|
+
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
|
|
76
77
|
};
|
|
77
78
|
|
|
78
79
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
|
@@ -213,6 +214,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
213
214
|
return LLM_CHAT_TEMPLATE_SEED_OSS;
|
|
214
215
|
} else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
|
|
215
216
|
return LLM_CHAT_TEMPLATE_GROK_2;
|
|
217
|
+
} else if (tmpl_contains(LU8("[unused9]系统:[unused10]"))) {
|
|
218
|
+
return LLM_CHAT_TEMPLATE_PANGU_EMBED;
|
|
216
219
|
}
|
|
217
220
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
|
218
221
|
}
|
|
@@ -813,6 +816,35 @@ int32_t llm_chat_apply_template(
|
|
|
813
816
|
if (add_ass) {
|
|
814
817
|
ss << "Assistant:";
|
|
815
818
|
}
|
|
819
|
+
}else if (tmpl == LLM_CHAT_TEMPLATE_PANGU_EMBED) {
|
|
820
|
+
// [unused9]系统:xxx[unused10]
|
|
821
|
+
// [unused9]用户:xxx[unused10]
|
|
822
|
+
// [unused9]助手:xxx[unused10]
|
|
823
|
+
// ...
|
|
824
|
+
for (size_t i = 0; i < chat.size(); ++i) {
|
|
825
|
+
const auto & msg = chat[i];
|
|
826
|
+
const std::string & role = msg->role;
|
|
827
|
+
const std::string & content = msg->content;
|
|
828
|
+
|
|
829
|
+
if (i == 0 && role != "system") {
|
|
830
|
+
ss << "[unused9]系统:[unused10]";
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
if (role == "system") {
|
|
834
|
+
ss << "[unused9]系统:" << content << "[unused10]";
|
|
835
|
+
} else if (role == "user") {
|
|
836
|
+
ss << "[unused9]用户:" << content << "[unused10]";
|
|
837
|
+
} else if (role == "assistant") {
|
|
838
|
+
ss << "[unused9]助手:" << content << "[unused10]";
|
|
839
|
+
} else if (role == "tool") {
|
|
840
|
+
ss << "[unused9]工具:" << content << "[unused10]";
|
|
841
|
+
} else if (role == "function") {
|
|
842
|
+
ss << "[unused9]方法:" << content << "[unused10]";
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
if (add_ass) {
|
|
846
|
+
ss << "[unused9]助手:";
|
|
847
|
+
}
|
|
816
848
|
} else {
|
|
817
849
|
// template not supported
|
|
818
850
|
return -1;
|
|
@@ -21,6 +21,8 @@ llama_context::llama_context(
|
|
|
21
21
|
llama_context_params params) :
|
|
22
22
|
model(model),
|
|
23
23
|
balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
|
|
24
|
+
// TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
|
|
25
|
+
// may need to be backend-dependent
|
|
24
26
|
LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
|
|
25
27
|
|
|
26
28
|
t_start_us = model.t_start_us;
|
|
@@ -112,11 +114,28 @@ llama_context::llama_context(
|
|
|
112
114
|
}
|
|
113
115
|
}
|
|
114
116
|
|
|
115
|
-
|
|
117
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
|
|
118
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
|
119
|
+
|
|
120
|
+
if (cparams.kv_unified) {
|
|
121
|
+
cparams.n_ctx_seq = cparams.n_ctx;
|
|
122
|
+
} else {
|
|
123
|
+
cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
124
|
+
cparams.n_ctx_seq = GGML_PAD(cparams.n_ctx_seq, 256);
|
|
125
|
+
|
|
126
|
+
if (cparams.n_ctx_seq == 0) {
|
|
127
|
+
throw std::runtime_error("n_ctx_seq == 0");
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if (cparams.n_ctx != cparams.n_ctx_seq * cparams.n_seq_max) {
|
|
131
|
+
cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max;
|
|
132
|
+
LLAMA_LOG_WARN("%s: n_ctx is not divisible by n_seq_max - rounding down to %u\n", __func__, cparams.n_ctx);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
116
135
|
|
|
117
136
|
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
|
118
137
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
|
119
|
-
LLAMA_LOG_INFO("%s:
|
|
138
|
+
LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq);
|
|
120
139
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
|
121
140
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
|
122
141
|
LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
|
|
@@ -125,14 +144,14 @@ llama_context::llama_context(
|
|
|
125
144
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
|
126
145
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
|
127
146
|
|
|
128
|
-
if (
|
|
129
|
-
LLAMA_LOG_WARN("%s:
|
|
130
|
-
__func__,
|
|
147
|
+
if (cparams.n_ctx_seq < hparams.n_ctx_train) {
|
|
148
|
+
LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
|
|
149
|
+
__func__, cparams.n_ctx_seq, hparams.n_ctx_train);
|
|
131
150
|
}
|
|
132
151
|
|
|
133
|
-
if (
|
|
134
|
-
LLAMA_LOG_WARN("%s:
|
|
135
|
-
__func__,
|
|
152
|
+
if (cparams.n_ctx_seq > hparams.n_ctx_train) {
|
|
153
|
+
LLAMA_LOG_WARN("%s: n_ctx_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
|
|
154
|
+
__func__, cparams.n_ctx_seq, hparams.n_ctx_train);
|
|
136
155
|
}
|
|
137
156
|
|
|
138
157
|
if (!hparams.vocab_only) {
|
|
@@ -268,9 +287,7 @@ llama_context::llama_context(
|
|
|
268
287
|
if (pipeline_parallel) {
|
|
269
288
|
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
|
|
270
289
|
}
|
|
271
|
-
}
|
|
272
290
|
|
|
273
|
-
if (!hparams.vocab_only) {
|
|
274
291
|
llama_memory_context_ptr mctx;
|
|
275
292
|
if (memory) {
|
|
276
293
|
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
|
|
@@ -343,7 +360,14 @@ llama_context::llama_context(
|
|
|
343
360
|
{
|
|
344
361
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
345
362
|
if (!gf) {
|
|
346
|
-
|
|
363
|
+
if (pipeline_parallel) {
|
|
364
|
+
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
|
365
|
+
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
|
|
366
|
+
gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
367
|
+
}
|
|
368
|
+
if (!gf) {
|
|
369
|
+
throw std::runtime_error("failed to allocate compute pp buffers");
|
|
370
|
+
}
|
|
347
371
|
}
|
|
348
372
|
|
|
349
373
|
n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
|
|
@@ -448,8 +472,8 @@ uint32_t llama_context::n_ctx() const {
|
|
|
448
472
|
return cparams.n_ctx;
|
|
449
473
|
}
|
|
450
474
|
|
|
451
|
-
uint32_t llama_context::
|
|
452
|
-
return cparams.
|
|
475
|
+
uint32_t llama_context::n_ctx_seq() const {
|
|
476
|
+
return cparams.n_ctx_seq;
|
|
453
477
|
}
|
|
454
478
|
|
|
455
479
|
uint32_t llama_context::n_batch() const {
|
|
@@ -803,7 +827,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|
|
803
827
|
|
|
804
828
|
const auto & hparams = model.hparams;
|
|
805
829
|
|
|
806
|
-
const int64_t n_embd = hparams.
|
|
830
|
+
const int64_t n_embd = hparams.n_embd_inp();
|
|
807
831
|
const int64_t n_vocab = model.vocab.n_tokens();
|
|
808
832
|
|
|
809
833
|
// note: during encode, we always pass the full sequence starting from pos = 0
|
|
@@ -972,7 +996,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
972
996
|
const auto & hparams = model.hparams;
|
|
973
997
|
|
|
974
998
|
const int64_t n_vocab = vocab.n_tokens();
|
|
975
|
-
const int64_t n_embd = hparams.
|
|
999
|
+
const int64_t n_embd = hparams.n_embd_inp();
|
|
976
1000
|
|
|
977
1001
|
// when computing embeddings, all tokens are output
|
|
978
1002
|
const bool output_all = cparams.embeddings;
|
|
@@ -2130,7 +2154,7 @@ void llama_context::opt_epoch_iter(
|
|
|
2130
2154
|
batch.logits [pos_batch] = true;
|
|
2131
2155
|
}
|
|
2132
2156
|
|
|
2133
|
-
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.
|
|
2157
|
+
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
|
2134
2158
|
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
|
2135
2159
|
return;
|
|
2136
2160
|
}
|
|
@@ -2378,6 +2402,10 @@ uint32_t llama_n_ctx(const llama_context * ctx) {
|
|
|
2378
2402
|
return ctx->n_ctx();
|
|
2379
2403
|
}
|
|
2380
2404
|
|
|
2405
|
+
uint32_t llama_n_ctx_seq(const llama_context * ctx) {
|
|
2406
|
+
return ctx->n_ctx_seq();
|
|
2407
|
+
}
|
|
2408
|
+
|
|
2381
2409
|
uint32_t llama_n_batch(const llama_context * ctx) {
|
|
2382
2410
|
return ctx->n_batch();
|
|
2383
2411
|
}
|
|
@@ -43,11 +43,11 @@ struct llama_context {
|
|
|
43
43
|
|
|
44
44
|
ggml_backend_sched_t get_sched() const;
|
|
45
45
|
|
|
46
|
-
uint32_t n_ctx()
|
|
47
|
-
uint32_t
|
|
48
|
-
uint32_t n_batch()
|
|
49
|
-
uint32_t n_ubatch()
|
|
50
|
-
uint32_t n_seq_max()
|
|
46
|
+
uint32_t n_ctx() const;
|
|
47
|
+
uint32_t n_ctx_seq() const;
|
|
48
|
+
uint32_t n_batch() const;
|
|
49
|
+
uint32_t n_ubatch() const;
|
|
50
|
+
uint32_t n_seq_max() const;
|
|
51
51
|
|
|
52
52
|
uint32_t n_threads() const;
|
|
53
53
|
uint32_t n_threads_batch() const;
|
|
@@ -810,6 +810,9 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
|
810
810
|
GGML_ABORT("fatal error");
|
|
811
811
|
}
|
|
812
812
|
|
|
813
|
+
//expand here so that we can fuse ffn gate
|
|
814
|
+
ggml_build_forward_expand(gf, cur);
|
|
815
|
+
|
|
813
816
|
if (gate && type_gate == LLM_FFN_PAR) {
|
|
814
817
|
cur = ggml_mul(ctx0, cur, tmp);
|
|
815
818
|
cb(cur, "ffn_gate_par", il);
|
|
@@ -1006,10 +1009,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
1006
1009
|
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
|
|
1007
1010
|
cb(weights_sum, "ffn_moe_weights_sum", il);
|
|
1008
1011
|
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
}
|
|
1012
|
+
// Avoid division by zero, clamp to smallest number representable by F16
|
|
1013
|
+
weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY);
|
|
1014
|
+
cb(weights_sum, "ffn_moe_weights_sum_clamped", il);
|
|
1013
1015
|
|
|
1014
1016
|
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
|
|
1015
1017
|
cb(weights, "ffn_moe_weights_norm", il);
|
|
@@ -1091,6 +1093,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
1091
1093
|
GGML_ABORT("fatal error");
|
|
1092
1094
|
}
|
|
1093
1095
|
|
|
1096
|
+
//expand here so that we can fuse ffn gate
|
|
1097
|
+
ggml_build_forward_expand(gf, cur);
|
|
1098
|
+
|
|
1094
1099
|
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
|
1095
1100
|
cb(experts, "ffn_moe_down", il);
|
|
1096
1101
|
|
|
@@ -1137,7 +1142,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
1137
1142
|
|
|
1138
1143
|
// input embeddings with optional lora
|
|
1139
1144
|
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|
1140
|
-
const int64_t n_embd = hparams.
|
|
1145
|
+
const int64_t n_embd = hparams.n_embd_inp();
|
|
1141
1146
|
|
|
1142
1147
|
auto inp = std::make_unique<llm_graph_input_embd>();
|
|
1143
1148
|
|
|
@@ -1274,7 +1279,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
|
|
|
1274
1279
|
// return cur;
|
|
1275
1280
|
//}
|
|
1276
1281
|
|
|
1277
|
-
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.
|
|
1282
|
+
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
|
|
1278
1283
|
const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
|
|
1279
1284
|
|
|
1280
1285
|
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
|
|
@@ -2030,7 +2035,7 @@ int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buck
|
|
|
2030
2035
|
|
|
2031
2036
|
if (bidirectional) {
|
|
2032
2037
|
relative_bucket += (relative_position > 0) * n_buckets;
|
|
2033
|
-
relative_position = abs(relative_position);
|
|
2038
|
+
relative_position = std::abs(relative_position);
|
|
2034
2039
|
} else {
|
|
2035
2040
|
relative_position = -std::min<int32_t>(relative_position, 0);
|
|
2036
2041
|
}
|