cui-llama.rn 1.3.4 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/common.cpp +7 -4
- package/cpp/common.h +14 -2
- package/cpp/ggml-alloc.c +0 -1
- package/cpp/ggml-backend-reg.cpp +74 -49
- package/cpp/ggml-cpu-aarch64.cpp +51 -71
- package/cpp/ggml-cpu.c +6 -6
- package/cpp/ggml-cpu.cpp +9 -0
- package/cpp/ggml-impl.h +16 -0
- package/cpp/ggml.c +153 -136
- package/cpp/ggml.h +29 -12
- package/cpp/llama-grammar.cpp +15 -15
- package/cpp/llama-grammar.h +2 -5
- package/cpp/llama-vocab.cpp +5 -1
- package/cpp/llama-vocab.h +1 -1
- package/cpp/llama.cpp +992 -300
- package/cpp/llama.h +0 -3
- package/cpp/sgemm.cpp +265 -258
- package/cpp/sgemm.h +2 -2
- package/package.json +1 -1
package/cpp/llama.cpp
CHANGED
@@ -157,6 +157,7 @@ static std::string format(const char * fmt, ...) {
|
|
157
157
|
|
158
158
|
enum llm_arch {
|
159
159
|
LLM_ARCH_LLAMA,
|
160
|
+
LLM_ARCH_DECI,
|
160
161
|
LLM_ARCH_FALCON,
|
161
162
|
LLM_ARCH_BAICHUAN,
|
162
163
|
LLM_ARCH_GROK,
|
@@ -208,63 +209,66 @@ enum llm_arch {
|
|
208
209
|
LLM_ARCH_GRANITE,
|
209
210
|
LLM_ARCH_GRANITE_MOE,
|
210
211
|
LLM_ARCH_CHAMELEON,
|
212
|
+
LLM_ARCH_WAVTOKENIZER_DEC,
|
211
213
|
LLM_ARCH_UNKNOWN,
|
212
214
|
};
|
213
215
|
|
214
216
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
215
|
-
{ LLM_ARCH_LLAMA,
|
216
|
-
{
|
217
|
-
{
|
218
|
-
{
|
219
|
-
{
|
220
|
-
{
|
221
|
-
{
|
222
|
-
{
|
223
|
-
{
|
224
|
-
{
|
225
|
-
{
|
226
|
-
{
|
227
|
-
{
|
228
|
-
{
|
229
|
-
{
|
230
|
-
{
|
231
|
-
{
|
232
|
-
{
|
233
|
-
{
|
234
|
-
{
|
235
|
-
{
|
236
|
-
{
|
237
|
-
{
|
238
|
-
{
|
239
|
-
{
|
240
|
-
{
|
241
|
-
{
|
242
|
-
{
|
243
|
-
{
|
244
|
-
{
|
245
|
-
{
|
246
|
-
{
|
247
|
-
{
|
248
|
-
{
|
249
|
-
{
|
250
|
-
{
|
251
|
-
{
|
252
|
-
{
|
253
|
-
{
|
254
|
-
{
|
255
|
-
{
|
256
|
-
{
|
257
|
-
{
|
258
|
-
{
|
259
|
-
{
|
260
|
-
{
|
261
|
-
{
|
262
|
-
{
|
263
|
-
{
|
264
|
-
{
|
265
|
-
{
|
266
|
-
{
|
267
|
-
{
|
217
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
218
|
+
{ LLM_ARCH_DECI, "deci" },
|
219
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
220
|
+
{ LLM_ARCH_GROK, "grok" },
|
221
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
222
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
223
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
224
|
+
{ LLM_ARCH_MPT, "mpt" },
|
225
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
226
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
227
|
+
{ LLM_ARCH_REFACT, "refact" },
|
228
|
+
{ LLM_ARCH_BERT, "bert" },
|
229
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
230
|
+
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
231
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
232
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
233
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
234
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
235
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
236
|
+
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
237
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
238
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
239
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
240
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
241
|
+
{ LLM_ARCH_ORION, "orion" },
|
242
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
243
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
244
|
+
{ LLM_ARCH_MINICPM3, "minicpm3" },
|
245
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
246
|
+
{ LLM_ARCH_GEMMA2, "gemma2" },
|
247
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
248
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
249
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
250
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
251
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
252
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
253
|
+
{ LLM_ARCH_OLMO2, "olmo2" },
|
254
|
+
{ LLM_ARCH_OLMOE, "olmoe" },
|
255
|
+
{ LLM_ARCH_OPENELM, "openelm" },
|
256
|
+
{ LLM_ARCH_ARCTIC, "arctic" },
|
257
|
+
{ LLM_ARCH_DEEPSEEK, "deepseek" },
|
258
|
+
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
259
|
+
{ LLM_ARCH_CHATGLM, "chatglm" },
|
260
|
+
{ LLM_ARCH_BITNET, "bitnet" },
|
261
|
+
{ LLM_ARCH_T5, "t5" },
|
262
|
+
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
263
|
+
{ LLM_ARCH_JAIS, "jais" },
|
264
|
+
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
265
|
+
{ LLM_ARCH_EXAONE, "exaone" },
|
266
|
+
{ LLM_ARCH_RWKV6, "rwkv6" },
|
267
|
+
{ LLM_ARCH_GRANITE, "granite" },
|
268
|
+
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
269
|
+
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
270
|
+
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
271
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
268
272
|
};
|
269
273
|
|
270
274
|
enum llm_kv {
|
@@ -284,6 +288,7 @@ enum llm_kv {
|
|
284
288
|
LLM_KV_VOCAB_SIZE,
|
285
289
|
LLM_KV_CONTEXT_LENGTH,
|
286
290
|
LLM_KV_EMBEDDING_LENGTH,
|
291
|
+
LLM_KV_FEATURES_LENGTH,
|
287
292
|
LLM_KV_BLOCK_COUNT,
|
288
293
|
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
289
294
|
LLM_KV_FEED_FORWARD_LENGTH,
|
@@ -315,6 +320,8 @@ enum llm_kv {
|
|
315
320
|
LLM_KV_ATTENTION_VALUE_LENGTH,
|
316
321
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
317
322
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
323
|
+
LLM_KV_ATTENTION_GROUPNORM_EPS,
|
324
|
+
LLM_KV_ATTENTION_GROUPNORM_GROUPS,
|
318
325
|
LLM_KV_ATTENTION_CAUSAL,
|
319
326
|
LLM_KV_ATTENTION_Q_LORA_RANK,
|
320
327
|
LLM_KV_ATTENTION_KV_LORA_RANK,
|
@@ -378,6 +385,12 @@ enum llm_kv {
|
|
378
385
|
LLM_KV_ADAPTER_TYPE,
|
379
386
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
380
387
|
|
388
|
+
LLM_KV_POSNET_EMBEDDING_LENGTH,
|
389
|
+
LLM_KV_POSNET_BLOCK_COUNT,
|
390
|
+
|
391
|
+
LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
|
392
|
+
LLM_KV_CONVNEXT_BLOCK_COUNT,
|
393
|
+
|
381
394
|
// deprecated:
|
382
395
|
LLM_KV_TOKENIZER_PREFIX_ID,
|
383
396
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
@@ -401,6 +414,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
401
414
|
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
402
415
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
403
416
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
417
|
+
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
|
404
418
|
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
405
419
|
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
406
420
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
@@ -432,6 +446,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
432
446
|
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
433
447
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
434
448
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
449
|
+
{ LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
450
|
+
{ LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
435
451
|
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
436
452
|
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
437
453
|
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
@@ -462,6 +478,12 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
462
478
|
|
463
479
|
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
|
464
480
|
|
481
|
+
{ LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
|
482
|
+
{ LLM_KV_POSNET_BLOCK_COUNT, "%s.posnet.block_count" },
|
483
|
+
|
484
|
+
{ LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
|
485
|
+
{ LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
|
486
|
+
|
465
487
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
466
488
|
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
467
489
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
@@ -620,6 +642,22 @@ enum llm_tensor {
|
|
620
642
|
LLM_TENSOR_ENC_OUTPUT_NORM,
|
621
643
|
LLM_TENSOR_CLS,
|
622
644
|
LLM_TENSOR_CLS_OUT,
|
645
|
+
LLM_TENSOR_CONV1D,
|
646
|
+
LLM_TENSOR_CONVNEXT_DW,
|
647
|
+
LLM_TENSOR_CONVNEXT_NORM,
|
648
|
+
LLM_TENSOR_CONVNEXT_PW1,
|
649
|
+
LLM_TENSOR_CONVNEXT_PW2,
|
650
|
+
LLM_TENSOR_CONVNEXT_GAMMA,
|
651
|
+
LLM_TENSOR_POS_NET_CONV1,
|
652
|
+
LLM_TENSOR_POS_NET_CONV2,
|
653
|
+
LLM_TENSOR_POS_NET_NORM,
|
654
|
+
LLM_TENSOR_POS_NET_NORM1,
|
655
|
+
LLM_TENSOR_POS_NET_NORM2,
|
656
|
+
LLM_TENSOR_POS_NET_ATTN_NORM,
|
657
|
+
LLM_TENSOR_POS_NET_ATTN_Q,
|
658
|
+
LLM_TENSOR_POS_NET_ATTN_K,
|
659
|
+
LLM_TENSOR_POS_NET_ATTN_V,
|
660
|
+
LLM_TENSOR_POS_NET_ATTN_OUT,
|
623
661
|
};
|
624
662
|
|
625
663
|
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
@@ -649,6 +687,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
649
687
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
650
688
|
},
|
651
689
|
},
|
690
|
+
{
|
691
|
+
LLM_ARCH_DECI,
|
692
|
+
{
|
693
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
694
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
695
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
696
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
697
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
698
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
699
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
700
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
701
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
702
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
703
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
704
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
705
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
706
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
707
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
708
|
+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
709
|
+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
710
|
+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
711
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
712
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
713
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
714
|
+
},
|
715
|
+
},
|
652
716
|
{
|
653
717
|
LLM_ARCH_BAICHUAN,
|
654
718
|
{
|
@@ -1604,6 +1668,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1604
1668
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
1605
1669
|
},
|
1606
1670
|
},
|
1671
|
+
{
|
1672
|
+
LLM_ARCH_WAVTOKENIZER_DEC,
|
1673
|
+
{
|
1674
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1675
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
1676
|
+
{ LLM_TENSOR_CONV1D, "conv1d" },
|
1677
|
+
{ LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" },
|
1678
|
+
{ LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" },
|
1679
|
+
{ LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" },
|
1680
|
+
{ LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" },
|
1681
|
+
{ LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" },
|
1682
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1683
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1684
|
+
{ LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" },
|
1685
|
+
{ LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" },
|
1686
|
+
{ LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" },
|
1687
|
+
{ LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" },
|
1688
|
+
{ LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" },
|
1689
|
+
{ LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },
|
1690
|
+
{ LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" },
|
1691
|
+
{ LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" },
|
1692
|
+
{ LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" },
|
1693
|
+
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
|
1694
|
+
},
|
1695
|
+
},
|
1607
1696
|
{
|
1608
1697
|
LLM_ARCH_UNKNOWN,
|
1609
1698
|
{
|
@@ -1623,6 +1712,7 @@ enum llm_chat_template {
|
|
1623
1712
|
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
1624
1713
|
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
1625
1714
|
LLM_CHAT_TEMPLATE_PHI_3,
|
1715
|
+
LLM_CHAT_TEMPLATE_FALCON_3,
|
1626
1716
|
LLM_CHAT_TEMPLATE_ZEPHYR,
|
1627
1717
|
LLM_CHAT_TEMPLATE_MONARCH,
|
1628
1718
|
LLM_CHAT_TEMPLATE_GEMMA,
|
@@ -1641,6 +1731,7 @@ enum llm_chat_template {
|
|
1641
1731
|
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
1642
1732
|
LLM_CHAT_TEMPLATE_GRANITE,
|
1643
1733
|
LLM_CHAT_TEMPLATE_GIGACHAT,
|
1734
|
+
LLM_CHAT_TEMPLATE_MEGREZ,
|
1644
1735
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
1645
1736
|
};
|
1646
1737
|
|
@@ -1655,6 +1746,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
1655
1746
|
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
1656
1747
|
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
1657
1748
|
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
1749
|
+
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
1658
1750
|
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
1659
1751
|
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
1660
1752
|
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
@@ -1673,6 +1765,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
1673
1765
|
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
1674
1766
|
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
1675
1767
|
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
1768
|
+
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
|
1676
1769
|
};
|
1677
1770
|
|
1678
1771
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
@@ -2494,15 +2587,26 @@ static const size_t kiB = 1024;
|
|
2494
2587
|
static const size_t MiB = 1024*kiB;
|
2495
2588
|
static const size_t GiB = 1024*MiB;
|
2496
2589
|
|
2590
|
+
struct llama_hparams_posnet {
|
2591
|
+
uint32_t n_embd;
|
2592
|
+
uint32_t n_layer;
|
2593
|
+
};
|
2594
|
+
|
2595
|
+
struct llama_hparams_convnext {
|
2596
|
+
uint32_t n_embd;
|
2597
|
+
uint32_t n_layer;
|
2598
|
+
};
|
2599
|
+
|
2497
2600
|
struct llama_hparams {
|
2498
2601
|
bool vocab_only;
|
2499
2602
|
bool rope_finetuned;
|
2500
2603
|
bool use_par_res;
|
2501
2604
|
bool swin_norm;
|
2502
2605
|
|
2503
|
-
uint32_t n_vocab;
|
2606
|
+
uint32_t n_vocab = 0;
|
2504
2607
|
uint32_t n_ctx_train; // context size the model was trained on
|
2505
2608
|
uint32_t n_embd;
|
2609
|
+
uint32_t n_embd_features = 0;
|
2506
2610
|
uint32_t n_layer;
|
2507
2611
|
uint32_t n_rot;
|
2508
2612
|
uint32_t n_swa = 0; // sliding window attention (SWA)
|
@@ -2513,6 +2617,10 @@ struct llama_hparams {
|
|
2513
2617
|
uint32_t n_vocab_type = 0; // for BERT-style token types
|
2514
2618
|
uint32_t n_rel_attn_bkts = 0;
|
2515
2619
|
|
2620
|
+
// for WavTokenizer
|
2621
|
+
struct llama_hparams_posnet posnet;
|
2622
|
+
struct llama_hparams_convnext convnext;
|
2623
|
+
|
2516
2624
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
|
2517
2625
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
2518
2626
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
@@ -2527,6 +2635,9 @@ struct llama_hparams {
|
|
2527
2635
|
|
2528
2636
|
float f_norm_eps;
|
2529
2637
|
float f_norm_rms_eps;
|
2638
|
+
float f_norm_group_eps;
|
2639
|
+
|
2640
|
+
uint32_t n_norm_groups;
|
2530
2641
|
|
2531
2642
|
float f_attn_logit_softcapping = 50.0f;
|
2532
2643
|
float f_final_logit_softcapping = 30.0f;
|
@@ -2572,66 +2683,6 @@ struct llama_hparams {
|
|
2572
2683
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
2573
2684
|
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
2574
2685
|
|
2575
|
-
bool operator!=(const llama_hparams & other) const {
|
2576
|
-
if (this->vocab_only != other.vocab_only) return true;
|
2577
|
-
if (this->n_vocab != other.n_vocab) return true;
|
2578
|
-
if (this->n_ctx_train != other.n_ctx_train) return true;
|
2579
|
-
if (this->n_embd != other.n_embd) return true;
|
2580
|
-
if (this->n_layer != other.n_layer) return true;
|
2581
|
-
if (this->n_rot != other.n_rot) return true;
|
2582
|
-
if (this->n_swa != other.n_swa) return true;
|
2583
|
-
if (this->n_embd_head_k != other.n_embd_head_k) return true;
|
2584
|
-
if (this->n_embd_head_v != other.n_embd_head_v) return true;
|
2585
|
-
if (this->n_expert != other.n_expert) return true;
|
2586
|
-
if (this->n_expert_used != other.n_expert_used) return true;
|
2587
|
-
|
2588
|
-
if (this->n_head_arr != other.n_head_arr) return true;
|
2589
|
-
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
|
2590
|
-
if (this->n_ff_arr != other.n_ff_arr) return true;
|
2591
|
-
|
2592
|
-
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
|
2593
|
-
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
2594
|
-
if (this->n_lora_q != other.n_lora_q) return true;
|
2595
|
-
if (this->n_lora_kv != other.n_lora_kv) return true;
|
2596
|
-
if (this->n_ff_exp != other.n_ff_exp) return true;
|
2597
|
-
if (this->n_ff_shexp != other.n_ff_shexp) return true;
|
2598
|
-
if (this->n_expert_shared != other.n_expert_shared) return true;
|
2599
|
-
|
2600
|
-
if (this->rope_finetuned != other.rope_finetuned) return true;
|
2601
|
-
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
2602
|
-
if (std::equal(std::begin(this->rope_sections),
|
2603
|
-
std::end(this->rope_sections),
|
2604
|
-
std::begin(other.rope_sections))) return true;
|
2605
|
-
|
2606
|
-
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
2607
|
-
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
2608
|
-
if (this->ssm_d_state != other.ssm_d_state) return true;
|
2609
|
-
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
|
2610
|
-
if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
|
2611
|
-
|
2612
|
-
if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
|
2613
|
-
if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
|
2614
|
-
if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
|
2615
|
-
if (this->wkv_head_size != other.wkv_head_size) return true;
|
2616
|
-
|
2617
|
-
if (this->dec_start_token_id != other.dec_start_token_id) return true;
|
2618
|
-
|
2619
|
-
const float EPSILON = 1e-9f;
|
2620
|
-
|
2621
|
-
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
2622
|
-
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
2623
|
-
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
2624
|
-
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
2625
|
-
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
2626
|
-
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
|
2627
|
-
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
|
2628
|
-
if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
|
2629
|
-
if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
|
2630
|
-
if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
|
2631
|
-
|
2632
|
-
return false;
|
2633
|
-
}
|
2634
|
-
|
2635
2686
|
uint32_t n_head(uint32_t il = 0) const {
|
2636
2687
|
if (il < n_layer) {
|
2637
2688
|
return n_head_arr[il];
|
@@ -2684,21 +2735,21 @@ struct llama_hparams {
|
|
2684
2735
|
if (wkv_head_size != 0) {
|
2685
2736
|
// for RWKV models
|
2686
2737
|
return 2 * n_embd;
|
2687
|
-
} else {
|
2688
|
-
// TODO: maybe support other convolution strides than 1
|
2689
|
-
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
2690
|
-
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
2691
2738
|
}
|
2739
|
+
|
2740
|
+
// TODO: maybe support other convolution strides than 1
|
2741
|
+
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
2742
|
+
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
2692
2743
|
}
|
2693
2744
|
|
2694
2745
|
uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
|
2695
2746
|
if (wkv_head_size != 0) {
|
2696
2747
|
// corresponds to RWKV's wkv_states size
|
2697
2748
|
return n_embd * wkv_head_size;
|
2698
|
-
} else {
|
2699
|
-
// corresponds to Mamba's ssm_states size
|
2700
|
-
return ssm_d_state * ssm_d_inner;
|
2701
2749
|
}
|
2750
|
+
|
2751
|
+
// corresponds to Mamba's ssm_states size
|
2752
|
+
return ssm_d_state * ssm_d_inner;
|
2702
2753
|
}
|
2703
2754
|
};
|
2704
2755
|
|
@@ -2736,142 +2787,187 @@ struct llama_cparams {
|
|
2736
2787
|
void * cb_eval_user_data;
|
2737
2788
|
};
|
2738
2789
|
|
2739
|
-
|
2740
|
-
|
2741
|
-
|
2742
|
-
|
2743
|
-
|
2744
|
-
|
2790
|
+
struct llama_layer_posnet {
|
2791
|
+
// resnet
|
2792
|
+
struct lm_ggml_tensor * norm1 = nullptr;
|
2793
|
+
struct lm_ggml_tensor * norm1_b = nullptr;
|
2794
|
+
|
2795
|
+
struct lm_ggml_tensor * conv1 = nullptr;
|
2796
|
+
struct lm_ggml_tensor * conv1_b = nullptr;
|
2797
|
+
|
2798
|
+
struct lm_ggml_tensor * norm2 = nullptr;
|
2799
|
+
struct lm_ggml_tensor * norm2_b = nullptr;
|
2800
|
+
|
2801
|
+
struct lm_ggml_tensor * conv2 = nullptr;
|
2802
|
+
struct lm_ggml_tensor * conv2_b = nullptr;
|
2745
2803
|
|
2804
|
+
// attention
|
2805
|
+
struct lm_ggml_tensor * attn_norm = nullptr;
|
2806
|
+
struct lm_ggml_tensor * attn_norm_b = nullptr;
|
2807
|
+
|
2808
|
+
struct lm_ggml_tensor * attn_q = nullptr;
|
2809
|
+
struct lm_ggml_tensor * attn_q_b = nullptr;
|
2810
|
+
|
2811
|
+
struct lm_ggml_tensor * attn_k = nullptr;
|
2812
|
+
struct lm_ggml_tensor * attn_k_b = nullptr;
|
2813
|
+
|
2814
|
+
struct lm_ggml_tensor * attn_v = nullptr;
|
2815
|
+
struct lm_ggml_tensor * attn_v_b = nullptr;
|
2816
|
+
|
2817
|
+
struct lm_ggml_tensor * attn_o = nullptr;
|
2818
|
+
struct lm_ggml_tensor * attn_o_b = nullptr;
|
2819
|
+
|
2820
|
+
// normalize
|
2821
|
+
struct lm_ggml_tensor * norm = nullptr;
|
2822
|
+
struct lm_ggml_tensor * norm_b = nullptr;
|
2823
|
+
};
|
2824
|
+
|
2825
|
+
struct llama_layer_convnext {
|
2826
|
+
struct lm_ggml_tensor * dw = nullptr;
|
2827
|
+
struct lm_ggml_tensor * dw_b = nullptr;
|
2828
|
+
|
2829
|
+
struct lm_ggml_tensor * norm = nullptr;
|
2830
|
+
struct lm_ggml_tensor * norm_b = nullptr;
|
2831
|
+
|
2832
|
+
struct lm_ggml_tensor * pw1 = nullptr;
|
2833
|
+
struct lm_ggml_tensor * pw1_b = nullptr;
|
2834
|
+
|
2835
|
+
struct lm_ggml_tensor * pw2 = nullptr;
|
2836
|
+
struct lm_ggml_tensor * pw2_b = nullptr;
|
2837
|
+
|
2838
|
+
struct lm_ggml_tensor * gamma = nullptr;
|
2839
|
+
};
|
2840
|
+
|
2841
|
+
struct llama_layer {
|
2746
2842
|
// normalization
|
2747
|
-
struct lm_ggml_tensor * attn_norm;
|
2748
|
-
struct lm_ggml_tensor * attn_norm_b;
|
2749
|
-
struct lm_ggml_tensor * attn_norm_2;
|
2750
|
-
struct lm_ggml_tensor * attn_norm_2_b;
|
2751
|
-
struct lm_ggml_tensor * attn_q_norm;
|
2752
|
-
struct lm_ggml_tensor * attn_q_norm_b;
|
2753
|
-
struct lm_ggml_tensor * attn_k_norm;
|
2754
|
-
struct lm_ggml_tensor * attn_k_norm_b;
|
2755
|
-
struct lm_ggml_tensor * attn_out_norm;
|
2756
|
-
struct lm_ggml_tensor * attn_out_norm_b;
|
2757
|
-
struct lm_ggml_tensor * attn_q_a_norm;
|
2758
|
-
struct lm_ggml_tensor * attn_kv_a_norm;
|
2759
|
-
struct lm_ggml_tensor * attn_sub_norm;
|
2760
|
-
struct lm_ggml_tensor * attn_post_norm;
|
2761
|
-
struct lm_ggml_tensor * ffn_sub_norm;
|
2762
|
-
struct lm_ggml_tensor * attn_norm_cross;
|
2763
|
-
struct lm_ggml_tensor * attn_norm_enc;
|
2843
|
+
struct lm_ggml_tensor * attn_norm = nullptr;
|
2844
|
+
struct lm_ggml_tensor * attn_norm_b = nullptr;
|
2845
|
+
struct lm_ggml_tensor * attn_norm_2 = nullptr;
|
2846
|
+
struct lm_ggml_tensor * attn_norm_2_b = nullptr;
|
2847
|
+
struct lm_ggml_tensor * attn_q_norm = nullptr;
|
2848
|
+
struct lm_ggml_tensor * attn_q_norm_b = nullptr;
|
2849
|
+
struct lm_ggml_tensor * attn_k_norm = nullptr;
|
2850
|
+
struct lm_ggml_tensor * attn_k_norm_b = nullptr;
|
2851
|
+
struct lm_ggml_tensor * attn_out_norm = nullptr;
|
2852
|
+
struct lm_ggml_tensor * attn_out_norm_b = nullptr;
|
2853
|
+
struct lm_ggml_tensor * attn_q_a_norm = nullptr;
|
2854
|
+
struct lm_ggml_tensor * attn_kv_a_norm = nullptr;
|
2855
|
+
struct lm_ggml_tensor * attn_sub_norm = nullptr;
|
2856
|
+
struct lm_ggml_tensor * attn_post_norm = nullptr;
|
2857
|
+
struct lm_ggml_tensor * ffn_sub_norm = nullptr;
|
2858
|
+
struct lm_ggml_tensor * attn_norm_cross = nullptr;
|
2859
|
+
struct lm_ggml_tensor * attn_norm_enc = nullptr;
|
2764
2860
|
|
2765
2861
|
// attention
|
2766
|
-
struct lm_ggml_tensor * wq;
|
2767
|
-
struct lm_ggml_tensor * wk;
|
2768
|
-
struct lm_ggml_tensor * wv;
|
2769
|
-
struct lm_ggml_tensor * wo;
|
2770
|
-
struct lm_ggml_tensor * wqkv;
|
2771
|
-
struct lm_ggml_tensor * wq_a;
|
2772
|
-
struct lm_ggml_tensor * wq_b;
|
2773
|
-
struct lm_ggml_tensor * wkv_a_mqa;
|
2774
|
-
struct lm_ggml_tensor * wkv_b;
|
2775
|
-
struct lm_ggml_tensor * wq_cross;
|
2776
|
-
struct lm_ggml_tensor * wk_cross;
|
2777
|
-
struct lm_ggml_tensor * wv_cross;
|
2778
|
-
struct lm_ggml_tensor * wo_cross;
|
2779
|
-
struct lm_ggml_tensor * wq_enc;
|
2780
|
-
struct lm_ggml_tensor * wk_enc;
|
2781
|
-
struct lm_ggml_tensor * wv_enc;
|
2782
|
-
struct lm_ggml_tensor * wo_enc;
|
2862
|
+
struct lm_ggml_tensor * wq = nullptr;
|
2863
|
+
struct lm_ggml_tensor * wk = nullptr;
|
2864
|
+
struct lm_ggml_tensor * wv = nullptr;
|
2865
|
+
struct lm_ggml_tensor * wo = nullptr;
|
2866
|
+
struct lm_ggml_tensor * wqkv = nullptr;
|
2867
|
+
struct lm_ggml_tensor * wq_a = nullptr;
|
2868
|
+
struct lm_ggml_tensor * wq_b = nullptr;
|
2869
|
+
struct lm_ggml_tensor * wkv_a_mqa = nullptr;
|
2870
|
+
struct lm_ggml_tensor * wkv_b = nullptr;
|
2871
|
+
struct lm_ggml_tensor * wq_cross = nullptr;
|
2872
|
+
struct lm_ggml_tensor * wk_cross = nullptr;
|
2873
|
+
struct lm_ggml_tensor * wv_cross = nullptr;
|
2874
|
+
struct lm_ggml_tensor * wo_cross = nullptr;
|
2875
|
+
struct lm_ggml_tensor * wq_enc = nullptr;
|
2876
|
+
struct lm_ggml_tensor * wk_enc = nullptr;
|
2877
|
+
struct lm_ggml_tensor * wv_enc = nullptr;
|
2878
|
+
struct lm_ggml_tensor * wo_enc = nullptr;
|
2783
2879
|
|
2784
2880
|
// attention bias
|
2785
|
-
struct lm_ggml_tensor * bq;
|
2786
|
-
struct lm_ggml_tensor * bk;
|
2787
|
-
struct lm_ggml_tensor * bv;
|
2788
|
-
struct lm_ggml_tensor * bo;
|
2789
|
-
struct lm_ggml_tensor * bqkv;
|
2881
|
+
struct lm_ggml_tensor * bq = nullptr;
|
2882
|
+
struct lm_ggml_tensor * bk = nullptr;
|
2883
|
+
struct lm_ggml_tensor * bv = nullptr;
|
2884
|
+
struct lm_ggml_tensor * bo = nullptr;
|
2885
|
+
struct lm_ggml_tensor * bqkv = nullptr;
|
2790
2886
|
|
2791
2887
|
// relative position bias
|
2792
|
-
struct lm_ggml_tensor * attn_rel_b;
|
2793
|
-
struct lm_ggml_tensor * attn_rel_b_enc;
|
2794
|
-
struct lm_ggml_tensor * attn_rel_b_cross;
|
2888
|
+
struct lm_ggml_tensor * attn_rel_b = nullptr;
|
2889
|
+
struct lm_ggml_tensor * attn_rel_b_enc = nullptr;
|
2890
|
+
struct lm_ggml_tensor * attn_rel_b_cross = nullptr;
|
2795
2891
|
|
2796
2892
|
// normalization
|
2797
|
-
struct lm_ggml_tensor * ffn_norm;
|
2798
|
-
struct lm_ggml_tensor * ffn_norm_b;
|
2799
|
-
struct lm_ggml_tensor * ffn_post_norm;
|
2800
|
-
struct lm_ggml_tensor * layer_out_norm;
|
2801
|
-
struct lm_ggml_tensor * layer_out_norm_b;
|
2802
|
-
struct lm_ggml_tensor * ffn_norm_exps;
|
2803
|
-
struct lm_ggml_tensor * ffn_norm_enc;
|
2893
|
+
struct lm_ggml_tensor * ffn_norm = nullptr;
|
2894
|
+
struct lm_ggml_tensor * ffn_norm_b = nullptr;
|
2895
|
+
struct lm_ggml_tensor * ffn_post_norm = nullptr;
|
2896
|
+
struct lm_ggml_tensor * layer_out_norm = nullptr;
|
2897
|
+
struct lm_ggml_tensor * layer_out_norm_b = nullptr;
|
2898
|
+
struct lm_ggml_tensor * ffn_norm_exps = nullptr;
|
2899
|
+
struct lm_ggml_tensor * ffn_norm_enc = nullptr;
|
2804
2900
|
|
2805
2901
|
// ff
|
2806
|
-
struct lm_ggml_tensor * ffn_gate; // w1
|
2807
|
-
struct lm_ggml_tensor * ffn_down; // w2
|
2808
|
-
struct lm_ggml_tensor * ffn_up;
|
2809
|
-
struct lm_ggml_tensor * ffn_gate_enc;
|
2810
|
-
struct lm_ggml_tensor * ffn_down_enc;
|
2811
|
-
struct lm_ggml_tensor * ffn_up_enc;
|
2902
|
+
struct lm_ggml_tensor * ffn_gate = nullptr; // w1
|
2903
|
+
struct lm_ggml_tensor * ffn_down = nullptr; // w2
|
2904
|
+
struct lm_ggml_tensor * ffn_up = nullptr; // w3
|
2905
|
+
struct lm_ggml_tensor * ffn_gate_enc = nullptr;
|
2906
|
+
struct lm_ggml_tensor * ffn_down_enc = nullptr;
|
2907
|
+
struct lm_ggml_tensor * ffn_up_enc = nullptr;
|
2812
2908
|
|
2813
2909
|
// ff MoE
|
2814
|
-
struct lm_ggml_tensor * ffn_gate_inp;
|
2815
|
-
struct lm_ggml_tensor * ffn_gate_exps;
|
2816
|
-
struct lm_ggml_tensor * ffn_down_exps;
|
2817
|
-
struct lm_ggml_tensor * ffn_up_exps ;
|
2910
|
+
struct lm_ggml_tensor * ffn_gate_inp = nullptr;
|
2911
|
+
struct lm_ggml_tensor * ffn_gate_exps = nullptr;
|
2912
|
+
struct lm_ggml_tensor * ffn_down_exps = nullptr;
|
2913
|
+
struct lm_ggml_tensor * ffn_up_exps = nullptr;
|
2818
2914
|
|
2819
2915
|
// ff shared expert (shexp)
|
2820
|
-
struct lm_ggml_tensor * ffn_gate_inp_shexp;
|
2821
|
-
struct lm_ggml_tensor * ffn_gate_shexp;
|
2822
|
-
struct lm_ggml_tensor * ffn_down_shexp;
|
2823
|
-
struct lm_ggml_tensor * ffn_up_shexp;
|
2916
|
+
struct lm_ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
2917
|
+
struct lm_ggml_tensor * ffn_gate_shexp = nullptr;
|
2918
|
+
struct lm_ggml_tensor * ffn_down_shexp = nullptr;
|
2919
|
+
struct lm_ggml_tensor * ffn_up_shexp = nullptr;
|
2824
2920
|
|
2825
2921
|
// ff bias
|
2826
|
-
struct lm_ggml_tensor * ffn_gate_b;
|
2827
|
-
struct lm_ggml_tensor * ffn_down_b; // b2
|
2828
|
-
struct lm_ggml_tensor * ffn_up_b; // b3
|
2829
|
-
struct lm_ggml_tensor * ffn_act;
|
2922
|
+
struct lm_ggml_tensor * ffn_gate_b = nullptr;
|
2923
|
+
struct lm_ggml_tensor * ffn_down_b = nullptr; // b2
|
2924
|
+
struct lm_ggml_tensor * ffn_up_b = nullptr; // b3
|
2925
|
+
struct lm_ggml_tensor * ffn_act = nullptr;
|
2830
2926
|
|
2831
2927
|
// mamba proj
|
2832
|
-
struct lm_ggml_tensor * ssm_in;
|
2833
|
-
struct lm_ggml_tensor * ssm_x;
|
2834
|
-
struct lm_ggml_tensor * ssm_dt;
|
2835
|
-
struct lm_ggml_tensor * ssm_out;
|
2928
|
+
struct lm_ggml_tensor * ssm_in = nullptr;
|
2929
|
+
struct lm_ggml_tensor * ssm_x = nullptr;
|
2930
|
+
struct lm_ggml_tensor * ssm_dt = nullptr;
|
2931
|
+
struct lm_ggml_tensor * ssm_out = nullptr;
|
2836
2932
|
|
2837
2933
|
// mamba
|
2838
|
-
struct lm_ggml_tensor * ssm_conv1d;
|
2839
|
-
struct lm_ggml_tensor * ssm_a;
|
2840
|
-
struct lm_ggml_tensor * ssm_d;
|
2934
|
+
struct lm_ggml_tensor * ssm_conv1d = nullptr;
|
2935
|
+
struct lm_ggml_tensor * ssm_a = nullptr;
|
2936
|
+
struct lm_ggml_tensor * ssm_d = nullptr;
|
2841
2937
|
|
2842
2938
|
// mamba bias
|
2843
|
-
struct lm_ggml_tensor * ssm_conv1d_b;
|
2844
|
-
struct lm_ggml_tensor * ssm_dt_b;
|
2939
|
+
struct lm_ggml_tensor * ssm_conv1d_b = nullptr;
|
2940
|
+
struct lm_ggml_tensor * ssm_dt_b = nullptr;
|
2845
2941
|
|
2846
2942
|
// rwkv
|
2847
|
-
struct lm_ggml_tensor * time_mix_w1;
|
2848
|
-
struct lm_ggml_tensor * time_mix_w2;
|
2849
|
-
struct lm_ggml_tensor * time_mix_lerp_x;
|
2850
|
-
struct lm_ggml_tensor * time_mix_lerp_w;
|
2851
|
-
struct lm_ggml_tensor * time_mix_lerp_k;
|
2852
|
-
struct lm_ggml_tensor * time_mix_lerp_v;
|
2853
|
-
struct lm_ggml_tensor * time_mix_lerp_r;
|
2854
|
-
struct lm_ggml_tensor * time_mix_lerp_g;
|
2855
|
-
|
2856
|
-
struct lm_ggml_tensor * time_mix_first;
|
2857
|
-
struct lm_ggml_tensor * time_mix_decay;
|
2858
|
-
struct lm_ggml_tensor * time_mix_decay_w1;
|
2859
|
-
struct lm_ggml_tensor * time_mix_decay_w2;
|
2860
|
-
struct lm_ggml_tensor * time_mix_key;
|
2861
|
-
struct lm_ggml_tensor * time_mix_value;
|
2862
|
-
struct lm_ggml_tensor * time_mix_receptance;
|
2863
|
-
struct lm_ggml_tensor * time_mix_gate;
|
2864
|
-
|
2865
|
-
struct lm_ggml_tensor * time_mix_ln;
|
2866
|
-
struct lm_ggml_tensor * time_mix_ln_b;
|
2867
|
-
struct lm_ggml_tensor * time_mix_output;
|
2868
|
-
|
2869
|
-
struct lm_ggml_tensor * channel_mix_lerp_k;
|
2870
|
-
struct lm_ggml_tensor * channel_mix_lerp_r;
|
2871
|
-
|
2872
|
-
struct lm_ggml_tensor * channel_mix_key;
|
2873
|
-
struct lm_ggml_tensor * channel_mix_receptance;
|
2874
|
-
struct lm_ggml_tensor * channel_mix_value;
|
2943
|
+
struct lm_ggml_tensor * time_mix_w1 = nullptr;
|
2944
|
+
struct lm_ggml_tensor * time_mix_w2 = nullptr;
|
2945
|
+
struct lm_ggml_tensor * time_mix_lerp_x = nullptr;
|
2946
|
+
struct lm_ggml_tensor * time_mix_lerp_w = nullptr;
|
2947
|
+
struct lm_ggml_tensor * time_mix_lerp_k = nullptr;
|
2948
|
+
struct lm_ggml_tensor * time_mix_lerp_v = nullptr;
|
2949
|
+
struct lm_ggml_tensor * time_mix_lerp_r = nullptr;
|
2950
|
+
struct lm_ggml_tensor * time_mix_lerp_g = nullptr;
|
2951
|
+
|
2952
|
+
struct lm_ggml_tensor * time_mix_first = nullptr;
|
2953
|
+
struct lm_ggml_tensor * time_mix_decay = nullptr;
|
2954
|
+
struct lm_ggml_tensor * time_mix_decay_w1 = nullptr;
|
2955
|
+
struct lm_ggml_tensor * time_mix_decay_w2 = nullptr;
|
2956
|
+
struct lm_ggml_tensor * time_mix_key = nullptr;
|
2957
|
+
struct lm_ggml_tensor * time_mix_value = nullptr;
|
2958
|
+
struct lm_ggml_tensor * time_mix_receptance = nullptr;
|
2959
|
+
struct lm_ggml_tensor * time_mix_gate = nullptr;
|
2960
|
+
|
2961
|
+
struct lm_ggml_tensor * time_mix_ln = nullptr;
|
2962
|
+
struct lm_ggml_tensor * time_mix_ln_b = nullptr;
|
2963
|
+
struct lm_ggml_tensor * time_mix_output = nullptr;
|
2964
|
+
|
2965
|
+
struct lm_ggml_tensor * channel_mix_lerp_k = nullptr;
|
2966
|
+
struct lm_ggml_tensor * channel_mix_lerp_r = nullptr;
|
2967
|
+
|
2968
|
+
struct lm_ggml_tensor * channel_mix_key = nullptr;
|
2969
|
+
struct lm_ggml_tensor * channel_mix_receptance = nullptr;
|
2970
|
+
struct lm_ggml_tensor * channel_mix_value = nullptr;
|
2875
2971
|
|
2876
2972
|
// long rope factors
|
2877
2973
|
struct lm_ggml_tensor * rope_long = nullptr;
|
@@ -2879,13 +2975,17 @@ struct llama_layer {
|
|
2879
2975
|
struct lm_ggml_tensor * rope_freqs = nullptr;
|
2880
2976
|
|
2881
2977
|
// bitnet scale
|
2882
|
-
struct lm_ggml_tensor * wq_scale;
|
2883
|
-
struct lm_ggml_tensor * wk_scale;
|
2884
|
-
struct lm_ggml_tensor * wv_scale;
|
2885
|
-
struct lm_ggml_tensor * wo_scale;
|
2886
|
-
struct lm_ggml_tensor * ffn_gate_scale;
|
2887
|
-
struct lm_ggml_tensor * ffn_up_scale;
|
2888
|
-
struct lm_ggml_tensor * ffn_down_scale;
|
2978
|
+
struct lm_ggml_tensor * wq_scale = nullptr;
|
2979
|
+
struct lm_ggml_tensor * wk_scale = nullptr;
|
2980
|
+
struct lm_ggml_tensor * wv_scale = nullptr;
|
2981
|
+
struct lm_ggml_tensor * wo_scale = nullptr;
|
2982
|
+
struct lm_ggml_tensor * ffn_gate_scale = nullptr;
|
2983
|
+
struct lm_ggml_tensor * ffn_up_scale = nullptr;
|
2984
|
+
struct lm_ggml_tensor * ffn_down_scale = nullptr;
|
2985
|
+
|
2986
|
+
struct llama_layer_posnet posnet;
|
2987
|
+
|
2988
|
+
struct llama_layer_convnext convnext;
|
2889
2989
|
};
|
2890
2990
|
|
2891
2991
|
// very similar to llama_batch,
|
@@ -3016,6 +3116,9 @@ struct llama_model {
|
|
3016
3116
|
struct lm_ggml_tensor * cls_out = nullptr;
|
3017
3117
|
struct lm_ggml_tensor * cls_out_b = nullptr;
|
3018
3118
|
|
3119
|
+
struct lm_ggml_tensor * conv1d = nullptr;
|
3120
|
+
struct lm_ggml_tensor * conv1d_b = nullptr;
|
3121
|
+
|
3019
3122
|
std::vector<llama_layer> layers;
|
3020
3123
|
|
3021
3124
|
// gguf metadata
|
@@ -3100,6 +3203,7 @@ struct llama_sbatch {
|
|
3100
3203
|
// batch indices of the output
|
3101
3204
|
std::vector<size_t> out_ids;
|
3102
3205
|
std::vector<llama_sbatch_seq> seq;
|
3206
|
+
|
3103
3207
|
const llama_batch * batch = nullptr;
|
3104
3208
|
|
3105
3209
|
// buffers for the ubatch
|
@@ -3520,6 +3624,17 @@ static int llama_get_device_count(const llama_model & model) {
|
|
3520
3624
|
return (int) model.devices.size();
|
3521
3625
|
}
|
3522
3626
|
|
3627
|
+
static struct lm_ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) {
|
3628
|
+
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
3629
|
+
[name](const std::pair<std::string, struct lm_ggml_tensor *> & it) {
|
3630
|
+
return it.first == name;
|
3631
|
+
});
|
3632
|
+
if (it == model->tensors_by_name.end()) {
|
3633
|
+
return nullptr;
|
3634
|
+
}
|
3635
|
+
return it->second;
|
3636
|
+
}
|
3637
|
+
|
3523
3638
|
template<typename F>
|
3524
3639
|
static bool buft_supported(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_dev_t dev, F & fn) {
|
3525
3640
|
lm_ggml_init_params params = {
|
@@ -3573,7 +3688,9 @@ static bool llama_kv_cache_init(
|
|
3573
3688
|
|
3574
3689
|
const struct llama_hparams & hparams = model.hparams;
|
3575
3690
|
|
3576
|
-
const
|
3691
|
+
const int32_t n_layer = hparams.n_layer;
|
3692
|
+
|
3693
|
+
LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer);
|
3577
3694
|
|
3578
3695
|
cache.has_shift = false;
|
3579
3696
|
|
@@ -3614,10 +3731,12 @@ static bool llama_kv_cache_init(
|
|
3614
3731
|
cache.k_l.reserve(n_layer);
|
3615
3732
|
cache.v_l.reserve(n_layer);
|
3616
3733
|
|
3617
|
-
for (int i = 0; i <
|
3734
|
+
for (int i = 0; i < n_layer; i++) {
|
3618
3735
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
3619
3736
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
3620
3737
|
|
3738
|
+
LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
|
3739
|
+
|
3621
3740
|
lm_ggml_backend_buffer_type_t buft;
|
3622
3741
|
if (offload) {
|
3623
3742
|
auto * dev = model.dev_layer.at(i).dev;
|
@@ -5530,7 +5649,7 @@ static void llm_load_hparams(
|
|
5530
5649
|
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
5531
5650
|
|
5532
5651
|
// get hparams kv
|
5533
|
-
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
5652
|
+
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
|
5534
5653
|
|
5535
5654
|
// everything past this point is not vocab-related
|
5536
5655
|
if (hparams.vocab_only) {
|
@@ -5543,6 +5662,16 @@ static void llm_load_hparams(
|
|
5543
5662
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
5544
5663
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
5545
5664
|
|
5665
|
+
if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
5666
|
+
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
5667
|
+
|
5668
|
+
ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
|
5669
|
+
ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
|
5670
|
+
|
5671
|
+
ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
|
5672
|
+
ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
|
5673
|
+
}
|
5674
|
+
|
5546
5675
|
LM_GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
5547
5676
|
LM_GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
5548
5677
|
if (hparams.n_expert > 0) {
|
@@ -5551,13 +5680,13 @@ static void llm_load_hparams(
|
|
5551
5680
|
LM_GGML_ASSERT(hparams.n_expert_used == 0);
|
5552
5681
|
}
|
5553
5682
|
|
5554
|
-
// zero-out the
|
5683
|
+
// zero-out the array hparams
|
5555
5684
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
5556
5685
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
5557
5686
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
5558
5687
|
|
5559
|
-
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
|
5560
|
-
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
|
5688
|
+
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
5689
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
5561
5690
|
|
5562
5691
|
// n_head_kv is optional, default to n_head
|
5563
5692
|
hparams.n_head_kv_arr = hparams.n_head_arr;
|
@@ -5606,7 +5735,7 @@ static void llm_load_hparams(
|
|
5606
5735
|
|
5607
5736
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
5608
5737
|
|
5609
|
-
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
5738
|
+
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
|
5610
5739
|
if (hparams.n_rot != hparams.n_embd_head_k) {
|
5611
5740
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
5612
5741
|
}
|
@@ -5646,6 +5775,15 @@ static void llm_load_hparams(
|
|
5646
5775
|
}
|
5647
5776
|
}
|
5648
5777
|
} break;
|
5778
|
+
case LLM_ARCH_DECI:
|
5779
|
+
{
|
5780
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5781
|
+
switch (hparams.n_layer) {
|
5782
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
5783
|
+
case 80: model.type = e_model::MODEL_70B; break;
|
5784
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5785
|
+
}
|
5786
|
+
} break;
|
5649
5787
|
case LLM_ARCH_MINICPM:
|
5650
5788
|
{
|
5651
5789
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -6302,6 +6440,13 @@ static void llm_load_hparams(
|
|
6302
6440
|
default: model.type = e_model::MODEL_UNKNOWN;
|
6303
6441
|
}
|
6304
6442
|
} break;
|
6443
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
6444
|
+
{
|
6445
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
6446
|
+
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
6447
|
+
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
6448
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
6449
|
+
} break;
|
6305
6450
|
default: (void)0;
|
6306
6451
|
}
|
6307
6452
|
|
@@ -6331,7 +6476,7 @@ static void llm_load_vocab(
|
|
6331
6476
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
6332
6477
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
6333
6478
|
|
6334
|
-
if (tokenizer_model == "no_vocab") {
|
6479
|
+
if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
|
6335
6480
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
6336
6481
|
|
6337
6482
|
// default special tokens
|
@@ -6469,7 +6614,8 @@ static void llm_load_vocab(
|
|
6469
6614
|
} else if (
|
6470
6615
|
tokenizer_pre == "llama3" ||
|
6471
6616
|
tokenizer_pre == "llama-v3" ||
|
6472
|
-
tokenizer_pre == "llama-bpe"
|
6617
|
+
tokenizer_pre == "llama-bpe"||
|
6618
|
+
tokenizer_pre == "falcon3") {
|
6473
6619
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
6474
6620
|
vocab.tokenizer_ignore_merges = true;
|
6475
6621
|
vocab.tokenizer_add_bos = true;
|
@@ -6499,7 +6645,8 @@ static void llm_load_vocab(
|
|
6499
6645
|
tokenizer_pre == "jina-v1-en" ||
|
6500
6646
|
tokenizer_pre == "jina-v2-es" ||
|
6501
6647
|
tokenizer_pre == "jina-v2-de" ||
|
6502
|
-
tokenizer_pre == "jina-v2-code"
|
6648
|
+
tokenizer_pre == "jina-v2-code" ||
|
6649
|
+
tokenizer_pre == "roberta-bpe") {
|
6503
6650
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
6504
6651
|
} else if (
|
6505
6652
|
tokenizer_pre == "refact") {
|
@@ -6569,6 +6716,9 @@ static void llm_load_vocab(
|
|
6569
6716
|
} else if (
|
6570
6717
|
tokenizer_pre == "minerva-7b") {
|
6571
6718
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
6719
|
+
} else if (
|
6720
|
+
tokenizer_pre == "megrez") {
|
6721
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
6572
6722
|
} else {
|
6573
6723
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
6574
6724
|
}
|
@@ -7310,6 +7460,22 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
|
7310
7460
|
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT_ID}},
|
7311
7461
|
// this tensor is loaded for T5, but never used
|
7312
7462
|
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_NONE}},
|
7463
|
+
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, LM_GGML_OP_IM2COL}},
|
7464
|
+
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7465
|
+
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7466
|
+
{LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7467
|
+
{LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
|
7468
|
+
{LLM_TENSOR_POS_NET_CONV2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
|
7469
|
+
{LLM_TENSOR_POS_NET_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7470
|
+
{LLM_TENSOR_POS_NET_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7471
|
+
{LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7472
|
+
{LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7473
|
+
{LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7474
|
+
{LLM_TENSOR_CONVNEXT_DW, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
|
7475
|
+
{LLM_TENSOR_CONVNEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7476
|
+
{LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7477
|
+
{LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
7478
|
+
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
|
7313
7479
|
};
|
7314
7480
|
|
7315
7481
|
// checks if the weight tensor can be used with the specified buffer type and device
|
@@ -7414,6 +7580,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
|
|
7414
7580
|
lm_ggml_tensor * state = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, S, n_seqs, S, H);
|
7415
7581
|
op_tensor = lm_ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
7416
7582
|
} break;
|
7583
|
+
case LM_GGML_OP_IM2COL:
|
7584
|
+
{
|
7585
|
+
const int n_embd = hparams.n_embd;
|
7586
|
+
lm_ggml_tensor * b = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
7587
|
+
op_tensor = lm_ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, LM_GGML_TYPE_F16);
|
7588
|
+
} break;
|
7417
7589
|
default:
|
7418
7590
|
LM_GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, lm_ggml_op_name(op), w->name);
|
7419
7591
|
}
|
@@ -7544,7 +7716,8 @@ static bool llm_load_tensors(
|
|
7544
7716
|
model.main_gpu = main_gpu;
|
7545
7717
|
model.n_gpu_layers = n_gpu_layers;
|
7546
7718
|
|
7547
|
-
const int n_layer
|
7719
|
+
const int n_layer = hparams.n_layer;
|
7720
|
+
|
7548
7721
|
bool use_mmap_buffer = true;
|
7549
7722
|
|
7550
7723
|
// build a list of buffer types for the CPU and GPU devices
|
@@ -7819,6 +7992,68 @@ static bool llm_load_tensors(
|
|
7819
7992
|
}
|
7820
7993
|
}
|
7821
7994
|
} break;
|
7995
|
+
case LLM_ARCH_DECI:
|
7996
|
+
{
|
7997
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
7998
|
+
|
7999
|
+
// output
|
8000
|
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
8001
|
+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8002
|
+
|
8003
|
+
// if output is NULL, init from the input tok embed
|
8004
|
+
if (model.output == NULL) {
|
8005
|
+
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
8006
|
+
}
|
8007
|
+
|
8008
|
+
for (int i = 0; i < n_layer; ++i) {
|
8009
|
+
auto & layer = model.layers[i];
|
8010
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
8011
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
8012
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
|
8013
|
+
const int64_t n_ff = hparams.n_ff(i);
|
8014
|
+
const int64_t n_head = hparams.n_head(i);
|
8015
|
+
const int64_t n_head_kv = hparams.n_head_kv(i);
|
8016
|
+
|
8017
|
+
if (n_head_kv == 0 && n_head > 0) {
|
8018
|
+
// linear attention for DeciLMCausalModel
|
8019
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
8020
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
8021
|
+
}
|
8022
|
+
else if (n_head_kv > 0) {
|
8023
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
8024
|
+
|
8025
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
8026
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
8027
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
8028
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
8029
|
+
}
|
8030
|
+
|
8031
|
+
// optional bias tensors
|
8032
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8033
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8034
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8035
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8036
|
+
|
8037
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
8038
|
+
|
8039
|
+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
8040
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
8041
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
8042
|
+
}
|
8043
|
+
else {
|
8044
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
8045
|
+
}
|
8046
|
+
|
8047
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
8048
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
8049
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
8050
|
+
|
8051
|
+
// optional MLP bias
|
8052
|
+
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8053
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8054
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
8055
|
+
}
|
8056
|
+
} break;
|
7822
8057
|
case LLM_ARCH_MINICPM3:
|
7823
8058
|
{
|
7824
8059
|
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
@@ -9347,9 +9582,9 @@ static bool llm_load_tensors(
|
|
9347
9582
|
} break;
|
9348
9583
|
case LLM_ARCH_CHAMELEON:
|
9349
9584
|
{
|
9350
|
-
|
9585
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
9351
9586
|
|
9352
|
-
|
9587
|
+
// output
|
9353
9588
|
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
9354
9589
|
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
9355
9590
|
// if output is NULL, init from the input tok embed
|
@@ -9378,6 +9613,109 @@ static bool llm_load_tensors(
|
|
9378
9613
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
9379
9614
|
}
|
9380
9615
|
} break;
|
9616
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
9617
|
+
{
|
9618
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
|
9619
|
+
|
9620
|
+
model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
|
9621
|
+
model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
|
9622
|
+
|
9623
|
+
// posnet
|
9624
|
+
{
|
9625
|
+
const int64_t n_embd = hparams.posnet.n_embd;
|
9626
|
+
|
9627
|
+
for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
|
9628
|
+
auto & layer = model.layers[i].posnet;
|
9629
|
+
|
9630
|
+
// posnet:
|
9631
|
+
//
|
9632
|
+
// - resnet
|
9633
|
+
// - resnet
|
9634
|
+
// - attn
|
9635
|
+
// - resnet
|
9636
|
+
// - resnet
|
9637
|
+
// - norm
|
9638
|
+
//
|
9639
|
+
switch (i) {
|
9640
|
+
case 0:
|
9641
|
+
case 1:
|
9642
|
+
case 3:
|
9643
|
+
case 4:
|
9644
|
+
{
|
9645
|
+
layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
|
9646
|
+
layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
|
9647
|
+
|
9648
|
+
layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
|
9649
|
+
layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
|
9650
|
+
|
9651
|
+
layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
|
9652
|
+
layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
|
9653
|
+
|
9654
|
+
layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
|
9655
|
+
layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
|
9656
|
+
} break;
|
9657
|
+
case 2:
|
9658
|
+
{
|
9659
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
9660
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
9661
|
+
|
9662
|
+
layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
|
9663
|
+
layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
|
9664
|
+
|
9665
|
+
layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
|
9666
|
+
layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
|
9667
|
+
|
9668
|
+
layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
|
9669
|
+
layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
|
9670
|
+
|
9671
|
+
layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
|
9672
|
+
layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
|
9673
|
+
} break;
|
9674
|
+
case 5:
|
9675
|
+
{
|
9676
|
+
layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
9677
|
+
layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
9678
|
+
} break;
|
9679
|
+
default: LM_GGML_ABORT("unknown posnet layer");
|
9680
|
+
};
|
9681
|
+
}
|
9682
|
+
}
|
9683
|
+
|
9684
|
+
LM_GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
|
9685
|
+
|
9686
|
+
model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
|
9687
|
+
model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
|
9688
|
+
|
9689
|
+
// convnext
|
9690
|
+
{
|
9691
|
+
const int64_t n_embd = hparams.convnext.n_embd;
|
9692
|
+
|
9693
|
+
for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
|
9694
|
+
auto & layer = model.layers[i].convnext;
|
9695
|
+
|
9696
|
+
layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
|
9697
|
+
layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
|
9698
|
+
|
9699
|
+
layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
|
9700
|
+
layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
|
9701
|
+
|
9702
|
+
layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
|
9703
|
+
layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
|
9704
|
+
|
9705
|
+
layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
|
9706
|
+
layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
|
9707
|
+
|
9708
|
+
layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
|
9709
|
+
}
|
9710
|
+
|
9711
|
+
// output
|
9712
|
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
9713
|
+
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
9714
|
+
}
|
9715
|
+
|
9716
|
+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
|
9717
|
+
model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
|
9718
|
+
} break;
|
9381
9719
|
default:
|
9382
9720
|
throw std::runtime_error("unknown architecture");
|
9383
9721
|
}
|
@@ -9597,6 +9935,7 @@ enum llm_ffn_gate_type {
|
|
9597
9935
|
enum llm_norm_type {
|
9598
9936
|
LLM_NORM,
|
9599
9937
|
LLM_NORM_RMS,
|
9938
|
+
LLM_NORM_GROUP,
|
9600
9939
|
};
|
9601
9940
|
|
9602
9941
|
static struct lm_ggml_tensor * llm_build_inp_embd(
|
@@ -9617,7 +9956,7 @@ static struct lm_ggml_tensor * llm_build_inp_embd(
|
|
9617
9956
|
|
9618
9957
|
inpL = lm_ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
9619
9958
|
} else {
|
9620
|
-
|
9959
|
+
lctx.inp_embd = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, n_embd, batch.n_tokens);
|
9621
9960
|
inpL = lctx.inp_embd;
|
9622
9961
|
lm_ggml_set_input(lctx.inp_embd);
|
9623
9962
|
}
|
@@ -9738,8 +10077,14 @@ static struct lm_ggml_tensor * llm_build_norm(
|
|
9738
10077
|
const llm_build_cb & cb,
|
9739
10078
|
int il) {
|
9740
10079
|
switch (type) {
|
9741
|
-
case LLM_NORM:
|
9742
|
-
case LLM_NORM_RMS:
|
10080
|
+
case LLM_NORM: cur = lm_ggml_norm (ctx, cur, hparams.f_norm_eps); break;
|
10081
|
+
case LLM_NORM_RMS: cur = lm_ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break;
|
10082
|
+
case LLM_NORM_GROUP:
|
10083
|
+
{
|
10084
|
+
cur = lm_ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
|
10085
|
+
cur = lm_ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
|
10086
|
+
cur = lm_ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]);
|
10087
|
+
} break;
|
9743
10088
|
}
|
9744
10089
|
|
9745
10090
|
if (mw || mb) {
|
@@ -11078,6 +11423,167 @@ struct llm_build_context {
|
|
11078
11423
|
return gf;
|
11079
11424
|
}
|
11080
11425
|
|
11426
|
+
struct lm_ggml_cgraph * build_deci() {
|
11427
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
11428
|
+
|
11429
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
11430
|
+
int32_t n_tokens = this->n_tokens;
|
11431
|
+
|
11432
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
11433
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
11434
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
11435
|
+
|
11436
|
+
struct lm_ggml_tensor * cur;
|
11437
|
+
struct lm_ggml_tensor * inpL;
|
11438
|
+
|
11439
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
11440
|
+
|
11441
|
+
// inp_pos - contains the positions
|
11442
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
11443
|
+
|
11444
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
11445
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
11446
|
+
|
11447
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
11448
|
+
for (int il = 0; il < n_layer; ++il) {
|
11449
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
11450
|
+
const int64_t n_head_kv = hparams.n_head_kv(il);
|
11451
|
+
const int64_t n_head = hparams.n_head(il);
|
11452
|
+
|
11453
|
+
if (n_head == 0) {
|
11454
|
+
// attention-free layer of Llama-3_1-Nemotron-51B
|
11455
|
+
cur = inpL;
|
11456
|
+
} else {
|
11457
|
+
// norm
|
11458
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
11459
|
+
model.layers[il].attn_norm, NULL,
|
11460
|
+
LLM_NORM_RMS, cb, il);
|
11461
|
+
cb(cur, "attn_norm", il);
|
11462
|
+
}
|
11463
|
+
|
11464
|
+
if (n_head > 0 && n_head_kv == 0) {
|
11465
|
+
// "linear attention" of Llama-3_1-Nemotron-51B
|
11466
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
|
11467
|
+
cb(cur, "wo", il);
|
11468
|
+
} else if (n_head > 0) {
|
11469
|
+
// self-attention
|
11470
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
11471
|
+
struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
|
11472
|
+
|
11473
|
+
// compute Q and K and RoPE them
|
11474
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
11475
|
+
cb(Qcur, "Qcur", il);
|
11476
|
+
if (model.layers[il].bq) {
|
11477
|
+
Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
|
11478
|
+
cb(Qcur, "Qcur", il);
|
11479
|
+
}
|
11480
|
+
|
11481
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
11482
|
+
cb(Kcur, "Kcur", il);
|
11483
|
+
if (model.layers[il].bk) {
|
11484
|
+
Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
|
11485
|
+
cb(Kcur, "Kcur", il);
|
11486
|
+
}
|
11487
|
+
|
11488
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
11489
|
+
cb(Vcur, "Vcur", il);
|
11490
|
+
if (model.layers[il].bv) {
|
11491
|
+
Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
|
11492
|
+
cb(Vcur, "Vcur", il);
|
11493
|
+
}
|
11494
|
+
|
11495
|
+
Qcur = lm_ggml_rope_ext(
|
11496
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
11497
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11498
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
11499
|
+
);
|
11500
|
+
cb(Qcur, "Qcur", il);
|
11501
|
+
|
11502
|
+
Kcur = lm_ggml_rope_ext(
|
11503
|
+
ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
11504
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
11505
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
11506
|
+
);
|
11507
|
+
cb(Kcur, "Kcur", il);
|
11508
|
+
|
11509
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
11510
|
+
model.layers[il].wo, model.layers[il].bo,
|
11511
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
11512
|
+
}
|
11513
|
+
|
11514
|
+
if (il == n_layer - 1) {
|
11515
|
+
// skip computing output for unused tokens
|
11516
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
11517
|
+
n_tokens = n_outputs;
|
11518
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
11519
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
11520
|
+
}
|
11521
|
+
|
11522
|
+
// For Granite architecture
|
11523
|
+
if (hparams.f_residual_scale) {
|
11524
|
+
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
11525
|
+
}
|
11526
|
+
|
11527
|
+
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
11528
|
+
struct lm_ggml_tensor * ffn_inp = cur;
|
11529
|
+
if (n_head > 0) {
|
11530
|
+
ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
11531
|
+
cb(ffn_inp, "ffn_inp", il);
|
11532
|
+
}
|
11533
|
+
|
11534
|
+
// feed-forward network
|
11535
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
11536
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
11537
|
+
model.layers[il].ffn_norm, NULL,
|
11538
|
+
LLM_NORM_RMS, cb, il);
|
11539
|
+
cb(cur, "ffn_norm", il);
|
11540
|
+
|
11541
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
11542
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
11543
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
11544
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
11545
|
+
NULL,
|
11546
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
11547
|
+
cb(cur, "ffn_out", il);
|
11548
|
+
}
|
11549
|
+
|
11550
|
+
// For Granite architecture
|
11551
|
+
if (hparams.f_residual_scale) {
|
11552
|
+
cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
11553
|
+
}
|
11554
|
+
|
11555
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
11556
|
+
cb(cur, "ffn_out", il);
|
11557
|
+
|
11558
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
11559
|
+
cb(cur, "l_out", il);
|
11560
|
+
|
11561
|
+
// input for next layer
|
11562
|
+
inpL = cur;
|
11563
|
+
}
|
11564
|
+
|
11565
|
+
cur = inpL;
|
11566
|
+
|
11567
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
11568
|
+
model.output_norm, NULL,
|
11569
|
+
LLM_NORM_RMS, cb, -1);
|
11570
|
+
cb(cur, "result_norm", -1);
|
11571
|
+
|
11572
|
+
// lm_head
|
11573
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
11574
|
+
|
11575
|
+
// For Granite architecture
|
11576
|
+
if (hparams.f_logit_scale) {
|
11577
|
+
cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
11578
|
+
}
|
11579
|
+
|
11580
|
+
cb(cur, "result_output", -1);
|
11581
|
+
|
11582
|
+
lm_ggml_build_forward_expand(gf, cur);
|
11583
|
+
|
11584
|
+
return gf;
|
11585
|
+
}
|
11586
|
+
|
11081
11587
|
struct lm_ggml_cgraph * build_baichuan() {
|
11082
11588
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
11083
11589
|
|
@@ -13107,7 +13613,13 @@ struct llm_build_context {
|
|
13107
13613
|
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
13108
13614
|
|
13109
13615
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13110
|
-
struct lm_ggml_tensor *
|
13616
|
+
struct lm_ggml_tensor * KQ_mask = nullptr;
|
13617
|
+
if (hparams.n_swa == 0) {
|
13618
|
+
// Phi-4 doesn't use sliding window attention
|
13619
|
+
KQ_mask = build_inp_KQ_mask();
|
13620
|
+
} else {
|
13621
|
+
KQ_mask = build_inp_KQ_mask_swa();
|
13622
|
+
}
|
13111
13623
|
|
13112
13624
|
for (int il = 0; il < n_layer; ++il) {
|
13113
13625
|
auto residual = inpL;
|
@@ -13165,7 +13677,7 @@ struct llm_build_context {
|
|
13165
13677
|
|
13166
13678
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13167
13679
|
model.layers[il].wo, model.layers[il].bo,
|
13168
|
-
Kcur, Vcur, Qcur,
|
13680
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
13169
13681
|
}
|
13170
13682
|
|
13171
13683
|
if (il == n_layer - 1) {
|
@@ -15865,7 +16377,7 @@ struct llm_build_context {
|
|
15865
16377
|
return gf;
|
15866
16378
|
}
|
15867
16379
|
|
15868
|
-
struct lm_ggml_cgraph *
|
16380
|
+
struct lm_ggml_cgraph * build_t5_enc() {
|
15869
16381
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
15870
16382
|
|
15871
16383
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
@@ -15997,7 +16509,7 @@ struct llm_build_context {
|
|
15997
16509
|
return gf;
|
15998
16510
|
}
|
15999
16511
|
|
16000
|
-
struct lm_ggml_cgraph *
|
16512
|
+
struct lm_ggml_cgraph * build_t5_dec() {
|
16001
16513
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
16002
16514
|
|
16003
16515
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
@@ -16946,6 +17458,158 @@ struct llm_build_context {
|
|
16946
17458
|
|
16947
17459
|
return gf;
|
16948
17460
|
}
|
17461
|
+
|
17462
|
+
struct lm_ggml_cgraph * build_wavtokenizer_dec() {
|
17463
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
17464
|
+
|
17465
|
+
struct lm_ggml_tensor * cur;
|
17466
|
+
struct lm_ggml_tensor * inpL;
|
17467
|
+
|
17468
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
17469
|
+
|
17470
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, inpL));
|
17471
|
+
|
17472
|
+
cur = lm_ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
|
17473
|
+
cur = lm_ggml_add(ctx0, cur, model.conv1d_b);
|
17474
|
+
|
17475
|
+
// posnet
|
17476
|
+
for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
|
17477
|
+
const auto & layer = model.layers[il].posnet;
|
17478
|
+
|
17479
|
+
inpL = cur;
|
17480
|
+
|
17481
|
+
switch (il) {
|
17482
|
+
case 0:
|
17483
|
+
case 1:
|
17484
|
+
case 3:
|
17485
|
+
case 4:
|
17486
|
+
{
|
17487
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17488
|
+
layer.norm1,
|
17489
|
+
layer.norm1_b,
|
17490
|
+
LLM_NORM_GROUP, cb, 0);
|
17491
|
+
|
17492
|
+
cur = lm_ggml_mul(ctx0, lm_ggml_sigmoid(ctx0, cur), cur);
|
17493
|
+
|
17494
|
+
cur = lm_ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
|
17495
|
+
cur = lm_ggml_add(ctx0, cur, layer.conv1_b);
|
17496
|
+
|
17497
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17498
|
+
layer.norm2,
|
17499
|
+
layer.norm2_b,
|
17500
|
+
LLM_NORM_GROUP, cb, 0);
|
17501
|
+
|
17502
|
+
cur = lm_ggml_mul(ctx0, lm_ggml_sigmoid(ctx0, cur), cur);
|
17503
|
+
|
17504
|
+
cur = lm_ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
|
17505
|
+
cur = lm_ggml_add(ctx0, cur, layer.conv2_b);
|
17506
|
+
|
17507
|
+
cur = lm_ggml_add(ctx0, cur, inpL);
|
17508
|
+
} break;
|
17509
|
+
case 2:
|
17510
|
+
{
|
17511
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17512
|
+
layer.attn_norm,
|
17513
|
+
layer.attn_norm_b,
|
17514
|
+
LLM_NORM_GROUP, cb, 0);
|
17515
|
+
|
17516
|
+
struct lm_ggml_tensor * q;
|
17517
|
+
struct lm_ggml_tensor * k;
|
17518
|
+
struct lm_ggml_tensor * v;
|
17519
|
+
|
17520
|
+
q = lm_ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
|
17521
|
+
k = lm_ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
|
17522
|
+
v = lm_ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
|
17523
|
+
|
17524
|
+
q = lm_ggml_add(ctx0, q, layer.attn_q_b);
|
17525
|
+
k = lm_ggml_add(ctx0, k, layer.attn_k_b);
|
17526
|
+
v = lm_ggml_add(ctx0, v, layer.attn_v_b);
|
17527
|
+
|
17528
|
+
q = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, q));
|
17529
|
+
k = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, k));
|
17530
|
+
|
17531
|
+
struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
|
17532
|
+
|
17533
|
+
kq = lm_ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
|
17534
|
+
|
17535
|
+
cur = lm_ggml_mul_mat(ctx0, kq, v);
|
17536
|
+
|
17537
|
+
cur = lm_ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
|
17538
|
+
cur = lm_ggml_add(ctx0, cur, layer.attn_o_b);
|
17539
|
+
|
17540
|
+
cur = lm_ggml_add(ctx0, cur, inpL);
|
17541
|
+
} break;
|
17542
|
+
case 5:
|
17543
|
+
{
|
17544
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17545
|
+
layer.norm,
|
17546
|
+
layer.norm_b,
|
17547
|
+
LLM_NORM_GROUP, cb, 0);
|
17548
|
+
} break;
|
17549
|
+
default: LM_GGML_ABORT("unknown posnet layer");
|
17550
|
+
};
|
17551
|
+
}
|
17552
|
+
|
17553
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
|
17554
|
+
|
17555
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17556
|
+
model.tok_norm,
|
17557
|
+
model.tok_norm_b,
|
17558
|
+
LLM_NORM, cb, -1);
|
17559
|
+
|
17560
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
|
17561
|
+
|
17562
|
+
inpL = cur;
|
17563
|
+
|
17564
|
+
// convnext
|
17565
|
+
for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
|
17566
|
+
const auto & layer = model.layers[il].convnext;
|
17567
|
+
|
17568
|
+
cur = inpL;
|
17569
|
+
|
17570
|
+
cur = lm_ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
|
17571
|
+
cur = lm_ggml_add(ctx0, cur, layer.dw_b);
|
17572
|
+
|
17573
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
|
17574
|
+
|
17575
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17576
|
+
layer.norm,
|
17577
|
+
layer.norm_b,
|
17578
|
+
LLM_NORM, cb, -1);
|
17579
|
+
|
17580
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
17581
|
+
layer.pw1, layer.pw1_b, NULL,
|
17582
|
+
NULL, NULL, NULL,
|
17583
|
+
layer.pw2, layer.pw2_b, NULL,
|
17584
|
+
NULL,
|
17585
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
17586
|
+
|
17587
|
+
cur = lm_ggml_mul(ctx0, cur, layer.gamma);
|
17588
|
+
|
17589
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
|
17590
|
+
|
17591
|
+
inpL = lm_ggml_add(ctx0, cur, inpL);
|
17592
|
+
}
|
17593
|
+
|
17594
|
+
cur = inpL;
|
17595
|
+
|
17596
|
+
cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
|
17597
|
+
|
17598
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
17599
|
+
model.output_norm,
|
17600
|
+
model.output_norm_b,
|
17601
|
+
LLM_NORM, cb, -1);
|
17602
|
+
|
17603
|
+
// lm_head
|
17604
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
17605
|
+
|
17606
|
+
cur = lm_ggml_add(ctx0, cur, model.output_b);
|
17607
|
+
cb(cur, "result_embd", -1);
|
17608
|
+
|
17609
|
+
lm_ggml_build_forward_expand(gf, cur);
|
17610
|
+
|
17611
|
+
return gf;
|
17612
|
+
}
|
16949
17613
|
};
|
16950
17614
|
|
16951
17615
|
static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -17034,6 +17698,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
17034
17698
|
{
|
17035
17699
|
result = llm.build_llama();
|
17036
17700
|
} break;
|
17701
|
+
case LLM_ARCH_DECI:
|
17702
|
+
{
|
17703
|
+
result = llm.build_deci();
|
17704
|
+
} break;
|
17037
17705
|
case LLM_ARCH_BAICHUAN:
|
17038
17706
|
{
|
17039
17707
|
result = llm.build_baichuan();
|
@@ -17192,14 +17860,14 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
17192
17860
|
case LLM_ARCH_T5:
|
17193
17861
|
{
|
17194
17862
|
if (lctx.is_encoding) {
|
17195
|
-
result = llm.
|
17863
|
+
result = llm.build_t5_enc();
|
17196
17864
|
} else {
|
17197
|
-
result = llm.
|
17865
|
+
result = llm.build_t5_dec();
|
17198
17866
|
}
|
17199
17867
|
} break;
|
17200
17868
|
case LLM_ARCH_T5ENCODER:
|
17201
17869
|
{
|
17202
|
-
result = llm.
|
17870
|
+
result = llm.build_t5_enc();
|
17203
17871
|
} break;
|
17204
17872
|
case LLM_ARCH_JAIS:
|
17205
17873
|
{
|
@@ -17221,6 +17889,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
17221
17889
|
{
|
17222
17890
|
result = llm.build_chameleon();
|
17223
17891
|
} break;
|
17892
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
17893
|
+
{
|
17894
|
+
result = llm.build_wavtokenizer_dec();
|
17895
|
+
} break;
|
17224
17896
|
default:
|
17225
17897
|
LM_GGML_ABORT("fatal error");
|
17226
17898
|
}
|
@@ -17312,30 +17984,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
|
17312
17984
|
}
|
17313
17985
|
|
17314
17986
|
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
17315
|
-
LM_GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
17316
|
-
const int64_t n_tokens = ubatch.n_tokens;
|
17987
|
+
//LM_GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
17317
17988
|
|
17318
|
-
|
17319
|
-
|
17989
|
+
if (!lctx.inp_out_ids) {
|
17990
|
+
LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
|
17991
|
+
} else {
|
17992
|
+
const int64_t n_tokens = ubatch.n_tokens;
|
17320
17993
|
|
17321
|
-
|
17322
|
-
|
17323
|
-
|
17324
|
-
|
17325
|
-
|
17326
|
-
|
17327
|
-
for (int i = 0; i < n_tokens; ++i) {
|
17328
|
-
if (ubatch.output[i]) {
|
17329
|
-
data[n_outputs++] = i;
|
17994
|
+
LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
|
17995
|
+
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
|
17996
|
+
|
17997
|
+
if (lctx.n_outputs == n_tokens) {
|
17998
|
+
for (int i = 0; i < n_tokens; ++i) {
|
17999
|
+
data[i] = i;
|
17330
18000
|
}
|
18001
|
+
} else if (ubatch.output) {
|
18002
|
+
int32_t n_outputs = 0;
|
18003
|
+
for (int i = 0; i < n_tokens; ++i) {
|
18004
|
+
if (ubatch.output[i]) {
|
18005
|
+
data[n_outputs++] = i;
|
18006
|
+
}
|
18007
|
+
}
|
18008
|
+
// the graph needs to have been passed the correct number of outputs
|
18009
|
+
LM_GGML_ASSERT(lctx.n_outputs == n_outputs);
|
18010
|
+
} else if (lctx.n_outputs == 1) {
|
18011
|
+
// only keep last output
|
18012
|
+
data[0] = n_tokens - 1;
|
18013
|
+
} else {
|
18014
|
+
LM_GGML_ASSERT(lctx.n_outputs == 0);
|
17331
18015
|
}
|
17332
|
-
// the graph needs to have been passed the correct number of outputs
|
17333
|
-
LM_GGML_ASSERT(lctx.n_outputs == n_outputs);
|
17334
|
-
} else if (lctx.n_outputs == 1) {
|
17335
|
-
// only keep last output
|
17336
|
-
data[0] = n_tokens - 1;
|
17337
|
-
} else {
|
17338
|
-
LM_GGML_ASSERT(lctx.n_outputs == 0);
|
17339
18016
|
}
|
17340
18017
|
}
|
17341
18018
|
|
@@ -18006,6 +18683,7 @@ static int llama_decode_internal(
|
|
18006
18683
|
embd = nullptr; // do not extract embeddings when not needed
|
18007
18684
|
LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
18008
18685
|
}
|
18686
|
+
|
18009
18687
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (lm_ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
18010
18688
|
|
18011
18689
|
lm_ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
|
@@ -20394,10 +21072,12 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
20394
21072
|
case LLM_ARCH_T5ENCODER:
|
20395
21073
|
case LLM_ARCH_JAIS:
|
20396
21074
|
case LLM_ARCH_RWKV6:
|
21075
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
20397
21076
|
return LLAMA_ROPE_TYPE_NONE;
|
20398
21077
|
|
20399
21078
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
20400
21079
|
case LLM_ARCH_LLAMA:
|
21080
|
+
case LLM_ARCH_DECI:
|
20401
21081
|
case LLM_ARCH_BAICHUAN:
|
20402
21082
|
case LLM_ARCH_STARCODER:
|
20403
21083
|
case LLM_ARCH_PLAMO:
|
@@ -20511,17 +21191,6 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
20511
21191
|
return model->n_elements;
|
20512
21192
|
}
|
20513
21193
|
|
20514
|
-
struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
20515
|
-
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
20516
|
-
[name](const std::pair<std::string, struct lm_ggml_tensor *> & it) {
|
20517
|
-
return it.first == name;
|
20518
|
-
});
|
20519
|
-
if (it == model->tensors_by_name.end()) {
|
20520
|
-
return nullptr;
|
20521
|
-
}
|
20522
|
-
return it->second;
|
20523
|
-
}
|
20524
|
-
|
20525
21194
|
bool llama_model_has_encoder(const struct llama_model * model) {
|
20526
21195
|
switch (model->arch) {
|
20527
21196
|
case LLM_ARCH_T5: return true;
|
@@ -22230,6 +22899,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
|
22230
22899
|
}
|
22231
22900
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
22232
22901
|
return LLM_CHAT_TEMPLATE_PHI_3;
|
22902
|
+
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
22903
|
+
return LLM_CHAT_TEMPLATE_FALCON_3;
|
22233
22904
|
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
22234
22905
|
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
22235
22906
|
} else if (tmpl_contains("bos_token + message['role']")) {
|
@@ -22276,6 +22947,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
|
22276
22947
|
return LLM_CHAT_TEMPLATE_GRANITE;
|
22277
22948
|
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
22278
22949
|
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
22950
|
+
} else if (tmpl_contains("<|role_start|>")) {
|
22951
|
+
return LLM_CHAT_TEMPLATE_MEGREZ;
|
22279
22952
|
}
|
22280
22953
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
22281
22954
|
}
|
@@ -22382,6 +23055,15 @@ static int32_t llama_chat_apply_template_internal(
|
|
22382
23055
|
if (add_ass) {
|
22383
23056
|
ss << "<|assistant|>\n";
|
22384
23057
|
}
|
23058
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
|
23059
|
+
// Falcon 3
|
23060
|
+
for (auto message : chat) {
|
23061
|
+
std::string role(message->role);
|
23062
|
+
ss << "<|" << role << "|>\n" << message->content << "\n";
|
23063
|
+
}
|
23064
|
+
if (add_ass) {
|
23065
|
+
ss << "<|assistant|>\n";
|
23066
|
+
}
|
22385
23067
|
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
22386
23068
|
// zephyr template
|
22387
23069
|
for (auto message : chat) {
|
@@ -22625,6 +23307,16 @@ static int32_t llama_chat_apply_template_internal(
|
|
22625
23307
|
if (add_ass) {
|
22626
23308
|
ss << "assistant<|role_sep|>";
|
22627
23309
|
}
|
23310
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
|
23311
|
+
// Megrez template
|
23312
|
+
for (auto message : chat) {
|
23313
|
+
std::string role(message->role);
|
23314
|
+
ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
|
23315
|
+
}
|
23316
|
+
|
23317
|
+
if (add_ass) {
|
23318
|
+
ss << "<|role_start|>assistant<|role_end|>";
|
23319
|
+
}
|
22628
23320
|
} else {
|
22629
23321
|
// template not supported
|
22630
23322
|
return -1;
|
@@ -22644,15 +23336,15 @@ int32_t llama_chat_apply_template(
|
|
22644
23336
|
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
22645
23337
|
if (tmpl == nullptr) {
|
22646
23338
|
LM_GGML_ASSERT(model != nullptr);
|
22647
|
-
|
22648
|
-
|
22649
|
-
|
22650
|
-
|
22651
|
-
|
23339
|
+
|
23340
|
+
// load template from model, if available
|
23341
|
+
const auto & it = model->lm_gguf_kv.find("tokenizer.chat_template");
|
23342
|
+
if (it != model->lm_gguf_kv.end() && it->second.size() > 0) {
|
23343
|
+
curr_tmpl = it->second;
|
23344
|
+
}
|
23345
|
+
else {
|
22652
23346
|
// worst case: there is no information about template, we will use chatml by default
|
22653
|
-
curr_tmpl = "chatml";
|
22654
|
-
} else {
|
22655
|
-
curr_tmpl = std::string(model_template.data(), model_template.size());
|
23347
|
+
curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
|
22656
23348
|
}
|
22657
23349
|
}
|
22658
23350
|
|