cui-llama.rn 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.cpp CHANGED
@@ -157,6 +157,7 @@ static std::string format(const char * fmt, ...) {
157
157
 
158
158
  enum llm_arch {
159
159
  LLM_ARCH_LLAMA,
160
+ LLM_ARCH_DECI,
160
161
  LLM_ARCH_FALCON,
161
162
  LLM_ARCH_BAICHUAN,
162
163
  LLM_ARCH_GROK,
@@ -208,63 +209,66 @@ enum llm_arch {
208
209
  LLM_ARCH_GRANITE,
209
210
  LLM_ARCH_GRANITE_MOE,
210
211
  LLM_ARCH_CHAMELEON,
212
+ LLM_ARCH_WAVTOKENIZER_DEC,
211
213
  LLM_ARCH_UNKNOWN,
212
214
  };
213
215
 
214
216
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
215
- { LLM_ARCH_LLAMA, "llama" },
216
- { LLM_ARCH_FALCON, "falcon" },
217
- { LLM_ARCH_GROK, "grok" },
218
- { LLM_ARCH_GPT2, "gpt2" },
219
- { LLM_ARCH_GPTJ, "gptj" },
220
- { LLM_ARCH_GPTNEOX, "gptneox" },
221
- { LLM_ARCH_MPT, "mpt" },
222
- { LLM_ARCH_BAICHUAN, "baichuan" },
223
- { LLM_ARCH_STARCODER, "starcoder" },
224
- { LLM_ARCH_REFACT, "refact" },
225
- { LLM_ARCH_BERT, "bert" },
226
- { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
227
- { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
228
- { LLM_ARCH_BLOOM, "bloom" },
229
- { LLM_ARCH_STABLELM, "stablelm" },
230
- { LLM_ARCH_QWEN, "qwen" },
231
- { LLM_ARCH_QWEN2, "qwen2" },
232
- { LLM_ARCH_QWEN2MOE, "qwen2moe" },
233
- { LLM_ARCH_QWEN2VL, "qwen2vl" },
234
- { LLM_ARCH_PHI2, "phi2" },
235
- { LLM_ARCH_PHI3, "phi3" },
236
- { LLM_ARCH_PLAMO, "plamo" },
237
- { LLM_ARCH_CODESHELL, "codeshell" },
238
- { LLM_ARCH_ORION, "orion" },
239
- { LLM_ARCH_INTERNLM2, "internlm2" },
240
- { LLM_ARCH_MINICPM, "minicpm" },
241
- { LLM_ARCH_MINICPM3, "minicpm3" },
242
- { LLM_ARCH_GEMMA, "gemma" },
243
- { LLM_ARCH_GEMMA2, "gemma2" },
244
- { LLM_ARCH_STARCODER2, "starcoder2" },
245
- { LLM_ARCH_MAMBA, "mamba" },
246
- { LLM_ARCH_XVERSE, "xverse" },
247
- { LLM_ARCH_COMMAND_R, "command-r" },
248
- { LLM_ARCH_DBRX, "dbrx" },
249
- { LLM_ARCH_OLMO, "olmo" },
250
- { LLM_ARCH_OLMO2, "olmo2" },
251
- { LLM_ARCH_OLMOE, "olmoe" },
252
- { LLM_ARCH_OPENELM, "openelm" },
253
- { LLM_ARCH_ARCTIC, "arctic" },
254
- { LLM_ARCH_DEEPSEEK, "deepseek" },
255
- { LLM_ARCH_DEEPSEEK2, "deepseek2" },
256
- { LLM_ARCH_CHATGLM, "chatglm" },
257
- { LLM_ARCH_BITNET, "bitnet" },
258
- { LLM_ARCH_T5, "t5" },
259
- { LLM_ARCH_T5ENCODER, "t5encoder" },
260
- { LLM_ARCH_JAIS, "jais" },
261
- { LLM_ARCH_NEMOTRON, "nemotron" },
262
- { LLM_ARCH_EXAONE, "exaone" },
263
- { LLM_ARCH_RWKV6, "rwkv6" },
264
- { LLM_ARCH_GRANITE, "granite" },
265
- { LLM_ARCH_GRANITE_MOE, "granitemoe" },
266
- { LLM_ARCH_CHAMELEON, "chameleon" },
267
- { LLM_ARCH_UNKNOWN, "(unknown)" },
217
+ { LLM_ARCH_LLAMA, "llama" },
218
+ { LLM_ARCH_DECI, "deci" },
219
+ { LLM_ARCH_FALCON, "falcon" },
220
+ { LLM_ARCH_GROK, "grok" },
221
+ { LLM_ARCH_GPT2, "gpt2" },
222
+ { LLM_ARCH_GPTJ, "gptj" },
223
+ { LLM_ARCH_GPTNEOX, "gptneox" },
224
+ { LLM_ARCH_MPT, "mpt" },
225
+ { LLM_ARCH_BAICHUAN, "baichuan" },
226
+ { LLM_ARCH_STARCODER, "starcoder" },
227
+ { LLM_ARCH_REFACT, "refact" },
228
+ { LLM_ARCH_BERT, "bert" },
229
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
230
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
231
+ { LLM_ARCH_BLOOM, "bloom" },
232
+ { LLM_ARCH_STABLELM, "stablelm" },
233
+ { LLM_ARCH_QWEN, "qwen" },
234
+ { LLM_ARCH_QWEN2, "qwen2" },
235
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
236
+ { LLM_ARCH_QWEN2VL, "qwen2vl" },
237
+ { LLM_ARCH_PHI2, "phi2" },
238
+ { LLM_ARCH_PHI3, "phi3" },
239
+ { LLM_ARCH_PLAMO, "plamo" },
240
+ { LLM_ARCH_CODESHELL, "codeshell" },
241
+ { LLM_ARCH_ORION, "orion" },
242
+ { LLM_ARCH_INTERNLM2, "internlm2" },
243
+ { LLM_ARCH_MINICPM, "minicpm" },
244
+ { LLM_ARCH_MINICPM3, "minicpm3" },
245
+ { LLM_ARCH_GEMMA, "gemma" },
246
+ { LLM_ARCH_GEMMA2, "gemma2" },
247
+ { LLM_ARCH_STARCODER2, "starcoder2" },
248
+ { LLM_ARCH_MAMBA, "mamba" },
249
+ { LLM_ARCH_XVERSE, "xverse" },
250
+ { LLM_ARCH_COMMAND_R, "command-r" },
251
+ { LLM_ARCH_DBRX, "dbrx" },
252
+ { LLM_ARCH_OLMO, "olmo" },
253
+ { LLM_ARCH_OLMO2, "olmo2" },
254
+ { LLM_ARCH_OLMOE, "olmoe" },
255
+ { LLM_ARCH_OPENELM, "openelm" },
256
+ { LLM_ARCH_ARCTIC, "arctic" },
257
+ { LLM_ARCH_DEEPSEEK, "deepseek" },
258
+ { LLM_ARCH_DEEPSEEK2, "deepseek2" },
259
+ { LLM_ARCH_CHATGLM, "chatglm" },
260
+ { LLM_ARCH_BITNET, "bitnet" },
261
+ { LLM_ARCH_T5, "t5" },
262
+ { LLM_ARCH_T5ENCODER, "t5encoder" },
263
+ { LLM_ARCH_JAIS, "jais" },
264
+ { LLM_ARCH_NEMOTRON, "nemotron" },
265
+ { LLM_ARCH_EXAONE, "exaone" },
266
+ { LLM_ARCH_RWKV6, "rwkv6" },
267
+ { LLM_ARCH_GRANITE, "granite" },
268
+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
269
+ { LLM_ARCH_CHAMELEON, "chameleon" },
270
+ { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
271
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
268
272
  };
269
273
 
270
274
  enum llm_kv {
@@ -284,6 +288,7 @@ enum llm_kv {
284
288
  LLM_KV_VOCAB_SIZE,
285
289
  LLM_KV_CONTEXT_LENGTH,
286
290
  LLM_KV_EMBEDDING_LENGTH,
291
+ LLM_KV_FEATURES_LENGTH,
287
292
  LLM_KV_BLOCK_COUNT,
288
293
  LLM_KV_LEADING_DENSE_BLOCK_COUNT,
289
294
  LLM_KV_FEED_FORWARD_LENGTH,
@@ -315,6 +320,8 @@ enum llm_kv {
315
320
  LLM_KV_ATTENTION_VALUE_LENGTH,
316
321
  LLM_KV_ATTENTION_LAYERNORM_EPS,
317
322
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
323
+ LLM_KV_ATTENTION_GROUPNORM_EPS,
324
+ LLM_KV_ATTENTION_GROUPNORM_GROUPS,
318
325
  LLM_KV_ATTENTION_CAUSAL,
319
326
  LLM_KV_ATTENTION_Q_LORA_RANK,
320
327
  LLM_KV_ATTENTION_KV_LORA_RANK,
@@ -378,6 +385,12 @@ enum llm_kv {
378
385
  LLM_KV_ADAPTER_TYPE,
379
386
  LLM_KV_ADAPTER_LORA_ALPHA,
380
387
 
388
+ LLM_KV_POSNET_EMBEDDING_LENGTH,
389
+ LLM_KV_POSNET_BLOCK_COUNT,
390
+
391
+ LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
392
+ LLM_KV_CONVNEXT_BLOCK_COUNT,
393
+
381
394
  // deprecated:
382
395
  LLM_KV_TOKENIZER_PREFIX_ID,
383
396
  LLM_KV_TOKENIZER_SUFFIX_ID,
@@ -401,6 +414,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
401
414
  { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
402
415
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
403
416
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
417
+ { LLM_KV_FEATURES_LENGTH, "%s.features_length" },
404
418
  { LLM_KV_BLOCK_COUNT, "%s.block_count" },
405
419
  { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
406
420
  { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
@@ -432,6 +446,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
432
446
  { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
433
447
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
434
448
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
449
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
450
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
435
451
  { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
436
452
  { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
437
453
  { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
@@ -462,6 +478,12 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
462
478
 
463
479
  { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
464
480
 
481
+ { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
482
+ { LLM_KV_POSNET_BLOCK_COUNT, "%s.posnet.block_count" },
483
+
484
+ { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
485
+ { LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
486
+
465
487
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
466
488
  { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
467
489
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -620,6 +642,22 @@ enum llm_tensor {
620
642
  LLM_TENSOR_ENC_OUTPUT_NORM,
621
643
  LLM_TENSOR_CLS,
622
644
  LLM_TENSOR_CLS_OUT,
645
+ LLM_TENSOR_CONV1D,
646
+ LLM_TENSOR_CONVNEXT_DW,
647
+ LLM_TENSOR_CONVNEXT_NORM,
648
+ LLM_TENSOR_CONVNEXT_PW1,
649
+ LLM_TENSOR_CONVNEXT_PW2,
650
+ LLM_TENSOR_CONVNEXT_GAMMA,
651
+ LLM_TENSOR_POS_NET_CONV1,
652
+ LLM_TENSOR_POS_NET_CONV2,
653
+ LLM_TENSOR_POS_NET_NORM,
654
+ LLM_TENSOR_POS_NET_NORM1,
655
+ LLM_TENSOR_POS_NET_NORM2,
656
+ LLM_TENSOR_POS_NET_ATTN_NORM,
657
+ LLM_TENSOR_POS_NET_ATTN_Q,
658
+ LLM_TENSOR_POS_NET_ATTN_K,
659
+ LLM_TENSOR_POS_NET_ATTN_V,
660
+ LLM_TENSOR_POS_NET_ATTN_OUT,
623
661
  };
624
662
 
625
663
  static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -649,6 +687,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
649
687
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
650
688
  },
651
689
  },
690
+ {
691
+ LLM_ARCH_DECI,
692
+ {
693
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
694
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
695
+ { LLM_TENSOR_OUTPUT, "output" },
696
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
697
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
698
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
699
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
700
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
701
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
702
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
703
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
704
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
705
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
706
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
707
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
708
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
709
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
710
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
711
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
712
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
713
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
714
+ },
715
+ },
652
716
  {
653
717
  LLM_ARCH_BAICHUAN,
654
718
  {
@@ -1604,6 +1668,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1604
1668
  { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1605
1669
  },
1606
1670
  },
1671
+ {
1672
+ LLM_ARCH_WAVTOKENIZER_DEC,
1673
+ {
1674
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1675
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1676
+ { LLM_TENSOR_CONV1D, "conv1d" },
1677
+ { LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" },
1678
+ { LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" },
1679
+ { LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" },
1680
+ { LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" },
1681
+ { LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" },
1682
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1683
+ { LLM_TENSOR_OUTPUT, "output" },
1684
+ { LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" },
1685
+ { LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" },
1686
+ { LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" },
1687
+ { LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" },
1688
+ { LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" },
1689
+ { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },
1690
+ { LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" },
1691
+ { LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" },
1692
+ { LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" },
1693
+ { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
1694
+ },
1695
+ },
1607
1696
  {
1608
1697
  LLM_ARCH_UNKNOWN,
1609
1698
  {
@@ -1623,6 +1712,7 @@ enum llm_chat_template {
1623
1712
  LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
1624
1713
  LLM_CHAT_TEMPLATE_MISTRAL_V7,
1625
1714
  LLM_CHAT_TEMPLATE_PHI_3,
1715
+ LLM_CHAT_TEMPLATE_FALCON_3,
1626
1716
  LLM_CHAT_TEMPLATE_ZEPHYR,
1627
1717
  LLM_CHAT_TEMPLATE_MONARCH,
1628
1718
  LLM_CHAT_TEMPLATE_GEMMA,
@@ -1641,6 +1731,7 @@ enum llm_chat_template {
1641
1731
  LLM_CHAT_TEMPLATE_RWKV_WORLD,
1642
1732
  LLM_CHAT_TEMPLATE_GRANITE,
1643
1733
  LLM_CHAT_TEMPLATE_GIGACHAT,
1734
+ LLM_CHAT_TEMPLATE_MEGREZ,
1644
1735
  LLM_CHAT_TEMPLATE_UNKNOWN,
1645
1736
  };
1646
1737
 
@@ -1655,6 +1746,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
1655
1746
  { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
1656
1747
  { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
1657
1748
  { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
1749
+ { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
1658
1750
  { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
1659
1751
  { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
1660
1752
  { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
@@ -1673,6 +1765,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
1673
1765
  { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
1674
1766
  { "granite", LLM_CHAT_TEMPLATE_GRANITE },
1675
1767
  { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
1768
+ { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
1676
1769
  };
1677
1770
 
1678
1771
  static llm_arch llm_arch_from_string(const std::string & name) {
@@ -2494,15 +2587,26 @@ static const size_t kiB = 1024;
2494
2587
  static const size_t MiB = 1024*kiB;
2495
2588
  static const size_t GiB = 1024*MiB;
2496
2589
 
2590
+ struct llama_hparams_posnet {
2591
+ uint32_t n_embd;
2592
+ uint32_t n_layer;
2593
+ };
2594
+
2595
+ struct llama_hparams_convnext {
2596
+ uint32_t n_embd;
2597
+ uint32_t n_layer;
2598
+ };
2599
+
2497
2600
  struct llama_hparams {
2498
2601
  bool vocab_only;
2499
2602
  bool rope_finetuned;
2500
2603
  bool use_par_res;
2501
2604
  bool swin_norm;
2502
2605
 
2503
- uint32_t n_vocab;
2606
+ uint32_t n_vocab = 0;
2504
2607
  uint32_t n_ctx_train; // context size the model was trained on
2505
2608
  uint32_t n_embd;
2609
+ uint32_t n_embd_features = 0;
2506
2610
  uint32_t n_layer;
2507
2611
  uint32_t n_rot;
2508
2612
  uint32_t n_swa = 0; // sliding window attention (SWA)
@@ -2513,6 +2617,10 @@ struct llama_hparams {
2513
2617
  uint32_t n_vocab_type = 0; // for BERT-style token types
2514
2618
  uint32_t n_rel_attn_bkts = 0;
2515
2619
 
2620
+ // for WavTokenizer
2621
+ struct llama_hparams_posnet posnet;
2622
+ struct llama_hparams_convnext convnext;
2623
+
2516
2624
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
2517
2625
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
2518
2626
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -2527,6 +2635,9 @@ struct llama_hparams {
2527
2635
 
2528
2636
  float f_norm_eps;
2529
2637
  float f_norm_rms_eps;
2638
+ float f_norm_group_eps;
2639
+
2640
+ uint32_t n_norm_groups;
2530
2641
 
2531
2642
  float f_attn_logit_softcapping = 50.0f;
2532
2643
  float f_final_logit_softcapping = 30.0f;
@@ -2572,66 +2683,6 @@ struct llama_hparams {
2572
2683
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
2573
2684
  enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
2574
2685
 
2575
- bool operator!=(const llama_hparams & other) const {
2576
- if (this->vocab_only != other.vocab_only) return true;
2577
- if (this->n_vocab != other.n_vocab) return true;
2578
- if (this->n_ctx_train != other.n_ctx_train) return true;
2579
- if (this->n_embd != other.n_embd) return true;
2580
- if (this->n_layer != other.n_layer) return true;
2581
- if (this->n_rot != other.n_rot) return true;
2582
- if (this->n_swa != other.n_swa) return true;
2583
- if (this->n_embd_head_k != other.n_embd_head_k) return true;
2584
- if (this->n_embd_head_v != other.n_embd_head_v) return true;
2585
- if (this->n_expert != other.n_expert) return true;
2586
- if (this->n_expert_used != other.n_expert_used) return true;
2587
-
2588
- if (this->n_head_arr != other.n_head_arr) return true;
2589
- if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
2590
- if (this->n_ff_arr != other.n_ff_arr) return true;
2591
-
2592
- if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
2593
- if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
2594
- if (this->n_lora_q != other.n_lora_q) return true;
2595
- if (this->n_lora_kv != other.n_lora_kv) return true;
2596
- if (this->n_ff_exp != other.n_ff_exp) return true;
2597
- if (this->n_ff_shexp != other.n_ff_shexp) return true;
2598
- if (this->n_expert_shared != other.n_expert_shared) return true;
2599
-
2600
- if (this->rope_finetuned != other.rope_finetuned) return true;
2601
- if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
2602
- if (std::equal(std::begin(this->rope_sections),
2603
- std::end(this->rope_sections),
2604
- std::begin(other.rope_sections))) return true;
2605
-
2606
- if (this->ssm_d_conv != other.ssm_d_conv) return true;
2607
- if (this->ssm_d_inner != other.ssm_d_inner) return true;
2608
- if (this->ssm_d_state != other.ssm_d_state) return true;
2609
- if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
2610
- if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
2611
-
2612
- if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
2613
- if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
2614
- if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
2615
- if (this->wkv_head_size != other.wkv_head_size) return true;
2616
-
2617
- if (this->dec_start_token_id != other.dec_start_token_id) return true;
2618
-
2619
- const float EPSILON = 1e-9f;
2620
-
2621
- if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2622
- if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2623
- if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2624
- if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2625
- if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2626
- if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2627
- if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2628
- if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2629
- if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2630
- if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
2631
-
2632
- return false;
2633
- }
2634
-
2635
2686
  uint32_t n_head(uint32_t il = 0) const {
2636
2687
  if (il < n_layer) {
2637
2688
  return n_head_arr[il];
@@ -2684,21 +2735,21 @@ struct llama_hparams {
2684
2735
  if (wkv_head_size != 0) {
2685
2736
  // for RWKV models
2686
2737
  return 2 * n_embd;
2687
- } else {
2688
- // TODO: maybe support other convolution strides than 1
2689
- // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
2690
- return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
2691
2738
  }
2739
+
2740
+ // TODO: maybe support other convolution strides than 1
2741
+ // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
2742
+ return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
2692
2743
  }
2693
2744
 
2694
2745
  uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
2695
2746
  if (wkv_head_size != 0) {
2696
2747
  // corresponds to RWKV's wkv_states size
2697
2748
  return n_embd * wkv_head_size;
2698
- } else {
2699
- // corresponds to Mamba's ssm_states size
2700
- return ssm_d_state * ssm_d_inner;
2701
2749
  }
2750
+
2751
+ // corresponds to Mamba's ssm_states size
2752
+ return ssm_d_state * ssm_d_inner;
2702
2753
  }
2703
2754
  };
2704
2755
 
@@ -2736,142 +2787,187 @@ struct llama_cparams {
2736
2787
  void * cb_eval_user_data;
2737
2788
  };
2738
2789
 
2739
- // TODO: separate into "llama_layer_enc" and "llama_layer_dec"
2740
- struct llama_layer {
2741
- llama_layer() {
2742
- // initialize all pointers to NULL
2743
- std::memset(this, 0, sizeof(*this));
2744
- }
2790
+ struct llama_layer_posnet {
2791
+ // resnet
2792
+ struct lm_ggml_tensor * norm1 = nullptr;
2793
+ struct lm_ggml_tensor * norm1_b = nullptr;
2794
+
2795
+ struct lm_ggml_tensor * conv1 = nullptr;
2796
+ struct lm_ggml_tensor * conv1_b = nullptr;
2797
+
2798
+ struct lm_ggml_tensor * norm2 = nullptr;
2799
+ struct lm_ggml_tensor * norm2_b = nullptr;
2800
+
2801
+ struct lm_ggml_tensor * conv2 = nullptr;
2802
+ struct lm_ggml_tensor * conv2_b = nullptr;
2745
2803
 
2804
+ // attention
2805
+ struct lm_ggml_tensor * attn_norm = nullptr;
2806
+ struct lm_ggml_tensor * attn_norm_b = nullptr;
2807
+
2808
+ struct lm_ggml_tensor * attn_q = nullptr;
2809
+ struct lm_ggml_tensor * attn_q_b = nullptr;
2810
+
2811
+ struct lm_ggml_tensor * attn_k = nullptr;
2812
+ struct lm_ggml_tensor * attn_k_b = nullptr;
2813
+
2814
+ struct lm_ggml_tensor * attn_v = nullptr;
2815
+ struct lm_ggml_tensor * attn_v_b = nullptr;
2816
+
2817
+ struct lm_ggml_tensor * attn_o = nullptr;
2818
+ struct lm_ggml_tensor * attn_o_b = nullptr;
2819
+
2820
+ // normalize
2821
+ struct lm_ggml_tensor * norm = nullptr;
2822
+ struct lm_ggml_tensor * norm_b = nullptr;
2823
+ };
2824
+
2825
+ struct llama_layer_convnext {
2826
+ struct lm_ggml_tensor * dw = nullptr;
2827
+ struct lm_ggml_tensor * dw_b = nullptr;
2828
+
2829
+ struct lm_ggml_tensor * norm = nullptr;
2830
+ struct lm_ggml_tensor * norm_b = nullptr;
2831
+
2832
+ struct lm_ggml_tensor * pw1 = nullptr;
2833
+ struct lm_ggml_tensor * pw1_b = nullptr;
2834
+
2835
+ struct lm_ggml_tensor * pw2 = nullptr;
2836
+ struct lm_ggml_tensor * pw2_b = nullptr;
2837
+
2838
+ struct lm_ggml_tensor * gamma = nullptr;
2839
+ };
2840
+
2841
+ struct llama_layer {
2746
2842
  // normalization
2747
- struct lm_ggml_tensor * attn_norm;
2748
- struct lm_ggml_tensor * attn_norm_b;
2749
- struct lm_ggml_tensor * attn_norm_2;
2750
- struct lm_ggml_tensor * attn_norm_2_b;
2751
- struct lm_ggml_tensor * attn_q_norm;
2752
- struct lm_ggml_tensor * attn_q_norm_b;
2753
- struct lm_ggml_tensor * attn_k_norm;
2754
- struct lm_ggml_tensor * attn_k_norm_b;
2755
- struct lm_ggml_tensor * attn_out_norm;
2756
- struct lm_ggml_tensor * attn_out_norm_b;
2757
- struct lm_ggml_tensor * attn_q_a_norm;
2758
- struct lm_ggml_tensor * attn_kv_a_norm;
2759
- struct lm_ggml_tensor * attn_sub_norm;
2760
- struct lm_ggml_tensor * attn_post_norm;
2761
- struct lm_ggml_tensor * ffn_sub_norm;
2762
- struct lm_ggml_tensor * attn_norm_cross;
2763
- struct lm_ggml_tensor * attn_norm_enc;
2843
+ struct lm_ggml_tensor * attn_norm = nullptr;
2844
+ struct lm_ggml_tensor * attn_norm_b = nullptr;
2845
+ struct lm_ggml_tensor * attn_norm_2 = nullptr;
2846
+ struct lm_ggml_tensor * attn_norm_2_b = nullptr;
2847
+ struct lm_ggml_tensor * attn_q_norm = nullptr;
2848
+ struct lm_ggml_tensor * attn_q_norm_b = nullptr;
2849
+ struct lm_ggml_tensor * attn_k_norm = nullptr;
2850
+ struct lm_ggml_tensor * attn_k_norm_b = nullptr;
2851
+ struct lm_ggml_tensor * attn_out_norm = nullptr;
2852
+ struct lm_ggml_tensor * attn_out_norm_b = nullptr;
2853
+ struct lm_ggml_tensor * attn_q_a_norm = nullptr;
2854
+ struct lm_ggml_tensor * attn_kv_a_norm = nullptr;
2855
+ struct lm_ggml_tensor * attn_sub_norm = nullptr;
2856
+ struct lm_ggml_tensor * attn_post_norm = nullptr;
2857
+ struct lm_ggml_tensor * ffn_sub_norm = nullptr;
2858
+ struct lm_ggml_tensor * attn_norm_cross = nullptr;
2859
+ struct lm_ggml_tensor * attn_norm_enc = nullptr;
2764
2860
 
2765
2861
  // attention
2766
- struct lm_ggml_tensor * wq;
2767
- struct lm_ggml_tensor * wk;
2768
- struct lm_ggml_tensor * wv;
2769
- struct lm_ggml_tensor * wo;
2770
- struct lm_ggml_tensor * wqkv;
2771
- struct lm_ggml_tensor * wq_a;
2772
- struct lm_ggml_tensor * wq_b;
2773
- struct lm_ggml_tensor * wkv_a_mqa;
2774
- struct lm_ggml_tensor * wkv_b;
2775
- struct lm_ggml_tensor * wq_cross;
2776
- struct lm_ggml_tensor * wk_cross;
2777
- struct lm_ggml_tensor * wv_cross;
2778
- struct lm_ggml_tensor * wo_cross;
2779
- struct lm_ggml_tensor * wq_enc;
2780
- struct lm_ggml_tensor * wk_enc;
2781
- struct lm_ggml_tensor * wv_enc;
2782
- struct lm_ggml_tensor * wo_enc;
2862
+ struct lm_ggml_tensor * wq = nullptr;
2863
+ struct lm_ggml_tensor * wk = nullptr;
2864
+ struct lm_ggml_tensor * wv = nullptr;
2865
+ struct lm_ggml_tensor * wo = nullptr;
2866
+ struct lm_ggml_tensor * wqkv = nullptr;
2867
+ struct lm_ggml_tensor * wq_a = nullptr;
2868
+ struct lm_ggml_tensor * wq_b = nullptr;
2869
+ struct lm_ggml_tensor * wkv_a_mqa = nullptr;
2870
+ struct lm_ggml_tensor * wkv_b = nullptr;
2871
+ struct lm_ggml_tensor * wq_cross = nullptr;
2872
+ struct lm_ggml_tensor * wk_cross = nullptr;
2873
+ struct lm_ggml_tensor * wv_cross = nullptr;
2874
+ struct lm_ggml_tensor * wo_cross = nullptr;
2875
+ struct lm_ggml_tensor * wq_enc = nullptr;
2876
+ struct lm_ggml_tensor * wk_enc = nullptr;
2877
+ struct lm_ggml_tensor * wv_enc = nullptr;
2878
+ struct lm_ggml_tensor * wo_enc = nullptr;
2783
2879
 
2784
2880
  // attention bias
2785
- struct lm_ggml_tensor * bq;
2786
- struct lm_ggml_tensor * bk;
2787
- struct lm_ggml_tensor * bv;
2788
- struct lm_ggml_tensor * bo;
2789
- struct lm_ggml_tensor * bqkv;
2881
+ struct lm_ggml_tensor * bq = nullptr;
2882
+ struct lm_ggml_tensor * bk = nullptr;
2883
+ struct lm_ggml_tensor * bv = nullptr;
2884
+ struct lm_ggml_tensor * bo = nullptr;
2885
+ struct lm_ggml_tensor * bqkv = nullptr;
2790
2886
 
2791
2887
  // relative position bias
2792
- struct lm_ggml_tensor * attn_rel_b;
2793
- struct lm_ggml_tensor * attn_rel_b_enc;
2794
- struct lm_ggml_tensor * attn_rel_b_cross;
2888
+ struct lm_ggml_tensor * attn_rel_b = nullptr;
2889
+ struct lm_ggml_tensor * attn_rel_b_enc = nullptr;
2890
+ struct lm_ggml_tensor * attn_rel_b_cross = nullptr;
2795
2891
 
2796
2892
  // normalization
2797
- struct lm_ggml_tensor * ffn_norm;
2798
- struct lm_ggml_tensor * ffn_norm_b;
2799
- struct lm_ggml_tensor * ffn_post_norm;
2800
- struct lm_ggml_tensor * layer_out_norm;
2801
- struct lm_ggml_tensor * layer_out_norm_b;
2802
- struct lm_ggml_tensor * ffn_norm_exps;
2803
- struct lm_ggml_tensor * ffn_norm_enc;
2893
+ struct lm_ggml_tensor * ffn_norm = nullptr;
2894
+ struct lm_ggml_tensor * ffn_norm_b = nullptr;
2895
+ struct lm_ggml_tensor * ffn_post_norm = nullptr;
2896
+ struct lm_ggml_tensor * layer_out_norm = nullptr;
2897
+ struct lm_ggml_tensor * layer_out_norm_b = nullptr;
2898
+ struct lm_ggml_tensor * ffn_norm_exps = nullptr;
2899
+ struct lm_ggml_tensor * ffn_norm_enc = nullptr;
2804
2900
 
2805
2901
  // ff
2806
- struct lm_ggml_tensor * ffn_gate; // w1
2807
- struct lm_ggml_tensor * ffn_down; // w2
2808
- struct lm_ggml_tensor * ffn_up; // w3
2809
- struct lm_ggml_tensor * ffn_gate_enc;
2810
- struct lm_ggml_tensor * ffn_down_enc;
2811
- struct lm_ggml_tensor * ffn_up_enc;
2902
+ struct lm_ggml_tensor * ffn_gate = nullptr; // w1
2903
+ struct lm_ggml_tensor * ffn_down = nullptr; // w2
2904
+ struct lm_ggml_tensor * ffn_up = nullptr; // w3
2905
+ struct lm_ggml_tensor * ffn_gate_enc = nullptr;
2906
+ struct lm_ggml_tensor * ffn_down_enc = nullptr;
2907
+ struct lm_ggml_tensor * ffn_up_enc = nullptr;
2812
2908
 
2813
2909
  // ff MoE
2814
- struct lm_ggml_tensor * ffn_gate_inp;
2815
- struct lm_ggml_tensor * ffn_gate_exps;
2816
- struct lm_ggml_tensor * ffn_down_exps;
2817
- struct lm_ggml_tensor * ffn_up_exps ;
2910
+ struct lm_ggml_tensor * ffn_gate_inp = nullptr;
2911
+ struct lm_ggml_tensor * ffn_gate_exps = nullptr;
2912
+ struct lm_ggml_tensor * ffn_down_exps = nullptr;
2913
+ struct lm_ggml_tensor * ffn_up_exps = nullptr;
2818
2914
 
2819
2915
  // ff shared expert (shexp)
2820
- struct lm_ggml_tensor * ffn_gate_inp_shexp;
2821
- struct lm_ggml_tensor * ffn_gate_shexp;
2822
- struct lm_ggml_tensor * ffn_down_shexp;
2823
- struct lm_ggml_tensor * ffn_up_shexp;
2916
+ struct lm_ggml_tensor * ffn_gate_inp_shexp = nullptr;
2917
+ struct lm_ggml_tensor * ffn_gate_shexp = nullptr;
2918
+ struct lm_ggml_tensor * ffn_down_shexp = nullptr;
2919
+ struct lm_ggml_tensor * ffn_up_shexp = nullptr;
2824
2920
 
2825
2921
  // ff bias
2826
- struct lm_ggml_tensor * ffn_gate_b;
2827
- struct lm_ggml_tensor * ffn_down_b; // b2
2828
- struct lm_ggml_tensor * ffn_up_b; // b3
2829
- struct lm_ggml_tensor * ffn_act;
2922
+ struct lm_ggml_tensor * ffn_gate_b = nullptr;
2923
+ struct lm_ggml_tensor * ffn_down_b = nullptr; // b2
2924
+ struct lm_ggml_tensor * ffn_up_b = nullptr; // b3
2925
+ struct lm_ggml_tensor * ffn_act = nullptr;
2830
2926
 
2831
2927
  // mamba proj
2832
- struct lm_ggml_tensor * ssm_in;
2833
- struct lm_ggml_tensor * ssm_x;
2834
- struct lm_ggml_tensor * ssm_dt;
2835
- struct lm_ggml_tensor * ssm_out;
2928
+ struct lm_ggml_tensor * ssm_in = nullptr;
2929
+ struct lm_ggml_tensor * ssm_x = nullptr;
2930
+ struct lm_ggml_tensor * ssm_dt = nullptr;
2931
+ struct lm_ggml_tensor * ssm_out = nullptr;
2836
2932
 
2837
2933
  // mamba
2838
- struct lm_ggml_tensor * ssm_conv1d;
2839
- struct lm_ggml_tensor * ssm_a;
2840
- struct lm_ggml_tensor * ssm_d;
2934
+ struct lm_ggml_tensor * ssm_conv1d = nullptr;
2935
+ struct lm_ggml_tensor * ssm_a = nullptr;
2936
+ struct lm_ggml_tensor * ssm_d = nullptr;
2841
2937
 
2842
2938
  // mamba bias
2843
- struct lm_ggml_tensor * ssm_conv1d_b;
2844
- struct lm_ggml_tensor * ssm_dt_b;
2939
+ struct lm_ggml_tensor * ssm_conv1d_b = nullptr;
2940
+ struct lm_ggml_tensor * ssm_dt_b = nullptr;
2845
2941
 
2846
2942
  // rwkv
2847
- struct lm_ggml_tensor * time_mix_w1;
2848
- struct lm_ggml_tensor * time_mix_w2;
2849
- struct lm_ggml_tensor * time_mix_lerp_x;
2850
- struct lm_ggml_tensor * time_mix_lerp_w;
2851
- struct lm_ggml_tensor * time_mix_lerp_k;
2852
- struct lm_ggml_tensor * time_mix_lerp_v;
2853
- struct lm_ggml_tensor * time_mix_lerp_r;
2854
- struct lm_ggml_tensor * time_mix_lerp_g;
2855
-
2856
- struct lm_ggml_tensor * time_mix_first;
2857
- struct lm_ggml_tensor * time_mix_decay;
2858
- struct lm_ggml_tensor * time_mix_decay_w1;
2859
- struct lm_ggml_tensor * time_mix_decay_w2;
2860
- struct lm_ggml_tensor * time_mix_key;
2861
- struct lm_ggml_tensor * time_mix_value;
2862
- struct lm_ggml_tensor * time_mix_receptance;
2863
- struct lm_ggml_tensor * time_mix_gate;
2864
-
2865
- struct lm_ggml_tensor * time_mix_ln;
2866
- struct lm_ggml_tensor * time_mix_ln_b;
2867
- struct lm_ggml_tensor * time_mix_output;
2868
-
2869
- struct lm_ggml_tensor * channel_mix_lerp_k;
2870
- struct lm_ggml_tensor * channel_mix_lerp_r;
2871
-
2872
- struct lm_ggml_tensor * channel_mix_key;
2873
- struct lm_ggml_tensor * channel_mix_receptance;
2874
- struct lm_ggml_tensor * channel_mix_value;
2943
+ struct lm_ggml_tensor * time_mix_w1 = nullptr;
2944
+ struct lm_ggml_tensor * time_mix_w2 = nullptr;
2945
+ struct lm_ggml_tensor * time_mix_lerp_x = nullptr;
2946
+ struct lm_ggml_tensor * time_mix_lerp_w = nullptr;
2947
+ struct lm_ggml_tensor * time_mix_lerp_k = nullptr;
2948
+ struct lm_ggml_tensor * time_mix_lerp_v = nullptr;
2949
+ struct lm_ggml_tensor * time_mix_lerp_r = nullptr;
2950
+ struct lm_ggml_tensor * time_mix_lerp_g = nullptr;
2951
+
2952
+ struct lm_ggml_tensor * time_mix_first = nullptr;
2953
+ struct lm_ggml_tensor * time_mix_decay = nullptr;
2954
+ struct lm_ggml_tensor * time_mix_decay_w1 = nullptr;
2955
+ struct lm_ggml_tensor * time_mix_decay_w2 = nullptr;
2956
+ struct lm_ggml_tensor * time_mix_key = nullptr;
2957
+ struct lm_ggml_tensor * time_mix_value = nullptr;
2958
+ struct lm_ggml_tensor * time_mix_receptance = nullptr;
2959
+ struct lm_ggml_tensor * time_mix_gate = nullptr;
2960
+
2961
+ struct lm_ggml_tensor * time_mix_ln = nullptr;
2962
+ struct lm_ggml_tensor * time_mix_ln_b = nullptr;
2963
+ struct lm_ggml_tensor * time_mix_output = nullptr;
2964
+
2965
+ struct lm_ggml_tensor * channel_mix_lerp_k = nullptr;
2966
+ struct lm_ggml_tensor * channel_mix_lerp_r = nullptr;
2967
+
2968
+ struct lm_ggml_tensor * channel_mix_key = nullptr;
2969
+ struct lm_ggml_tensor * channel_mix_receptance = nullptr;
2970
+ struct lm_ggml_tensor * channel_mix_value = nullptr;
2875
2971
 
2876
2972
  // long rope factors
2877
2973
  struct lm_ggml_tensor * rope_long = nullptr;
@@ -2879,13 +2975,17 @@ struct llama_layer {
2879
2975
  struct lm_ggml_tensor * rope_freqs = nullptr;
2880
2976
 
2881
2977
  // bitnet scale
2882
- struct lm_ggml_tensor * wq_scale;
2883
- struct lm_ggml_tensor * wk_scale;
2884
- struct lm_ggml_tensor * wv_scale;
2885
- struct lm_ggml_tensor * wo_scale;
2886
- struct lm_ggml_tensor * ffn_gate_scale;
2887
- struct lm_ggml_tensor * ffn_up_scale;
2888
- struct lm_ggml_tensor * ffn_down_scale;
2978
+ struct lm_ggml_tensor * wq_scale = nullptr;
2979
+ struct lm_ggml_tensor * wk_scale = nullptr;
2980
+ struct lm_ggml_tensor * wv_scale = nullptr;
2981
+ struct lm_ggml_tensor * wo_scale = nullptr;
2982
+ struct lm_ggml_tensor * ffn_gate_scale = nullptr;
2983
+ struct lm_ggml_tensor * ffn_up_scale = nullptr;
2984
+ struct lm_ggml_tensor * ffn_down_scale = nullptr;
2985
+
2986
+ struct llama_layer_posnet posnet;
2987
+
2988
+ struct llama_layer_convnext convnext;
2889
2989
  };
2890
2990
 
2891
2991
  // very similar to llama_batch,
@@ -3016,6 +3116,9 @@ struct llama_model {
3016
3116
  struct lm_ggml_tensor * cls_out = nullptr;
3017
3117
  struct lm_ggml_tensor * cls_out_b = nullptr;
3018
3118
 
3119
+ struct lm_ggml_tensor * conv1d = nullptr;
3120
+ struct lm_ggml_tensor * conv1d_b = nullptr;
3121
+
3019
3122
  std::vector<llama_layer> layers;
3020
3123
 
3021
3124
  // gguf metadata
@@ -3100,6 +3203,7 @@ struct llama_sbatch {
3100
3203
  // batch indices of the output
3101
3204
  std::vector<size_t> out_ids;
3102
3205
  std::vector<llama_sbatch_seq> seq;
3206
+
3103
3207
  const llama_batch * batch = nullptr;
3104
3208
 
3105
3209
  // buffers for the ubatch
@@ -3520,6 +3624,17 @@ static int llama_get_device_count(const llama_model & model) {
3520
3624
  return (int) model.devices.size();
3521
3625
  }
3522
3626
 
3627
+ static struct lm_ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) {
3628
+ auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
3629
+ [name](const std::pair<std::string, struct lm_ggml_tensor *> & it) {
3630
+ return it.first == name;
3631
+ });
3632
+ if (it == model->tensors_by_name.end()) {
3633
+ return nullptr;
3634
+ }
3635
+ return it->second;
3636
+ }
3637
+
3523
3638
  template<typename F>
3524
3639
  static bool buft_supported(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_dev_t dev, F & fn) {
3525
3640
  lm_ggml_init_params params = {
@@ -3573,7 +3688,9 @@ static bool llama_kv_cache_init(
3573
3688
 
3574
3689
  const struct llama_hparams & hparams = model.hparams;
3575
3690
 
3576
- const int64_t n_layer = hparams.n_layer;
3691
+ const int32_t n_layer = hparams.n_layer;
3692
+
3693
+ LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer);
3577
3694
 
3578
3695
  cache.has_shift = false;
3579
3696
 
@@ -3614,10 +3731,12 @@ static bool llama_kv_cache_init(
3614
3731
  cache.k_l.reserve(n_layer);
3615
3732
  cache.v_l.reserve(n_layer);
3616
3733
 
3617
- for (int i = 0; i < (int) n_layer; i++) {
3734
+ for (int i = 0; i < n_layer; i++) {
3618
3735
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
3619
3736
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
3620
3737
 
3738
+ LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
3739
+
3621
3740
  lm_ggml_backend_buffer_type_t buft;
3622
3741
  if (offload) {
3623
3742
  auto * dev = model.dev_layer.at(i).dev;
@@ -5530,7 +5649,7 @@ static void llm_load_hparams(
5530
5649
  ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
5531
5650
 
5532
5651
  // get hparams kv
5533
- ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
5652
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
5534
5653
 
5535
5654
  // everything past this point is not vocab-related
5536
5655
  if (hparams.vocab_only) {
@@ -5543,6 +5662,16 @@ static void llm_load_hparams(
5543
5662
  ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
5544
5663
  ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
5545
5664
 
5665
+ if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
5666
+ ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
5667
+
5668
+ ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
5669
+ ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
5670
+
5671
+ ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
5672
+ ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
5673
+ }
5674
+
5546
5675
  LM_GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
5547
5676
  LM_GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
5548
5677
  if (hparams.n_expert > 0) {
@@ -5551,13 +5680,13 @@ static void llm_load_hparams(
5551
5680
  LM_GGML_ASSERT(hparams.n_expert_used == 0);
5552
5681
  }
5553
5682
 
5554
- // zero-out the per-layer hparams
5683
+ // zero-out the array hparams
5555
5684
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
5556
5685
  std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
5557
5686
  std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
5558
5687
 
5559
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
5560
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
5688
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
5689
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
5561
5690
 
5562
5691
  // n_head_kv is optional, default to n_head
5563
5692
  hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5606,7 +5735,7 @@ static void llm_load_hparams(
5606
5735
 
5607
5736
  ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
5608
5737
 
5609
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
5738
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
5610
5739
  if (hparams.n_rot != hparams.n_embd_head_k) {
5611
5740
  throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
5612
5741
  }
@@ -5646,6 +5775,15 @@ static void llm_load_hparams(
5646
5775
  }
5647
5776
  }
5648
5777
  } break;
5778
+ case LLM_ARCH_DECI:
5779
+ {
5780
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5781
+ switch (hparams.n_layer) {
5782
+ case 32: model.type = e_model::MODEL_7B; break;
5783
+ case 80: model.type = e_model::MODEL_70B; break;
5784
+ default: model.type = e_model::MODEL_UNKNOWN;
5785
+ }
5786
+ } break;
5649
5787
  case LLM_ARCH_MINICPM:
5650
5788
  {
5651
5789
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6302,6 +6440,13 @@ static void llm_load_hparams(
6302
6440
  default: model.type = e_model::MODEL_UNKNOWN;
6303
6441
  }
6304
6442
  } break;
6443
+ case LLM_ARCH_WAVTOKENIZER_DEC:
6444
+ {
6445
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
6446
+ ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
6447
+ ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
6448
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
6449
+ } break;
6305
6450
  default: (void)0;
6306
6451
  }
6307
6452
 
@@ -6331,7 +6476,7 @@ static void llm_load_vocab(
6331
6476
  ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
6332
6477
  ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
6333
6478
 
6334
- if (tokenizer_model == "no_vocab") {
6479
+ if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
6335
6480
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
6336
6481
 
6337
6482
  // default special tokens
@@ -6469,7 +6614,8 @@ static void llm_load_vocab(
6469
6614
  } else if (
6470
6615
  tokenizer_pre == "llama3" ||
6471
6616
  tokenizer_pre == "llama-v3" ||
6472
- tokenizer_pre == "llama-bpe") {
6617
+ tokenizer_pre == "llama-bpe"||
6618
+ tokenizer_pre == "falcon3") {
6473
6619
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
6474
6620
  vocab.tokenizer_ignore_merges = true;
6475
6621
  vocab.tokenizer_add_bos = true;
@@ -6499,7 +6645,8 @@ static void llm_load_vocab(
6499
6645
  tokenizer_pre == "jina-v1-en" ||
6500
6646
  tokenizer_pre == "jina-v2-es" ||
6501
6647
  tokenizer_pre == "jina-v2-de" ||
6502
- tokenizer_pre == "jina-v2-code") {
6648
+ tokenizer_pre == "jina-v2-code" ||
6649
+ tokenizer_pre == "roberta-bpe") {
6503
6650
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
6504
6651
  } else if (
6505
6652
  tokenizer_pre == "refact") {
@@ -6569,6 +6716,9 @@ static void llm_load_vocab(
6569
6716
  } else if (
6570
6717
  tokenizer_pre == "minerva-7b") {
6571
6718
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
6719
+ } else if (
6720
+ tokenizer_pre == "megrez") {
6721
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
6572
6722
  } else {
6573
6723
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
6574
6724
  }
@@ -7310,6 +7460,22 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
7310
7460
  {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT_ID}},
7311
7461
  // this tensor is loaded for T5, but never used
7312
7462
  {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_NONE}},
7463
+ {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, LM_GGML_OP_IM2COL}},
7464
+ {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7465
+ {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7466
+ {LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7467
+ {LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
7468
+ {LLM_TENSOR_POS_NET_CONV2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
7469
+ {LLM_TENSOR_POS_NET_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7470
+ {LLM_TENSOR_POS_NET_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7471
+ {LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7472
+ {LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7473
+ {LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7474
+ {LLM_TENSOR_CONVNEXT_DW, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
7475
+ {LLM_TENSOR_CONVNEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7476
+ {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7477
+ {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7478
+ {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7313
7479
  };
7314
7480
 
7315
7481
  // checks if the weight tensor can be used with the specified buffer type and device
@@ -7414,6 +7580,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
7414
7580
  lm_ggml_tensor * state = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, S, n_seqs, S, H);
7415
7581
  op_tensor = lm_ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
7416
7582
  } break;
7583
+ case LM_GGML_OP_IM2COL:
7584
+ {
7585
+ const int n_embd = hparams.n_embd;
7586
+ lm_ggml_tensor * b = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
7587
+ op_tensor = lm_ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, LM_GGML_TYPE_F16);
7588
+ } break;
7417
7589
  default:
7418
7590
  LM_GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, lm_ggml_op_name(op), w->name);
7419
7591
  }
@@ -7544,7 +7716,8 @@ static bool llm_load_tensors(
7544
7716
  model.main_gpu = main_gpu;
7545
7717
  model.n_gpu_layers = n_gpu_layers;
7546
7718
 
7547
- const int n_layer = hparams.n_layer;
7719
+ const int n_layer = hparams.n_layer;
7720
+
7548
7721
  bool use_mmap_buffer = true;
7549
7722
 
7550
7723
  // build a list of buffer types for the CPU and GPU devices
@@ -7819,6 +7992,68 @@ static bool llm_load_tensors(
7819
7992
  }
7820
7993
  }
7821
7994
  } break;
7995
+ case LLM_ARCH_DECI:
7996
+ {
7997
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7998
+
7999
+ // output
8000
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
8001
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
8002
+
8003
+ // if output is NULL, init from the input tok embed
8004
+ if (model.output == NULL) {
8005
+ model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
8006
+ }
8007
+
8008
+ for (int i = 0; i < n_layer; ++i) {
8009
+ auto & layer = model.layers[i];
8010
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
8011
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
8012
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
8013
+ const int64_t n_ff = hparams.n_ff(i);
8014
+ const int64_t n_head = hparams.n_head(i);
8015
+ const int64_t n_head_kv = hparams.n_head_kv(i);
8016
+
8017
+ if (n_head_kv == 0 && n_head > 0) {
8018
+ // linear attention for DeciLMCausalModel
8019
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
8020
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
8021
+ }
8022
+ else if (n_head_kv > 0) {
8023
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
8024
+
8025
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
8026
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
8027
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
8028
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
8029
+ }
8030
+
8031
+ // optional bias tensors
8032
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
8033
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
8034
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
8035
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
8036
+
8037
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
8038
+
8039
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
8040
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8041
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8042
+ }
8043
+ else {
8044
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8045
+ }
8046
+
8047
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
8048
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
8049
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
8050
+
8051
+ // optional MLP bias
8052
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
8053
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
8054
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
8055
+ }
8056
+ } break;
7822
8057
  case LLM_ARCH_MINICPM3:
7823
8058
  {
7824
8059
  const int64_t n_embd_head_qk_rope = hparams.n_rot;
@@ -9347,9 +9582,9 @@ static bool llm_load_tensors(
9347
9582
  } break;
9348
9583
  case LLM_ARCH_CHAMELEON:
9349
9584
  {
9350
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
9585
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
9351
9586
 
9352
- // output
9587
+ // output
9353
9588
  model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
9354
9589
  model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
9355
9590
  // if output is NULL, init from the input tok embed
@@ -9378,6 +9613,109 @@ static bool llm_load_tensors(
9378
9613
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
9379
9614
  }
9380
9615
  } break;
9616
+ case LLM_ARCH_WAVTOKENIZER_DEC:
9617
+ {
9618
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
9619
+
9620
+ model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
9621
+ model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
9622
+
9623
+ // posnet
9624
+ {
9625
+ const int64_t n_embd = hparams.posnet.n_embd;
9626
+
9627
+ for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
9628
+ auto & layer = model.layers[i].posnet;
9629
+
9630
+ // posnet:
9631
+ //
9632
+ // - resnet
9633
+ // - resnet
9634
+ // - attn
9635
+ // - resnet
9636
+ // - resnet
9637
+ // - norm
9638
+ //
9639
+ switch (i) {
9640
+ case 0:
9641
+ case 1:
9642
+ case 3:
9643
+ case 4:
9644
+ {
9645
+ layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
9646
+ layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
9647
+
9648
+ layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
9649
+ layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
9650
+
9651
+ layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
9652
+ layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
9653
+
9654
+ layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
9655
+ layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
9656
+ } break;
9657
+ case 2:
9658
+ {
9659
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
9660
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
9661
+
9662
+ layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
9663
+ layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
9664
+
9665
+ layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
9666
+ layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
9667
+
9668
+ layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
9669
+ layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
9670
+
9671
+ layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
9672
+ layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
9673
+ } break;
9674
+ case 5:
9675
+ {
9676
+ layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
9677
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
9678
+ } break;
9679
+ default: LM_GGML_ABORT("unknown posnet layer");
9680
+ };
9681
+ }
9682
+ }
9683
+
9684
+ LM_GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
9685
+
9686
+ model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
9687
+ model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
9688
+
9689
+ // convnext
9690
+ {
9691
+ const int64_t n_embd = hparams.convnext.n_embd;
9692
+
9693
+ for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
9694
+ auto & layer = model.layers[i].convnext;
9695
+
9696
+ layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
9697
+ layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
9698
+
9699
+ layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
9700
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
9701
+
9702
+ layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
9703
+ layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
9704
+
9705
+ layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
9706
+ layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
9707
+
9708
+ layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
9709
+ }
9710
+
9711
+ // output
9712
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
9713
+ model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
9714
+ }
9715
+
9716
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
9717
+ model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
9718
+ } break;
9381
9719
  default:
9382
9720
  throw std::runtime_error("unknown architecture");
9383
9721
  }
@@ -9597,6 +9935,7 @@ enum llm_ffn_gate_type {
9597
9935
  enum llm_norm_type {
9598
9936
  LLM_NORM,
9599
9937
  LLM_NORM_RMS,
9938
+ LLM_NORM_GROUP,
9600
9939
  };
9601
9940
 
9602
9941
  static struct lm_ggml_tensor * llm_build_inp_embd(
@@ -9617,7 +9956,7 @@ static struct lm_ggml_tensor * llm_build_inp_embd(
9617
9956
 
9618
9957
  inpL = lm_ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
9619
9958
  } else {
9620
- lctx.inp_embd = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, n_embd, batch.n_tokens);
9959
+ lctx.inp_embd = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, n_embd, batch.n_tokens);
9621
9960
  inpL = lctx.inp_embd;
9622
9961
  lm_ggml_set_input(lctx.inp_embd);
9623
9962
  }
@@ -9738,8 +10077,14 @@ static struct lm_ggml_tensor * llm_build_norm(
9738
10077
  const llm_build_cb & cb,
9739
10078
  int il) {
9740
10079
  switch (type) {
9741
- case LLM_NORM: cur = lm_ggml_norm (ctx, cur, hparams.f_norm_eps); break;
9742
- case LLM_NORM_RMS: cur = lm_ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
10080
+ case LLM_NORM: cur = lm_ggml_norm (ctx, cur, hparams.f_norm_eps); break;
10081
+ case LLM_NORM_RMS: cur = lm_ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break;
10082
+ case LLM_NORM_GROUP:
10083
+ {
10084
+ cur = lm_ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
10085
+ cur = lm_ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
10086
+ cur = lm_ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]);
10087
+ } break;
9743
10088
  }
9744
10089
 
9745
10090
  if (mw || mb) {
@@ -11078,6 +11423,167 @@ struct llm_build_context {
11078
11423
  return gf;
11079
11424
  }
11080
11425
 
11426
+ struct lm_ggml_cgraph * build_deci() {
11427
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
11428
+
11429
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
11430
+ int32_t n_tokens = this->n_tokens;
11431
+
11432
+ const int64_t n_embd_head = hparams.n_embd_head_v;
11433
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
11434
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
11435
+
11436
+ struct lm_ggml_tensor * cur;
11437
+ struct lm_ggml_tensor * inpL;
11438
+
11439
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
11440
+
11441
+ // inp_pos - contains the positions
11442
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
11443
+
11444
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
11445
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
11446
+
11447
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
11448
+ for (int il = 0; il < n_layer; ++il) {
11449
+ struct lm_ggml_tensor * inpSA = inpL;
11450
+ const int64_t n_head_kv = hparams.n_head_kv(il);
11451
+ const int64_t n_head = hparams.n_head(il);
11452
+
11453
+ if (n_head == 0) {
11454
+ // attention-free layer of Llama-3_1-Nemotron-51B
11455
+ cur = inpL;
11456
+ } else {
11457
+ // norm
11458
+ cur = llm_build_norm(ctx0, inpL, hparams,
11459
+ model.layers[il].attn_norm, NULL,
11460
+ LLM_NORM_RMS, cb, il);
11461
+ cb(cur, "attn_norm", il);
11462
+ }
11463
+
11464
+ if (n_head > 0 && n_head_kv == 0) {
11465
+ // "linear attention" of Llama-3_1-Nemotron-51B
11466
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
11467
+ cb(cur, "wo", il);
11468
+ } else if (n_head > 0) {
11469
+ // self-attention
11470
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
11471
+ struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
11472
+
11473
+ // compute Q and K and RoPE them
11474
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
11475
+ cb(Qcur, "Qcur", il);
11476
+ if (model.layers[il].bq) {
11477
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
11478
+ cb(Qcur, "Qcur", il);
11479
+ }
11480
+
11481
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
11482
+ cb(Kcur, "Kcur", il);
11483
+ if (model.layers[il].bk) {
11484
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
11485
+ cb(Kcur, "Kcur", il);
11486
+ }
11487
+
11488
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
11489
+ cb(Vcur, "Vcur", il);
11490
+ if (model.layers[il].bv) {
11491
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
11492
+ cb(Vcur, "Vcur", il);
11493
+ }
11494
+
11495
+ Qcur = lm_ggml_rope_ext(
11496
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
11497
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11498
+ ext_factor, attn_factor, beta_fast, beta_slow
11499
+ );
11500
+ cb(Qcur, "Qcur", il);
11501
+
11502
+ Kcur = lm_ggml_rope_ext(
11503
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
11504
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11505
+ ext_factor, attn_factor, beta_fast, beta_slow
11506
+ );
11507
+ cb(Kcur, "Kcur", il);
11508
+
11509
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
11510
+ model.layers[il].wo, model.layers[il].bo,
11511
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
11512
+ }
11513
+
11514
+ if (il == n_layer - 1) {
11515
+ // skip computing output for unused tokens
11516
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
11517
+ n_tokens = n_outputs;
11518
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
11519
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
11520
+ }
11521
+
11522
+ // For Granite architecture
11523
+ if (hparams.f_residual_scale) {
11524
+ cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
11525
+ }
11526
+
11527
+ // modified to support attention-free layer of Llama-3_1-Nemotron-51B
11528
+ struct lm_ggml_tensor * ffn_inp = cur;
11529
+ if (n_head > 0) {
11530
+ ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
11531
+ cb(ffn_inp, "ffn_inp", il);
11532
+ }
11533
+
11534
+ // feed-forward network
11535
+ if (model.layers[il].ffn_gate_inp == nullptr) {
11536
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11537
+ model.layers[il].ffn_norm, NULL,
11538
+ LLM_NORM_RMS, cb, il);
11539
+ cb(cur, "ffn_norm", il);
11540
+
11541
+ cur = llm_build_ffn(ctx0, lctx, cur,
11542
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
11543
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
11544
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
11545
+ NULL,
11546
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11547
+ cb(cur, "ffn_out", il);
11548
+ }
11549
+
11550
+ // For Granite architecture
11551
+ if (hparams.f_residual_scale) {
11552
+ cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
11553
+ }
11554
+
11555
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
11556
+ cb(cur, "ffn_out", il);
11557
+
11558
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
11559
+ cb(cur, "l_out", il);
11560
+
11561
+ // input for next layer
11562
+ inpL = cur;
11563
+ }
11564
+
11565
+ cur = inpL;
11566
+
11567
+ cur = llm_build_norm(ctx0, cur, hparams,
11568
+ model.output_norm, NULL,
11569
+ LLM_NORM_RMS, cb, -1);
11570
+ cb(cur, "result_norm", -1);
11571
+
11572
+ // lm_head
11573
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
11574
+
11575
+ // For Granite architecture
11576
+ if (hparams.f_logit_scale) {
11577
+ cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
11578
+ }
11579
+
11580
+ cb(cur, "result_output", -1);
11581
+
11582
+ lm_ggml_build_forward_expand(gf, cur);
11583
+
11584
+ return gf;
11585
+ }
11586
+
11081
11587
  struct lm_ggml_cgraph * build_baichuan() {
11082
11588
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
11083
11589
 
@@ -13107,7 +13613,13 @@ struct llm_build_context {
13107
13613
  struct lm_ggml_tensor * inp_pos = build_inp_pos();
13108
13614
 
13109
13615
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13110
- struct lm_ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
13616
+ struct lm_ggml_tensor * KQ_mask = nullptr;
13617
+ if (hparams.n_swa == 0) {
13618
+ // Phi-4 doesn't use sliding window attention
13619
+ KQ_mask = build_inp_KQ_mask();
13620
+ } else {
13621
+ KQ_mask = build_inp_KQ_mask_swa();
13622
+ }
13111
13623
 
13112
13624
  for (int il = 0; il < n_layer; ++il) {
13113
13625
  auto residual = inpL;
@@ -13165,7 +13677,7 @@ struct llm_build_context {
13165
13677
 
13166
13678
  cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13167
13679
  model.layers[il].wo, model.layers[il].bo,
13168
- Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
13680
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
13169
13681
  }
13170
13682
 
13171
13683
  if (il == n_layer - 1) {
@@ -15865,7 +16377,7 @@ struct llm_build_context {
15865
16377
  return gf;
15866
16378
  }
15867
16379
 
15868
- struct lm_ggml_cgraph * build_t5_encoder() {
16380
+ struct lm_ggml_cgraph * build_t5_enc() {
15869
16381
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15870
16382
 
15871
16383
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -15997,7 +16509,7 @@ struct llm_build_context {
15997
16509
  return gf;
15998
16510
  }
15999
16511
 
16000
- struct lm_ggml_cgraph * build_t5_decoder() {
16512
+ struct lm_ggml_cgraph * build_t5_dec() {
16001
16513
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
16002
16514
 
16003
16515
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -16946,6 +17458,158 @@ struct llm_build_context {
16946
17458
 
16947
17459
  return gf;
16948
17460
  }
17461
+
17462
+ struct lm_ggml_cgraph * build_wavtokenizer_dec() {
17463
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
17464
+
17465
+ struct lm_ggml_tensor * cur;
17466
+ struct lm_ggml_tensor * inpL;
17467
+
17468
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
17469
+
17470
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, inpL));
17471
+
17472
+ cur = lm_ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
17473
+ cur = lm_ggml_add(ctx0, cur, model.conv1d_b);
17474
+
17475
+ // posnet
17476
+ for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
17477
+ const auto & layer = model.layers[il].posnet;
17478
+
17479
+ inpL = cur;
17480
+
17481
+ switch (il) {
17482
+ case 0:
17483
+ case 1:
17484
+ case 3:
17485
+ case 4:
17486
+ {
17487
+ cur = llm_build_norm(ctx0, cur, hparams,
17488
+ layer.norm1,
17489
+ layer.norm1_b,
17490
+ LLM_NORM_GROUP, cb, 0);
17491
+
17492
+ cur = lm_ggml_mul(ctx0, lm_ggml_sigmoid(ctx0, cur), cur);
17493
+
17494
+ cur = lm_ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
17495
+ cur = lm_ggml_add(ctx0, cur, layer.conv1_b);
17496
+
17497
+ cur = llm_build_norm(ctx0, cur, hparams,
17498
+ layer.norm2,
17499
+ layer.norm2_b,
17500
+ LLM_NORM_GROUP, cb, 0);
17501
+
17502
+ cur = lm_ggml_mul(ctx0, lm_ggml_sigmoid(ctx0, cur), cur);
17503
+
17504
+ cur = lm_ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
17505
+ cur = lm_ggml_add(ctx0, cur, layer.conv2_b);
17506
+
17507
+ cur = lm_ggml_add(ctx0, cur, inpL);
17508
+ } break;
17509
+ case 2:
17510
+ {
17511
+ cur = llm_build_norm(ctx0, cur, hparams,
17512
+ layer.attn_norm,
17513
+ layer.attn_norm_b,
17514
+ LLM_NORM_GROUP, cb, 0);
17515
+
17516
+ struct lm_ggml_tensor * q;
17517
+ struct lm_ggml_tensor * k;
17518
+ struct lm_ggml_tensor * v;
17519
+
17520
+ q = lm_ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
17521
+ k = lm_ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
17522
+ v = lm_ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
17523
+
17524
+ q = lm_ggml_add(ctx0, q, layer.attn_q_b);
17525
+ k = lm_ggml_add(ctx0, k, layer.attn_k_b);
17526
+ v = lm_ggml_add(ctx0, v, layer.attn_v_b);
17527
+
17528
+ q = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, q));
17529
+ k = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, k));
17530
+
17531
+ struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
17532
+
17533
+ kq = lm_ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
17534
+
17535
+ cur = lm_ggml_mul_mat(ctx0, kq, v);
17536
+
17537
+ cur = lm_ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
17538
+ cur = lm_ggml_add(ctx0, cur, layer.attn_o_b);
17539
+
17540
+ cur = lm_ggml_add(ctx0, cur, inpL);
17541
+ } break;
17542
+ case 5:
17543
+ {
17544
+ cur = llm_build_norm(ctx0, cur, hparams,
17545
+ layer.norm,
17546
+ layer.norm_b,
17547
+ LLM_NORM_GROUP, cb, 0);
17548
+ } break;
17549
+ default: LM_GGML_ABORT("unknown posnet layer");
17550
+ };
17551
+ }
17552
+
17553
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
17554
+
17555
+ cur = llm_build_norm(ctx0, cur, hparams,
17556
+ model.tok_norm,
17557
+ model.tok_norm_b,
17558
+ LLM_NORM, cb, -1);
17559
+
17560
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
17561
+
17562
+ inpL = cur;
17563
+
17564
+ // convnext
17565
+ for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
17566
+ const auto & layer = model.layers[il].convnext;
17567
+
17568
+ cur = inpL;
17569
+
17570
+ cur = lm_ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
17571
+ cur = lm_ggml_add(ctx0, cur, layer.dw_b);
17572
+
17573
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
17574
+
17575
+ cur = llm_build_norm(ctx0, cur, hparams,
17576
+ layer.norm,
17577
+ layer.norm_b,
17578
+ LLM_NORM, cb, -1);
17579
+
17580
+ cur = llm_build_ffn(ctx0, lctx, cur,
17581
+ layer.pw1, layer.pw1_b, NULL,
17582
+ NULL, NULL, NULL,
17583
+ layer.pw2, layer.pw2_b, NULL,
17584
+ NULL,
17585
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
17586
+
17587
+ cur = lm_ggml_mul(ctx0, cur, layer.gamma);
17588
+
17589
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
17590
+
17591
+ inpL = lm_ggml_add(ctx0, cur, inpL);
17592
+ }
17593
+
17594
+ cur = inpL;
17595
+
17596
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
17597
+
17598
+ cur = llm_build_norm(ctx0, cur, hparams,
17599
+ model.output_norm,
17600
+ model.output_norm_b,
17601
+ LLM_NORM, cb, -1);
17602
+
17603
+ // lm_head
17604
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
17605
+
17606
+ cur = lm_ggml_add(ctx0, cur, model.output_b);
17607
+ cb(cur, "result_embd", -1);
17608
+
17609
+ lm_ggml_build_forward_expand(gf, cur);
17610
+
17611
+ return gf;
17612
+ }
16949
17613
  };
16950
17614
 
16951
17615
  static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -17034,6 +17698,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
17034
17698
  {
17035
17699
  result = llm.build_llama();
17036
17700
  } break;
17701
+ case LLM_ARCH_DECI:
17702
+ {
17703
+ result = llm.build_deci();
17704
+ } break;
17037
17705
  case LLM_ARCH_BAICHUAN:
17038
17706
  {
17039
17707
  result = llm.build_baichuan();
@@ -17192,14 +17860,14 @@ static struct lm_ggml_cgraph * llama_build_graph(
17192
17860
  case LLM_ARCH_T5:
17193
17861
  {
17194
17862
  if (lctx.is_encoding) {
17195
- result = llm.build_t5_encoder();
17863
+ result = llm.build_t5_enc();
17196
17864
  } else {
17197
- result = llm.build_t5_decoder();
17865
+ result = llm.build_t5_dec();
17198
17866
  }
17199
17867
  } break;
17200
17868
  case LLM_ARCH_T5ENCODER:
17201
17869
  {
17202
- result = llm.build_t5_encoder();
17870
+ result = llm.build_t5_enc();
17203
17871
  } break;
17204
17872
  case LLM_ARCH_JAIS:
17205
17873
  {
@@ -17221,6 +17889,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
17221
17889
  {
17222
17890
  result = llm.build_chameleon();
17223
17891
  } break;
17892
+ case LLM_ARCH_WAVTOKENIZER_DEC:
17893
+ {
17894
+ result = llm.build_wavtokenizer_dec();
17895
+ } break;
17224
17896
  default:
17225
17897
  LM_GGML_ABORT("fatal error");
17226
17898
  }
@@ -17312,30 +17984,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
17312
17984
  }
17313
17985
 
17314
17986
  if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
17315
- LM_GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
17316
- const int64_t n_tokens = ubatch.n_tokens;
17987
+ //LM_GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
17317
17988
 
17318
- LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
17319
- int32_t * data = (int32_t *) lctx.inp_out_ids->data;
17989
+ if (!lctx.inp_out_ids) {
17990
+ LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
17991
+ } else {
17992
+ const int64_t n_tokens = ubatch.n_tokens;
17320
17993
 
17321
- if (lctx.n_outputs == n_tokens) {
17322
- for (int i = 0; i < n_tokens; ++i) {
17323
- data[i] = i;
17324
- }
17325
- } else if (ubatch.output) {
17326
- int32_t n_outputs = 0;
17327
- for (int i = 0; i < n_tokens; ++i) {
17328
- if (ubatch.output[i]) {
17329
- data[n_outputs++] = i;
17994
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
17995
+ int32_t * data = (int32_t *) lctx.inp_out_ids->data;
17996
+
17997
+ if (lctx.n_outputs == n_tokens) {
17998
+ for (int i = 0; i < n_tokens; ++i) {
17999
+ data[i] = i;
17330
18000
  }
18001
+ } else if (ubatch.output) {
18002
+ int32_t n_outputs = 0;
18003
+ for (int i = 0; i < n_tokens; ++i) {
18004
+ if (ubatch.output[i]) {
18005
+ data[n_outputs++] = i;
18006
+ }
18007
+ }
18008
+ // the graph needs to have been passed the correct number of outputs
18009
+ LM_GGML_ASSERT(lctx.n_outputs == n_outputs);
18010
+ } else if (lctx.n_outputs == 1) {
18011
+ // only keep last output
18012
+ data[0] = n_tokens - 1;
18013
+ } else {
18014
+ LM_GGML_ASSERT(lctx.n_outputs == 0);
17331
18015
  }
17332
- // the graph needs to have been passed the correct number of outputs
17333
- LM_GGML_ASSERT(lctx.n_outputs == n_outputs);
17334
- } else if (lctx.n_outputs == 1) {
17335
- // only keep last output
17336
- data[0] = n_tokens - 1;
17337
- } else {
17338
- LM_GGML_ASSERT(lctx.n_outputs == 0);
17339
18016
  }
17340
18017
  }
17341
18018
 
@@ -18006,6 +18683,7 @@ static int llama_decode_internal(
18006
18683
  embd = nullptr; // do not extract embeddings when not needed
18007
18684
  LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
18008
18685
  }
18686
+
18009
18687
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (lm_ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
18010
18688
 
18011
18689
  lm_ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
@@ -20394,10 +21072,12 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
20394
21072
  case LLM_ARCH_T5ENCODER:
20395
21073
  case LLM_ARCH_JAIS:
20396
21074
  case LLM_ARCH_RWKV6:
21075
+ case LLM_ARCH_WAVTOKENIZER_DEC:
20397
21076
  return LLAMA_ROPE_TYPE_NONE;
20398
21077
 
20399
21078
  // use what we call a normal RoPE, operating on pairs of consecutive head values
20400
21079
  case LLM_ARCH_LLAMA:
21080
+ case LLM_ARCH_DECI:
20401
21081
  case LLM_ARCH_BAICHUAN:
20402
21082
  case LLM_ARCH_STARCODER:
20403
21083
  case LLM_ARCH_PLAMO:
@@ -20511,17 +21191,6 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
20511
21191
  return model->n_elements;
20512
21192
  }
20513
21193
 
20514
- struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
20515
- auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
20516
- [name](const std::pair<std::string, struct lm_ggml_tensor *> & it) {
20517
- return it.first == name;
20518
- });
20519
- if (it == model->tensors_by_name.end()) {
20520
- return nullptr;
20521
- }
20522
- return it->second;
20523
- }
20524
-
20525
21194
  bool llama_model_has_encoder(const struct llama_model * model) {
20526
21195
  switch (model->arch) {
20527
21196
  case LLM_ARCH_T5: return true;
@@ -22230,6 +22899,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
22230
22899
  }
22231
22900
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
22232
22901
  return LLM_CHAT_TEMPLATE_PHI_3;
22902
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
22903
+ return LLM_CHAT_TEMPLATE_FALCON_3;
22233
22904
  } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
22234
22905
  return LLM_CHAT_TEMPLATE_ZEPHYR;
22235
22906
  } else if (tmpl_contains("bos_token + message['role']")) {
@@ -22276,6 +22947,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
22276
22947
  return LLM_CHAT_TEMPLATE_GRANITE;
22277
22948
  } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
22278
22949
  return LLM_CHAT_TEMPLATE_GIGACHAT;
22950
+ } else if (tmpl_contains("<|role_start|>")) {
22951
+ return LLM_CHAT_TEMPLATE_MEGREZ;
22279
22952
  }
22280
22953
  return LLM_CHAT_TEMPLATE_UNKNOWN;
22281
22954
  }
@@ -22382,6 +23055,15 @@ static int32_t llama_chat_apply_template_internal(
22382
23055
  if (add_ass) {
22383
23056
  ss << "<|assistant|>\n";
22384
23057
  }
23058
+ } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
23059
+ // Falcon 3
23060
+ for (auto message : chat) {
23061
+ std::string role(message->role);
23062
+ ss << "<|" << role << "|>\n" << message->content << "\n";
23063
+ }
23064
+ if (add_ass) {
23065
+ ss << "<|assistant|>\n";
23066
+ }
22385
23067
  } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
22386
23068
  // zephyr template
22387
23069
  for (auto message : chat) {
@@ -22625,6 +23307,16 @@ static int32_t llama_chat_apply_template_internal(
22625
23307
  if (add_ass) {
22626
23308
  ss << "assistant<|role_sep|>";
22627
23309
  }
23310
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
23311
+ // Megrez template
23312
+ for (auto message : chat) {
23313
+ std::string role(message->role);
23314
+ ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
23315
+ }
23316
+
23317
+ if (add_ass) {
23318
+ ss << "<|role_start|>assistant<|role_end|>";
23319
+ }
22628
23320
  } else {
22629
23321
  // template not supported
22630
23322
  return -1;
@@ -22644,15 +23336,15 @@ int32_t llama_chat_apply_template(
22644
23336
  std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
22645
23337
  if (tmpl == nullptr) {
22646
23338
  LM_GGML_ASSERT(model != nullptr);
22647
- // load template from model
22648
- std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
22649
- std::string template_key = "tokenizer.chat_template";
22650
- int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
22651
- if (res < 0) {
23339
+
23340
+ // load template from model, if available
23341
+ const auto & it = model->lm_gguf_kv.find("tokenizer.chat_template");
23342
+ if (it != model->lm_gguf_kv.end() && it->second.size() > 0) {
23343
+ curr_tmpl = it->second;
23344
+ }
23345
+ else {
22652
23346
  // worst case: there is no information about template, we will use chatml by default
22653
- curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
22654
- } else {
22655
- curr_tmpl = std::string(model_template.data(), model_template.size());
23347
+ curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
22656
23348
  }
22657
23349
  }
22658
23350