cui-llama.rn 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/android/src/main/CMakeLists.txt +5 -7
  2. package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
  3. package/android/src/main/jni.cpp +9 -9
  4. package/cpp/common.cpp +28 -44
  5. package/cpp/common.h +35 -14
  6. package/cpp/ggml-alloc.c +0 -1
  7. package/cpp/ggml-backend-impl.h +38 -20
  8. package/cpp/ggml-backend-reg.cpp +246 -92
  9. package/cpp/ggml-backend.h +1 -0
  10. package/cpp/ggml-common.h +42 -48
  11. package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +642 -223
  12. package/cpp/ggml-cpu-aarch64.h +2 -26
  13. package/cpp/ggml-cpu-traits.cpp +36 -0
  14. package/cpp/ggml-cpu-traits.h +38 -0
  15. package/cpp/ggml-cpu.c +14122 -13971
  16. package/cpp/ggml-cpu.cpp +627 -715
  17. package/cpp/ggml-cpu.h +0 -17
  18. package/cpp/ggml-impl.h +22 -6
  19. package/cpp/ggml-metal.m +482 -24
  20. package/cpp/ggml-quants.c +0 -9
  21. package/cpp/ggml-threading.h +4 -2
  22. package/cpp/ggml.c +284 -178
  23. package/cpp/ggml.h +73 -25
  24. package/cpp/llama-grammar.cpp +15 -15
  25. package/cpp/llama-grammar.h +2 -5
  26. package/cpp/llama-sampling.cpp +35 -90
  27. package/cpp/llama-vocab.cpp +7 -2
  28. package/cpp/llama-vocab.h +1 -1
  29. package/cpp/llama.cpp +1782 -586
  30. package/cpp/llama.h +20 -19
  31. package/cpp/sampling.cpp +11 -16
  32. package/cpp/sgemm.cpp +265 -258
  33. package/cpp/sgemm.h +2 -2
  34. package/cpp/speculative.cpp +4 -0
  35. package/cpp/unicode.cpp +51 -51
  36. package/cpp/unicode.h +9 -10
  37. package/lib/commonjs/index.js +38 -1
  38. package/lib/commonjs/index.js.map +1 -1
  39. package/lib/module/index.js +36 -0
  40. package/lib/module/index.js.map +1 -1
  41. package/lib/typescript/NativeRNLlama.d.ts +2 -3
  42. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  43. package/lib/typescript/index.d.ts +36 -2
  44. package/lib/typescript/index.d.ts.map +1 -1
  45. package/package.json +1 -1
  46. package/src/NativeRNLlama.ts +3 -3
  47. package/src/index.ts +46 -2
  48. package/cpp/amx/amx.cpp +0 -196
  49. package/cpp/amx/amx.h +0 -20
  50. package/cpp/amx/common.h +0 -101
  51. package/cpp/amx/mmq.cpp +0 -2524
  52. package/cpp/amx/mmq.h +0 -16
  53. package/cpp/ggml-aarch64.c +0 -129
  54. package/cpp/ggml-aarch64.h +0 -19
package/cpp/llama.cpp CHANGED
@@ -157,6 +157,7 @@ static std::string format(const char * fmt, ...) {
157
157
 
158
158
  enum llm_arch {
159
159
  LLM_ARCH_LLAMA,
160
+ LLM_ARCH_DECI,
160
161
  LLM_ARCH_FALCON,
161
162
  LLM_ARCH_BAICHUAN,
162
163
  LLM_ARCH_GROK,
@@ -174,6 +175,7 @@ enum llm_arch {
174
175
  LLM_ARCH_QWEN,
175
176
  LLM_ARCH_QWEN2,
176
177
  LLM_ARCH_QWEN2MOE,
178
+ LLM_ARCH_QWEN2VL,
177
179
  LLM_ARCH_PHI2,
178
180
  LLM_ARCH_PHI3,
179
181
  LLM_ARCH_PLAMO,
@@ -194,6 +196,7 @@ enum llm_arch {
194
196
  LLM_ARCH_OLMOE,
195
197
  LLM_ARCH_OPENELM,
196
198
  LLM_ARCH_ARCTIC,
199
+ LLM_ARCH_DEEPSEEK,
197
200
  LLM_ARCH_DEEPSEEK2,
198
201
  LLM_ARCH_CHATGLM,
199
202
  LLM_ARCH_BITNET,
@@ -206,61 +209,66 @@ enum llm_arch {
206
209
  LLM_ARCH_GRANITE,
207
210
  LLM_ARCH_GRANITE_MOE,
208
211
  LLM_ARCH_CHAMELEON,
212
+ LLM_ARCH_WAVTOKENIZER_DEC,
209
213
  LLM_ARCH_UNKNOWN,
210
214
  };
211
215
 
212
216
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
213
- { LLM_ARCH_LLAMA, "llama" },
214
- { LLM_ARCH_FALCON, "falcon" },
215
- { LLM_ARCH_GROK, "grok" },
216
- { LLM_ARCH_GPT2, "gpt2" },
217
- { LLM_ARCH_GPTJ, "gptj" },
218
- { LLM_ARCH_GPTNEOX, "gptneox" },
219
- { LLM_ARCH_MPT, "mpt" },
220
- { LLM_ARCH_BAICHUAN, "baichuan" },
221
- { LLM_ARCH_STARCODER, "starcoder" },
222
- { LLM_ARCH_REFACT, "refact" },
223
- { LLM_ARCH_BERT, "bert" },
224
- { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
225
- { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
226
- { LLM_ARCH_BLOOM, "bloom" },
227
- { LLM_ARCH_STABLELM, "stablelm" },
228
- { LLM_ARCH_QWEN, "qwen" },
229
- { LLM_ARCH_QWEN2, "qwen2" },
230
- { LLM_ARCH_QWEN2MOE, "qwen2moe" },
231
- { LLM_ARCH_PHI2, "phi2" },
232
- { LLM_ARCH_PHI3, "phi3" },
233
- { LLM_ARCH_PLAMO, "plamo" },
234
- { LLM_ARCH_CODESHELL, "codeshell" },
235
- { LLM_ARCH_ORION, "orion" },
236
- { LLM_ARCH_INTERNLM2, "internlm2" },
237
- { LLM_ARCH_MINICPM, "minicpm" },
238
- { LLM_ARCH_MINICPM3, "minicpm3" },
239
- { LLM_ARCH_GEMMA, "gemma" },
240
- { LLM_ARCH_GEMMA2, "gemma2" },
241
- { LLM_ARCH_STARCODER2, "starcoder2" },
242
- { LLM_ARCH_MAMBA, "mamba" },
243
- { LLM_ARCH_XVERSE, "xverse" },
244
- { LLM_ARCH_COMMAND_R, "command-r" },
245
- { LLM_ARCH_DBRX, "dbrx" },
246
- { LLM_ARCH_OLMO, "olmo" },
247
- { LLM_ARCH_OLMO2, "olmo2" },
248
- { LLM_ARCH_OLMOE, "olmoe" },
249
- { LLM_ARCH_OPENELM, "openelm" },
250
- { LLM_ARCH_ARCTIC, "arctic" },
251
- { LLM_ARCH_DEEPSEEK2, "deepseek2" },
252
- { LLM_ARCH_CHATGLM, "chatglm" },
253
- { LLM_ARCH_BITNET, "bitnet" },
254
- { LLM_ARCH_T5, "t5" },
255
- { LLM_ARCH_T5ENCODER, "t5encoder" },
256
- { LLM_ARCH_JAIS, "jais" },
257
- { LLM_ARCH_NEMOTRON, "nemotron" },
258
- { LLM_ARCH_EXAONE, "exaone" },
259
- { LLM_ARCH_RWKV6, "rwkv6" },
260
- { LLM_ARCH_GRANITE, "granite" },
261
- { LLM_ARCH_GRANITE_MOE, "granitemoe" },
262
- { LLM_ARCH_CHAMELEON, "chameleon" },
263
- { LLM_ARCH_UNKNOWN, "(unknown)" },
217
+ { LLM_ARCH_LLAMA, "llama" },
218
+ { LLM_ARCH_DECI, "deci" },
219
+ { LLM_ARCH_FALCON, "falcon" },
220
+ { LLM_ARCH_GROK, "grok" },
221
+ { LLM_ARCH_GPT2, "gpt2" },
222
+ { LLM_ARCH_GPTJ, "gptj" },
223
+ { LLM_ARCH_GPTNEOX, "gptneox" },
224
+ { LLM_ARCH_MPT, "mpt" },
225
+ { LLM_ARCH_BAICHUAN, "baichuan" },
226
+ { LLM_ARCH_STARCODER, "starcoder" },
227
+ { LLM_ARCH_REFACT, "refact" },
228
+ { LLM_ARCH_BERT, "bert" },
229
+ { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
230
+ { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
231
+ { LLM_ARCH_BLOOM, "bloom" },
232
+ { LLM_ARCH_STABLELM, "stablelm" },
233
+ { LLM_ARCH_QWEN, "qwen" },
234
+ { LLM_ARCH_QWEN2, "qwen2" },
235
+ { LLM_ARCH_QWEN2MOE, "qwen2moe" },
236
+ { LLM_ARCH_QWEN2VL, "qwen2vl" },
237
+ { LLM_ARCH_PHI2, "phi2" },
238
+ { LLM_ARCH_PHI3, "phi3" },
239
+ { LLM_ARCH_PLAMO, "plamo" },
240
+ { LLM_ARCH_CODESHELL, "codeshell" },
241
+ { LLM_ARCH_ORION, "orion" },
242
+ { LLM_ARCH_INTERNLM2, "internlm2" },
243
+ { LLM_ARCH_MINICPM, "minicpm" },
244
+ { LLM_ARCH_MINICPM3, "minicpm3" },
245
+ { LLM_ARCH_GEMMA, "gemma" },
246
+ { LLM_ARCH_GEMMA2, "gemma2" },
247
+ { LLM_ARCH_STARCODER2, "starcoder2" },
248
+ { LLM_ARCH_MAMBA, "mamba" },
249
+ { LLM_ARCH_XVERSE, "xverse" },
250
+ { LLM_ARCH_COMMAND_R, "command-r" },
251
+ { LLM_ARCH_DBRX, "dbrx" },
252
+ { LLM_ARCH_OLMO, "olmo" },
253
+ { LLM_ARCH_OLMO2, "olmo2" },
254
+ { LLM_ARCH_OLMOE, "olmoe" },
255
+ { LLM_ARCH_OPENELM, "openelm" },
256
+ { LLM_ARCH_ARCTIC, "arctic" },
257
+ { LLM_ARCH_DEEPSEEK, "deepseek" },
258
+ { LLM_ARCH_DEEPSEEK2, "deepseek2" },
259
+ { LLM_ARCH_CHATGLM, "chatglm" },
260
+ { LLM_ARCH_BITNET, "bitnet" },
261
+ { LLM_ARCH_T5, "t5" },
262
+ { LLM_ARCH_T5ENCODER, "t5encoder" },
263
+ { LLM_ARCH_JAIS, "jais" },
264
+ { LLM_ARCH_NEMOTRON, "nemotron" },
265
+ { LLM_ARCH_EXAONE, "exaone" },
266
+ { LLM_ARCH_RWKV6, "rwkv6" },
267
+ { LLM_ARCH_GRANITE, "granite" },
268
+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
269
+ { LLM_ARCH_CHAMELEON, "chameleon" },
270
+ { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
271
+ { LLM_ARCH_UNKNOWN, "(unknown)" },
264
272
  };
265
273
 
266
274
  enum llm_kv {
@@ -280,6 +288,7 @@ enum llm_kv {
280
288
  LLM_KV_VOCAB_SIZE,
281
289
  LLM_KV_CONTEXT_LENGTH,
282
290
  LLM_KV_EMBEDDING_LENGTH,
291
+ LLM_KV_FEATURES_LENGTH,
283
292
  LLM_KV_BLOCK_COUNT,
284
293
  LLM_KV_LEADING_DENSE_BLOCK_COUNT,
285
294
  LLM_KV_FEED_FORWARD_LENGTH,
@@ -311,6 +320,8 @@ enum llm_kv {
311
320
  LLM_KV_ATTENTION_VALUE_LENGTH,
312
321
  LLM_KV_ATTENTION_LAYERNORM_EPS,
313
322
  LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
323
+ LLM_KV_ATTENTION_GROUPNORM_EPS,
324
+ LLM_KV_ATTENTION_GROUPNORM_GROUPS,
314
325
  LLM_KV_ATTENTION_CAUSAL,
315
326
  LLM_KV_ATTENTION_Q_LORA_RANK,
316
327
  LLM_KV_ATTENTION_KV_LORA_RANK,
@@ -319,6 +330,7 @@ enum llm_kv {
319
330
  LLM_KV_ATTENTION_SCALE,
320
331
 
321
332
  LLM_KV_ROPE_DIMENSION_COUNT,
333
+ LLM_KV_ROPE_DIMENSION_SECTIONS,
322
334
  LLM_KV_ROPE_FREQ_BASE,
323
335
  LLM_KV_ROPE_SCALE_LINEAR,
324
336
  LLM_KV_ROPE_SCALING_TYPE,
@@ -373,6 +385,12 @@ enum llm_kv {
373
385
  LLM_KV_ADAPTER_TYPE,
374
386
  LLM_KV_ADAPTER_LORA_ALPHA,
375
387
 
388
+ LLM_KV_POSNET_EMBEDDING_LENGTH,
389
+ LLM_KV_POSNET_BLOCK_COUNT,
390
+
391
+ LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
392
+ LLM_KV_CONVNEXT_BLOCK_COUNT,
393
+
376
394
  // deprecated:
377
395
  LLM_KV_TOKENIZER_PREFIX_ID,
378
396
  LLM_KV_TOKENIZER_SUFFIX_ID,
@@ -396,6 +414,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
396
414
  { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
397
415
  { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
398
416
  { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
417
+ { LLM_KV_FEATURES_LENGTH, "%s.features_length" },
399
418
  { LLM_KV_BLOCK_COUNT, "%s.block_count" },
400
419
  { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
401
420
  { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
@@ -427,6 +446,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
427
446
  { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
428
447
  { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
429
448
  { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
449
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
450
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
430
451
  { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
431
452
  { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
432
453
  { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
@@ -435,6 +456,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
435
456
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
436
457
 
437
458
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
459
+ { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
438
460
  { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
439
461
  { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
440
462
  { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
@@ -456,6 +478,12 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
456
478
 
457
479
  { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
458
480
 
481
+ { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
482
+ { LLM_KV_POSNET_BLOCK_COUNT, "%s.posnet.block_count" },
483
+
484
+ { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
485
+ { LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
486
+
459
487
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
460
488
  { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
461
489
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -614,6 +642,22 @@ enum llm_tensor {
614
642
  LLM_TENSOR_ENC_OUTPUT_NORM,
615
643
  LLM_TENSOR_CLS,
616
644
  LLM_TENSOR_CLS_OUT,
645
+ LLM_TENSOR_CONV1D,
646
+ LLM_TENSOR_CONVNEXT_DW,
647
+ LLM_TENSOR_CONVNEXT_NORM,
648
+ LLM_TENSOR_CONVNEXT_PW1,
649
+ LLM_TENSOR_CONVNEXT_PW2,
650
+ LLM_TENSOR_CONVNEXT_GAMMA,
651
+ LLM_TENSOR_POS_NET_CONV1,
652
+ LLM_TENSOR_POS_NET_CONV2,
653
+ LLM_TENSOR_POS_NET_NORM,
654
+ LLM_TENSOR_POS_NET_NORM1,
655
+ LLM_TENSOR_POS_NET_NORM2,
656
+ LLM_TENSOR_POS_NET_ATTN_NORM,
657
+ LLM_TENSOR_POS_NET_ATTN_Q,
658
+ LLM_TENSOR_POS_NET_ATTN_K,
659
+ LLM_TENSOR_POS_NET_ATTN_V,
660
+ LLM_TENSOR_POS_NET_ATTN_OUT,
617
661
  };
618
662
 
619
663
  static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -643,6 +687,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
643
687
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
644
688
  },
645
689
  },
690
+ {
691
+ LLM_ARCH_DECI,
692
+ {
693
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
694
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
695
+ { LLM_TENSOR_OUTPUT, "output" },
696
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
697
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
698
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
699
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
700
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
701
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
702
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
703
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
704
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
705
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
706
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
707
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
708
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
709
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
710
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
711
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
712
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
713
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
714
+ },
715
+ },
646
716
  {
647
717
  LLM_ARCH_BAICHUAN,
648
718
  {
@@ -909,6 +979,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
909
979
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
910
980
  },
911
981
  },
982
+ {
983
+ LLM_ARCH_QWEN2VL,
984
+ {
985
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
986
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
987
+ { LLM_TENSOR_OUTPUT, "output" },
988
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
989
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
990
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
991
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
992
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
993
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
994
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
995
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
996
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
997
+ },
998
+ },
912
999
  {
913
1000
  LLM_ARCH_QWEN2MOE,
914
1001
  {
@@ -1047,6 +1134,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1047
1134
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1048
1135
  { LLM_TENSOR_OUTPUT, "output" },
1049
1136
  { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1137
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
1138
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
1050
1139
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1051
1140
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1052
1141
  { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
@@ -1297,6 +1386,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1297
1386
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1298
1387
  },
1299
1388
  },
1389
+ {
1390
+ LLM_ARCH_DEEPSEEK,
1391
+ {
1392
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1393
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1394
+ { LLM_TENSOR_OUTPUT, "output" },
1395
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1396
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1397
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1398
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1399
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1400
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1401
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1402
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1403
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1404
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1405
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1406
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1407
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1408
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1409
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1410
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1411
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1412
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1413
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1414
+ },
1415
+ },
1300
1416
  {
1301
1417
  LLM_ARCH_DEEPSEEK2,
1302
1418
  {
@@ -1552,6 +1668,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1552
1668
  { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1553
1669
  },
1554
1670
  },
1671
+ {
1672
+ LLM_ARCH_WAVTOKENIZER_DEC,
1673
+ {
1674
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1675
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1676
+ { LLM_TENSOR_CONV1D, "conv1d" },
1677
+ { LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" },
1678
+ { LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" },
1679
+ { LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" },
1680
+ { LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" },
1681
+ { LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" },
1682
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1683
+ { LLM_TENSOR_OUTPUT, "output" },
1684
+ { LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" },
1685
+ { LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" },
1686
+ { LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" },
1687
+ { LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" },
1688
+ { LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" },
1689
+ { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },
1690
+ { LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" },
1691
+ { LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" },
1692
+ { LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" },
1693
+ { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
1694
+ },
1695
+ },
1555
1696
  {
1556
1697
  LLM_ARCH_UNKNOWN,
1557
1698
  {
@@ -1560,6 +1701,73 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1560
1701
  },
1561
1702
  };
1562
1703
 
1704
+ enum llm_chat_template {
1705
+ LLM_CHAT_TEMPLATE_CHATML,
1706
+ LLM_CHAT_TEMPLATE_LLAMA_2,
1707
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
1708
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
1709
+ LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
1710
+ LLM_CHAT_TEMPLATE_MISTRAL_V1,
1711
+ LLM_CHAT_TEMPLATE_MISTRAL_V3,
1712
+ LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
1713
+ LLM_CHAT_TEMPLATE_MISTRAL_V7,
1714
+ LLM_CHAT_TEMPLATE_PHI_3,
1715
+ LLM_CHAT_TEMPLATE_FALCON_3,
1716
+ LLM_CHAT_TEMPLATE_ZEPHYR,
1717
+ LLM_CHAT_TEMPLATE_MONARCH,
1718
+ LLM_CHAT_TEMPLATE_GEMMA,
1719
+ LLM_CHAT_TEMPLATE_ORION,
1720
+ LLM_CHAT_TEMPLATE_OPENCHAT,
1721
+ LLM_CHAT_TEMPLATE_VICUNA,
1722
+ LLM_CHAT_TEMPLATE_VICUNA_ORCA,
1723
+ LLM_CHAT_TEMPLATE_DEEPSEEK,
1724
+ LLM_CHAT_TEMPLATE_DEEPSEEK_2,
1725
+ LLM_CHAT_TEMPLATE_COMMAND_R,
1726
+ LLM_CHAT_TEMPLATE_LLAMA_3,
1727
+ LLM_CHAT_TEMPLATE_CHATGML_3,
1728
+ LLM_CHAT_TEMPLATE_CHATGML_4,
1729
+ LLM_CHAT_TEMPLATE_MINICPM,
1730
+ LLM_CHAT_TEMPLATE_EXAONE_3,
1731
+ LLM_CHAT_TEMPLATE_RWKV_WORLD,
1732
+ LLM_CHAT_TEMPLATE_GRANITE,
1733
+ LLM_CHAT_TEMPLATE_GIGACHAT,
1734
+ LLM_CHAT_TEMPLATE_MEGREZ,
1735
+ LLM_CHAT_TEMPLATE_UNKNOWN,
1736
+ };
1737
+
1738
+ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
1739
+ { "chatml", LLM_CHAT_TEMPLATE_CHATML },
1740
+ { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
1741
+ { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
1742
+ { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
1743
+ { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
1744
+ { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
1745
+ { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
1746
+ { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
1747
+ { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
1748
+ { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
1749
+ { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
1750
+ { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
1751
+ { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
1752
+ { "gemma", LLM_CHAT_TEMPLATE_GEMMA },
1753
+ { "orion", LLM_CHAT_TEMPLATE_ORION },
1754
+ { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
1755
+ { "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
1756
+ { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
1757
+ { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
1758
+ { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
1759
+ { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
1760
+ { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
1761
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
1762
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
1763
+ { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
1764
+ { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
1765
+ { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
1766
+ { "granite", LLM_CHAT_TEMPLATE_GRANITE },
1767
+ { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
1768
+ { "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
1769
+ };
1770
+
1563
1771
  static llm_arch llm_arch_from_string(const std::string & name) {
1564
1772
  for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
1565
1773
  if (kv.second == name) {
@@ -1633,9 +1841,10 @@ struct LLM_TN {
1633
1841
  //
1634
1842
 
1635
1843
  static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
1636
- { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
1637
- { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
1638
- { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
1844
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
1845
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
1846
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
1847
+ { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
1639
1848
  };
1640
1849
 
1641
1850
  static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
@@ -1741,7 +1950,7 @@ private:
1741
1950
  DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1742
1951
  NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1743
1952
  if (!bufLen) {
1744
- ret = format("Win32 error code: %s", error_code);
1953
+ ret = format("Win32 error code: %lx", error_code);
1745
1954
  } else {
1746
1955
  ret = lpMsgBuf;
1747
1956
  LocalFree(lpMsgBuf);
@@ -2079,7 +2288,7 @@ struct llama_mmap {
2079
2288
  HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
2080
2289
 
2081
2290
  // may fail on pre-Windows 8 systems
2082
- pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
2291
+ pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
2083
2292
 
2084
2293
  if (pPrefetchVirtualMemory) {
2085
2294
  // advise the kernel to preload the mapped memory
@@ -2378,15 +2587,26 @@ static const size_t kiB = 1024;
2378
2587
  static const size_t MiB = 1024*kiB;
2379
2588
  static const size_t GiB = 1024*MiB;
2380
2589
 
2590
+ struct llama_hparams_posnet {
2591
+ uint32_t n_embd;
2592
+ uint32_t n_layer;
2593
+ };
2594
+
2595
+ struct llama_hparams_convnext {
2596
+ uint32_t n_embd;
2597
+ uint32_t n_layer;
2598
+ };
2599
+
2381
2600
  struct llama_hparams {
2382
2601
  bool vocab_only;
2383
2602
  bool rope_finetuned;
2384
2603
  bool use_par_res;
2385
2604
  bool swin_norm;
2386
2605
 
2387
- uint32_t n_vocab;
2606
+ uint32_t n_vocab = 0;
2388
2607
  uint32_t n_ctx_train; // context size the model was trained on
2389
2608
  uint32_t n_embd;
2609
+ uint32_t n_embd_features = 0;
2390
2610
  uint32_t n_layer;
2391
2611
  uint32_t n_rot;
2392
2612
  uint32_t n_swa = 0; // sliding window attention (SWA)
@@ -2397,6 +2617,10 @@ struct llama_hparams {
2397
2617
  uint32_t n_vocab_type = 0; // for BERT-style token types
2398
2618
  uint32_t n_rel_attn_bkts = 0;
2399
2619
 
2620
+ // for WavTokenizer
2621
+ struct llama_hparams_posnet posnet;
2622
+ struct llama_hparams_convnext convnext;
2623
+
2400
2624
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
2401
2625
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
2402
2626
  std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -2411,6 +2635,9 @@ struct llama_hparams {
2411
2635
 
2412
2636
  float f_norm_eps;
2413
2637
  float f_norm_rms_eps;
2638
+ float f_norm_group_eps;
2639
+
2640
+ uint32_t n_norm_groups;
2414
2641
 
2415
2642
  float f_attn_logit_softcapping = 50.0f;
2416
2643
  float f_final_logit_softcapping = 30.0f;
@@ -2421,11 +2648,12 @@ struct llama_hparams {
2421
2648
  uint32_t time_decay_extra_dim = 0;
2422
2649
  uint32_t wkv_head_size = 0;
2423
2650
 
2424
- float rope_attn_factor = 1.0f;
2425
- float rope_freq_base_train;
2426
- float rope_freq_scale_train;
2427
- uint32_t n_ctx_orig_yarn;
2428
- float rope_yarn_log_mul;
2651
+ float rope_attn_factor = 1.0f;
2652
+ float rope_freq_base_train;
2653
+ float rope_freq_scale_train;
2654
+ uint32_t n_ctx_orig_yarn;
2655
+ float rope_yarn_log_mul;
2656
+ int rope_sections[4];
2429
2657
 
2430
2658
  // for State Space Models
2431
2659
  uint32_t ssm_d_conv = 0;
@@ -2455,63 +2683,6 @@ struct llama_hparams {
2455
2683
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
2456
2684
  enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
2457
2685
 
2458
- bool operator!=(const llama_hparams & other) const {
2459
- if (this->vocab_only != other.vocab_only) return true;
2460
- if (this->n_vocab != other.n_vocab) return true;
2461
- if (this->n_ctx_train != other.n_ctx_train) return true;
2462
- if (this->n_embd != other.n_embd) return true;
2463
- if (this->n_layer != other.n_layer) return true;
2464
- if (this->n_rot != other.n_rot) return true;
2465
- if (this->n_swa != other.n_swa) return true;
2466
- if (this->n_embd_head_k != other.n_embd_head_k) return true;
2467
- if (this->n_embd_head_v != other.n_embd_head_v) return true;
2468
- if (this->n_expert != other.n_expert) return true;
2469
- if (this->n_expert_used != other.n_expert_used) return true;
2470
-
2471
- if (this->n_head_arr != other.n_head_arr) return true;
2472
- if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
2473
- if (this->n_ff_arr != other.n_ff_arr) return true;
2474
-
2475
- if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
2476
- if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
2477
- if (this->n_lora_q != other.n_lora_q) return true;
2478
- if (this->n_lora_kv != other.n_lora_kv) return true;
2479
- if (this->n_ff_exp != other.n_ff_exp) return true;
2480
- if (this->n_ff_shexp != other.n_ff_shexp) return true;
2481
- if (this->n_expert_shared != other.n_expert_shared) return true;
2482
-
2483
- if (this->rope_finetuned != other.rope_finetuned) return true;
2484
- if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
2485
-
2486
- if (this->ssm_d_conv != other.ssm_d_conv) return true;
2487
- if (this->ssm_d_inner != other.ssm_d_inner) return true;
2488
- if (this->ssm_d_state != other.ssm_d_state) return true;
2489
- if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
2490
- if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
2491
-
2492
- if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
2493
- if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
2494
- if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
2495
- if (this->wkv_head_size != other.wkv_head_size) return true;
2496
-
2497
- if (this->dec_start_token_id != other.dec_start_token_id) return true;
2498
-
2499
- const float EPSILON = 1e-9f;
2500
-
2501
- if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
2502
- if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
2503
- if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
2504
- if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
2505
- if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2506
- if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2507
- if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2508
- if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2509
- if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2510
- if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
2511
-
2512
- return false;
2513
- }
2514
-
2515
2686
  uint32_t n_head(uint32_t il = 0) const {
2516
2687
  if (il < n_layer) {
2517
2688
  return n_head_arr[il];
@@ -2564,21 +2735,21 @@ struct llama_hparams {
2564
2735
  if (wkv_head_size != 0) {
2565
2736
  // for RWKV models
2566
2737
  return 2 * n_embd;
2567
- } else {
2568
- // TODO: maybe support other convolution strides than 1
2569
- // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
2570
- return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
2571
2738
  }
2739
+
2740
+ // TODO: maybe support other convolution strides than 1
2741
+ // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
2742
+ return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
2572
2743
  }
2573
2744
 
2574
2745
  uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
2575
2746
  if (wkv_head_size != 0) {
2576
2747
  // corresponds to RWKV's wkv_states size
2577
2748
  return n_embd * wkv_head_size;
2578
- } else {
2579
- // corresponds to Mamba's ssm_states size
2580
- return ssm_d_state * ssm_d_inner;
2581
2749
  }
2750
+
2751
+ // corresponds to Mamba's ssm_states size
2752
+ return ssm_d_state * ssm_d_inner;
2582
2753
  }
2583
2754
  };
2584
2755
 
@@ -2616,142 +2787,187 @@ struct llama_cparams {
2616
2787
  void * cb_eval_user_data;
2617
2788
  };
2618
2789
 
2619
- // TODO: separate into "llama_layer_enc" and "llama_layer_dec"
2620
- struct llama_layer {
2621
- llama_layer() {
2622
- // initialize all pointers to NULL
2623
- std::memset(this, 0, sizeof(*this));
2624
- }
2790
+ struct llama_layer_posnet {
2791
+ // resnet
2792
+ struct lm_ggml_tensor * norm1 = nullptr;
2793
+ struct lm_ggml_tensor * norm1_b = nullptr;
2794
+
2795
+ struct lm_ggml_tensor * conv1 = nullptr;
2796
+ struct lm_ggml_tensor * conv1_b = nullptr;
2797
+
2798
+ struct lm_ggml_tensor * norm2 = nullptr;
2799
+ struct lm_ggml_tensor * norm2_b = nullptr;
2800
+
2801
+ struct lm_ggml_tensor * conv2 = nullptr;
2802
+ struct lm_ggml_tensor * conv2_b = nullptr;
2803
+
2804
+ // attention
2805
+ struct lm_ggml_tensor * attn_norm = nullptr;
2806
+ struct lm_ggml_tensor * attn_norm_b = nullptr;
2807
+
2808
+ struct lm_ggml_tensor * attn_q = nullptr;
2809
+ struct lm_ggml_tensor * attn_q_b = nullptr;
2810
+
2811
+ struct lm_ggml_tensor * attn_k = nullptr;
2812
+ struct lm_ggml_tensor * attn_k_b = nullptr;
2813
+
2814
+ struct lm_ggml_tensor * attn_v = nullptr;
2815
+ struct lm_ggml_tensor * attn_v_b = nullptr;
2816
+
2817
+ struct lm_ggml_tensor * attn_o = nullptr;
2818
+ struct lm_ggml_tensor * attn_o_b = nullptr;
2819
+
2820
+ // normalize
2821
+ struct lm_ggml_tensor * norm = nullptr;
2822
+ struct lm_ggml_tensor * norm_b = nullptr;
2823
+ };
2824
+
2825
+ struct llama_layer_convnext {
2826
+ struct lm_ggml_tensor * dw = nullptr;
2827
+ struct lm_ggml_tensor * dw_b = nullptr;
2625
2828
 
2829
+ struct lm_ggml_tensor * norm = nullptr;
2830
+ struct lm_ggml_tensor * norm_b = nullptr;
2831
+
2832
+ struct lm_ggml_tensor * pw1 = nullptr;
2833
+ struct lm_ggml_tensor * pw1_b = nullptr;
2834
+
2835
+ struct lm_ggml_tensor * pw2 = nullptr;
2836
+ struct lm_ggml_tensor * pw2_b = nullptr;
2837
+
2838
+ struct lm_ggml_tensor * gamma = nullptr;
2839
+ };
2840
+
2841
+ struct llama_layer {
2626
2842
  // normalization
2627
- struct lm_ggml_tensor * attn_norm;
2628
- struct lm_ggml_tensor * attn_norm_b;
2629
- struct lm_ggml_tensor * attn_norm_2;
2630
- struct lm_ggml_tensor * attn_norm_2_b;
2631
- struct lm_ggml_tensor * attn_q_norm;
2632
- struct lm_ggml_tensor * attn_q_norm_b;
2633
- struct lm_ggml_tensor * attn_k_norm;
2634
- struct lm_ggml_tensor * attn_k_norm_b;
2635
- struct lm_ggml_tensor * attn_out_norm;
2636
- struct lm_ggml_tensor * attn_out_norm_b;
2637
- struct lm_ggml_tensor * attn_q_a_norm;
2638
- struct lm_ggml_tensor * attn_kv_a_norm;
2639
- struct lm_ggml_tensor * attn_sub_norm;
2640
- struct lm_ggml_tensor * attn_post_norm;
2641
- struct lm_ggml_tensor * ffn_sub_norm;
2642
- struct lm_ggml_tensor * attn_norm_cross;
2643
- struct lm_ggml_tensor * attn_norm_enc;
2843
+ struct lm_ggml_tensor * attn_norm = nullptr;
2844
+ struct lm_ggml_tensor * attn_norm_b = nullptr;
2845
+ struct lm_ggml_tensor * attn_norm_2 = nullptr;
2846
+ struct lm_ggml_tensor * attn_norm_2_b = nullptr;
2847
+ struct lm_ggml_tensor * attn_q_norm = nullptr;
2848
+ struct lm_ggml_tensor * attn_q_norm_b = nullptr;
2849
+ struct lm_ggml_tensor * attn_k_norm = nullptr;
2850
+ struct lm_ggml_tensor * attn_k_norm_b = nullptr;
2851
+ struct lm_ggml_tensor * attn_out_norm = nullptr;
2852
+ struct lm_ggml_tensor * attn_out_norm_b = nullptr;
2853
+ struct lm_ggml_tensor * attn_q_a_norm = nullptr;
2854
+ struct lm_ggml_tensor * attn_kv_a_norm = nullptr;
2855
+ struct lm_ggml_tensor * attn_sub_norm = nullptr;
2856
+ struct lm_ggml_tensor * attn_post_norm = nullptr;
2857
+ struct lm_ggml_tensor * ffn_sub_norm = nullptr;
2858
+ struct lm_ggml_tensor * attn_norm_cross = nullptr;
2859
+ struct lm_ggml_tensor * attn_norm_enc = nullptr;
2644
2860
 
2645
2861
  // attention
2646
- struct lm_ggml_tensor * wq;
2647
- struct lm_ggml_tensor * wk;
2648
- struct lm_ggml_tensor * wv;
2649
- struct lm_ggml_tensor * wo;
2650
- struct lm_ggml_tensor * wqkv;
2651
- struct lm_ggml_tensor * wq_a;
2652
- struct lm_ggml_tensor * wq_b;
2653
- struct lm_ggml_tensor * wkv_a_mqa;
2654
- struct lm_ggml_tensor * wkv_b;
2655
- struct lm_ggml_tensor * wq_cross;
2656
- struct lm_ggml_tensor * wk_cross;
2657
- struct lm_ggml_tensor * wv_cross;
2658
- struct lm_ggml_tensor * wo_cross;
2659
- struct lm_ggml_tensor * wq_enc;
2660
- struct lm_ggml_tensor * wk_enc;
2661
- struct lm_ggml_tensor * wv_enc;
2662
- struct lm_ggml_tensor * wo_enc;
2862
+ struct lm_ggml_tensor * wq = nullptr;
2863
+ struct lm_ggml_tensor * wk = nullptr;
2864
+ struct lm_ggml_tensor * wv = nullptr;
2865
+ struct lm_ggml_tensor * wo = nullptr;
2866
+ struct lm_ggml_tensor * wqkv = nullptr;
2867
+ struct lm_ggml_tensor * wq_a = nullptr;
2868
+ struct lm_ggml_tensor * wq_b = nullptr;
2869
+ struct lm_ggml_tensor * wkv_a_mqa = nullptr;
2870
+ struct lm_ggml_tensor * wkv_b = nullptr;
2871
+ struct lm_ggml_tensor * wq_cross = nullptr;
2872
+ struct lm_ggml_tensor * wk_cross = nullptr;
2873
+ struct lm_ggml_tensor * wv_cross = nullptr;
2874
+ struct lm_ggml_tensor * wo_cross = nullptr;
2875
+ struct lm_ggml_tensor * wq_enc = nullptr;
2876
+ struct lm_ggml_tensor * wk_enc = nullptr;
2877
+ struct lm_ggml_tensor * wv_enc = nullptr;
2878
+ struct lm_ggml_tensor * wo_enc = nullptr;
2663
2879
 
2664
2880
  // attention bias
2665
- struct lm_ggml_tensor * bq;
2666
- struct lm_ggml_tensor * bk;
2667
- struct lm_ggml_tensor * bv;
2668
- struct lm_ggml_tensor * bo;
2669
- struct lm_ggml_tensor * bqkv;
2881
+ struct lm_ggml_tensor * bq = nullptr;
2882
+ struct lm_ggml_tensor * bk = nullptr;
2883
+ struct lm_ggml_tensor * bv = nullptr;
2884
+ struct lm_ggml_tensor * bo = nullptr;
2885
+ struct lm_ggml_tensor * bqkv = nullptr;
2670
2886
 
2671
2887
  // relative position bias
2672
- struct lm_ggml_tensor * attn_rel_b;
2673
- struct lm_ggml_tensor * attn_rel_b_enc;
2674
- struct lm_ggml_tensor * attn_rel_b_cross;
2888
+ struct lm_ggml_tensor * attn_rel_b = nullptr;
2889
+ struct lm_ggml_tensor * attn_rel_b_enc = nullptr;
2890
+ struct lm_ggml_tensor * attn_rel_b_cross = nullptr;
2675
2891
 
2676
2892
  // normalization
2677
- struct lm_ggml_tensor * ffn_norm;
2678
- struct lm_ggml_tensor * ffn_norm_b;
2679
- struct lm_ggml_tensor * ffn_post_norm;
2680
- struct lm_ggml_tensor * layer_out_norm;
2681
- struct lm_ggml_tensor * layer_out_norm_b;
2682
- struct lm_ggml_tensor * ffn_norm_exps;
2683
- struct lm_ggml_tensor * ffn_norm_enc;
2893
+ struct lm_ggml_tensor * ffn_norm = nullptr;
2894
+ struct lm_ggml_tensor * ffn_norm_b = nullptr;
2895
+ struct lm_ggml_tensor * ffn_post_norm = nullptr;
2896
+ struct lm_ggml_tensor * layer_out_norm = nullptr;
2897
+ struct lm_ggml_tensor * layer_out_norm_b = nullptr;
2898
+ struct lm_ggml_tensor * ffn_norm_exps = nullptr;
2899
+ struct lm_ggml_tensor * ffn_norm_enc = nullptr;
2684
2900
 
2685
2901
  // ff
2686
- struct lm_ggml_tensor * ffn_gate; // w1
2687
- struct lm_ggml_tensor * ffn_down; // w2
2688
- struct lm_ggml_tensor * ffn_up; // w3
2689
- struct lm_ggml_tensor * ffn_gate_enc;
2690
- struct lm_ggml_tensor * ffn_down_enc;
2691
- struct lm_ggml_tensor * ffn_up_enc;
2902
+ struct lm_ggml_tensor * ffn_gate = nullptr; // w1
2903
+ struct lm_ggml_tensor * ffn_down = nullptr; // w2
2904
+ struct lm_ggml_tensor * ffn_up = nullptr; // w3
2905
+ struct lm_ggml_tensor * ffn_gate_enc = nullptr;
2906
+ struct lm_ggml_tensor * ffn_down_enc = nullptr;
2907
+ struct lm_ggml_tensor * ffn_up_enc = nullptr;
2692
2908
 
2693
2909
  // ff MoE
2694
- struct lm_ggml_tensor * ffn_gate_inp;
2695
- struct lm_ggml_tensor * ffn_gate_exps;
2696
- struct lm_ggml_tensor * ffn_down_exps;
2697
- struct lm_ggml_tensor * ffn_up_exps ;
2910
+ struct lm_ggml_tensor * ffn_gate_inp = nullptr;
2911
+ struct lm_ggml_tensor * ffn_gate_exps = nullptr;
2912
+ struct lm_ggml_tensor * ffn_down_exps = nullptr;
2913
+ struct lm_ggml_tensor * ffn_up_exps = nullptr;
2698
2914
 
2699
2915
  // ff shared expert (shexp)
2700
- struct lm_ggml_tensor * ffn_gate_inp_shexp;
2701
- struct lm_ggml_tensor * ffn_gate_shexp;
2702
- struct lm_ggml_tensor * ffn_down_shexp;
2703
- struct lm_ggml_tensor * ffn_up_shexp;
2916
+ struct lm_ggml_tensor * ffn_gate_inp_shexp = nullptr;
2917
+ struct lm_ggml_tensor * ffn_gate_shexp = nullptr;
2918
+ struct lm_ggml_tensor * ffn_down_shexp = nullptr;
2919
+ struct lm_ggml_tensor * ffn_up_shexp = nullptr;
2704
2920
 
2705
2921
  // ff bias
2706
- struct lm_ggml_tensor * ffn_gate_b;
2707
- struct lm_ggml_tensor * ffn_down_b; // b2
2708
- struct lm_ggml_tensor * ffn_up_b; // b3
2709
- struct lm_ggml_tensor * ffn_act;
2922
+ struct lm_ggml_tensor * ffn_gate_b = nullptr;
2923
+ struct lm_ggml_tensor * ffn_down_b = nullptr; // b2
2924
+ struct lm_ggml_tensor * ffn_up_b = nullptr; // b3
2925
+ struct lm_ggml_tensor * ffn_act = nullptr;
2710
2926
 
2711
2927
  // mamba proj
2712
- struct lm_ggml_tensor * ssm_in;
2713
- struct lm_ggml_tensor * ssm_x;
2714
- struct lm_ggml_tensor * ssm_dt;
2715
- struct lm_ggml_tensor * ssm_out;
2928
+ struct lm_ggml_tensor * ssm_in = nullptr;
2929
+ struct lm_ggml_tensor * ssm_x = nullptr;
2930
+ struct lm_ggml_tensor * ssm_dt = nullptr;
2931
+ struct lm_ggml_tensor * ssm_out = nullptr;
2716
2932
 
2717
2933
  // mamba
2718
- struct lm_ggml_tensor * ssm_conv1d;
2719
- struct lm_ggml_tensor * ssm_a;
2720
- struct lm_ggml_tensor * ssm_d;
2934
+ struct lm_ggml_tensor * ssm_conv1d = nullptr;
2935
+ struct lm_ggml_tensor * ssm_a = nullptr;
2936
+ struct lm_ggml_tensor * ssm_d = nullptr;
2721
2937
 
2722
2938
  // mamba bias
2723
- struct lm_ggml_tensor * ssm_conv1d_b;
2724
- struct lm_ggml_tensor * ssm_dt_b;
2939
+ struct lm_ggml_tensor * ssm_conv1d_b = nullptr;
2940
+ struct lm_ggml_tensor * ssm_dt_b = nullptr;
2725
2941
 
2726
2942
  // rwkv
2727
- struct lm_ggml_tensor * time_mix_w1;
2728
- struct lm_ggml_tensor * time_mix_w2;
2729
- struct lm_ggml_tensor * time_mix_lerp_x;
2730
- struct lm_ggml_tensor * time_mix_lerp_w;
2731
- struct lm_ggml_tensor * time_mix_lerp_k;
2732
- struct lm_ggml_tensor * time_mix_lerp_v;
2733
- struct lm_ggml_tensor * time_mix_lerp_r;
2734
- struct lm_ggml_tensor * time_mix_lerp_g;
2735
-
2736
- struct lm_ggml_tensor * time_mix_first;
2737
- struct lm_ggml_tensor * time_mix_decay;
2738
- struct lm_ggml_tensor * time_mix_decay_w1;
2739
- struct lm_ggml_tensor * time_mix_decay_w2;
2740
- struct lm_ggml_tensor * time_mix_key;
2741
- struct lm_ggml_tensor * time_mix_value;
2742
- struct lm_ggml_tensor * time_mix_receptance;
2743
- struct lm_ggml_tensor * time_mix_gate;
2744
-
2745
- struct lm_ggml_tensor * time_mix_ln;
2746
- struct lm_ggml_tensor * time_mix_ln_b;
2747
- struct lm_ggml_tensor * time_mix_output;
2748
-
2749
- struct lm_ggml_tensor * channel_mix_lerp_k;
2750
- struct lm_ggml_tensor * channel_mix_lerp_r;
2751
-
2752
- struct lm_ggml_tensor * channel_mix_key;
2753
- struct lm_ggml_tensor * channel_mix_receptance;
2754
- struct lm_ggml_tensor * channel_mix_value;
2943
+ struct lm_ggml_tensor * time_mix_w1 = nullptr;
2944
+ struct lm_ggml_tensor * time_mix_w2 = nullptr;
2945
+ struct lm_ggml_tensor * time_mix_lerp_x = nullptr;
2946
+ struct lm_ggml_tensor * time_mix_lerp_w = nullptr;
2947
+ struct lm_ggml_tensor * time_mix_lerp_k = nullptr;
2948
+ struct lm_ggml_tensor * time_mix_lerp_v = nullptr;
2949
+ struct lm_ggml_tensor * time_mix_lerp_r = nullptr;
2950
+ struct lm_ggml_tensor * time_mix_lerp_g = nullptr;
2951
+
2952
+ struct lm_ggml_tensor * time_mix_first = nullptr;
2953
+ struct lm_ggml_tensor * time_mix_decay = nullptr;
2954
+ struct lm_ggml_tensor * time_mix_decay_w1 = nullptr;
2955
+ struct lm_ggml_tensor * time_mix_decay_w2 = nullptr;
2956
+ struct lm_ggml_tensor * time_mix_key = nullptr;
2957
+ struct lm_ggml_tensor * time_mix_value = nullptr;
2958
+ struct lm_ggml_tensor * time_mix_receptance = nullptr;
2959
+ struct lm_ggml_tensor * time_mix_gate = nullptr;
2960
+
2961
+ struct lm_ggml_tensor * time_mix_ln = nullptr;
2962
+ struct lm_ggml_tensor * time_mix_ln_b = nullptr;
2963
+ struct lm_ggml_tensor * time_mix_output = nullptr;
2964
+
2965
+ struct lm_ggml_tensor * channel_mix_lerp_k = nullptr;
2966
+ struct lm_ggml_tensor * channel_mix_lerp_r = nullptr;
2967
+
2968
+ struct lm_ggml_tensor * channel_mix_key = nullptr;
2969
+ struct lm_ggml_tensor * channel_mix_receptance = nullptr;
2970
+ struct lm_ggml_tensor * channel_mix_value = nullptr;
2755
2971
 
2756
2972
  // long rope factors
2757
2973
  struct lm_ggml_tensor * rope_long = nullptr;
@@ -2759,13 +2975,17 @@ struct llama_layer {
2759
2975
  struct lm_ggml_tensor * rope_freqs = nullptr;
2760
2976
 
2761
2977
  // bitnet scale
2762
- struct lm_ggml_tensor * wq_scale;
2763
- struct lm_ggml_tensor * wk_scale;
2764
- struct lm_ggml_tensor * wv_scale;
2765
- struct lm_ggml_tensor * wo_scale;
2766
- struct lm_ggml_tensor * ffn_gate_scale;
2767
- struct lm_ggml_tensor * ffn_up_scale;
2768
- struct lm_ggml_tensor * ffn_down_scale;
2978
+ struct lm_ggml_tensor * wq_scale = nullptr;
2979
+ struct lm_ggml_tensor * wk_scale = nullptr;
2980
+ struct lm_ggml_tensor * wv_scale = nullptr;
2981
+ struct lm_ggml_tensor * wo_scale = nullptr;
2982
+ struct lm_ggml_tensor * ffn_gate_scale = nullptr;
2983
+ struct lm_ggml_tensor * ffn_up_scale = nullptr;
2984
+ struct lm_ggml_tensor * ffn_down_scale = nullptr;
2985
+
2986
+ struct llama_layer_posnet posnet;
2987
+
2988
+ struct llama_layer_convnext convnext;
2769
2989
  };
2770
2990
 
2771
2991
  // very similar to llama_batch,
@@ -2896,6 +3116,9 @@ struct llama_model {
2896
3116
  struct lm_ggml_tensor * cls_out = nullptr;
2897
3117
  struct lm_ggml_tensor * cls_out_b = nullptr;
2898
3118
 
3119
+ struct lm_ggml_tensor * conv1d = nullptr;
3120
+ struct lm_ggml_tensor * conv1d_b = nullptr;
3121
+
2899
3122
  std::vector<llama_layer> layers;
2900
3123
 
2901
3124
  // gguf metadata
@@ -2980,6 +3203,7 @@ struct llama_sbatch {
2980
3203
  // batch indices of the output
2981
3204
  std::vector<size_t> out_ids;
2982
3205
  std::vector<llama_sbatch_seq> seq;
3206
+
2983
3207
  const llama_batch * batch = nullptr;
2984
3208
 
2985
3209
  // buffers for the ubatch
@@ -3325,6 +3549,11 @@ struct llama_context {
3325
3549
  // whether we are computing encoder output or decoder output
3326
3550
  bool is_encoding = false;
3327
3551
 
3552
+ // TODO: find a better way to accommodate mutli-dimension position encoding methods
3553
+ // number of position id each token get, 1 for each token in most cases.
3554
+ // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
3555
+ int n_pos_per_token = 1;
3556
+
3328
3557
  // output of the encoder part of the encoder-decoder models
3329
3558
  std::vector<float> embd_enc;
3330
3559
  std::vector<std::set<llama_seq_id>> seq_ids_enc;
@@ -3395,6 +3624,17 @@ static int llama_get_device_count(const llama_model & model) {
3395
3624
  return (int) model.devices.size();
3396
3625
  }
3397
3626
 
3627
+ static struct lm_ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) {
3628
+ auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
3629
+ [name](const std::pair<std::string, struct lm_ggml_tensor *> & it) {
3630
+ return it.first == name;
3631
+ });
3632
+ if (it == model->tensors_by_name.end()) {
3633
+ return nullptr;
3634
+ }
3635
+ return it->second;
3636
+ }
3637
+
3398
3638
  template<typename F>
3399
3639
  static bool buft_supported(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_dev_t dev, F & fn) {
3400
3640
  lm_ggml_init_params params = {
@@ -3448,7 +3688,9 @@ static bool llama_kv_cache_init(
3448
3688
 
3449
3689
  const struct llama_hparams & hparams = model.hparams;
3450
3690
 
3451
- const int64_t n_layer = hparams.n_layer;
3691
+ const int32_t n_layer = hparams.n_layer;
3692
+
3693
+ LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, lm_ggml_type_name(type_k), lm_ggml_type_name(type_v), n_layer);
3452
3694
 
3453
3695
  cache.has_shift = false;
3454
3696
 
@@ -3489,10 +3731,12 @@ static bool llama_kv_cache_init(
3489
3731
  cache.k_l.reserve(n_layer);
3490
3732
  cache.v_l.reserve(n_layer);
3491
3733
 
3492
- for (int i = 0; i < (int) n_layer; i++) {
3734
+ for (int i = 0; i < n_layer; i++) {
3493
3735
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
3494
3736
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
3495
3737
 
3738
+ LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
3739
+
3496
3740
  lm_ggml_backend_buffer_type_t buft;
3497
3741
  if (offload) {
3498
3742
  auto * dev = model.dev_layer.at(i).dev;
@@ -4525,9 +4769,6 @@ struct llama_model_loader {
4525
4769
  case LM_GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
4526
4770
  case LM_GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
4527
4771
  case LM_GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
4528
- case LM_GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
4529
- case LM_GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
4530
- case LM_GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
4531
4772
  default:
4532
4773
  {
4533
4774
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max));
@@ -5291,9 +5532,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
5291
5532
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
5292
5533
  case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
5293
5534
  case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
5294
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
5295
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
5296
- case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
5297
5535
 
5298
5536
  default: return "unknown, may not work";
5299
5537
  }
@@ -5411,7 +5649,7 @@ static void llm_load_hparams(
5411
5649
  ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
5412
5650
 
5413
5651
  // get hparams kv
5414
- ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
5652
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
5415
5653
 
5416
5654
  // everything past this point is not vocab-related
5417
5655
  if (hparams.vocab_only) {
@@ -5424,6 +5662,16 @@ static void llm_load_hparams(
5424
5662
  ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
5425
5663
  ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
5426
5664
 
5665
+ if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
5666
+ ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
5667
+
5668
+ ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
5669
+ ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
5670
+
5671
+ ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
5672
+ ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
5673
+ }
5674
+
5427
5675
  LM_GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
5428
5676
  LM_GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
5429
5677
  if (hparams.n_expert > 0) {
@@ -5432,13 +5680,13 @@ static void llm_load_hparams(
5432
5680
  LM_GGML_ASSERT(hparams.n_expert_used == 0);
5433
5681
  }
5434
5682
 
5435
- // zero-out the per-layer hparams
5683
+ // zero-out the array hparams
5436
5684
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
5437
5685
  std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
5438
5686
  std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
5439
5687
 
5440
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
5441
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
5688
+ ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
5689
+ ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
5442
5690
 
5443
5691
  // n_head_kv is optional, default to n_head
5444
5692
  hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5487,7 +5735,7 @@ static void llm_load_hparams(
5487
5735
 
5488
5736
  ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
5489
5737
 
5490
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
5738
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
5491
5739
  if (hparams.n_rot != hparams.n_embd_head_k) {
5492
5740
  throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
5493
5741
  }
@@ -5527,11 +5775,24 @@ static void llm_load_hparams(
5527
5775
  }
5528
5776
  }
5529
5777
  } break;
5778
+ case LLM_ARCH_DECI:
5779
+ {
5780
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5781
+ switch (hparams.n_layer) {
5782
+ case 32: model.type = e_model::MODEL_7B; break;
5783
+ case 80: model.type = e_model::MODEL_70B; break;
5784
+ default: model.type = e_model::MODEL_UNKNOWN;
5785
+ }
5786
+ } break;
5530
5787
  case LLM_ARCH_MINICPM:
5531
5788
  {
5532
5789
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5790
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5791
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5792
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5533
5793
 
5534
5794
  switch (hparams.n_layer) {
5795
+ case 52: model.type = e_model::MODEL_1B; break;
5535
5796
  case 40: model.type = e_model::MODEL_2B; break;
5536
5797
  default: model.type = e_model::MODEL_UNKNOWN;
5537
5798
  }
@@ -5696,6 +5957,13 @@ static void llm_load_hparams(
5696
5957
  default: model.type = e_model::MODEL_UNKNOWN;
5697
5958
  }
5698
5959
  } break;
5960
+ case LLM_ARCH_QWEN2VL:
5961
+ {
5962
+ std::array<int, 4> section_dims;
5963
+ ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
5964
+ std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
5965
+ }
5966
+ // fall through
5699
5967
  case LLM_ARCH_QWEN2:
5700
5968
  {
5701
5969
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6006,6 +6274,19 @@ static void llm_load_hparams(
6006
6274
  model.type = e_model::MODEL_UNKNOWN;
6007
6275
  }
6008
6276
  } break;
6277
+ case LLM_ARCH_DEEPSEEK:
6278
+ {
6279
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
6280
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
6281
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
6282
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
6283
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
6284
+
6285
+ switch (hparams.n_layer) {
6286
+ case 28: model.type = e_model::MODEL_20B; break;
6287
+ default: model.type = e_model::MODEL_UNKNOWN;
6288
+ }
6289
+ } break;
6009
6290
  case LLM_ARCH_DEEPSEEK2:
6010
6291
  {
6011
6292
  bool is_lite = (hparams.n_layer == 27);
@@ -6159,6 +6440,13 @@ static void llm_load_hparams(
6159
6440
  default: model.type = e_model::MODEL_UNKNOWN;
6160
6441
  }
6161
6442
  } break;
6443
+ case LLM_ARCH_WAVTOKENIZER_DEC:
6444
+ {
6445
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
6446
+ ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
6447
+ ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
6448
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
6449
+ } break;
6162
6450
  default: (void)0;
6163
6451
  }
6164
6452
 
@@ -6188,7 +6476,7 @@ static void llm_load_vocab(
6188
6476
  ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
6189
6477
  ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
6190
6478
 
6191
- if (tokenizer_model == "no_vocab") {
6479
+ if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
6192
6480
  vocab.type = LLAMA_VOCAB_TYPE_NONE;
6193
6481
 
6194
6482
  // default special tokens
@@ -6326,7 +6614,8 @@ static void llm_load_vocab(
6326
6614
  } else if (
6327
6615
  tokenizer_pre == "llama3" ||
6328
6616
  tokenizer_pre == "llama-v3" ||
6329
- tokenizer_pre == "llama-bpe") {
6617
+ tokenizer_pre == "llama-bpe"||
6618
+ tokenizer_pre == "falcon3") {
6330
6619
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
6331
6620
  vocab.tokenizer_ignore_merges = true;
6332
6621
  vocab.tokenizer_add_bos = true;
@@ -6352,10 +6641,12 @@ static void llm_load_vocab(
6352
6641
  tokenizer_pre == "phi-2" ||
6353
6642
  tokenizer_pre == "jina-es" ||
6354
6643
  tokenizer_pre == "jina-de" ||
6644
+ tokenizer_pre == "gigachat" ||
6355
6645
  tokenizer_pre == "jina-v1-en" ||
6356
6646
  tokenizer_pre == "jina-v2-es" ||
6357
6647
  tokenizer_pre == "jina-v2-de" ||
6358
- tokenizer_pre == "jina-v2-code") {
6648
+ tokenizer_pre == "jina-v2-code" ||
6649
+ tokenizer_pre == "roberta-bpe") {
6359
6650
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
6360
6651
  } else if (
6361
6652
  tokenizer_pre == "refact") {
@@ -6422,6 +6713,12 @@ static void llm_load_vocab(
6422
6713
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
6423
6714
  vocab.tokenizer_add_bos = true;
6424
6715
  vocab.tokenizer_clean_spaces = false;
6716
+ } else if (
6717
+ tokenizer_pre == "minerva-7b") {
6718
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
6719
+ } else if (
6720
+ tokenizer_pre == "megrez") {
6721
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
6425
6722
  } else {
6426
6723
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
6427
6724
  }
@@ -7000,6 +7297,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
7000
7297
 
7001
7298
  LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
7002
7299
 
7300
+ if (model.arch == LLM_ARCH_DEEPSEEK) {
7301
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7302
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
7303
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
7304
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
7305
+ }
7306
+
7003
7307
  if (model.arch == LLM_ARCH_DEEPSEEK2) {
7004
7308
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
7005
7309
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@@ -7015,7 +7319,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
7015
7319
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
7016
7320
  }
7017
7321
 
7018
- if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
7322
+ if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
7019
7323
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
7020
7324
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
7021
7325
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -7156,6 +7460,22 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
7156
7460
  {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT_ID}},
7157
7461
  // this tensor is loaded for T5, but never used
7158
7462
  {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_NONE}},
7463
+ {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, LM_GGML_OP_IM2COL}},
7464
+ {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7465
+ {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7466
+ {LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7467
+ {LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
7468
+ {LLM_TENSOR_POS_NET_CONV2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
7469
+ {LLM_TENSOR_POS_NET_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7470
+ {LLM_TENSOR_POS_NET_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7471
+ {LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7472
+ {LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7473
+ {LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7474
+ {LLM_TENSOR_CONVNEXT_DW, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_IM2COL}},
7475
+ {LLM_TENSOR_CONVNEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7476
+ {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7477
+ {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
7478
+ {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
7159
7479
  };
7160
7480
 
7161
7481
  // checks if the weight tensor can be used with the specified buffer type and device
@@ -7260,6 +7580,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
7260
7580
  lm_ggml_tensor * state = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, S, n_seqs, S, H);
7261
7581
  op_tensor = lm_ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
7262
7582
  } break;
7583
+ case LM_GGML_OP_IM2COL:
7584
+ {
7585
+ const int n_embd = hparams.n_embd;
7586
+ lm_ggml_tensor * b = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
7587
+ op_tensor = lm_ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, LM_GGML_TYPE_F16);
7588
+ } break;
7263
7589
  default:
7264
7590
  LM_GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, lm_ggml_op_name(op), w->name);
7265
7591
  }
@@ -7390,7 +7716,8 @@ static bool llm_load_tensors(
7390
7716
  model.main_gpu = main_gpu;
7391
7717
  model.n_gpu_layers = n_gpu_layers;
7392
7718
 
7393
- const int n_layer = hparams.n_layer;
7719
+ const int n_layer = hparams.n_layer;
7720
+
7394
7721
  bool use_mmap_buffer = true;
7395
7722
 
7396
7723
  // build a list of buffer types for the CPU and GPU devices
@@ -7640,7 +7967,13 @@ static bool llm_load_tensors(
7640
7967
 
7641
7968
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
7642
7969
 
7643
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7970
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
7971
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7972
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7973
+ }
7974
+ else {
7975
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7976
+ }
7644
7977
 
7645
7978
  if (n_expert == 0) {
7646
7979
  layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
@@ -7659,6 +7992,68 @@ static bool llm_load_tensors(
7659
7992
  }
7660
7993
  }
7661
7994
  } break;
7995
+ case LLM_ARCH_DECI:
7996
+ {
7997
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
7998
+
7999
+ // output
8000
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
8001
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
8002
+
8003
+ // if output is NULL, init from the input tok embed
8004
+ if (model.output == NULL) {
8005
+ model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
8006
+ }
8007
+
8008
+ for (int i = 0; i < n_layer; ++i) {
8009
+ auto & layer = model.layers[i];
8010
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
8011
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
8012
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
8013
+ const int64_t n_ff = hparams.n_ff(i);
8014
+ const int64_t n_head = hparams.n_head(i);
8015
+ const int64_t n_head_kv = hparams.n_head_kv(i);
8016
+
8017
+ if (n_head_kv == 0 && n_head > 0) {
8018
+ // linear attention for DeciLMCausalModel
8019
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
8020
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
8021
+ }
8022
+ else if (n_head_kv > 0) {
8023
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
8024
+
8025
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
8026
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
8027
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
8028
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
8029
+ }
8030
+
8031
+ // optional bias tensors
8032
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
8033
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
8034
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
8035
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
8036
+
8037
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
8038
+
8039
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
8040
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8041
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8042
+ }
8043
+ else {
8044
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8045
+ }
8046
+
8047
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
8048
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
8049
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
8050
+
8051
+ // optional MLP bias
8052
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
8053
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
8054
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
8055
+ }
8056
+ } break;
7662
8057
  case LLM_ARCH_MINICPM3:
7663
8058
  {
7664
8059
  const int64_t n_embd_head_qk_rope = hparams.n_rot;
@@ -8107,6 +8502,7 @@ static bool llm_load_tensors(
8107
8502
  }
8108
8503
  } break;
8109
8504
  case LLM_ARCH_QWEN2:
8505
+ case LLM_ARCH_QWEN2VL:
8110
8506
  {
8111
8507
  model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
8112
8508
 
@@ -8767,15 +9163,8 @@ static bool llm_load_tensors(
8767
9163
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
8768
9164
  }
8769
9165
  } break;
8770
- case LLM_ARCH_DEEPSEEK2:
9166
+ case LLM_ARCH_DEEPSEEK:
8771
9167
  {
8772
- const bool is_lite = (hparams.n_layer == 27);
8773
-
8774
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
8775
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
8776
-
8777
- const int64_t q_lora_rank = hparams.n_lora_q;
8778
- const int64_t kv_lora_rank = hparams.n_lora_kv;
8779
9168
 
8780
9169
  const int64_t n_ff_exp = hparams.n_ff_exp;
8781
9170
  const int64_t n_expert_shared = hparams.n_expert_shared;
@@ -8790,23 +9179,11 @@ static bool llm_load_tensors(
8790
9179
  auto & layer = model.layers[i];
8791
9180
 
8792
9181
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
8793
- if (!is_lite) {
8794
- layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
8795
- }
8796
-
8797
- layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
8798
-
8799
- if (!is_lite) {
8800
- layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
8801
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
8802
- } else {
8803
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
8804
- }
8805
-
8806
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
8807
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
8808
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
8809
9182
 
9183
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
9184
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
9185
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
9186
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
8810
9187
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
8811
9188
 
8812
9189
  if (i < (int) hparams.n_layer_dense_lead) {
@@ -8835,12 +9212,80 @@ static bool llm_load_tensors(
8835
9212
  }
8836
9213
  }
8837
9214
  } break;
8838
- case LLM_ARCH_BITNET:
9215
+ case LLM_ARCH_DEEPSEEK2:
8839
9216
  {
8840
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
9217
+ const bool is_lite = (hparams.n_layer == 27);
8841
9218
 
8842
- // output
8843
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
9219
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
9220
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
9221
+
9222
+ const int64_t q_lora_rank = hparams.n_lora_q;
9223
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
9224
+
9225
+ const int64_t n_ff_exp = hparams.n_ff_exp;
9226
+ const int64_t n_expert_shared = hparams.n_expert_shared;
9227
+
9228
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
9229
+
9230
+ // output
9231
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
9232
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
9233
+
9234
+ for (int i = 0; i < n_layer; ++i) {
9235
+ auto & layer = model.layers[i];
9236
+
9237
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
9238
+ if (!is_lite) {
9239
+ layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
9240
+ }
9241
+
9242
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
9243
+
9244
+ if (!is_lite) {
9245
+ layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
9246
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
9247
+ } else {
9248
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
9249
+ }
9250
+
9251
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
9252
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
9253
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
9254
+
9255
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
9256
+
9257
+ if (i < (int) hparams.n_layer_dense_lead) {
9258
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
9259
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
9260
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
9261
+ } else {
9262
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
9263
+
9264
+ if (n_expert == 0) {
9265
+ throw std::runtime_error("n_expert must be > 0");
9266
+ }
9267
+ if (n_expert_used == 0) {
9268
+ throw std::runtime_error("n_expert_used must be > 0");
9269
+ }
9270
+
9271
+ // MoE branch
9272
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
9273
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
9274
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
9275
+
9276
+ // Shared expert branch
9277
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
9278
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
9279
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
9280
+ }
9281
+ }
9282
+ } break;
9283
+ case LLM_ARCH_BITNET:
9284
+ {
9285
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
9286
+
9287
+ // output
9288
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
8844
9289
 
8845
9290
  for (int i = 0; i < n_layer; ++i) {
8846
9291
  auto & layer = model.layers[i];
@@ -9137,9 +9582,9 @@ static bool llm_load_tensors(
9137
9582
  } break;
9138
9583
  case LLM_ARCH_CHAMELEON:
9139
9584
  {
9140
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
9585
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
9141
9586
 
9142
- // output
9587
+ // output
9143
9588
  model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
9144
9589
  model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
9145
9590
  // if output is NULL, init from the input tok embed
@@ -9168,6 +9613,109 @@ static bool llm_load_tensors(
9168
9613
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
9169
9614
  }
9170
9615
  } break;
9616
+ case LLM_ARCH_WAVTOKENIZER_DEC:
9617
+ {
9618
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
9619
+
9620
+ model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
9621
+ model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
9622
+
9623
+ // posnet
9624
+ {
9625
+ const int64_t n_embd = hparams.posnet.n_embd;
9626
+
9627
+ for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
9628
+ auto & layer = model.layers[i].posnet;
9629
+
9630
+ // posnet:
9631
+ //
9632
+ // - resnet
9633
+ // - resnet
9634
+ // - attn
9635
+ // - resnet
9636
+ // - resnet
9637
+ // - norm
9638
+ //
9639
+ switch (i) {
9640
+ case 0:
9641
+ case 1:
9642
+ case 3:
9643
+ case 4:
9644
+ {
9645
+ layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
9646
+ layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
9647
+
9648
+ layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
9649
+ layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
9650
+
9651
+ layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
9652
+ layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
9653
+
9654
+ layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
9655
+ layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
9656
+ } break;
9657
+ case 2:
9658
+ {
9659
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
9660
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
9661
+
9662
+ layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
9663
+ layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
9664
+
9665
+ layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
9666
+ layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
9667
+
9668
+ layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
9669
+ layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
9670
+
9671
+ layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
9672
+ layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
9673
+ } break;
9674
+ case 5:
9675
+ {
9676
+ layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
9677
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
9678
+ } break;
9679
+ default: LM_GGML_ABORT("unknown posnet layer");
9680
+ };
9681
+ }
9682
+ }
9683
+
9684
+ LM_GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
9685
+
9686
+ model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
9687
+ model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
9688
+
9689
+ // convnext
9690
+ {
9691
+ const int64_t n_embd = hparams.convnext.n_embd;
9692
+
9693
+ for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
9694
+ auto & layer = model.layers[i].convnext;
9695
+
9696
+ layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
9697
+ layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
9698
+
9699
+ layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
9700
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
9701
+
9702
+ layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
9703
+ layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
9704
+
9705
+ layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
9706
+ layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
9707
+
9708
+ layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
9709
+ }
9710
+
9711
+ // output
9712
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
9713
+ model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
9714
+ }
9715
+
9716
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
9717
+ model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
9718
+ } break;
9171
9719
  default:
9172
9720
  throw std::runtime_error("unknown architecture");
9173
9721
  }
@@ -9387,6 +9935,7 @@ enum llm_ffn_gate_type {
9387
9935
  enum llm_norm_type {
9388
9936
  LLM_NORM,
9389
9937
  LLM_NORM_RMS,
9938
+ LLM_NORM_GROUP,
9390
9939
  };
9391
9940
 
9392
9941
  static struct lm_ggml_tensor * llm_build_inp_embd(
@@ -9407,7 +9956,7 @@ static struct lm_ggml_tensor * llm_build_inp_embd(
9407
9956
 
9408
9957
  inpL = lm_ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
9409
9958
  } else {
9410
- lctx.inp_embd = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, n_embd, batch.n_tokens);
9959
+ lctx.inp_embd = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, n_embd, batch.n_tokens);
9411
9960
  inpL = lctx.inp_embd;
9412
9961
  lm_ggml_set_input(lctx.inp_embd);
9413
9962
  }
@@ -9528,8 +10077,14 @@ static struct lm_ggml_tensor * llm_build_norm(
9528
10077
  const llm_build_cb & cb,
9529
10078
  int il) {
9530
10079
  switch (type) {
9531
- case LLM_NORM: cur = lm_ggml_norm (ctx, cur, hparams.f_norm_eps); break;
9532
- case LLM_NORM_RMS: cur = lm_ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
10080
+ case LLM_NORM: cur = lm_ggml_norm (ctx, cur, hparams.f_norm_eps); break;
10081
+ case LLM_NORM_RMS: cur = lm_ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break;
10082
+ case LLM_NORM_GROUP:
10083
+ {
10084
+ cur = lm_ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
10085
+ cur = lm_ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
10086
+ cur = lm_ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]);
10087
+ } break;
9533
10088
  }
9534
10089
 
9535
10090
  if (mw || mb) {
@@ -10868,6 +11423,167 @@ struct llm_build_context {
10868
11423
  return gf;
10869
11424
  }
10870
11425
 
11426
+ struct lm_ggml_cgraph * build_deci() {
11427
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
11428
+
11429
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
11430
+ int32_t n_tokens = this->n_tokens;
11431
+
11432
+ const int64_t n_embd_head = hparams.n_embd_head_v;
11433
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
11434
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
11435
+
11436
+ struct lm_ggml_tensor * cur;
11437
+ struct lm_ggml_tensor * inpL;
11438
+
11439
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
11440
+
11441
+ // inp_pos - contains the positions
11442
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
11443
+
11444
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
11445
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
11446
+
11447
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
11448
+ for (int il = 0; il < n_layer; ++il) {
11449
+ struct lm_ggml_tensor * inpSA = inpL;
11450
+ const int64_t n_head_kv = hparams.n_head_kv(il);
11451
+ const int64_t n_head = hparams.n_head(il);
11452
+
11453
+ if (n_head == 0) {
11454
+ // attention-free layer of Llama-3_1-Nemotron-51B
11455
+ cur = inpL;
11456
+ } else {
11457
+ // norm
11458
+ cur = llm_build_norm(ctx0, inpL, hparams,
11459
+ model.layers[il].attn_norm, NULL,
11460
+ LLM_NORM_RMS, cb, il);
11461
+ cb(cur, "attn_norm", il);
11462
+ }
11463
+
11464
+ if (n_head > 0 && n_head_kv == 0) {
11465
+ // "linear attention" of Llama-3_1-Nemotron-51B
11466
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
11467
+ cb(cur, "wo", il);
11468
+ } else if (n_head > 0) {
11469
+ // self-attention
11470
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
11471
+ struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
11472
+
11473
+ // compute Q and K and RoPE them
11474
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
11475
+ cb(Qcur, "Qcur", il);
11476
+ if (model.layers[il].bq) {
11477
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
11478
+ cb(Qcur, "Qcur", il);
11479
+ }
11480
+
11481
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
11482
+ cb(Kcur, "Kcur", il);
11483
+ if (model.layers[il].bk) {
11484
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
11485
+ cb(Kcur, "Kcur", il);
11486
+ }
11487
+
11488
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
11489
+ cb(Vcur, "Vcur", il);
11490
+ if (model.layers[il].bv) {
11491
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
11492
+ cb(Vcur, "Vcur", il);
11493
+ }
11494
+
11495
+ Qcur = lm_ggml_rope_ext(
11496
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
11497
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11498
+ ext_factor, attn_factor, beta_fast, beta_slow
11499
+ );
11500
+ cb(Qcur, "Qcur", il);
11501
+
11502
+ Kcur = lm_ggml_rope_ext(
11503
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
11504
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11505
+ ext_factor, attn_factor, beta_fast, beta_slow
11506
+ );
11507
+ cb(Kcur, "Kcur", il);
11508
+
11509
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
11510
+ model.layers[il].wo, model.layers[il].bo,
11511
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
11512
+ }
11513
+
11514
+ if (il == n_layer - 1) {
11515
+ // skip computing output for unused tokens
11516
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
11517
+ n_tokens = n_outputs;
11518
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
11519
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
11520
+ }
11521
+
11522
+ // For Granite architecture
11523
+ if (hparams.f_residual_scale) {
11524
+ cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
11525
+ }
11526
+
11527
+ // modified to support attention-free layer of Llama-3_1-Nemotron-51B
11528
+ struct lm_ggml_tensor * ffn_inp = cur;
11529
+ if (n_head > 0) {
11530
+ ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
11531
+ cb(ffn_inp, "ffn_inp", il);
11532
+ }
11533
+
11534
+ // feed-forward network
11535
+ if (model.layers[il].ffn_gate_inp == nullptr) {
11536
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
11537
+ model.layers[il].ffn_norm, NULL,
11538
+ LLM_NORM_RMS, cb, il);
11539
+ cb(cur, "ffn_norm", il);
11540
+
11541
+ cur = llm_build_ffn(ctx0, lctx, cur,
11542
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
11543
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
11544
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
11545
+ NULL,
11546
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
11547
+ cb(cur, "ffn_out", il);
11548
+ }
11549
+
11550
+ // For Granite architecture
11551
+ if (hparams.f_residual_scale) {
11552
+ cur = lm_ggml_scale(ctx0, cur, hparams.f_residual_scale);
11553
+ }
11554
+
11555
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
11556
+ cb(cur, "ffn_out", il);
11557
+
11558
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
11559
+ cb(cur, "l_out", il);
11560
+
11561
+ // input for next layer
11562
+ inpL = cur;
11563
+ }
11564
+
11565
+ cur = inpL;
11566
+
11567
+ cur = llm_build_norm(ctx0, cur, hparams,
11568
+ model.output_norm, NULL,
11569
+ LLM_NORM_RMS, cb, -1);
11570
+ cb(cur, "result_norm", -1);
11571
+
11572
+ // lm_head
11573
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
11574
+
11575
+ // For Granite architecture
11576
+ if (hparams.f_logit_scale) {
11577
+ cur = lm_ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
11578
+ }
11579
+
11580
+ cb(cur, "result_output", -1);
11581
+
11582
+ lm_ggml_build_forward_expand(gf, cur);
11583
+
11584
+ return gf;
11585
+ }
11586
+
10871
11587
  struct lm_ggml_cgraph * build_baichuan() {
10872
11588
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
10873
11589
 
@@ -12496,12 +13212,8 @@ struct llm_build_context {
12496
13212
  return gf;
12497
13213
  }
12498
13214
 
12499
- struct lm_ggml_cgraph * build_qwen2moe() {
13215
+ struct lm_ggml_cgraph * build_qwen2vl() {
12500
13216
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
12501
-
12502
- // mutable variable, needed during the last layer of the computation to skip unused tokens
12503
- int32_t n_tokens = this->n_tokens;
12504
-
12505
13217
  const int64_t n_embd_head = hparams.n_embd_head_v;
12506
13218
  LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12507
13219
  LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -12512,10 +13224,15 @@ struct llm_build_context {
12512
13224
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
12513
13225
 
12514
13226
  // inp_pos - contains the positions
12515
- struct lm_ggml_tensor * inp_pos = build_inp_pos();
13227
+ lctx.inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens * 4);
13228
+ cb(lctx.inp_pos, "inp_pos", -1);
13229
+ lm_ggml_set_input(lctx.inp_pos);
13230
+ struct lm_ggml_tensor * inp_pos = lctx.inp_pos;
12516
13231
 
12517
13232
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
12518
13233
  struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
13234
+ int sections[4];
13235
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
12519
13236
 
12520
13237
  for (int il = 0; il < n_layer; ++il) {
12521
13238
  struct lm_ggml_tensor * inpSA = inpL;
@@ -12526,7 +13243,7 @@ struct llm_build_context {
12526
13243
  LLM_NORM_RMS, cb, il);
12527
13244
  cb(cur, "attn_norm", il);
12528
13245
 
12529
- // self_attention
13246
+ // self-attention
12530
13247
  {
12531
13248
  // compute Q and K and RoPE them
12532
13249
  struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
@@ -12544,8 +13261,125 @@ struct llm_build_context {
12544
13261
  Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
12545
13262
  cb(Vcur, "Vcur", il);
12546
13263
 
12547
- Qcur = lm_ggml_rope_ext(
12548
- ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
13264
+ Qcur = lm_ggml_rope_multi(
13265
+ ctx0,
13266
+ lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
13267
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
13268
+ ext_factor, attn_factor, beta_fast, beta_slow
13269
+ );
13270
+ cb(Qcur, "Qcur", il);
13271
+
13272
+ Kcur = lm_ggml_rope_multi(
13273
+ ctx0,
13274
+ lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
13275
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
13276
+ ext_factor, attn_factor, beta_fast, beta_slow
13277
+ );
13278
+ cb(Kcur, "Kcur", il);
13279
+
13280
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13281
+ model.layers[il].wo, model.layers[il].bo,
13282
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13283
+ }
13284
+
13285
+ if (il == n_layer - 1) {
13286
+ // skip computing output for unused tokens
13287
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13288
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13289
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13290
+ }
13291
+
13292
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13293
+ cb(ffn_inp, "ffn_inp", il);
13294
+
13295
+ // feed-forward network
13296
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
13297
+ model.layers[il].ffn_norm, NULL,
13298
+ LLM_NORM_RMS, cb, il);
13299
+ cb(cur, "ffn_norm", il);
13300
+
13301
+ cur = llm_build_ffn(ctx0, lctx, cur,
13302
+ model.layers[il].ffn_up, NULL, NULL,
13303
+ model.layers[il].ffn_gate, NULL, NULL,
13304
+ model.layers[il].ffn_down, NULL, NULL,
13305
+ NULL,
13306
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
13307
+ cb(cur, "ffn_out", il);
13308
+
13309
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
13310
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
13311
+ cb(cur, "l_out", il);
13312
+
13313
+ // input for next layer
13314
+ inpL = cur;
13315
+ }
13316
+
13317
+ cur = inpL;
13318
+
13319
+ cur = llm_build_norm(ctx0, cur, hparams,
13320
+ model.output_norm, NULL,
13321
+ LLM_NORM_RMS, cb, -1);
13322
+ cb(cur, "result_norm", -1);
13323
+
13324
+ // lm_head
13325
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13326
+ cb(cur, "result_output", -1);
13327
+
13328
+ lm_ggml_build_forward_expand(gf, cur);
13329
+
13330
+ return gf;
13331
+ }
13332
+
13333
+ struct lm_ggml_cgraph * build_qwen2moe() {
13334
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13335
+
13336
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
13337
+ int32_t n_tokens = this->n_tokens;
13338
+
13339
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13340
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13341
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
13342
+
13343
+ struct lm_ggml_tensor * cur;
13344
+ struct lm_ggml_tensor * inpL;
13345
+
13346
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
13347
+
13348
+ // inp_pos - contains the positions
13349
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
13350
+
13351
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13352
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
13353
+
13354
+ for (int il = 0; il < n_layer; ++il) {
13355
+ struct lm_ggml_tensor * inpSA = inpL;
13356
+
13357
+ // norm
13358
+ cur = llm_build_norm(ctx0, inpL, hparams,
13359
+ model.layers[il].attn_norm, NULL,
13360
+ LLM_NORM_RMS, cb, il);
13361
+ cb(cur, "attn_norm", il);
13362
+
13363
+ // self_attention
13364
+ {
13365
+ // compute Q and K and RoPE them
13366
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13367
+ cb(Qcur, "Qcur", il);
13368
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
13369
+ cb(Qcur, "Qcur", il);
13370
+
13371
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13372
+ cb(Kcur, "Kcur", il);
13373
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
13374
+ cb(Kcur, "Kcur", il);
13375
+
13376
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13377
+ cb(Vcur, "Vcur", il);
13378
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
13379
+ cb(Vcur, "Vcur", il);
13380
+
13381
+ Qcur = lm_ggml_rope_ext(
13382
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
12549
13383
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12550
13384
  ext_factor, attn_factor, beta_fast, beta_slow
12551
13385
  );
@@ -12779,7 +13613,13 @@ struct llm_build_context {
12779
13613
  struct lm_ggml_tensor * inp_pos = build_inp_pos();
12780
13614
 
12781
13615
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
12782
- struct lm_ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
13616
+ struct lm_ggml_tensor * KQ_mask = nullptr;
13617
+ if (hparams.n_swa == 0) {
13618
+ // Phi-4 doesn't use sliding window attention
13619
+ KQ_mask = build_inp_KQ_mask();
13620
+ } else {
13621
+ KQ_mask = build_inp_KQ_mask_swa();
13622
+ }
12783
13623
 
12784
13624
  for (int il = 0; il < n_layer; ++il) {
12785
13625
  auto residual = inpL;
@@ -12837,7 +13677,7 @@ struct llm_build_context {
12837
13677
 
12838
13678
  cur = llm_build_kv(ctx0, lctx, kv_self, gf,
12839
13679
  model.layers[il].wo, model.layers[il].bo,
12840
- Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
13680
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
12841
13681
  }
12842
13682
 
12843
13683
  if (il == n_layer - 1) {
@@ -13447,153 +14287,6 @@ struct llm_build_context {
13447
14287
  return gf;
13448
14288
  }
13449
14289
 
13450
- // ref: https://arxiv.org/abs/2203.03466
13451
- // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
13452
- // based on the original build_llama() function
13453
- struct lm_ggml_cgraph * build_minicpm() {
13454
- struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13455
-
13456
- const int64_t n_embd_head = hparams.n_embd_head_v;
13457
- LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13458
- LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
13459
-
13460
- const int64_t n_embd = hparams.n_embd;
13461
- //TODO: if the model varies, these parameters need to be read from the model
13462
- const int64_t n_embd_base = 256;
13463
- const float scale_embd = 12.0f;
13464
- const float scale_depth = 1.4f;
13465
-
13466
- struct lm_ggml_tensor * cur;
13467
- struct lm_ggml_tensor * inpL;
13468
-
13469
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
13470
-
13471
- // scale the input embeddings
13472
- inpL = lm_ggml_scale(ctx0, inpL, scale_embd);
13473
- cb(inpL, "inp_scaled", -1);
13474
-
13475
- // inp_pos - contains the positions
13476
- struct lm_ggml_tensor * inp_pos = build_inp_pos();
13477
-
13478
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13479
- struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
13480
-
13481
- for (int il = 0; il < n_layer; ++il) {
13482
- struct lm_ggml_tensor * inpSA = inpL;
13483
-
13484
- // norm
13485
- cur = llm_build_norm(ctx0, inpL, hparams,
13486
- model.layers[il].attn_norm, NULL,
13487
- LLM_NORM_RMS, cb, il);
13488
- cb(cur, "attn_norm", il);
13489
-
13490
- // self-attention
13491
- {
13492
- // compute Q and K and RoPE them
13493
- struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13494
- cb(Qcur, "Qcur", il);
13495
- if (model.layers[il].bq) {
13496
- Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
13497
- cb(Qcur, "Qcur", il);
13498
- }
13499
-
13500
- struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13501
- cb(Kcur, "Kcur", il);
13502
- if (model.layers[il].bk) {
13503
- Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
13504
- cb(Kcur, "Kcur", il);
13505
- }
13506
-
13507
- struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13508
- cb(Vcur, "Vcur", il);
13509
- if (model.layers[il].bv) {
13510
- Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
13511
- cb(Vcur, "Vcur", il);
13512
- }
13513
-
13514
- Qcur = lm_ggml_rope_ext(
13515
- ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
13516
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13517
- ext_factor, attn_factor, beta_fast, beta_slow
13518
- );
13519
- cb(Qcur, "Qcur", il);
13520
-
13521
- Kcur = lm_ggml_rope_ext(
13522
- ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
13523
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13524
- ext_factor, attn_factor, beta_fast, beta_slow
13525
- );
13526
- cb(Kcur, "Kcur", il);
13527
-
13528
- cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13529
- model.layers[il].wo, model.layers[il].bo,
13530
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13531
- }
13532
-
13533
- if (il == n_layer - 1) {
13534
- // skip computing output for unused tokens
13535
- struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13536
- cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13537
- inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13538
- }
13539
-
13540
- // scale_res - scale the hidden states for residual connection
13541
- const float scale_res = scale_depth/sqrtf(float(n_layer));
13542
- cur = lm_ggml_scale(ctx0, cur, scale_res);
13543
- cb(cur, "hidden_scaled", -1);
13544
-
13545
- struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13546
- cb(ffn_inp, "ffn_inp", il);
13547
-
13548
- // feed-forward network
13549
- {
13550
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
13551
- model.layers[il].ffn_norm, NULL,
13552
- LLM_NORM_RMS, cb, il);
13553
- cb(cur, "ffn_norm", il);
13554
-
13555
- cur = llm_build_ffn(ctx0, lctx, cur,
13556
- model.layers[il].ffn_up, NULL, NULL,
13557
- model.layers[il].ffn_gate, NULL, NULL,
13558
- model.layers[il].ffn_down, NULL, NULL,
13559
- NULL,
13560
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
13561
- cb(cur, "ffn_out", il);
13562
- }
13563
-
13564
- // scale the hidden states for residual connection
13565
- cur = lm_ggml_scale(ctx0, cur, scale_res);
13566
- cb(cur, "hidden_scaled_ffn", -1);
13567
-
13568
- cur = lm_ggml_add(ctx0, cur, ffn_inp);
13569
- cur = lctx.cvec.apply_to(ctx0, cur, il);
13570
- cb(cur, "l_out", il);
13571
-
13572
- // input for next layer
13573
- inpL = cur;
13574
- }
13575
-
13576
- cur = inpL;
13577
-
13578
- cur = llm_build_norm(ctx0, cur, hparams,
13579
- model.output_norm, NULL,
13580
- LLM_NORM_RMS, cb, -1);
13581
- cb(cur, "result_norm", -1);
13582
-
13583
- // lm_head scaling
13584
- const float scale_lmhead = float(n_embd_base)/float(n_embd);
13585
- cur = lm_ggml_scale(ctx0, cur, scale_lmhead);
13586
- cb(cur, "lmhead_scaling", -1);
13587
-
13588
- // lm_head
13589
- cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13590
- cb(cur, "result_output", -1);
13591
-
13592
- lm_ggml_build_forward_expand(gf, cur);
13593
-
13594
- return gf;
13595
- }
13596
-
13597
14290
  struct lm_ggml_cgraph * build_minicpm3() {
13598
14291
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13599
14292
 
@@ -15061,22 +15754,169 @@ struct llm_build_context {
15061
15754
  cb(Vcur, "Vcur", il);
15062
15755
 
15063
15756
  Qcur = lm_ggml_rope_ext(
15064
- ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
15757
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
15758
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15759
+ ext_factor, attn_factor, beta_fast, beta_slow
15760
+ );
15761
+ cb(Qcur, "Qcur", il);
15762
+
15763
+ Kcur = lm_ggml_rope_ext(
15764
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
15765
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15766
+ ext_factor, attn_factor, beta_fast, beta_slow
15767
+ );
15768
+ cb(Kcur, "Kcur", il);
15769
+
15770
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
15771
+ model.layers[il].wo, NULL,
15772
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
15773
+ }
15774
+
15775
+ if (il == n_layer - 1) {
15776
+ // skip computing output for unused tokens
15777
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
15778
+ n_tokens = n_outputs;
15779
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
15780
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
15781
+ }
15782
+
15783
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
15784
+ cb(ffn_inp, "ffn_inp", il);
15785
+
15786
+ // feed-forward network
15787
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
15788
+ model.layers[il].ffn_norm, NULL,
15789
+ LLM_NORM_RMS, cb, il);
15790
+ cb(cur, "ffn_norm", il);
15791
+
15792
+ cur = llm_build_ffn(ctx0, lctx, cur,
15793
+ model.layers[il].ffn_up, NULL, NULL,
15794
+ model.layers[il].ffn_gate, NULL, NULL,
15795
+ model.layers[il].ffn_down, NULL, NULL,
15796
+ NULL,
15797
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
15798
+ cb(cur, "ffn_out", il);
15799
+
15800
+ struct lm_ggml_tensor * ffn_out = lm_ggml_add(ctx0, cur, ffn_inp);
15801
+ cb(ffn_out, "ffn_out", il);
15802
+
15803
+ // MoE
15804
+ cur = llm_build_norm(ctx0, inpSA, hparams,
15805
+ model.layers[il].ffn_norm_exps, NULL,
15806
+ LLM_NORM_RMS, cb, il);
15807
+ cb(cur, "ffn_norm_exps", il);
15808
+
15809
+ cur = llm_build_moe_ffn(ctx0, lctx, cur,
15810
+ model.layers[il].ffn_gate_inp,
15811
+ model.layers[il].ffn_up_exps,
15812
+ model.layers[il].ffn_gate_exps,
15813
+ model.layers[il].ffn_down_exps,
15814
+ n_expert, n_expert_used,
15815
+ LLM_FFN_SILU, true,
15816
+ false, 0.0,
15817
+ cb, il);
15818
+ cb(cur, "ffn_moe_out", il);
15819
+
15820
+ cur = lm_ggml_add(ctx0, cur, ffn_out);
15821
+ cb(cur, "ffn_out", il);
15822
+
15823
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
15824
+ cb(cur, "l_out", il);
15825
+
15826
+ // input for next layer
15827
+ inpL = cur;
15828
+ }
15829
+
15830
+ cur = inpL;
15831
+
15832
+ cur = llm_build_norm(ctx0, cur, hparams,
15833
+ model.output_norm, NULL,
15834
+ LLM_NORM_RMS, cb, -1);
15835
+ cb(cur, "result_norm", -1);
15836
+
15837
+ // lm_head
15838
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
15839
+ cb(cur, "result_output", -1);
15840
+
15841
+ lm_ggml_build_forward_expand(gf, cur);
15842
+
15843
+ return gf;
15844
+ }
15845
+
15846
+ struct lm_ggml_cgraph * build_deepseek() {
15847
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15848
+
15849
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
15850
+ int32_t n_tokens = this->n_tokens;
15851
+
15852
+ const int64_t n_embd_head = hparams.n_embd_head_v;
15853
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
15854
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
15855
+
15856
+ struct lm_ggml_tensor * cur;
15857
+ struct lm_ggml_tensor * inpL;
15858
+
15859
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
15860
+
15861
+ // inp_pos - contains the positions
15862
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
15863
+
15864
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
15865
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
15866
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
15867
+ for (int il = 0; il < n_layer; ++il) {
15868
+ struct lm_ggml_tensor * inpSA = inpL;
15869
+
15870
+ // norm
15871
+ cur = llm_build_norm(ctx0, inpL, hparams,
15872
+ model.layers[il].attn_norm, NULL,
15873
+ LLM_NORM_RMS, cb, il);
15874
+ cb(cur, "attn_norm", il);
15875
+
15876
+ // self-attention
15877
+ {
15878
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
15879
+ struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
15880
+
15881
+ // compute Q and K and RoPE them
15882
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
15883
+ cb(Qcur, "Qcur", il);
15884
+ if (model.layers[il].bq) {
15885
+ Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
15886
+ cb(Qcur, "Qcur", il);
15887
+ }
15888
+
15889
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
15890
+ cb(Kcur, "Kcur", il);
15891
+ if (model.layers[il].bk) {
15892
+ Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
15893
+ cb(Kcur, "Kcur", il);
15894
+ }
15895
+
15896
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
15897
+ cb(Vcur, "Vcur", il);
15898
+ if (model.layers[il].bv) {
15899
+ Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
15900
+ cb(Vcur, "Vcur", il);
15901
+ }
15902
+
15903
+ Qcur = lm_ggml_rope_ext(
15904
+ ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
15065
15905
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15066
15906
  ext_factor, attn_factor, beta_fast, beta_slow
15067
15907
  );
15068
15908
  cb(Qcur, "Qcur", il);
15069
15909
 
15070
15910
  Kcur = lm_ggml_rope_ext(
15071
- ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
15911
+ ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
15072
15912
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
15073
15913
  ext_factor, attn_factor, beta_fast, beta_slow
15074
15914
  );
15075
15915
  cb(Kcur, "Kcur", il);
15076
15916
 
15077
15917
  cur = llm_build_kv(ctx0, lctx, kv_self, gf,
15078
- model.layers[il].wo, NULL,
15079
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
15918
+ model.layers[il].wo, model.layers[il].bo,
15919
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
15080
15920
  }
15081
15921
 
15082
15922
  if (il == n_layer - 1) {
@@ -15087,46 +15927,53 @@ struct llm_build_context {
15087
15927
  inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
15088
15928
  }
15089
15929
 
15930
+
15090
15931
  struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
15091
15932
  cb(ffn_inp, "ffn_inp", il);
15092
15933
 
15093
- // feed-forward network
15094
15934
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
15095
15935
  model.layers[il].ffn_norm, NULL,
15096
15936
  LLM_NORM_RMS, cb, il);
15097
15937
  cb(cur, "ffn_norm", il);
15098
15938
 
15099
- cur = llm_build_ffn(ctx0, lctx, cur,
15100
- model.layers[il].ffn_up, NULL, NULL,
15101
- model.layers[il].ffn_gate, NULL, NULL,
15102
- model.layers[il].ffn_down, NULL, NULL,
15103
- NULL,
15104
- LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
15105
- cb(cur, "ffn_out", il);
15106
-
15107
- struct lm_ggml_tensor * ffn_out = lm_ggml_add(ctx0, cur, ffn_inp);
15108
- cb(ffn_out, "ffn_out", il);
15109
-
15110
- // MoE
15111
- cur = llm_build_norm(ctx0, inpSA, hparams,
15112
- model.layers[il].ffn_norm_exps, NULL,
15113
- LLM_NORM_RMS, cb, il);
15114
- cb(cur, "ffn_norm_exps", il);
15939
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
15940
+ cur = llm_build_ffn(ctx0, lctx, cur,
15941
+ model.layers[il].ffn_up, NULL, NULL,
15942
+ model.layers[il].ffn_gate, NULL, NULL,
15943
+ model.layers[il].ffn_down, NULL, NULL,
15944
+ NULL,
15945
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
15946
+ cb(cur, "ffn_out", il);
15947
+ } else {
15948
+ // MoE branch
15949
+ lm_ggml_tensor * moe_out =
15950
+ llm_build_moe_ffn(ctx0, lctx, cur,
15951
+ model.layers[il].ffn_gate_inp,
15952
+ model.layers[il].ffn_up_exps,
15953
+ model.layers[il].ffn_gate_exps,
15954
+ model.layers[il].ffn_down_exps,
15955
+ n_expert, n_expert_used,
15956
+ LLM_FFN_SILU, false,
15957
+ false, hparams.expert_weights_scale,
15958
+ cb, il);
15959
+ cb(moe_out, "ffn_moe_out", il);
15115
15960
 
15116
- cur = llm_build_moe_ffn(ctx0, lctx, cur,
15117
- model.layers[il].ffn_gate_inp,
15118
- model.layers[il].ffn_up_exps,
15119
- model.layers[il].ffn_gate_exps,
15120
- model.layers[il].ffn_down_exps,
15121
- n_expert, n_expert_used,
15122
- LLM_FFN_SILU, true,
15123
- false, 0.0,
15124
- cb, il);
15125
- cb(cur, "ffn_moe_out", il);
15961
+ // FFN shared expert
15962
+ {
15963
+ lm_ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
15964
+ model.layers[il].ffn_up_shexp, NULL, NULL,
15965
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
15966
+ model.layers[il].ffn_down_shexp, NULL, NULL,
15967
+ NULL,
15968
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
15969
+ cb(ffn_shexp, "ffn_shexp", il);
15126
15970
 
15127
- cur = lm_ggml_add(ctx0, cur, ffn_out);
15128
- cb(cur, "ffn_out", il);
15971
+ cur = lm_ggml_add(ctx0, moe_out, ffn_shexp);
15972
+ cb(cur, "ffn_out", il);
15973
+ }
15974
+ }
15129
15975
 
15976
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
15130
15977
  cur = lctx.cvec.apply_to(ctx0, cur, il);
15131
15978
  cb(cur, "l_out", il);
15132
15979
 
@@ -15143,6 +15990,7 @@ struct llm_build_context {
15143
15990
 
15144
15991
  // lm_head
15145
15992
  cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
15993
+
15146
15994
  cb(cur, "result_output", -1);
15147
15995
 
15148
15996
  lm_ggml_build_forward_expand(gf, cur);
@@ -15529,7 +16377,7 @@ struct llm_build_context {
15529
16377
  return gf;
15530
16378
  }
15531
16379
 
15532
- struct lm_ggml_cgraph * build_t5_encoder() {
16380
+ struct lm_ggml_cgraph * build_t5_enc() {
15533
16381
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15534
16382
 
15535
16383
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -15661,7 +16509,7 @@ struct llm_build_context {
15661
16509
  return gf;
15662
16510
  }
15663
16511
 
15664
- struct lm_ggml_cgraph * build_t5_decoder() {
16512
+ struct lm_ggml_cgraph * build_t5_dec() {
15665
16513
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
15666
16514
 
15667
16515
  // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -16610,6 +17458,158 @@ struct llm_build_context {
16610
17458
 
16611
17459
  return gf;
16612
17460
  }
17461
+
17462
+ struct lm_ggml_cgraph * build_wavtokenizer_dec() {
17463
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
17464
+
17465
+ struct lm_ggml_tensor * cur;
17466
+ struct lm_ggml_tensor * inpL;
17467
+
17468
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
17469
+
17470
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, inpL));
17471
+
17472
+ cur = lm_ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
17473
+ cur = lm_ggml_add(ctx0, cur, model.conv1d_b);
17474
+
17475
+ // posnet
17476
+ for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
17477
+ const auto & layer = model.layers[il].posnet;
17478
+
17479
+ inpL = cur;
17480
+
17481
+ switch (il) {
17482
+ case 0:
17483
+ case 1:
17484
+ case 3:
17485
+ case 4:
17486
+ {
17487
+ cur = llm_build_norm(ctx0, cur, hparams,
17488
+ layer.norm1,
17489
+ layer.norm1_b,
17490
+ LLM_NORM_GROUP, cb, 0);
17491
+
17492
+ cur = lm_ggml_mul(ctx0, lm_ggml_sigmoid(ctx0, cur), cur);
17493
+
17494
+ cur = lm_ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
17495
+ cur = lm_ggml_add(ctx0, cur, layer.conv1_b);
17496
+
17497
+ cur = llm_build_norm(ctx0, cur, hparams,
17498
+ layer.norm2,
17499
+ layer.norm2_b,
17500
+ LLM_NORM_GROUP, cb, 0);
17501
+
17502
+ cur = lm_ggml_mul(ctx0, lm_ggml_sigmoid(ctx0, cur), cur);
17503
+
17504
+ cur = lm_ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
17505
+ cur = lm_ggml_add(ctx0, cur, layer.conv2_b);
17506
+
17507
+ cur = lm_ggml_add(ctx0, cur, inpL);
17508
+ } break;
17509
+ case 2:
17510
+ {
17511
+ cur = llm_build_norm(ctx0, cur, hparams,
17512
+ layer.attn_norm,
17513
+ layer.attn_norm_b,
17514
+ LLM_NORM_GROUP, cb, 0);
17515
+
17516
+ struct lm_ggml_tensor * q;
17517
+ struct lm_ggml_tensor * k;
17518
+ struct lm_ggml_tensor * v;
17519
+
17520
+ q = lm_ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
17521
+ k = lm_ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
17522
+ v = lm_ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
17523
+
17524
+ q = lm_ggml_add(ctx0, q, layer.attn_q_b);
17525
+ k = lm_ggml_add(ctx0, k, layer.attn_k_b);
17526
+ v = lm_ggml_add(ctx0, v, layer.attn_v_b);
17527
+
17528
+ q = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, q));
17529
+ k = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, k));
17530
+
17531
+ struct lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
17532
+
17533
+ kq = lm_ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
17534
+
17535
+ cur = lm_ggml_mul_mat(ctx0, kq, v);
17536
+
17537
+ cur = lm_ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
17538
+ cur = lm_ggml_add(ctx0, cur, layer.attn_o_b);
17539
+
17540
+ cur = lm_ggml_add(ctx0, cur, inpL);
17541
+ } break;
17542
+ case 5:
17543
+ {
17544
+ cur = llm_build_norm(ctx0, cur, hparams,
17545
+ layer.norm,
17546
+ layer.norm_b,
17547
+ LLM_NORM_GROUP, cb, 0);
17548
+ } break;
17549
+ default: LM_GGML_ABORT("unknown posnet layer");
17550
+ };
17551
+ }
17552
+
17553
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
17554
+
17555
+ cur = llm_build_norm(ctx0, cur, hparams,
17556
+ model.tok_norm,
17557
+ model.tok_norm_b,
17558
+ LLM_NORM, cb, -1);
17559
+
17560
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
17561
+
17562
+ inpL = cur;
17563
+
17564
+ // convnext
17565
+ for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
17566
+ const auto & layer = model.layers[il].convnext;
17567
+
17568
+ cur = inpL;
17569
+
17570
+ cur = lm_ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
17571
+ cur = lm_ggml_add(ctx0, cur, layer.dw_b);
17572
+
17573
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
17574
+
17575
+ cur = llm_build_norm(ctx0, cur, hparams,
17576
+ layer.norm,
17577
+ layer.norm_b,
17578
+ LLM_NORM, cb, -1);
17579
+
17580
+ cur = llm_build_ffn(ctx0, lctx, cur,
17581
+ layer.pw1, layer.pw1_b, NULL,
17582
+ NULL, NULL, NULL,
17583
+ layer.pw2, layer.pw2_b, NULL,
17584
+ NULL,
17585
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
17586
+
17587
+ cur = lm_ggml_mul(ctx0, cur, layer.gamma);
17588
+
17589
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
17590
+
17591
+ inpL = lm_ggml_add(ctx0, cur, inpL);
17592
+ }
17593
+
17594
+ cur = inpL;
17595
+
17596
+ cur = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, cur));
17597
+
17598
+ cur = llm_build_norm(ctx0, cur, hparams,
17599
+ model.output_norm,
17600
+ model.output_norm_b,
17601
+ LLM_NORM, cb, -1);
17602
+
17603
+ // lm_head
17604
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
17605
+
17606
+ cur = lm_ggml_add(ctx0, cur, model.output_b);
17607
+ cb(cur, "result_embd", -1);
17608
+
17609
+ lm_ggml_build_forward_expand(gf, cur);
17610
+
17611
+ return gf;
17612
+ }
16613
17613
  };
16614
17614
 
16615
17615
  static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -16692,11 +17692,16 @@ static struct lm_ggml_cgraph * llama_build_graph(
16692
17692
 
16693
17693
  switch (model.arch) {
16694
17694
  case LLM_ARCH_LLAMA:
17695
+ case LLM_ARCH_MINICPM:
16695
17696
  case LLM_ARCH_GRANITE:
16696
17697
  case LLM_ARCH_GRANITE_MOE:
16697
17698
  {
16698
17699
  result = llm.build_llama();
16699
17700
  } break;
17701
+ case LLM_ARCH_DECI:
17702
+ {
17703
+ result = llm.build_deci();
17704
+ } break;
16700
17705
  case LLM_ARCH_BAICHUAN:
16701
17706
  {
16702
17707
  result = llm.build_baichuan();
@@ -16743,6 +17748,11 @@ static struct lm_ggml_cgraph * llama_build_graph(
16743
17748
  {
16744
17749
  result = llm.build_qwen2();
16745
17750
  } break;
17751
+ case LLM_ARCH_QWEN2VL:
17752
+ {
17753
+ lctx.n_pos_per_token = 4;
17754
+ result = llm.build_qwen2vl();
17755
+ } break;
16746
17756
  case LLM_ARCH_QWEN2MOE:
16747
17757
  {
16748
17758
  result = llm.build_qwen2moe();
@@ -16775,10 +17785,6 @@ static struct lm_ggml_cgraph * llama_build_graph(
16775
17785
  {
16776
17786
  result = llm.build_internlm2();
16777
17787
  } break;
16778
- case LLM_ARCH_MINICPM:
16779
- {
16780
- result = llm.build_minicpm();
16781
- } break;
16782
17788
  case LLM_ARCH_MINICPM3:
16783
17789
  {
16784
17790
  result = llm.build_minicpm3();
@@ -16835,6 +17841,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
16835
17841
  {
16836
17842
  result = llm.build_arctic();
16837
17843
  } break;
17844
+ case LLM_ARCH_DEEPSEEK:
17845
+ {
17846
+ result = llm.build_deepseek();
17847
+ } break;
16838
17848
  case LLM_ARCH_DEEPSEEK2:
16839
17849
  {
16840
17850
  result = llm.build_deepseek2();
@@ -16850,14 +17860,14 @@ static struct lm_ggml_cgraph * llama_build_graph(
16850
17860
  case LLM_ARCH_T5:
16851
17861
  {
16852
17862
  if (lctx.is_encoding) {
16853
- result = llm.build_t5_encoder();
17863
+ result = llm.build_t5_enc();
16854
17864
  } else {
16855
- result = llm.build_t5_decoder();
17865
+ result = llm.build_t5_dec();
16856
17866
  }
16857
17867
  } break;
16858
17868
  case LLM_ARCH_T5ENCODER:
16859
17869
  {
16860
- result = llm.build_t5_encoder();
17870
+ result = llm.build_t5_enc();
16861
17871
  } break;
16862
17872
  case LLM_ARCH_JAIS:
16863
17873
  {
@@ -16879,6 +17889,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
16879
17889
  {
16880
17890
  result = llm.build_chameleon();
16881
17891
  } break;
17892
+ case LLM_ARCH_WAVTOKENIZER_DEC:
17893
+ {
17894
+ result = llm.build_wavtokenizer_dec();
17895
+ } break;
16882
17896
  default:
16883
17897
  LM_GGML_ABORT("fatal error");
16884
17898
  }
@@ -16965,35 +17979,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
16965
17979
 
16966
17980
  if (ubatch.pos && lctx.inp_pos) {
16967
17981
  const int64_t n_tokens = ubatch.n_tokens;
16968
-
16969
- lm_ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*lm_ggml_element_size(lctx.inp_pos));
17982
+ auto n_pos = lctx.n_pos_per_token;
17983
+ lm_ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*lm_ggml_element_size(lctx.inp_pos));
16970
17984
  }
16971
17985
 
16972
17986
  if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
16973
- LM_GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
16974
- const int64_t n_tokens = ubatch.n_tokens;
17987
+ //LM_GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
17988
+
17989
+ if (!lctx.inp_out_ids) {
17990
+ LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
17991
+ } else {
17992
+ const int64_t n_tokens = ubatch.n_tokens;
16975
17993
 
16976
- LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
16977
- int32_t * data = (int32_t *) lctx.inp_out_ids->data;
17994
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
17995
+ int32_t * data = (int32_t *) lctx.inp_out_ids->data;
16978
17996
 
16979
- if (lctx.n_outputs == n_tokens) {
16980
- for (int i = 0; i < n_tokens; ++i) {
16981
- data[i] = i;
16982
- }
16983
- } else if (ubatch.output) {
16984
- int32_t n_outputs = 0;
16985
- for (int i = 0; i < n_tokens; ++i) {
16986
- if (ubatch.output[i]) {
16987
- data[n_outputs++] = i;
17997
+ if (lctx.n_outputs == n_tokens) {
17998
+ for (int i = 0; i < n_tokens; ++i) {
17999
+ data[i] = i;
16988
18000
  }
18001
+ } else if (ubatch.output) {
18002
+ int32_t n_outputs = 0;
18003
+ for (int i = 0; i < n_tokens; ++i) {
18004
+ if (ubatch.output[i]) {
18005
+ data[n_outputs++] = i;
18006
+ }
18007
+ }
18008
+ // the graph needs to have been passed the correct number of outputs
18009
+ LM_GGML_ASSERT(lctx.n_outputs == n_outputs);
18010
+ } else if (lctx.n_outputs == 1) {
18011
+ // only keep last output
18012
+ data[0] = n_tokens - 1;
18013
+ } else {
18014
+ LM_GGML_ASSERT(lctx.n_outputs == 0);
16989
18015
  }
16990
- // the graph needs to have been passed the correct number of outputs
16991
- LM_GGML_ASSERT(lctx.n_outputs == n_outputs);
16992
- } else if (lctx.n_outputs == 1) {
16993
- // only keep last output
16994
- data[0] = n_tokens - 1;
16995
- } else {
16996
- LM_GGML_ASSERT(lctx.n_outputs == 0);
16997
18016
  }
16998
18017
  }
16999
18018
 
@@ -17664,6 +18683,7 @@ static int llama_decode_internal(
17664
18683
  embd = nullptr; // do not extract embeddings when not needed
17665
18684
  LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
17666
18685
  }
18686
+
17667
18687
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (lm_ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
17668
18688
 
17669
18689
  lm_ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
@@ -18451,10 +19471,6 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
18451
19471
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
18452
19472
  new_type = LM_GGML_TYPE_IQ3_S;
18453
19473
  }
18454
- else if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8 ||
18455
- new_type == LM_GGML_TYPE_Q4_0_8_8) {
18456
- new_type = LM_GGML_TYPE_Q4_0;
18457
- }
18458
19474
  else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
18459
19475
  new_type = LM_GGML_TYPE_Q4_K;
18460
19476
  }
@@ -18777,9 +19793,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18777
19793
  case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = LM_GGML_TYPE_IQ4_XS; break;
18778
19794
  case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = LM_GGML_TYPE_IQ3_S; break;
18779
19795
  case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = LM_GGML_TYPE_IQ3_S; break;
18780
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = LM_GGML_TYPE_Q4_0_4_4; break;
18781
- case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = LM_GGML_TYPE_Q4_0_4_8; break;
18782
- case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = LM_GGML_TYPE_Q4_0_8_8; break;
18783
19796
 
18784
19797
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
18785
19798
  }
@@ -19118,14 +20131,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
19118
20131
  f32_data = (float *) f32_conv_buf.data();
19119
20132
  }
19120
20133
 
19121
- int chunk_size_multiplier = 1;
19122
- if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8 || new_type == LM_GGML_TYPE_Q4_0_8_8) {
19123
- if ((new_type == LM_GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = LM_GGML_TYPE_Q4_0;
19124
- else if (tensor->ne[1] % 4 != 0) new_type = LM_GGML_TYPE_Q4_0;
19125
- if (new_type == LM_GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
19126
- else if (new_type == LM_GGML_TYPE_Q4_0_4_4 || new_type == LM_GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
19127
- }
19128
-
19129
20134
  LLAMA_LOG_INFO("converting to %s .. ", lm_ggml_type_name(new_type));
19130
20135
  fflush(stdout);
19131
20136
 
@@ -19138,8 +20143,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
19138
20143
  const int64_t nrows = tensor->ne[1];
19139
20144
 
19140
20145
  static const int64_t min_chunk_size = 32 * 512;
19141
- const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
19142
- chunk_size_multiplier;
20146
+ const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
19143
20147
 
19144
20148
  const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
19145
20149
  const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
@@ -20068,10 +21072,12 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
20068
21072
  case LLM_ARCH_T5ENCODER:
20069
21073
  case LLM_ARCH_JAIS:
20070
21074
  case LLM_ARCH_RWKV6:
21075
+ case LLM_ARCH_WAVTOKENIZER_DEC:
20071
21076
  return LLAMA_ROPE_TYPE_NONE;
20072
21077
 
20073
21078
  // use what we call a normal RoPE, operating on pairs of consecutive head values
20074
21079
  case LLM_ARCH_LLAMA:
21080
+ case LLM_ARCH_DECI:
20075
21081
  case LLM_ARCH_BAICHUAN:
20076
21082
  case LLM_ARCH_STARCODER:
20077
21083
  case LLM_ARCH_PLAMO:
@@ -20082,6 +21088,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
20082
21088
  case LLM_ARCH_COMMAND_R:
20083
21089
  case LLM_ARCH_OLMO:
20084
21090
  case LLM_ARCH_ARCTIC:
21091
+ case LLM_ARCH_DEEPSEEK:
20085
21092
  case LLM_ARCH_DEEPSEEK2:
20086
21093
  case LLM_ARCH_CHATGLM:
20087
21094
  case LLM_ARCH_GRANITE:
@@ -20115,6 +21122,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
20115
21122
  case LLM_ARCH_MINICPM3:
20116
21123
  return LLAMA_ROPE_TYPE_NEOX;
20117
21124
 
21125
+ case LLM_ARCH_QWEN2VL:
21126
+ return LLAMA_ROPE_TYPE_MROPE;
21127
+
20118
21128
  // all model arches should be listed explicitly here
20119
21129
  case LLM_ARCH_UNKNOWN:
20120
21130
  LM_GGML_ABORT("unknown architecture");
@@ -20181,17 +21191,6 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
20181
21191
  return model->n_elements;
20182
21192
  }
20183
21193
 
20184
- struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
20185
- auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
20186
- [name](const std::pair<std::string, struct lm_ggml_tensor *> & it) {
20187
- return it.first == name;
20188
- });
20189
- if (it == model->tensors_by_name.end()) {
20190
- return nullptr;
20191
- }
20192
- return it->second;
20193
- }
20194
-
20195
21194
  bool llama_model_has_encoder(const struct llama_model * model) {
20196
21195
  switch (model->arch) {
20197
21196
  case LLM_ARCH_T5: return true;
@@ -21683,7 +22682,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
21683
22682
  throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
21684
22683
  }
21685
22684
  } else if ((size_t) i >= ctx->output_ids.size()) {
21686
- throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
22685
+ throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
21687
22686
  } else {
21688
22687
  j = ctx->output_ids[i];
21689
22688
  }
@@ -21854,18 +22853,115 @@ int32_t llama_detokenize(
21854
22853
  // chat templates
21855
22854
  //
21856
22855
 
22856
+ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
22857
+ if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
22858
+ return LLM_CHAT_TEMPLATES.at(tmpl);
22859
+ }
22860
+ auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
22861
+ return tmpl.find(haystack) != std::string::npos;
22862
+ };
22863
+ if (tmpl_contains("<|im_start|>")) {
22864
+ return LLM_CHAT_TEMPLATE_CHATML;
22865
+ } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
22866
+ if (tmpl_contains("[SYSTEM_PROMPT]")) {
22867
+ return LLM_CHAT_TEMPLATE_MISTRAL_V7;
22868
+ } else if (
22869
+ // catches official 'v1' template
22870
+ tmpl_contains("' [INST] ' + system_message")
22871
+ // catches official 'v3' and 'v3-tekken' templates
22872
+ || tmpl_contains("[AVAILABLE_TOOLS]")
22873
+ ) {
22874
+ // Official mistral 'v1', 'v3' and 'v3-tekken' templates
22875
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
22876
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
22877
+ if (tmpl_contains(" [INST]")) {
22878
+ return LLM_CHAT_TEMPLATE_MISTRAL_V1;
22879
+ } else if (tmpl_contains("\"[INST]\"")) {
22880
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
22881
+ }
22882
+ return LLM_CHAT_TEMPLATE_MISTRAL_V3;
22883
+ } else {
22884
+ // llama2 template and its variants
22885
+ // [variant] support system message
22886
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
22887
+ bool support_system_message = tmpl_contains("<<SYS>>");
22888
+ bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
22889
+ bool strip_message = tmpl_contains("content.strip()");
22890
+ if (strip_message) {
22891
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
22892
+ } else if (add_bos_inside_history) {
22893
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
22894
+ } else if (support_system_message) {
22895
+ return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
22896
+ } else {
22897
+ return LLM_CHAT_TEMPLATE_LLAMA_2;
22898
+ }
22899
+ }
22900
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
22901
+ return LLM_CHAT_TEMPLATE_PHI_3;
22902
+ } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
22903
+ return LLM_CHAT_TEMPLATE_FALCON_3;
22904
+ } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
22905
+ return LLM_CHAT_TEMPLATE_ZEPHYR;
22906
+ } else if (tmpl_contains("bos_token + message['role']")) {
22907
+ return LLM_CHAT_TEMPLATE_MONARCH;
22908
+ } else if (tmpl_contains("<start_of_turn>")) {
22909
+ return LLM_CHAT_TEMPLATE_GEMMA;
22910
+ } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
22911
+ // OrionStarAI/Orion-14B-Chat
22912
+ return LLM_CHAT_TEMPLATE_ORION;
22913
+ } else if (tmpl_contains("GPT4 Correct ")) {
22914
+ // openchat/openchat-3.5-0106
22915
+ return LLM_CHAT_TEMPLATE_OPENCHAT;
22916
+ } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
22917
+ // eachadea/vicuna-13b-1.1 (and Orca variant)
22918
+ if (tmpl_contains("SYSTEM: ")) {
22919
+ return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
22920
+ }
22921
+ return LLM_CHAT_TEMPLATE_VICUNA;
22922
+ } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
22923
+ // deepseek-ai/deepseek-coder-33b-instruct
22924
+ return LLM_CHAT_TEMPLATE_DEEPSEEK;
22925
+ } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
22926
+ // CohereForAI/c4ai-command-r-plus
22927
+ return LLM_CHAT_TEMPLATE_COMMAND_R;
22928
+ } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
22929
+ return LLM_CHAT_TEMPLATE_LLAMA_3;
22930
+ } else if (tmpl_contains("[gMASK]sop")) {
22931
+ // chatglm3-6b
22932
+ return LLM_CHAT_TEMPLATE_CHATGML_3;
22933
+ } else if (tmpl_contains("[gMASK]<sop>")) {
22934
+ return LLM_CHAT_TEMPLATE_CHATGML_4;
22935
+ } else if (tmpl_contains(LU8("<用户>"))) {
22936
+ // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
22937
+ return LLM_CHAT_TEMPLATE_MINICPM;
22938
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
22939
+ return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
22940
+ } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
22941
+ // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
22942
+ // EXAONE-3.0-7.8B-Instruct
22943
+ return LLM_CHAT_TEMPLATE_EXAONE_3;
22944
+ } else if (tmpl_contains("rwkv-world")) {
22945
+ return LLM_CHAT_TEMPLATE_RWKV_WORLD;
22946
+ } else if (tmpl_contains("<|start_of_role|>")) {
22947
+ return LLM_CHAT_TEMPLATE_GRANITE;
22948
+ } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
22949
+ return LLM_CHAT_TEMPLATE_GIGACHAT;
22950
+ } else if (tmpl_contains("<|role_start|>")) {
22951
+ return LLM_CHAT_TEMPLATE_MEGREZ;
22952
+ }
22953
+ return LLM_CHAT_TEMPLATE_UNKNOWN;
22954
+ }
22955
+
21857
22956
  // Simple version of "llama_apply_chat_template" that only works with strings
21858
22957
  // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
21859
22958
  static int32_t llama_chat_apply_template_internal(
21860
- const std::string & tmpl,
22959
+ const llm_chat_template tmpl,
21861
22960
  const std::vector<const llama_chat_message *> & chat,
21862
22961
  std::string & dest, bool add_ass) {
21863
22962
  // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
21864
22963
  std::stringstream ss;
21865
- auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
21866
- return tmpl.find(haystack) != std::string::npos;
21867
- };
21868
- if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
22964
+ if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
21869
22965
  // chatml template
21870
22966
  for (auto message : chat) {
21871
22967
  ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -21873,16 +22969,59 @@ static int32_t llama_chat_apply_template_internal(
21873
22969
  if (add_ass) {
21874
22970
  ss << "<|im_start|>assistant\n";
21875
22971
  }
21876
- } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
22972
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
22973
+ // Official mistral 'v7' template
22974
+ // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
22975
+ for (auto message : chat) {
22976
+ std::string role(message->role);
22977
+ std::string content(message->content);
22978
+ if (role == "system") {
22979
+ ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
22980
+ } else if (role == "user") {
22981
+ ss << "[INST] " << content << "[/INST]";
22982
+ }
22983
+ else {
22984
+ ss << " " << content << "</s>";
22985
+ }
22986
+ }
22987
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
22988
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
22989
+ || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
22990
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
22991
+ // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
22992
+ std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
22993
+ std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
22994
+ bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
22995
+ bool is_inside_turn = false;
22996
+ for (auto message : chat) {
22997
+ if (!is_inside_turn) {
22998
+ ss << leading_space << "[INST]" << trailing_space;
22999
+ is_inside_turn = true;
23000
+ }
23001
+ std::string role(message->role);
23002
+ std::string content(message->content);
23003
+ if (role == "system") {
23004
+ ss << content << "\n\n";
23005
+ } else if (role == "user") {
23006
+ ss << content << leading_space << "[/INST]";
23007
+ } else {
23008
+ ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
23009
+ is_inside_turn = false;
23010
+ }
23011
+ }
23012
+ } else if (
23013
+ tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
23014
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
23015
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
23016
+ || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
21877
23017
  // llama2 template and its variants
21878
23018
  // [variant] support system message
21879
- bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
21880
- // [variant] space before + after response
21881
- bool space_around_response = tmpl_contains("' ' + eos_token");
23019
+ // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
23020
+ bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
21882
23021
  // [variant] add BOS inside history
21883
- bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
23022
+ bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
21884
23023
  // [variant] trim spaces from the input message
21885
- bool strip_message = tmpl_contains("content.strip()");
23024
+ bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
21886
23025
  // construct the prompt
21887
23026
  bool is_inside_turn = true; // skip BOS at the beginning
21888
23027
  ss << "[INST] ";
@@ -21903,12 +23042,11 @@ static int32_t llama_chat_apply_template_internal(
21903
23042
  } else if (role == "user") {
21904
23043
  ss << content << " [/INST]";
21905
23044
  } else {
21906
- ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
23045
+ ss << content << "</s>";
21907
23046
  is_inside_turn = false;
21908
23047
  }
21909
23048
  }
21910
- // llama2 templates seem to not care about "add_generation_prompt"
21911
- } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
23049
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
21912
23050
  // Phi 3
21913
23051
  for (auto message : chat) {
21914
23052
  std::string role(message->role);
@@ -21917,7 +23055,16 @@ static int32_t llama_chat_apply_template_internal(
21917
23055
  if (add_ass) {
21918
23056
  ss << "<|assistant|>\n";
21919
23057
  }
21920
- } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
23058
+ } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
23059
+ // Falcon 3
23060
+ for (auto message : chat) {
23061
+ std::string role(message->role);
23062
+ ss << "<|" << role << "|>\n" << message->content << "\n";
23063
+ }
23064
+ if (add_ass) {
23065
+ ss << "<|assistant|>\n";
23066
+ }
23067
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
21921
23068
  // zephyr template
21922
23069
  for (auto message : chat) {
21923
23070
  ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -21925,7 +23072,7 @@ static int32_t llama_chat_apply_template_internal(
21925
23072
  if (add_ass) {
21926
23073
  ss << "<|assistant|>\n";
21927
23074
  }
21928
- } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
23075
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
21929
23076
  // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
21930
23077
  for (auto message : chat) {
21931
23078
  std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@@ -21934,7 +23081,7 @@ static int32_t llama_chat_apply_template_internal(
21934
23081
  if (add_ass) {
21935
23082
  ss << "<s>assistant\n";
21936
23083
  }
21937
- } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
23084
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
21938
23085
  // google/gemma-7b-it
21939
23086
  std::string system_prompt = "";
21940
23087
  for (auto message : chat) {
@@ -21956,7 +23103,7 @@ static int32_t llama_chat_apply_template_internal(
21956
23103
  if (add_ass) {
21957
23104
  ss << "<start_of_turn>model\n";
21958
23105
  }
21959
- } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
23106
+ } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
21960
23107
  // OrionStarAI/Orion-14B-Chat
21961
23108
  std::string system_prompt = "";
21962
23109
  for (auto message : chat) {
@@ -21976,7 +23123,7 @@ static int32_t llama_chat_apply_template_internal(
21976
23123
  ss << message->content << "</s>";
21977
23124
  }
21978
23125
  }
21979
- } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
23126
+ } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
21980
23127
  // openchat/openchat-3.5-0106,
21981
23128
  for (auto message : chat) {
21982
23129
  std::string role(message->role);
@@ -21990,13 +23137,13 @@ static int32_t llama_chat_apply_template_internal(
21990
23137
  if (add_ass) {
21991
23138
  ss << "GPT4 Correct Assistant:";
21992
23139
  }
21993
- } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
23140
+ } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
21994
23141
  // eachadea/vicuna-13b-1.1 (and Orca variant)
21995
23142
  for (auto message : chat) {
21996
23143
  std::string role(message->role);
21997
23144
  if (role == "system") {
21998
23145
  // Orca-Vicuna variant uses a system prefix
21999
- if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
23146
+ if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
22000
23147
  ss << "SYSTEM: " << message->content << "\n";
22001
23148
  } else {
22002
23149
  ss << message->content << "\n\n";
@@ -22010,7 +23157,7 @@ static int32_t llama_chat_apply_template_internal(
22010
23157
  if (add_ass) {
22011
23158
  ss << "ASSISTANT:";
22012
23159
  }
22013
- } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
23160
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
22014
23161
  // deepseek-ai/deepseek-coder-33b-instruct
22015
23162
  for (auto message : chat) {
22016
23163
  std::string role(message->role);
@@ -22025,7 +23172,7 @@ static int32_t llama_chat_apply_template_internal(
22025
23172
  if (add_ass) {
22026
23173
  ss << "### Response:\n";
22027
23174
  }
22028
- } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
23175
+ } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
22029
23176
  // CohereForAI/c4ai-command-r-plus
22030
23177
  for (auto message : chat) {
22031
23178
  std::string role(message->role);
@@ -22040,7 +23187,7 @@ static int32_t llama_chat_apply_template_internal(
22040
23187
  if (add_ass) {
22041
23188
  ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
22042
23189
  }
22043
- } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
23190
+ } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
22044
23191
  // Llama 3
22045
23192
  for (auto message : chat) {
22046
23193
  std::string role(message->role);
@@ -22049,7 +23196,7 @@ static int32_t llama_chat_apply_template_internal(
22049
23196
  if (add_ass) {
22050
23197
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
22051
23198
  }
22052
- } else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
23199
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
22053
23200
  // chatglm3-6b
22054
23201
  ss << "[gMASK]" << "sop";
22055
23202
  for (auto message : chat) {
@@ -22059,7 +23206,7 @@ static int32_t llama_chat_apply_template_internal(
22059
23206
  if (add_ass) {
22060
23207
  ss << "<|assistant|>";
22061
23208
  }
22062
- } else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
23209
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
22063
23210
  ss << "[gMASK]" << "<sop>";
22064
23211
  for (auto message : chat) {
22065
23212
  std::string role(message->role);
@@ -22068,7 +23215,7 @@ static int32_t llama_chat_apply_template_internal(
22068
23215
  if (add_ass) {
22069
23216
  ss << "<|assistant|>";
22070
23217
  }
22071
- } else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
23218
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
22072
23219
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
22073
23220
  for (auto message : chat) {
22074
23221
  std::string role(message->role);
@@ -22080,7 +23227,7 @@ static int32_t llama_chat_apply_template_internal(
22080
23227
  ss << trim(message->content);
22081
23228
  }
22082
23229
  }
22083
- } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
23230
+ } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
22084
23231
  // DeepSeek-V2
22085
23232
  for (auto message : chat) {
22086
23233
  std::string role(message->role);
@@ -22095,7 +23242,7 @@ static int32_t llama_chat_apply_template_internal(
22095
23242
  if (add_ass) {
22096
23243
  ss << "Assistant:";
22097
23244
  }
22098
- } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
23245
+ } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
22099
23246
  // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
22100
23247
  // EXAONE-3.0-7.8B-Instruct
22101
23248
  for (auto message : chat) {
@@ -22111,7 +23258,7 @@ static int32_t llama_chat_apply_template_internal(
22111
23258
  if (add_ass) {
22112
23259
  ss << "[|assistant|]";
22113
23260
  }
22114
- } else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world")) {
23261
+ } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
22115
23262
  // this template requires the model to have "\n\n" as EOT token
22116
23263
  for (auto message : chat) {
22117
23264
  std::string role(message->role);
@@ -22121,7 +23268,7 @@ static int32_t llama_chat_apply_template_internal(
22121
23268
  ss << message->content << "\n\n";
22122
23269
  }
22123
23270
  }
22124
- } else if (tmpl == "granite" || tmpl_contains("<|start_of_role|>")) {
23271
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
22125
23272
  // IBM Granite template
22126
23273
  for (const auto & message : chat) {
22127
23274
  std::string role(message->role);
@@ -22134,6 +23281,42 @@ static int32_t llama_chat_apply_template_internal(
22134
23281
  if (add_ass) {
22135
23282
  ss << "<|start_of_role|>assistant<|end_of_role|>\n";
22136
23283
  }
23284
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
23285
+ // GigaChat template
23286
+ bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
23287
+
23288
+ // Handle system message if present
23289
+ if (has_system) {
23290
+ ss << "<s>" << chat[0]->content << "<|message_sep|>";
23291
+ } else {
23292
+ ss << "<s>";
23293
+ }
23294
+
23295
+ // Process remaining messages
23296
+ for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
23297
+ std::string role(chat[i]->role);
23298
+ if (role == "user") {
23299
+ ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
23300
+ << "available functions<|role_sep|>[]<|message_sep|>";
23301
+ } else if (role == "assistant") {
23302
+ ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
23303
+ }
23304
+ }
23305
+
23306
+ // Add generation prompt if needed
23307
+ if (add_ass) {
23308
+ ss << "assistant<|role_sep|>";
23309
+ }
23310
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
23311
+ // Megrez template
23312
+ for (auto message : chat) {
23313
+ std::string role(message->role);
23314
+ ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
23315
+ }
23316
+
23317
+ if (add_ass) {
23318
+ ss << "<|role_start|>assistant<|role_end|>";
23319
+ }
22137
23320
  } else {
22138
23321
  // template not supported
22139
23322
  return -1;
@@ -22153,15 +23336,15 @@ int32_t llama_chat_apply_template(
22153
23336
  std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
22154
23337
  if (tmpl == nullptr) {
22155
23338
  LM_GGML_ASSERT(model != nullptr);
22156
- // load template from model
22157
- std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
22158
- std::string template_key = "tokenizer.chat_template";
22159
- int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
22160
- if (res < 0) {
23339
+
23340
+ // load template from model, if available
23341
+ const auto & it = model->lm_gguf_kv.find("tokenizer.chat_template");
23342
+ if (it != model->lm_gguf_kv.end() && it->second.size() > 0) {
23343
+ curr_tmpl = it->second;
23344
+ }
23345
+ else {
22161
23346
  // worst case: there is no information about template, we will use chatml by default
22162
- curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
22163
- } else {
22164
- curr_tmpl = std::string(model_template.data(), model_template.size());
23347
+ curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
22165
23348
  }
22166
23349
  }
22167
23350
 
@@ -22173,7 +23356,11 @@ int32_t llama_chat_apply_template(
22173
23356
  }
22174
23357
 
22175
23358
  std::string formatted_chat;
22176
- int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
23359
+ llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
23360
+ if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
23361
+ return -1;
23362
+ }
23363
+ int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
22177
23364
  if (res < 0) {
22178
23365
  return res;
22179
23366
  }
@@ -22183,6 +23370,15 @@ int32_t llama_chat_apply_template(
22183
23370
  return res;
22184
23371
  }
22185
23372
 
23373
+ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
23374
+ auto it = LLM_CHAT_TEMPLATES.begin();
23375
+ for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
23376
+ output[i] = it->first.c_str();
23377
+ std::advance(it, 1);
23378
+ }
23379
+ return (int32_t) LLM_CHAT_TEMPLATES.size();
23380
+ }
23381
+
22186
23382
  //
22187
23383
  // sampling
22188
23384
  //