@fugood/llama.node 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/CMakeLists.txt +0 -1
  3. package/src/llama.cpp/common/CMakeLists.txt +4 -5
  4. package/src/llama.cpp/common/arg.cpp +44 -0
  5. package/src/llama.cpp/common/common.cpp +22 -6
  6. package/src/llama.cpp/common/common.h +15 -1
  7. package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
  8. package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  9. package/src/llama.cpp/ggml/include/ggml.h +104 -10
  10. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  11. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  12. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
  18. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
  19. package/src/llama.cpp/include/llama.h +13 -47
  20. package/src/llama.cpp/src/llama-arch.cpp +298 -3
  21. package/src/llama.cpp/src/llama-arch.h +22 -1
  22. package/src/llama.cpp/src/llama-batch.cpp +103 -71
  23. package/src/llama.cpp/src/llama-batch.h +31 -18
  24. package/src/llama.cpp/src/llama-chat.cpp +59 -1
  25. package/src/llama.cpp/src/llama-chat.h +3 -0
  26. package/src/llama.cpp/src/llama-context.cpp +134 -95
  27. package/src/llama.cpp/src/llama-context.h +13 -16
  28. package/src/llama.cpp/src/llama-cparams.h +3 -2
  29. package/src/llama.cpp/src/llama-graph.cpp +279 -180
  30. package/src/llama.cpp/src/llama-graph.h +183 -122
  31. package/src/llama.cpp/src/llama-hparams.cpp +47 -1
  32. package/src/llama.cpp/src/llama-hparams.h +12 -1
  33. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
  34. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
  35. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
  36. package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
  37. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  38. package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
  39. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  40. package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
  41. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  42. package/src/llama.cpp/src/llama-memory.h +3 -0
  43. package/src/llama.cpp/src/llama-model.cpp +3373 -743
  44. package/src/llama.cpp/src/llama-model.h +20 -4
  45. package/src/llama.cpp/src/llama-quant.cpp +2 -2
  46. package/src/llama.cpp/src/llama-vocab.cpp +376 -10
  47. package/src/llama.cpp/src/llama-vocab.h +43 -0
  48. package/src/llama.cpp/src/unicode.cpp +207 -0
  49. package/src/llama.cpp/src/unicode.h +2 -0
  50. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
@@ -34,6 +34,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
34
34
  { LLM_ARCH_PHI3, "phi3" },
35
35
  { LLM_ARCH_PHIMOE, "phimoe" },
36
36
  { LLM_ARCH_PLAMO, "plamo" },
37
+ { LLM_ARCH_PLAMO2, "plamo2" },
37
38
  { LLM_ARCH_CODESHELL, "codeshell" },
38
39
  { LLM_ARCH_ORION, "orion" },
39
40
  { LLM_ARCH_INTERNLM2, "internlm2" },
@@ -45,6 +46,9 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
45
46
  { LLM_ARCH_GEMMA3N, "gemma3n" },
46
47
  { LLM_ARCH_STARCODER2, "starcoder2" },
47
48
  { LLM_ARCH_MAMBA, "mamba" },
49
+ { LLM_ARCH_MAMBA2, "mamba2" },
50
+ { LLM_ARCH_JAMBA, "jamba" },
51
+ { LLM_ARCH_FALCON_H1, "falcon-h1" },
48
52
  { LLM_ARCH_XVERSE, "xverse" },
49
53
  { LLM_ARCH_COMMAND_R, "command-r" },
50
54
  { LLM_ARCH_COHERE2, "cohere2" },
@@ -64,12 +68,14 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
64
68
  { LLM_ARCH_JAIS, "jais" },
65
69
  { LLM_ARCH_NEMOTRON, "nemotron" },
66
70
  { LLM_ARCH_EXAONE, "exaone" },
71
+ { LLM_ARCH_EXAONE4, "exaone4" },
67
72
  { LLM_ARCH_RWKV6, "rwkv6" },
68
73
  { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
69
74
  { LLM_ARCH_RWKV7, "rwkv7" },
70
75
  { LLM_ARCH_ARWKV7, "arwkv7" },
71
76
  { LLM_ARCH_GRANITE, "granite" },
72
77
  { LLM_ARCH_GRANITE_MOE, "granitemoe" },
78
+ { LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
73
79
  { LLM_ARCH_CHAMELEON, "chameleon" },
74
80
  { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
75
81
  { LLM_ARCH_PLM, "plm" },
@@ -77,6 +83,11 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
77
83
  { LLM_ARCH_DOTS1, "dots1" },
78
84
  { LLM_ARCH_ARCEE, "arcee" },
79
85
  { LLM_ARCH_ERNIE4_5, "ernie4_5" },
86
+ { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
87
+ { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
88
+ { LLM_ARCH_SMOLLM3, "smollm3" },
89
+ { LLM_ARCH_LFM2, "lfm2" },
90
+ { LLM_ARCH_DREAM, "dream" },
80
91
  { LLM_ARCH_UNKNOWN, "(unknown)" },
81
92
  };
82
93
 
@@ -149,7 +160,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
149
160
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
150
161
  { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
151
162
  { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
152
- { LLM_KV_ATTENTION_LAYER_INDICES, "%s.attention.layer_indices" },
153
163
 
154
164
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
155
165
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -170,6 +180,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
170
180
  { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
171
181
  { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
172
182
  { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
183
+ { LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" },
173
184
  { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
174
185
 
175
186
  { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
@@ -182,6 +193,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
182
193
 
183
194
  { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
184
195
 
196
+ { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
197
+
185
198
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
186
199
  { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
187
200
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -775,6 +788,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
775
788
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
776
789
  },
777
790
  },
791
+ {
792
+ LLM_ARCH_PLAMO2,
793
+ {
794
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
795
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
796
+ { LLM_TENSOR_OUTPUT, "output" },
797
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
798
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
799
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
800
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
801
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
802
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
803
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
804
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
805
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
806
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
807
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
808
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
809
+ { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
810
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
811
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
812
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
813
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
814
+ { LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" },
815
+ { LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" },
816
+ { LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" },
817
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
818
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
819
+ },
820
+ },
778
821
  {
779
822
  LLM_ARCH_CODESHELL,
780
823
  {
@@ -1004,6 +1047,77 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1004
1047
  { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1005
1048
  },
1006
1049
  },
1050
+ {
1051
+ LLM_ARCH_MAMBA2,
1052
+ {
1053
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1054
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1055
+ { LLM_TENSOR_OUTPUT, "output" },
1056
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1057
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1058
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1059
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1060
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1061
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1062
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
1063
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1064
+ },
1065
+ },
1066
+ {
1067
+ LLM_ARCH_JAMBA,
1068
+ {
1069
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1070
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1071
+ { LLM_TENSOR_OUTPUT, "output" },
1072
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1073
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1074
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1075
+ { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
1076
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1077
+ { LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" },
1078
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1079
+ { LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" },
1080
+ { LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" },
1081
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1082
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1083
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1084
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1085
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1086
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1087
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1088
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1089
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1090
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1091
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1092
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1093
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1094
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1095
+ },
1096
+ },
1097
+ {
1098
+ LLM_ARCH_FALCON_H1,
1099
+ {
1100
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1101
+ { LLM_TENSOR_OUTPUT, "output" },
1102
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1103
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1104
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1105
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1106
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1107
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1108
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1109
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1110
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1111
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1112
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1113
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
1114
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1115
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1116
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1117
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1118
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1119
+ },
1120
+ },
1007
1121
  {
1008
1122
  LLM_ARCH_XVERSE,
1009
1123
  {
@@ -1397,6 +1511,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1397
1511
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1398
1512
  },
1399
1513
  },
1514
+ {
1515
+ LLM_ARCH_EXAONE4,
1516
+ {
1517
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1518
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1519
+ { LLM_TENSOR_OUTPUT, "output" },
1520
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1521
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1522
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1523
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1524
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1525
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1526
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1527
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1528
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1529
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1530
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1531
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1532
+ }
1533
+ },
1400
1534
  {
1401
1535
  LLM_ARCH_RWKV6,
1402
1536
  {
@@ -1564,6 +1698,43 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1564
1698
  { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1565
1699
  },
1566
1700
  },
1701
+ {
1702
+ LLM_ARCH_GRANITE_HYBRID,
1703
+ {
1704
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1705
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1706
+ { LLM_TENSOR_OUTPUT, "output" },
1707
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1708
+ // mamba(2) ssm layers
1709
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1710
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1711
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1712
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1713
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1714
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
1715
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1716
+ // attention layers
1717
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1718
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1719
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1720
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1721
+ // dense FFN
1722
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1723
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1724
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1725
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1726
+ // moe FFN
1727
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1728
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1729
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1730
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1731
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1732
+ // shared expert
1733
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1734
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1735
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1736
+ },
1737
+ },
1567
1738
  {
1568
1739
  LLM_ARCH_CHAMELEON,
1569
1740
  {
@@ -1676,12 +1847,115 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1676
1847
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1677
1848
  },
1678
1849
  },
1850
+ {
1851
+ LLM_ARCH_ERNIE4_5_MOE,
1852
+ {
1853
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1854
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1855
+ { LLM_TENSOR_OUTPUT, "output" },
1856
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1857
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1858
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1859
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1860
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1861
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1862
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1863
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1864
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1865
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1866
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1867
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1868
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1869
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1870
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1871
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1872
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1873
+ },
1874
+ },
1875
+ {
1876
+ LLM_ARCH_HUNYUAN_MOE,
1877
+ {
1878
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1879
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1880
+ { LLM_TENSOR_OUTPUT, "output" },
1881
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1882
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1883
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1884
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1885
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1886
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1887
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1888
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1889
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1890
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1891
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1892
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1893
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1894
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1895
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1896
+ },
1897
+ },
1898
+ {
1899
+ LLM_ARCH_SMOLLM3,
1900
+ {
1901
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1902
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1903
+ { LLM_TENSOR_OUTPUT, "output" },
1904
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1905
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1906
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1907
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1908
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1909
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1910
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1911
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1912
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1913
+ },
1914
+ },
1915
+ {
1916
+ LLM_ARCH_LFM2,
1917
+ {
1918
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1919
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1920
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1921
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1922
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1923
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1924
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1925
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1926
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1927
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1928
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1929
+ { LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" },
1930
+ { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
1931
+ { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
1932
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1933
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1934
+ }
1935
+ },
1679
1936
  {
1680
1937
  LLM_ARCH_UNKNOWN,
1681
1938
  {
1682
1939
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1683
1940
  },
1684
1941
  },
1942
+ {
1943
+ LLM_ARCH_DREAM,
1944
+ {
1945
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1946
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1947
+ { LLM_TENSOR_OUTPUT, "output" },
1948
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1949
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1950
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1951
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1952
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1953
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1954
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1955
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1956
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1957
+ },
1958
+ },
1685
1959
  };
1686
1960
 
1687
1961
  static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
@@ -1760,7 +2034,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1760
2034
  {LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
1761
2035
  {LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
1762
2036
  {LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
2037
+ {LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2038
+ {LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2039
+ {LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1763
2040
  {LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2041
+ {LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1764
2042
  {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1765
2043
  {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1766
2044
  {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -1839,6 +2117,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1839
2117
  {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1840
2118
  {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1841
2119
  {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2120
+ {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2121
+ {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2122
+ {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1842
2123
  };
1843
2124
 
1844
2125
  LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@@ -1894,6 +2175,7 @@ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
1894
2175
  bool llm_arch_is_recurrent(const llm_arch & arch) {
1895
2176
  switch (arch) {
1896
2177
  case LLM_ARCH_MAMBA:
2178
+ case LLM_ARCH_MAMBA2:
1897
2179
  case LLM_ARCH_RWKV6:
1898
2180
  case LLM_ARCH_RWKV6QWEN2:
1899
2181
  case LLM_ARCH_RWKV7:
@@ -1905,9 +2187,22 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
1905
2187
  }
1906
2188
 
1907
2189
  bool llm_arch_is_hybrid(const llm_arch & arch) {
1908
- // TODO: There are currently no hybrid models! Once there are, this will be
1909
- // the place to identify them
1910
2190
  switch (arch) {
2191
+ case LLM_ARCH_JAMBA:
2192
+ case LLM_ARCH_FALCON_H1:
2193
+ case LLM_ARCH_PLAMO2:
2194
+ case LLM_ARCH_GRANITE_HYBRID:
2195
+ case LLM_ARCH_LFM2:
2196
+ return true;
2197
+ default:
2198
+ return false;
2199
+ }
2200
+ }
2201
+
2202
+ bool llm_arch_is_diffusion(const llm_arch & arch) {
2203
+ switch (arch) {
2204
+ case LLM_ARCH_DREAM:
2205
+ return true;
1911
2206
  default:
1912
2207
  return false;
1913
2208
  }
@@ -38,6 +38,7 @@ enum llm_arch {
38
38
  LLM_ARCH_PHI3,
39
39
  LLM_ARCH_PHIMOE,
40
40
  LLM_ARCH_PLAMO,
41
+ LLM_ARCH_PLAMO2,
41
42
  LLM_ARCH_CODESHELL,
42
43
  LLM_ARCH_ORION,
43
44
  LLM_ARCH_INTERNLM2,
@@ -49,6 +50,9 @@ enum llm_arch {
49
50
  LLM_ARCH_GEMMA3N,
50
51
  LLM_ARCH_STARCODER2,
51
52
  LLM_ARCH_MAMBA,
53
+ LLM_ARCH_MAMBA2,
54
+ LLM_ARCH_JAMBA,
55
+ LLM_ARCH_FALCON_H1,
52
56
  LLM_ARCH_XVERSE,
53
57
  LLM_ARCH_COMMAND_R,
54
58
  LLM_ARCH_COHERE2,
@@ -68,12 +72,14 @@ enum llm_arch {
68
72
  LLM_ARCH_JAIS,
69
73
  LLM_ARCH_NEMOTRON,
70
74
  LLM_ARCH_EXAONE,
75
+ LLM_ARCH_EXAONE4,
71
76
  LLM_ARCH_RWKV6,
72
77
  LLM_ARCH_RWKV6QWEN2,
73
78
  LLM_ARCH_RWKV7,
74
79
  LLM_ARCH_ARWKV7,
75
80
  LLM_ARCH_GRANITE,
76
81
  LLM_ARCH_GRANITE_MOE,
82
+ LLM_ARCH_GRANITE_HYBRID,
77
83
  LLM_ARCH_CHAMELEON,
78
84
  LLM_ARCH_WAVTOKENIZER_DEC,
79
85
  LLM_ARCH_PLM,
@@ -81,6 +87,11 @@ enum llm_arch {
81
87
  LLM_ARCH_DOTS1,
82
88
  LLM_ARCH_ARCEE,
83
89
  LLM_ARCH_ERNIE4_5,
90
+ LLM_ARCH_ERNIE4_5_MOE,
91
+ LLM_ARCH_HUNYUAN_MOE,
92
+ LLM_ARCH_SMOLLM3,
93
+ LLM_ARCH_LFM2,
94
+ LLM_ARCH_DREAM,
84
95
  LLM_ARCH_UNKNOWN,
85
96
  };
86
97
 
@@ -153,7 +164,6 @@ enum llm_kv {
153
164
  LLM_KV_ATTENTION_SCALE,
154
165
  LLM_KV_ATTENTION_KEY_LENGTH_MLA,
155
166
  LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
156
- LLM_KV_ATTENTION_LAYER_INDICES,
157
167
 
158
168
  LLM_KV_ROPE_DIMENSION_COUNT,
159
169
  LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -174,6 +184,7 @@ enum llm_kv {
174
184
  LLM_KV_SSM_CONV_KERNEL,
175
185
  LLM_KV_SSM_STATE_SIZE,
176
186
  LLM_KV_SSM_TIME_STEP_RANK,
187
+ LLM_KV_SSM_GROUP_COUNT,
177
188
  LLM_KV_SSM_DT_B_C_RMS,
178
189
 
179
190
  LLM_KV_WKV_HEAD_SIZE,
@@ -221,6 +232,8 @@ enum llm_kv {
221
232
 
222
233
  LLM_KV_CLASSIFIER_OUTPUT_LABELS,
223
234
 
235
+ LLM_KV_SHORTCONV_L_CACHE,
236
+
224
237
  // deprecated:
225
238
  LLM_KV_TOKENIZER_PREFIX_ID,
226
239
  LLM_KV_TOKENIZER_SUFFIX_ID,
@@ -291,8 +304,12 @@ enum llm_tensor {
291
304
  LLM_TENSOR_SSM_CONV1D,
292
305
  LLM_TENSOR_SSM_X,
293
306
  LLM_TENSOR_SSM_DT,
307
+ LLM_TENSOR_SSM_DT_NORM,
294
308
  LLM_TENSOR_SSM_A,
309
+ LLM_TENSOR_SSM_B_NORM,
310
+ LLM_TENSOR_SSM_C_NORM,
295
311
  LLM_TENSOR_SSM_D,
312
+ LLM_TENSOR_SSM_NORM,
296
313
  LLM_TENSOR_SSM_OUT,
297
314
  LLM_TENSOR_TIME_MIX_W0,
298
315
  LLM_TENSOR_TIME_MIX_W1,
@@ -386,6 +403,9 @@ enum llm_tensor {
386
403
  LLM_TENSOR_POS_NET_ATTN_K,
387
404
  LLM_TENSOR_POS_NET_ATTN_V,
388
405
  LLM_TENSOR_POS_NET_ATTN_OUT,
406
+ LLM_TENSOR_SHORTCONV_CONV,
407
+ LLM_TENSOR_SHORTCONV_INPROJ,
408
+ LLM_TENSOR_SHORTCONV_OUTPROJ,
389
409
  };
390
410
 
391
411
  enum llm_tensor_layer {
@@ -462,3 +482,4 @@ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
462
482
 
463
483
  bool llm_arch_is_recurrent(const llm_arch & arch);
464
484
  bool llm_arch_is_hybrid (const llm_arch & arch);
485
+ bool llm_arch_is_diffusion(const llm_arch & arch);