@fugood/llama.node 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/common/CMakeLists.txt +4 -5
  3. package/src/llama.cpp/common/arg.cpp +37 -0
  4. package/src/llama.cpp/common/common.cpp +22 -6
  5. package/src/llama.cpp/common/common.h +14 -1
  6. package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
  7. package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  8. package/src/llama.cpp/ggml/include/ggml.h +13 -0
  9. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  10. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  11. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
  12. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
  14. package/src/llama.cpp/include/llama.h +13 -48
  15. package/src/llama.cpp/src/llama-arch.cpp +222 -15
  16. package/src/llama.cpp/src/llama-arch.h +16 -1
  17. package/src/llama.cpp/src/llama-batch.cpp +76 -70
  18. package/src/llama.cpp/src/llama-batch.h +24 -18
  19. package/src/llama.cpp/src/llama-chat.cpp +44 -1
  20. package/src/llama.cpp/src/llama-chat.h +2 -0
  21. package/src/llama.cpp/src/llama-context.cpp +134 -95
  22. package/src/llama.cpp/src/llama-context.h +13 -16
  23. package/src/llama.cpp/src/llama-cparams.h +3 -2
  24. package/src/llama.cpp/src/llama-graph.cpp +239 -154
  25. package/src/llama.cpp/src/llama-graph.h +162 -126
  26. package/src/llama.cpp/src/llama-hparams.cpp +45 -0
  27. package/src/llama.cpp/src/llama-hparams.h +11 -1
  28. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
  29. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
  30. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
  31. package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
  32. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
  33. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
  34. package/src/llama.cpp/src/llama-model.cpp +2309 -665
  35. package/src/llama.cpp/src/llama-model.h +18 -4
  36. package/src/llama.cpp/src/llama-quant.cpp +2 -2
  37. package/src/llama.cpp/src/llama-vocab.cpp +368 -9
  38. package/src/llama.cpp/src/llama-vocab.h +43 -0
  39. package/src/llama.cpp/src/unicode.cpp +207 -0
  40. package/src/llama.cpp/src/unicode.h +2 -0
@@ -4015,6 +4015,9 @@ static void ggml_compute_forward_rms_norm_f32(
4015
4015
 
4016
4016
  const float scale = 1.0f/sqrtf(mean + eps);
4017
4017
 
4018
+ // if you hit this, likely you got an inf somewhere earlier
4019
+ assert(scale > 0.0f);
4020
+
4018
4021
  ggml_vec_scale_f32(ne00, y, scale);
4019
4022
  }
4020
4023
  }
@@ -4643,9 +4646,11 @@ static void ggml_compute_forward_scale_f32(
4643
4646
  GGML_ASSERT(ggml_is_contiguous(dst));
4644
4647
  GGML_ASSERT(ggml_are_same_shape(src0, dst));
4645
4648
 
4646
- // scale factor
4647
- float v;
4648
- memcpy(&v, dst->op_params, sizeof(float));
4649
+ float s; // scale factor
4650
+ float b; // bias
4651
+
4652
+ memcpy(&s, (float *) dst->op_params + 0, sizeof(float));
4653
+ memcpy(&b, (float *) dst->op_params + 1, sizeof(float));
4649
4654
 
4650
4655
  const int ith = params->ith;
4651
4656
  const int nth = params->nth;
@@ -4664,12 +4669,22 @@ static void ggml_compute_forward_scale_f32(
4664
4669
 
4665
4670
  const size_t nb1 = dst->nb[1];
4666
4671
 
4667
- for (int i1 = ir0; i1 < ir1; i1++) {
4668
- if (dst->data != src0->data) {
4669
- // src0 is same shape as dst => same indices
4670
- memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
4672
+ if (b == 0.0f) {
4673
+ for (int i1 = ir0; i1 < ir1; i1++) {
4674
+ if (dst->data != src0->data) {
4675
+ // src0 is same shape as dst => same indices
4676
+ // TODO: add x parameter to ggml_vec_scale_f32 and remove this memcpy
4677
+ memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
4678
+ }
4679
+ ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), s);
4680
+ }
4681
+ } else {
4682
+ for (int i1 = ir0; i1 < ir1; i1++) {
4683
+ ggml_vec_mad1_f32(nc,
4684
+ (float *) ((char *) dst->data + i1*nb1),
4685
+ (float *) ((char *) src0->data + i1*nb1),
4686
+ s, b);
4671
4687
  }
4672
- ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
4673
4688
  }
4674
4689
  }
4675
4690
 
@@ -221,6 +221,9 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
221
221
  for (int i = np; i < n; ++i) {
222
222
  sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
223
223
  }
224
+
225
+ // if you hit this, you are likely running outside the FP range
226
+ assert(!isnan(sumf) && !isinf(sumf));
224
227
  #else
225
228
  for (int i = 0; i < n; ++i) {
226
229
  sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
@@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
351
351
  #endif
352
352
  }
353
353
 
354
+ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
355
+ #if defined(GGML_USE_ACCELERATE)
356
+ vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
357
+ #elif defined(GGML_SIMD)
358
+ #if defined(__ARM_FEATURE_SVE)
359
+ // scalar ; TODO: Write SVE code
360
+ for (int i = 0; i < n; ++i) {
361
+ y[i] = x[i]*s + b;
362
+ }
363
+ #else
364
+ const int np = (n & ~(GGML_F32_STEP - 1));
365
+
366
+ GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
367
+ GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
368
+
369
+ GGML_F32_VEC ay[GGML_F32_ARR];
370
+
371
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
372
+ for (int j = 0; j < GGML_F32_ARR; j++) {
373
+ ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
374
+ ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
375
+
376
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
377
+ }
378
+ }
379
+
380
+ // leftovers
381
+ for (int i = np; i < n; ++i) {
382
+ y[i] = x[i]*s + b;
383
+ }
384
+ #endif
385
+ #else
386
+ // scalar
387
+ for (int i = 0; i < n; ++i) {
388
+ y[i] = x[i]*s + b;
389
+ }
390
+ #endif
391
+ }
392
+
354
393
  //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
355
394
  inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
356
395
  #if defined(GGML_USE_ACCELERATE)
@@ -71,53 +71,13 @@ extern "C" {
71
71
  typedef int32_t llama_seq_id;
72
72
 
73
73
  enum llama_vocab_type {
74
- LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75
- LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76
- LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77
- LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78
- LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79
- LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80
- };
81
-
82
- // pre-tokenization types
83
- enum llama_vocab_pre_type {
84
- LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
85
- LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
86
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
87
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
88
- LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
89
- LLAMA_VOCAB_PRE_TYPE_MPT = 5,
90
- LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
91
- LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
92
- LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
93
- LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
94
- LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
95
- LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
96
- LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
97
- LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
98
- LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
99
- LLAMA_VOCAB_PRE_TYPE_PORO = 15,
100
- LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
101
- LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
102
- LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
103
- LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
104
- LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
105
- LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
106
- LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
107
- LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
108
- LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
109
- LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
110
- LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
111
- LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
112
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
113
- LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
114
- LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
115
- LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
116
- LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
117
- LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
118
- LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
119
- LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
120
- LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
74
+ LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78
+ LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79
+ LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80
+ LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
121
81
  };
122
82
 
123
83
  enum llama_rope_type {
@@ -375,6 +335,9 @@ extern "C" {
375
335
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
376
336
  // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
377
337
  // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
338
+ bool kv_unified; // use a unified buffer across the input sequences when computing the attention
339
+ // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
340
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14363
378
341
  };
379
342
 
380
343
  // model quantization parameters
@@ -765,7 +728,7 @@ extern "C" {
765
728
  // - lazily on next llama_decode()
766
729
  // p0 < 0 : [0, p1]
767
730
  // p1 < 0 : [p0, inf)
768
- DEPRECATED(void llama_kv_self_seq_div(
731
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
769
732
  struct llama_context * ctx,
770
733
  llama_seq_id seq_id,
771
734
  llama_pos p0,
@@ -1045,6 +1008,7 @@ extern "C" {
1045
1008
  LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
1046
1009
  LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
1047
1010
  LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
1011
+ LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
1048
1012
 
1049
1013
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1050
1014
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
@@ -1430,6 +1394,7 @@ extern "C" {
1430
1394
 
1431
1395
  int32_t n_p_eval;
1432
1396
  int32_t n_eval;
1397
+ int32_t n_reused; // number of times a ggml compute graph had been reused
1433
1398
  };
1434
1399
 
1435
1400
  struct llama_perf_sampler_data {
@@ -34,6 +34,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
34
34
  { LLM_ARCH_PHI3, "phi3" },
35
35
  { LLM_ARCH_PHIMOE, "phimoe" },
36
36
  { LLM_ARCH_PLAMO, "plamo" },
37
+ { LLM_ARCH_PLAMO2, "plamo2" },
37
38
  { LLM_ARCH_CODESHELL, "codeshell" },
38
39
  { LLM_ARCH_ORION, "orion" },
39
40
  { LLM_ARCH_INTERNLM2, "internlm2" },
@@ -46,6 +47,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
46
47
  { LLM_ARCH_STARCODER2, "starcoder2" },
47
48
  { LLM_ARCH_MAMBA, "mamba" },
48
49
  { LLM_ARCH_MAMBA2, "mamba2" },
50
+ { LLM_ARCH_JAMBA, "jamba" },
49
51
  { LLM_ARCH_FALCON_H1, "falcon-h1" },
50
52
  { LLM_ARCH_XVERSE, "xverse" },
51
53
  { LLM_ARCH_COMMAND_R, "command-r" },
@@ -66,12 +68,14 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
66
68
  { LLM_ARCH_JAIS, "jais" },
67
69
  { LLM_ARCH_NEMOTRON, "nemotron" },
68
70
  { LLM_ARCH_EXAONE, "exaone" },
71
+ { LLM_ARCH_EXAONE4, "exaone4" },
69
72
  { LLM_ARCH_RWKV6, "rwkv6" },
70
73
  { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
71
74
  { LLM_ARCH_RWKV7, "rwkv7" },
72
75
  { LLM_ARCH_ARWKV7, "arwkv7" },
73
76
  { LLM_ARCH_GRANITE, "granite" },
74
77
  { LLM_ARCH_GRANITE_MOE, "granitemoe" },
78
+ { LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
75
79
  { LLM_ARCH_CHAMELEON, "chameleon" },
76
80
  { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
77
81
  { LLM_ARCH_PLM, "plm" },
@@ -79,8 +83,11 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
79
83
  { LLM_ARCH_DOTS1, "dots1" },
80
84
  { LLM_ARCH_ARCEE, "arcee" },
81
85
  { LLM_ARCH_ERNIE4_5, "ernie4_5" },
86
+ { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
82
87
  { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
83
88
  { LLM_ARCH_SMOLLM3, "smollm3" },
89
+ { LLM_ARCH_LFM2, "lfm2" },
90
+ { LLM_ARCH_DREAM, "dream" },
84
91
  { LLM_ARCH_UNKNOWN, "(unknown)" },
85
92
  };
86
93
 
@@ -153,7 +160,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
153
160
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
154
161
  { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
155
162
  { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
156
- { LLM_KV_ATTENTION_LAYER_INDICES, "%s.attention.layer_indices" },
157
163
 
158
164
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
159
165
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -187,6 +193,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
187
193
 
188
194
  { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
189
195
 
196
+ { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
197
+
190
198
  { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
191
199
  { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
192
200
  { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
@@ -780,6 +788,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
780
788
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
781
789
  },
782
790
  },
791
+ {
792
+ LLM_ARCH_PLAMO2,
793
+ {
794
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
795
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
796
+ { LLM_TENSOR_OUTPUT, "output" },
797
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
798
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
799
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
800
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
801
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
802
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
803
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
804
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
805
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
806
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
807
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
808
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
809
+ { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
810
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
811
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
812
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
813
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
814
+ { LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" },
815
+ { LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" },
816
+ { LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" },
817
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
818
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
819
+ },
820
+ },
783
821
  {
784
822
  LLM_ARCH_CODESHELL,
785
823
  {
@@ -1025,6 +1063,37 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1025
1063
  { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1026
1064
  },
1027
1065
  },
1066
+ {
1067
+ LLM_ARCH_JAMBA,
1068
+ {
1069
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1070
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1071
+ { LLM_TENSOR_OUTPUT, "output" },
1072
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1073
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1074
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1075
+ { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
1076
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1077
+ { LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" },
1078
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1079
+ { LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" },
1080
+ { LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" },
1081
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1082
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1083
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1084
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1085
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1086
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1087
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1088
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1089
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1090
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1091
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1092
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1093
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1094
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1095
+ },
1096
+ },
1028
1097
  {
1029
1098
  LLM_ARCH_FALCON_H1,
1030
1099
  {
@@ -1442,6 +1511,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1442
1511
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1443
1512
  },
1444
1513
  },
1514
+ {
1515
+ LLM_ARCH_EXAONE4,
1516
+ {
1517
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1518
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1519
+ { LLM_TENSOR_OUTPUT, "output" },
1520
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1521
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1522
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1523
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1524
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1525
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1526
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1527
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1528
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1529
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1530
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1531
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1532
+ }
1533
+ },
1445
1534
  {
1446
1535
  LLM_ARCH_RWKV6,
1447
1536
  {
@@ -1609,6 +1698,43 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1609
1698
  { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1610
1699
  },
1611
1700
  },
1701
+ {
1702
+ LLM_ARCH_GRANITE_HYBRID,
1703
+ {
1704
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1705
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1706
+ { LLM_TENSOR_OUTPUT, "output" },
1707
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1708
+ // mamba(2) ssm layers
1709
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
1710
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
1711
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
1712
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
1713
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
1714
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
1715
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
1716
+ // attention layers
1717
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1718
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1719
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1720
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1721
+ // dense FFN
1722
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1723
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1724
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1725
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1726
+ // moe FFN
1727
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1728
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1729
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1730
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1731
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1732
+ // shared expert
1733
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1734
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1735
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1736
+ },
1737
+ },
1612
1738
  {
1613
1739
  LLM_ARCH_CHAMELEON,
1614
1740
  {
@@ -1721,6 +1847,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1721
1847
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1722
1848
  },
1723
1849
  },
1850
+ {
1851
+ LLM_ARCH_ERNIE4_5_MOE,
1852
+ {
1853
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1854
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1855
+ { LLM_TENSOR_OUTPUT, "output" },
1856
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1857
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1858
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1859
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1860
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1861
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1862
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1863
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1864
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1865
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1866
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1867
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1868
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1869
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1870
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1871
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1872
+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1873
+ },
1874
+ },
1724
1875
  {
1725
1876
  LLM_ARCH_HUNYUAN_MOE,
1726
1877
  {
@@ -1744,6 +1895,44 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1744
1895
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1745
1896
  },
1746
1897
  },
1898
+ {
1899
+ LLM_ARCH_SMOLLM3,
1900
+ {
1901
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1902
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1903
+ { LLM_TENSOR_OUTPUT, "output" },
1904
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1905
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1906
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1907
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1908
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1909
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1910
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1911
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1912
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1913
+ },
1914
+ },
1915
+ {
1916
+ LLM_ARCH_LFM2,
1917
+ {
1918
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1919
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1920
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1921
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1922
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1923
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1924
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1925
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1926
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1927
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1928
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1929
+ { LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" },
1930
+ { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
1931
+ { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
1932
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1933
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1934
+ }
1935
+ },
1747
1936
  {
1748
1937
  LLM_ARCH_UNKNOWN,
1749
1938
  {
@@ -1751,20 +1940,20 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1751
1940
  },
1752
1941
  },
1753
1942
  {
1754
- LLM_ARCH_SMOLLM3,
1943
+ LLM_ARCH_DREAM,
1755
1944
  {
1756
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1757
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1758
- { LLM_TENSOR_OUTPUT, "output" },
1759
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1760
- { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1761
- { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1762
- { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1763
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1764
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1765
- { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1766
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1767
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1945
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1946
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1947
+ { LLM_TENSOR_OUTPUT, "output" },
1948
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1949
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1950
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1951
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1952
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1953
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1954
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1955
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1956
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1768
1957
  },
1769
1958
  },
1770
1959
  };
@@ -1845,6 +2034,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1845
2034
  {LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
1846
2035
  {LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
1847
2036
  {LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
2037
+ {LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2038
+ {LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2039
+ {LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1848
2040
  {LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1849
2041
  {LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1850
2042
  {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -1925,6 +2117,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1925
2117
  {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1926
2118
  {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1927
2119
  {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2120
+ {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2121
+ {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2122
+ {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1928
2123
  };
1929
2124
 
1930
2125
  LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@@ -1992,9 +2187,21 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
1992
2187
  }
1993
2188
 
1994
2189
  bool llm_arch_is_hybrid(const llm_arch & arch) {
1995
- // List all mamba-attention hybrid models here
1996
2190
  switch (arch) {
2191
+ case LLM_ARCH_JAMBA:
1997
2192
  case LLM_ARCH_FALCON_H1:
2193
+ case LLM_ARCH_PLAMO2:
2194
+ case LLM_ARCH_GRANITE_HYBRID:
2195
+ case LLM_ARCH_LFM2:
2196
+ return true;
2197
+ default:
2198
+ return false;
2199
+ }
2200
+ }
2201
+
2202
+ bool llm_arch_is_diffusion(const llm_arch & arch) {
2203
+ switch (arch) {
2204
+ case LLM_ARCH_DREAM:
1998
2205
  return true;
1999
2206
  default:
2000
2207
  return false;
@@ -38,6 +38,7 @@ enum llm_arch {
38
38
  LLM_ARCH_PHI3,
39
39
  LLM_ARCH_PHIMOE,
40
40
  LLM_ARCH_PLAMO,
41
+ LLM_ARCH_PLAMO2,
41
42
  LLM_ARCH_CODESHELL,
42
43
  LLM_ARCH_ORION,
43
44
  LLM_ARCH_INTERNLM2,
@@ -50,6 +51,7 @@ enum llm_arch {
50
51
  LLM_ARCH_STARCODER2,
51
52
  LLM_ARCH_MAMBA,
52
53
  LLM_ARCH_MAMBA2,
54
+ LLM_ARCH_JAMBA,
53
55
  LLM_ARCH_FALCON_H1,
54
56
  LLM_ARCH_XVERSE,
55
57
  LLM_ARCH_COMMAND_R,
@@ -70,12 +72,14 @@ enum llm_arch {
70
72
  LLM_ARCH_JAIS,
71
73
  LLM_ARCH_NEMOTRON,
72
74
  LLM_ARCH_EXAONE,
75
+ LLM_ARCH_EXAONE4,
73
76
  LLM_ARCH_RWKV6,
74
77
  LLM_ARCH_RWKV6QWEN2,
75
78
  LLM_ARCH_RWKV7,
76
79
  LLM_ARCH_ARWKV7,
77
80
  LLM_ARCH_GRANITE,
78
81
  LLM_ARCH_GRANITE_MOE,
82
+ LLM_ARCH_GRANITE_HYBRID,
79
83
  LLM_ARCH_CHAMELEON,
80
84
  LLM_ARCH_WAVTOKENIZER_DEC,
81
85
  LLM_ARCH_PLM,
@@ -83,8 +87,11 @@ enum llm_arch {
83
87
  LLM_ARCH_DOTS1,
84
88
  LLM_ARCH_ARCEE,
85
89
  LLM_ARCH_ERNIE4_5,
90
+ LLM_ARCH_ERNIE4_5_MOE,
86
91
  LLM_ARCH_HUNYUAN_MOE,
87
92
  LLM_ARCH_SMOLLM3,
93
+ LLM_ARCH_LFM2,
94
+ LLM_ARCH_DREAM,
88
95
  LLM_ARCH_UNKNOWN,
89
96
  };
90
97
 
@@ -157,7 +164,6 @@ enum llm_kv {
157
164
  LLM_KV_ATTENTION_SCALE,
158
165
  LLM_KV_ATTENTION_KEY_LENGTH_MLA,
159
166
  LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
160
- LLM_KV_ATTENTION_LAYER_INDICES,
161
167
 
162
168
  LLM_KV_ROPE_DIMENSION_COUNT,
163
169
  LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -226,6 +232,8 @@ enum llm_kv {
226
232
 
227
233
  LLM_KV_CLASSIFIER_OUTPUT_LABELS,
228
234
 
235
+ LLM_KV_SHORTCONV_L_CACHE,
236
+
229
237
  // deprecated:
230
238
  LLM_KV_TOKENIZER_PREFIX_ID,
231
239
  LLM_KV_TOKENIZER_SUFFIX_ID,
@@ -296,7 +304,10 @@ enum llm_tensor {
296
304
  LLM_TENSOR_SSM_CONV1D,
297
305
  LLM_TENSOR_SSM_X,
298
306
  LLM_TENSOR_SSM_DT,
307
+ LLM_TENSOR_SSM_DT_NORM,
299
308
  LLM_TENSOR_SSM_A,
309
+ LLM_TENSOR_SSM_B_NORM,
310
+ LLM_TENSOR_SSM_C_NORM,
300
311
  LLM_TENSOR_SSM_D,
301
312
  LLM_TENSOR_SSM_NORM,
302
313
  LLM_TENSOR_SSM_OUT,
@@ -392,6 +403,9 @@ enum llm_tensor {
392
403
  LLM_TENSOR_POS_NET_ATTN_K,
393
404
  LLM_TENSOR_POS_NET_ATTN_V,
394
405
  LLM_TENSOR_POS_NET_ATTN_OUT,
406
+ LLM_TENSOR_SHORTCONV_CONV,
407
+ LLM_TENSOR_SHORTCONV_INPROJ,
408
+ LLM_TENSOR_SHORTCONV_OUTPROJ,
395
409
  };
396
410
 
397
411
  enum llm_tensor_layer {
@@ -468,3 +482,4 @@ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
468
482
 
469
483
  bool llm_arch_is_recurrent(const llm_arch & arch);
470
484
  bool llm_arch_is_hybrid (const llm_arch & arch);
485
+ bool llm_arch_is_diffusion(const llm_arch & arch);