@fugood/llama.node 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/CMakeLists.txt +4 -5
- package/src/llama.cpp/common/arg.cpp +37 -0
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +14 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +13 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
- package/src/llama.cpp/include/llama.h +13 -48
- package/src/llama.cpp/src/llama-arch.cpp +222 -15
- package/src/llama.cpp/src/llama-arch.h +16 -1
- package/src/llama.cpp/src/llama-batch.cpp +76 -70
- package/src/llama.cpp/src/llama-batch.h +24 -18
- package/src/llama.cpp/src/llama-chat.cpp +44 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +134 -95
- package/src/llama.cpp/src/llama-context.h +13 -16
- package/src/llama.cpp/src/llama-cparams.h +3 -2
- package/src/llama.cpp/src/llama-graph.cpp +239 -154
- package/src/llama.cpp/src/llama-graph.h +162 -126
- package/src/llama.cpp/src/llama-hparams.cpp +45 -0
- package/src/llama.cpp/src/llama-hparams.h +11 -1
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
- package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
- package/src/llama.cpp/src/llama-model.cpp +2309 -665
- package/src/llama.cpp/src/llama-model.h +18 -4
- package/src/llama.cpp/src/llama-quant.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +368 -9
- package/src/llama.cpp/src/llama-vocab.h +43 -0
- package/src/llama.cpp/src/unicode.cpp +207 -0
- package/src/llama.cpp/src/unicode.h +2 -0
|
@@ -4015,6 +4015,9 @@ static void ggml_compute_forward_rms_norm_f32(
|
|
|
4015
4015
|
|
|
4016
4016
|
const float scale = 1.0f/sqrtf(mean + eps);
|
|
4017
4017
|
|
|
4018
|
+
// if you hit this, likely you got an inf somewhere earlier
|
|
4019
|
+
assert(scale > 0.0f);
|
|
4020
|
+
|
|
4018
4021
|
ggml_vec_scale_f32(ne00, y, scale);
|
|
4019
4022
|
}
|
|
4020
4023
|
}
|
|
@@ -4643,9 +4646,11 @@ static void ggml_compute_forward_scale_f32(
|
|
|
4643
4646
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
|
4644
4647
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
|
4645
4648
|
|
|
4646
|
-
// scale factor
|
|
4647
|
-
float
|
|
4648
|
-
|
|
4649
|
+
float s; // scale factor
|
|
4650
|
+
float b; // bias
|
|
4651
|
+
|
|
4652
|
+
memcpy(&s, (float *) dst->op_params + 0, sizeof(float));
|
|
4653
|
+
memcpy(&b, (float *) dst->op_params + 1, sizeof(float));
|
|
4649
4654
|
|
|
4650
4655
|
const int ith = params->ith;
|
|
4651
4656
|
const int nth = params->nth;
|
|
@@ -4664,12 +4669,22 @@ static void ggml_compute_forward_scale_f32(
|
|
|
4664
4669
|
|
|
4665
4670
|
const size_t nb1 = dst->nb[1];
|
|
4666
4671
|
|
|
4667
|
-
|
|
4668
|
-
|
|
4669
|
-
|
|
4670
|
-
|
|
4672
|
+
if (b == 0.0f) {
|
|
4673
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
4674
|
+
if (dst->data != src0->data) {
|
|
4675
|
+
// src0 is same shape as dst => same indices
|
|
4676
|
+
// TODO: add x parameter to ggml_vec_scale_f32 and remove this memcpy
|
|
4677
|
+
memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
|
|
4678
|
+
}
|
|
4679
|
+
ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), s);
|
|
4680
|
+
}
|
|
4681
|
+
} else {
|
|
4682
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
4683
|
+
ggml_vec_mad1_f32(nc,
|
|
4684
|
+
(float *) ((char *) dst->data + i1*nb1),
|
|
4685
|
+
(float *) ((char *) src0->data + i1*nb1),
|
|
4686
|
+
s, b);
|
|
4671
4687
|
}
|
|
4672
|
-
ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
|
|
4673
4688
|
}
|
|
4674
4689
|
}
|
|
4675
4690
|
|
|
@@ -221,6 +221,9 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
|
|
221
221
|
for (int i = np; i < n; ++i) {
|
|
222
222
|
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
223
223
|
}
|
|
224
|
+
|
|
225
|
+
// if you hit this, you are likely running outside the FP range
|
|
226
|
+
assert(!isnan(sumf) && !isinf(sumf));
|
|
224
227
|
#else
|
|
225
228
|
for (int i = 0; i < n; ++i) {
|
|
226
229
|
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
@@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
|
|
351
351
|
#endif
|
|
352
352
|
}
|
|
353
353
|
|
|
354
|
+
inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
|
|
355
|
+
#if defined(GGML_USE_ACCELERATE)
|
|
356
|
+
vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
|
|
357
|
+
#elif defined(GGML_SIMD)
|
|
358
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
359
|
+
// scalar ; TODO: Write SVE code
|
|
360
|
+
for (int i = 0; i < n; ++i) {
|
|
361
|
+
y[i] = x[i]*s + b;
|
|
362
|
+
}
|
|
363
|
+
#else
|
|
364
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
365
|
+
|
|
366
|
+
GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
|
|
367
|
+
GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
|
|
368
|
+
|
|
369
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
370
|
+
|
|
371
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
|
372
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
|
373
|
+
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
|
374
|
+
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
|
|
375
|
+
|
|
376
|
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// leftovers
|
|
381
|
+
for (int i = np; i < n; ++i) {
|
|
382
|
+
y[i] = x[i]*s + b;
|
|
383
|
+
}
|
|
384
|
+
#endif
|
|
385
|
+
#else
|
|
386
|
+
// scalar
|
|
387
|
+
for (int i = 0; i < n; ++i) {
|
|
388
|
+
y[i] = x[i]*s + b;
|
|
389
|
+
}
|
|
390
|
+
#endif
|
|
391
|
+
}
|
|
392
|
+
|
|
354
393
|
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
|
|
355
394
|
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
356
395
|
#if defined(GGML_USE_ACCELERATE)
|
|
@@ -71,53 +71,13 @@ extern "C" {
|
|
|
71
71
|
typedef int32_t llama_seq_id;
|
|
72
72
|
|
|
73
73
|
enum llama_vocab_type {
|
|
74
|
-
LLAMA_VOCAB_TYPE_NONE
|
|
75
|
-
LLAMA_VOCAB_TYPE_SPM
|
|
76
|
-
LLAMA_VOCAB_TYPE_BPE
|
|
77
|
-
LLAMA_VOCAB_TYPE_WPM
|
|
78
|
-
LLAMA_VOCAB_TYPE_UGM
|
|
79
|
-
LLAMA_VOCAB_TYPE_RWKV
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
// pre-tokenization types
|
|
83
|
-
enum llama_vocab_pre_type {
|
|
84
|
-
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
|
85
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
|
86
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
|
87
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
|
88
|
-
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
|
89
|
-
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
|
90
|
-
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
|
91
|
-
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
|
92
|
-
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
|
93
|
-
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
|
94
|
-
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
|
95
|
-
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
|
96
|
-
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
|
97
|
-
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
|
98
|
-
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
|
99
|
-
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
|
100
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
|
101
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
|
102
|
-
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
|
103
|
-
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
|
104
|
-
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
|
105
|
-
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
|
106
|
-
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
|
107
|
-
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
|
108
|
-
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
|
109
|
-
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
|
110
|
-
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
111
|
-
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
112
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
|
113
|
-
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
|
114
|
-
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
|
115
|
-
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
|
116
|
-
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
|
117
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
|
118
|
-
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
|
119
|
-
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
|
120
|
-
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
|
74
|
+
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
|
75
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
|
76
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
|
77
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
|
78
|
+
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
|
79
|
+
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
|
80
|
+
LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
|
|
121
81
|
};
|
|
122
82
|
|
|
123
83
|
enum llama_rope_type {
|
|
@@ -375,6 +335,9 @@ extern "C" {
|
|
|
375
335
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
376
336
|
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
|
377
337
|
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
|
338
|
+
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
|
|
339
|
+
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
|
|
340
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
378
341
|
};
|
|
379
342
|
|
|
380
343
|
// model quantization parameters
|
|
@@ -765,7 +728,7 @@ extern "C" {
|
|
|
765
728
|
// - lazily on next llama_decode()
|
|
766
729
|
// p0 < 0 : [0, p1]
|
|
767
730
|
// p1 < 0 : [p0, inf)
|
|
768
|
-
DEPRECATED(void llama_kv_self_seq_div(
|
|
731
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
|
|
769
732
|
struct llama_context * ctx,
|
|
770
733
|
llama_seq_id seq_id,
|
|
771
734
|
llama_pos p0,
|
|
@@ -1045,6 +1008,7 @@ extern "C" {
|
|
|
1045
1008
|
LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
|
|
1046
1009
|
LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
|
|
1047
1010
|
LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
|
|
1011
|
+
LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
|
|
1048
1012
|
|
|
1049
1013
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
|
1050
1014
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
|
@@ -1430,6 +1394,7 @@ extern "C" {
|
|
|
1430
1394
|
|
|
1431
1395
|
int32_t n_p_eval;
|
|
1432
1396
|
int32_t n_eval;
|
|
1397
|
+
int32_t n_reused; // number of times a ggml compute graph had been reused
|
|
1433
1398
|
};
|
|
1434
1399
|
|
|
1435
1400
|
struct llama_perf_sampler_data {
|
|
@@ -34,6 +34,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
34
34
|
{ LLM_ARCH_PHI3, "phi3" },
|
|
35
35
|
{ LLM_ARCH_PHIMOE, "phimoe" },
|
|
36
36
|
{ LLM_ARCH_PLAMO, "plamo" },
|
|
37
|
+
{ LLM_ARCH_PLAMO2, "plamo2" },
|
|
37
38
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
|
38
39
|
{ LLM_ARCH_ORION, "orion" },
|
|
39
40
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
|
@@ -46,6 +47,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
46
47
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
|
47
48
|
{ LLM_ARCH_MAMBA, "mamba" },
|
|
48
49
|
{ LLM_ARCH_MAMBA2, "mamba2" },
|
|
50
|
+
{ LLM_ARCH_JAMBA, "jamba" },
|
|
49
51
|
{ LLM_ARCH_FALCON_H1, "falcon-h1" },
|
|
50
52
|
{ LLM_ARCH_XVERSE, "xverse" },
|
|
51
53
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
|
@@ -66,12 +68,14 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
66
68
|
{ LLM_ARCH_JAIS, "jais" },
|
|
67
69
|
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
|
68
70
|
{ LLM_ARCH_EXAONE, "exaone" },
|
|
71
|
+
{ LLM_ARCH_EXAONE4, "exaone4" },
|
|
69
72
|
{ LLM_ARCH_RWKV6, "rwkv6" },
|
|
70
73
|
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
|
|
71
74
|
{ LLM_ARCH_RWKV7, "rwkv7" },
|
|
72
75
|
{ LLM_ARCH_ARWKV7, "arwkv7" },
|
|
73
76
|
{ LLM_ARCH_GRANITE, "granite" },
|
|
74
77
|
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
|
78
|
+
{ LLM_ARCH_GRANITE_HYBRID, "granitehybrid" },
|
|
75
79
|
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
|
76
80
|
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
|
77
81
|
{ LLM_ARCH_PLM, "plm" },
|
|
@@ -79,8 +83,11 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
79
83
|
{ LLM_ARCH_DOTS1, "dots1" },
|
|
80
84
|
{ LLM_ARCH_ARCEE, "arcee" },
|
|
81
85
|
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
|
86
|
+
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
|
82
87
|
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
|
83
88
|
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
|
89
|
+
{ LLM_ARCH_LFM2, "lfm2" },
|
|
90
|
+
{ LLM_ARCH_DREAM, "dream" },
|
|
84
91
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
85
92
|
};
|
|
86
93
|
|
|
@@ -153,7 +160,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
153
160
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
154
161
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
|
155
162
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
|
156
|
-
{ LLM_KV_ATTENTION_LAYER_INDICES, "%s.attention.layer_indices" },
|
|
157
163
|
|
|
158
164
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
159
165
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
|
@@ -187,6 +193,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
187
193
|
|
|
188
194
|
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
|
|
189
195
|
|
|
196
|
+
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
|
|
197
|
+
|
|
190
198
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
|
191
199
|
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
|
192
200
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
|
@@ -780,6 +788,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
780
788
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
781
789
|
},
|
|
782
790
|
},
|
|
791
|
+
{
|
|
792
|
+
LLM_ARCH_PLAMO2,
|
|
793
|
+
{
|
|
794
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
795
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
796
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
797
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
798
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
799
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
800
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
801
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
802
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
803
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
804
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
805
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
806
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
807
|
+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
|
808
|
+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
|
809
|
+
{ LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
|
|
810
|
+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
|
811
|
+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
|
812
|
+
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
|
813
|
+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
|
814
|
+
{ LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" },
|
|
815
|
+
{ LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" },
|
|
816
|
+
{ LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" },
|
|
817
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
818
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
819
|
+
},
|
|
820
|
+
},
|
|
783
821
|
{
|
|
784
822
|
LLM_ARCH_CODESHELL,
|
|
785
823
|
{
|
|
@@ -1025,6 +1063,37 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1025
1063
|
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
|
1026
1064
|
},
|
|
1027
1065
|
},
|
|
1066
|
+
{
|
|
1067
|
+
LLM_ARCH_JAMBA,
|
|
1068
|
+
{
|
|
1069
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1070
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1071
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1072
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1073
|
+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
|
1074
|
+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
|
1075
|
+
{ LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
|
|
1076
|
+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
|
1077
|
+
{ LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" },
|
|
1078
|
+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
|
1079
|
+
{ LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" },
|
|
1080
|
+
{ LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" },
|
|
1081
|
+
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
|
1082
|
+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
|
1083
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1084
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1085
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1086
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1087
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1088
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1089
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1090
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1091
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1092
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1093
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1094
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1095
|
+
},
|
|
1096
|
+
},
|
|
1028
1097
|
{
|
|
1029
1098
|
LLM_ARCH_FALCON_H1,
|
|
1030
1099
|
{
|
|
@@ -1442,6 +1511,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1442
1511
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1443
1512
|
},
|
|
1444
1513
|
},
|
|
1514
|
+
{
|
|
1515
|
+
LLM_ARCH_EXAONE4,
|
|
1516
|
+
{
|
|
1517
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1518
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1519
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1520
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
1521
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1522
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1523
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1524
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1525
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1526
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1527
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
1528
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1529
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1530
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1531
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
1532
|
+
}
|
|
1533
|
+
},
|
|
1445
1534
|
{
|
|
1446
1535
|
LLM_ARCH_RWKV6,
|
|
1447
1536
|
{
|
|
@@ -1609,6 +1698,43 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1609
1698
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1610
1699
|
},
|
|
1611
1700
|
},
|
|
1701
|
+
{
|
|
1702
|
+
LLM_ARCH_GRANITE_HYBRID,
|
|
1703
|
+
{
|
|
1704
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1705
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1706
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1707
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1708
|
+
// mamba(2) ssm layers
|
|
1709
|
+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
|
1710
|
+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
|
1711
|
+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
|
1712
|
+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
|
1713
|
+
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
|
1714
|
+
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
|
1715
|
+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
|
1716
|
+
// attention layers
|
|
1717
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1718
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1719
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1720
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1721
|
+
// dense FFN
|
|
1722
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1723
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1724
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1725
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1726
|
+
// moe FFN
|
|
1727
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1728
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1729
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1730
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1731
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1732
|
+
// shared expert
|
|
1733
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1734
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1735
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1736
|
+
},
|
|
1737
|
+
},
|
|
1612
1738
|
{
|
|
1613
1739
|
LLM_ARCH_CHAMELEON,
|
|
1614
1740
|
{
|
|
@@ -1721,6 +1847,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1721
1847
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1722
1848
|
},
|
|
1723
1849
|
},
|
|
1850
|
+
{
|
|
1851
|
+
LLM_ARCH_ERNIE4_5_MOE,
|
|
1852
|
+
{
|
|
1853
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1854
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1855
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1856
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1857
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1858
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1859
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1860
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1861
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1862
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1863
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1864
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1865
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1866
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1867
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1868
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1869
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1870
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1871
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1872
|
+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
1873
|
+
},
|
|
1874
|
+
},
|
|
1724
1875
|
{
|
|
1725
1876
|
LLM_ARCH_HUNYUAN_MOE,
|
|
1726
1877
|
{
|
|
@@ -1744,6 +1895,44 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1744
1895
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1745
1896
|
},
|
|
1746
1897
|
},
|
|
1898
|
+
{
|
|
1899
|
+
LLM_ARCH_SMOLLM3,
|
|
1900
|
+
{
|
|
1901
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1902
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1903
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1904
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1905
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1906
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1907
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1908
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1909
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1910
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1911
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1912
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1913
|
+
},
|
|
1914
|
+
},
|
|
1915
|
+
{
|
|
1916
|
+
LLM_ARCH_LFM2,
|
|
1917
|
+
{
|
|
1918
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1919
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1920
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1921
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1922
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1923
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1924
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1925
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1926
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1927
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1928
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1929
|
+
{ LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" },
|
|
1930
|
+
{ LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
|
|
1931
|
+
{ LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
|
|
1932
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1933
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
1934
|
+
}
|
|
1935
|
+
},
|
|
1747
1936
|
{
|
|
1748
1937
|
LLM_ARCH_UNKNOWN,
|
|
1749
1938
|
{
|
|
@@ -1751,20 +1940,20 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1751
1940
|
},
|
|
1752
1941
|
},
|
|
1753
1942
|
{
|
|
1754
|
-
|
|
1943
|
+
LLM_ARCH_DREAM,
|
|
1755
1944
|
{
|
|
1756
|
-
{ LLM_TENSOR_TOKEN_EMBD,
|
|
1757
|
-
{ LLM_TENSOR_OUTPUT_NORM,
|
|
1758
|
-
{ LLM_TENSOR_OUTPUT,
|
|
1759
|
-
{ LLM_TENSOR_ATTN_NORM,
|
|
1760
|
-
{ LLM_TENSOR_ATTN_Q,
|
|
1761
|
-
{ LLM_TENSOR_ATTN_K,
|
|
1762
|
-
{ LLM_TENSOR_ATTN_V,
|
|
1763
|
-
{ LLM_TENSOR_ATTN_OUT,
|
|
1764
|
-
{ LLM_TENSOR_FFN_NORM,
|
|
1765
|
-
{ LLM_TENSOR_FFN_GATE,
|
|
1766
|
-
{ LLM_TENSOR_FFN_DOWN,
|
|
1767
|
-
{ LLM_TENSOR_FFN_UP,
|
|
1945
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1946
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1947
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1948
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1949
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1950
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1951
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1952
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1953
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1954
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1955
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1956
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1768
1957
|
},
|
|
1769
1958
|
},
|
|
1770
1959
|
};
|
|
@@ -1845,6 +2034,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
1845
2034
|
{LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
|
|
1846
2035
|
{LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
|
1847
2036
|
{LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
|
|
2037
|
+
{LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
2038
|
+
{LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
2039
|
+
{LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1848
2040
|
{LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1849
2041
|
{LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
1850
2042
|
{LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
@@ -1925,6 +2117,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
|
1925
2117
|
{LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1926
2118
|
{LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1927
2119
|
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
2120
|
+
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
|
2121
|
+
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
2122
|
+
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
1928
2123
|
};
|
|
1929
2124
|
|
|
1930
2125
|
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
|
@@ -1992,9 +2187,21 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
|
|
|
1992
2187
|
}
|
|
1993
2188
|
|
|
1994
2189
|
bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|
1995
|
-
// List all mamba-attention hybrid models here
|
|
1996
2190
|
switch (arch) {
|
|
2191
|
+
case LLM_ARCH_JAMBA:
|
|
1997
2192
|
case LLM_ARCH_FALCON_H1:
|
|
2193
|
+
case LLM_ARCH_PLAMO2:
|
|
2194
|
+
case LLM_ARCH_GRANITE_HYBRID:
|
|
2195
|
+
case LLM_ARCH_LFM2:
|
|
2196
|
+
return true;
|
|
2197
|
+
default:
|
|
2198
|
+
return false;
|
|
2199
|
+
}
|
|
2200
|
+
}
|
|
2201
|
+
|
|
2202
|
+
bool llm_arch_is_diffusion(const llm_arch & arch) {
|
|
2203
|
+
switch (arch) {
|
|
2204
|
+
case LLM_ARCH_DREAM:
|
|
1998
2205
|
return true;
|
|
1999
2206
|
default:
|
|
2000
2207
|
return false;
|
|
@@ -38,6 +38,7 @@ enum llm_arch {
|
|
|
38
38
|
LLM_ARCH_PHI3,
|
|
39
39
|
LLM_ARCH_PHIMOE,
|
|
40
40
|
LLM_ARCH_PLAMO,
|
|
41
|
+
LLM_ARCH_PLAMO2,
|
|
41
42
|
LLM_ARCH_CODESHELL,
|
|
42
43
|
LLM_ARCH_ORION,
|
|
43
44
|
LLM_ARCH_INTERNLM2,
|
|
@@ -50,6 +51,7 @@ enum llm_arch {
|
|
|
50
51
|
LLM_ARCH_STARCODER2,
|
|
51
52
|
LLM_ARCH_MAMBA,
|
|
52
53
|
LLM_ARCH_MAMBA2,
|
|
54
|
+
LLM_ARCH_JAMBA,
|
|
53
55
|
LLM_ARCH_FALCON_H1,
|
|
54
56
|
LLM_ARCH_XVERSE,
|
|
55
57
|
LLM_ARCH_COMMAND_R,
|
|
@@ -70,12 +72,14 @@ enum llm_arch {
|
|
|
70
72
|
LLM_ARCH_JAIS,
|
|
71
73
|
LLM_ARCH_NEMOTRON,
|
|
72
74
|
LLM_ARCH_EXAONE,
|
|
75
|
+
LLM_ARCH_EXAONE4,
|
|
73
76
|
LLM_ARCH_RWKV6,
|
|
74
77
|
LLM_ARCH_RWKV6QWEN2,
|
|
75
78
|
LLM_ARCH_RWKV7,
|
|
76
79
|
LLM_ARCH_ARWKV7,
|
|
77
80
|
LLM_ARCH_GRANITE,
|
|
78
81
|
LLM_ARCH_GRANITE_MOE,
|
|
82
|
+
LLM_ARCH_GRANITE_HYBRID,
|
|
79
83
|
LLM_ARCH_CHAMELEON,
|
|
80
84
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
|
81
85
|
LLM_ARCH_PLM,
|
|
@@ -83,8 +87,11 @@ enum llm_arch {
|
|
|
83
87
|
LLM_ARCH_DOTS1,
|
|
84
88
|
LLM_ARCH_ARCEE,
|
|
85
89
|
LLM_ARCH_ERNIE4_5,
|
|
90
|
+
LLM_ARCH_ERNIE4_5_MOE,
|
|
86
91
|
LLM_ARCH_HUNYUAN_MOE,
|
|
87
92
|
LLM_ARCH_SMOLLM3,
|
|
93
|
+
LLM_ARCH_LFM2,
|
|
94
|
+
LLM_ARCH_DREAM,
|
|
88
95
|
LLM_ARCH_UNKNOWN,
|
|
89
96
|
};
|
|
90
97
|
|
|
@@ -157,7 +164,6 @@ enum llm_kv {
|
|
|
157
164
|
LLM_KV_ATTENTION_SCALE,
|
|
158
165
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
|
159
166
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
|
160
|
-
LLM_KV_ATTENTION_LAYER_INDICES,
|
|
161
167
|
|
|
162
168
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
163
169
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
|
@@ -226,6 +232,8 @@ enum llm_kv {
|
|
|
226
232
|
|
|
227
233
|
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
|
|
228
234
|
|
|
235
|
+
LLM_KV_SHORTCONV_L_CACHE,
|
|
236
|
+
|
|
229
237
|
// deprecated:
|
|
230
238
|
LLM_KV_TOKENIZER_PREFIX_ID,
|
|
231
239
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
|
@@ -296,7 +304,10 @@ enum llm_tensor {
|
|
|
296
304
|
LLM_TENSOR_SSM_CONV1D,
|
|
297
305
|
LLM_TENSOR_SSM_X,
|
|
298
306
|
LLM_TENSOR_SSM_DT,
|
|
307
|
+
LLM_TENSOR_SSM_DT_NORM,
|
|
299
308
|
LLM_TENSOR_SSM_A,
|
|
309
|
+
LLM_TENSOR_SSM_B_NORM,
|
|
310
|
+
LLM_TENSOR_SSM_C_NORM,
|
|
300
311
|
LLM_TENSOR_SSM_D,
|
|
301
312
|
LLM_TENSOR_SSM_NORM,
|
|
302
313
|
LLM_TENSOR_SSM_OUT,
|
|
@@ -392,6 +403,9 @@ enum llm_tensor {
|
|
|
392
403
|
LLM_TENSOR_POS_NET_ATTN_K,
|
|
393
404
|
LLM_TENSOR_POS_NET_ATTN_V,
|
|
394
405
|
LLM_TENSOR_POS_NET_ATTN_OUT,
|
|
406
|
+
LLM_TENSOR_SHORTCONV_CONV,
|
|
407
|
+
LLM_TENSOR_SHORTCONV_INPROJ,
|
|
408
|
+
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
|
395
409
|
};
|
|
396
410
|
|
|
397
411
|
enum llm_tensor_layer {
|
|
@@ -468,3 +482,4 @@ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
|
|
|
468
482
|
|
|
469
483
|
bool llm_arch_is_recurrent(const llm_arch & arch);
|
|
470
484
|
bool llm_arch_is_hybrid (const llm_arch & arch);
|
|
485
|
+
bool llm_arch_is_diffusion(const llm_arch & arch);
|