@fugood/llama.node 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/CMakeLists.txt +0 -1
- package/src/llama.cpp/common/arg.cpp +7 -0
- package/src/llama.cpp/common/common.h +1 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
- package/src/llama.cpp/ggml/include/ggml.h +91 -10
- package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +726 -155
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +9 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -9
- package/src/llama.cpp/include/llama.h +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +90 -2
- package/src/llama.cpp/src/llama-arch.h +6 -0
- package/src/llama.cpp/src/llama-batch.cpp +27 -1
- package/src/llama.cpp/src/llama-batch.h +8 -1
- package/src/llama.cpp/src/llama-chat.cpp +15 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +64 -50
- package/src/llama.cpp/src/llama-graph.h +41 -16
- package/src/llama.cpp/src/llama-hparams.cpp +2 -1
- package/src/llama.cpp/src/llama-hparams.h +1 -0
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
- package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
- package/src/llama.cpp/src/llama-kv-cells.h +62 -10
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
- package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +15 -2
- package/src/llama.cpp/src/llama-memory.cpp +17 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +1234 -248
- package/src/llama.cpp/src/llama-model.h +2 -0
- package/src/llama.cpp/src/llama-vocab.cpp +8 -1
- package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
|
@@ -102,6 +102,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
102
102
|
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
|
103
103
|
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
|
104
104
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
105
|
+
case LLM_TYPE_A13B: return "A13B";
|
|
105
106
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
106
107
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
107
108
|
case LLM_TYPE_E2B: return "E2B";
|
|
@@ -208,23 +209,27 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
208
209
|
} break;
|
|
209
210
|
case GGML_OP_SSM_CONV:
|
|
210
211
|
{
|
|
211
|
-
|
|
212
|
-
|
|
212
|
+
const int64_t n_seq_tokens = 512;
|
|
213
|
+
const int64_t n_seqs = 3;
|
|
214
|
+
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
|
|
213
215
|
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
|
|
214
216
|
} break;
|
|
215
217
|
case GGML_OP_SSM_SCAN:
|
|
216
218
|
{
|
|
217
|
-
//
|
|
218
|
-
const int64_t d_state = w->ne[0];
|
|
219
|
-
const int64_t
|
|
219
|
+
// w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
|
|
220
|
+
const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
|
|
221
|
+
const int64_t n_head = w->ne[1];
|
|
222
|
+
const int64_t head_dim = hparams.ssm_d_inner / n_head;
|
|
223
|
+
const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
|
|
220
224
|
const int64_t n_seq_tokens = 512;
|
|
221
|
-
const int64_t n_seqs =
|
|
222
|
-
ggml_tensor * s
|
|
223
|
-
ggml_tensor * x
|
|
224
|
-
ggml_tensor * dt
|
|
225
|
-
ggml_tensor * B
|
|
226
|
-
ggml_tensor * C
|
|
227
|
-
|
|
225
|
+
const int64_t n_seqs = 3;
|
|
226
|
+
ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
|
|
227
|
+
ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
|
|
228
|
+
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
|
|
229
|
+
ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
|
230
|
+
ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
|
231
|
+
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
|
|
232
|
+
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
|
|
228
233
|
} break;
|
|
229
234
|
case GGML_OP_RWKV_WKV6:
|
|
230
235
|
{
|
|
@@ -1081,6 +1086,38 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1081
1086
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1082
1087
|
}
|
|
1083
1088
|
} break;
|
|
1089
|
+
case LLM_ARCH_MAMBA2:
|
|
1090
|
+
{
|
|
1091
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1092
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1093
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1094
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1095
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1096
|
+
|
|
1097
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1098
|
+
|
|
1099
|
+
switch (hparams.n_layer) {
|
|
1100
|
+
case 24:
|
|
1101
|
+
switch (hparams.n_embd) {
|
|
1102
|
+
case 768: type = LLM_TYPE_SMALL; break;
|
|
1103
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1104
|
+
} break;
|
|
1105
|
+
case 48:
|
|
1106
|
+
switch (hparams.n_embd) {
|
|
1107
|
+
case 1024: type = LLM_TYPE_MEDIUM; break;
|
|
1108
|
+
case 1536: type = LLM_TYPE_LARGE; break;
|
|
1109
|
+
case 2048: type = LLM_TYPE_XL; break;
|
|
1110
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1111
|
+
} break;
|
|
1112
|
+
case 64:
|
|
1113
|
+
switch (hparams.n_embd) {
|
|
1114
|
+
case 2560: type = LLM_TYPE_3B; break;
|
|
1115
|
+
case 4096: type = LLM_TYPE_7B; break;
|
|
1116
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1117
|
+
} break;
|
|
1118
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1119
|
+
}
|
|
1120
|
+
} break;
|
|
1084
1121
|
case LLM_ARCH_XVERSE:
|
|
1085
1122
|
{
|
|
1086
1123
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1513,6 +1550,58 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1513
1550
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1514
1551
|
}
|
|
1515
1552
|
} break;
|
|
1553
|
+
case LLM_ARCH_FALCON_H1:
|
|
1554
|
+
{
|
|
1555
|
+
// Common parameters
|
|
1556
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1557
|
+
|
|
1558
|
+
// SSM parameters
|
|
1559
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1560
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1561
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1562
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1563
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1564
|
+
|
|
1565
|
+
std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
|
|
1566
|
+
|
|
1567
|
+
switch (hparams.n_layer) {
|
|
1568
|
+
case 36:
|
|
1569
|
+
type = LLM_TYPE_0_5B; break;
|
|
1570
|
+
case 24:
|
|
1571
|
+
type = LLM_TYPE_1_5B; break;
|
|
1572
|
+
case 66:
|
|
1573
|
+
type = LLM_TYPE_1B; break;
|
|
1574
|
+
case 32:
|
|
1575
|
+
type = LLM_TYPE_3B; break;
|
|
1576
|
+
case 44:
|
|
1577
|
+
type = LLM_TYPE_7B; break;
|
|
1578
|
+
case 72:
|
|
1579
|
+
type = LLM_TYPE_34B; break;
|
|
1580
|
+
default:
|
|
1581
|
+
type = LLM_TYPE_UNKNOWN;
|
|
1582
|
+
}
|
|
1583
|
+
} break;
|
|
1584
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
1585
|
+
{
|
|
1586
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1587
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1588
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
|
1589
|
+
|
|
1590
|
+
switch (hparams.n_layer) {
|
|
1591
|
+
case 32: type = LLM_TYPE_A13B; break;
|
|
1592
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1593
|
+
}
|
|
1594
|
+
} break;
|
|
1595
|
+
case LLM_ARCH_SMOLLM3:
|
|
1596
|
+
{
|
|
1597
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1598
|
+
hparams.n_no_rope_layer_step = 4;
|
|
1599
|
+
|
|
1600
|
+
switch (hparams.n_layer) {
|
|
1601
|
+
case 36: type = LLM_TYPE_3B; break;
|
|
1602
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1603
|
+
}
|
|
1604
|
+
} break;
|
|
1516
1605
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1517
1606
|
}
|
|
1518
1607
|
|
|
@@ -3120,6 +3209,54 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3120
3209
|
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
|
|
3121
3210
|
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
|
|
3122
3211
|
|
|
3212
|
+
// out_proj
|
|
3213
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
3214
|
+
}
|
|
3215
|
+
} break;
|
|
3216
|
+
case LLM_ARCH_MAMBA2:
|
|
3217
|
+
{
|
|
3218
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
3219
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
3220
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
3221
|
+
const int64_t n_head = hparams.ssm_dt_rank;
|
|
3222
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
3223
|
+
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
|
|
3224
|
+
|
|
3225
|
+
// only an expansion factor of 2 is supported for now
|
|
3226
|
+
GGML_ASSERT(2 * n_embd == d_inner);
|
|
3227
|
+
|
|
3228
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3229
|
+
|
|
3230
|
+
// output
|
|
3231
|
+
{
|
|
3232
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3233
|
+
|
|
3234
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3235
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
3236
|
+
if (output == NULL) {
|
|
3237
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
3238
|
+
}
|
|
3239
|
+
}
|
|
3240
|
+
|
|
3241
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3242
|
+
auto & layer = layers[i];
|
|
3243
|
+
|
|
3244
|
+
// norm
|
|
3245
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3246
|
+
|
|
3247
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
|
|
3248
|
+
|
|
3249
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
|
|
3250
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
|
|
3251
|
+
|
|
3252
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
|
|
3253
|
+
|
|
3254
|
+
// no "weight" suffix for these
|
|
3255
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
|
|
3256
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
|
|
3257
|
+
|
|
3258
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
|
|
3259
|
+
|
|
3123
3260
|
// out_proj
|
|
3124
3261
|
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
3125
3262
|
}
|
|
@@ -4385,6 +4522,149 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4385
4522
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
4386
4523
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4387
4524
|
|
|
4525
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4526
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4527
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4528
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4529
|
+
}
|
|
4530
|
+
} break;
|
|
4531
|
+
case LLM_ARCH_FALCON_H1:
|
|
4532
|
+
{
|
|
4533
|
+
// Common
|
|
4534
|
+
const int64_t hidden_size = hparams.n_embd; // hidden_size
|
|
4535
|
+
|
|
4536
|
+
// mamba2 Mixer SSM params
|
|
4537
|
+
const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
|
|
4538
|
+
const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
|
|
4539
|
+
const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
|
|
4540
|
+
const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
|
|
4541
|
+
const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
|
|
4542
|
+
const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
|
|
4543
|
+
const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
|
|
4544
|
+
|
|
4545
|
+
// attn params
|
|
4546
|
+
const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
|
|
4547
|
+
const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
|
|
4548
|
+
|
|
4549
|
+
// ffn params
|
|
4550
|
+
const int64_t ffn_intermediate_size = hparams.n_ff(0);
|
|
4551
|
+
|
|
4552
|
+
// embeddings
|
|
4553
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
|
|
4554
|
+
|
|
4555
|
+
// output
|
|
4556
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4557
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
|
|
4558
|
+
|
|
4559
|
+
// if output is NULL, init from the input tok embed
|
|
4560
|
+
if (output == NULL) {
|
|
4561
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
|
|
4562
|
+
}
|
|
4563
|
+
|
|
4564
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4565
|
+
auto & layer = layers[i];
|
|
4566
|
+
|
|
4567
|
+
/*SSM LAYERS*/
|
|
4568
|
+
// ssm in
|
|
4569
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
|
|
4570
|
+
// ssm 1d conv
|
|
4571
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
|
|
4572
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
|
|
4573
|
+
// ssm_dt
|
|
4574
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
|
|
4575
|
+
// no "weight" suffix for these
|
|
4576
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
|
|
4577
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
|
|
4578
|
+
// ssm_norm
|
|
4579
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
|
|
4580
|
+
// out_proj
|
|
4581
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
|
|
4582
|
+
|
|
4583
|
+
/*ATTENTION LAYERS*/
|
|
4584
|
+
// attention layers (with optional bias)
|
|
4585
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
|
|
4586
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
|
|
4587
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
|
|
4588
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
|
|
4589
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
|
|
4590
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
|
|
4591
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
|
|
4592
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
|
|
4593
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
|
|
4594
|
+
|
|
4595
|
+
|
|
4596
|
+
// feed forward (w/ optional biases)
|
|
4597
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
|
|
4598
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
4599
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
|
|
4600
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
|
|
4601
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
|
|
4602
|
+
|
|
4603
|
+
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
|
|
4604
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
|
|
4605
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
|
|
4606
|
+
}
|
|
4607
|
+
} break;
|
|
4608
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
4609
|
+
{
|
|
4610
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4611
|
+
|
|
4612
|
+
// output
|
|
4613
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4614
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4615
|
+
// if output is NULL, init from the input tok embed
|
|
4616
|
+
if (output == NULL) {
|
|
4617
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4618
|
+
}
|
|
4619
|
+
|
|
4620
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4621
|
+
auto & layer = layers[i];
|
|
4622
|
+
|
|
4623
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4624
|
+
|
|
4625
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4626
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4627
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4628
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4629
|
+
|
|
4630
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4631
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4632
|
+
|
|
4633
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4634
|
+
|
|
4635
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
4636
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
4637
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
4638
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
4639
|
+
|
|
4640
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
4641
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
4642
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
|
4643
|
+
}
|
|
4644
|
+
} break;
|
|
4645
|
+
case LLM_ARCH_SMOLLM3:
|
|
4646
|
+
{
|
|
4647
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4648
|
+
|
|
4649
|
+
// output
|
|
4650
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4651
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4652
|
+
|
|
4653
|
+
// if output is NULL, init from the input tok embed
|
|
4654
|
+
if (output == NULL) {
|
|
4655
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4656
|
+
}
|
|
4657
|
+
|
|
4658
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4659
|
+
auto & layer = layers[i];
|
|
4660
|
+
|
|
4661
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4662
|
+
|
|
4663
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4664
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4665
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4666
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4667
|
+
|
|
4388
4668
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4389
4669
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4390
4670
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
@@ -4630,10 +4910,14 @@ void llama_model::print_info() const {
|
|
|
4630
4910
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
4631
4911
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
4632
4912
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
4913
|
+
}
|
|
4914
|
+
|
|
4915
|
+
if (arch == LLM_ARCH_MAMBA || arch == LLM_ARCH_MAMBA2) {
|
|
4633
4916
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
4634
4917
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
4635
4918
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
4636
4919
|
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
4920
|
+
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
|
|
4637
4921
|
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
4638
4922
|
|
|
4639
4923
|
if (!classifier_labels.empty()) {
|
|
@@ -5582,12 +5866,10 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
5582
5866
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
5583
5867
|
cb(cur, "wqkv", il);
|
|
5584
5868
|
|
|
5585
|
-
ggml_tensor * Qcur =
|
|
5586
|
-
ggml_tensor * Kcur =
|
|
5869
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
5870
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
5587
5871
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
5588
5872
|
|
|
5589
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
5590
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
5591
5873
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
5592
5874
|
|
|
5593
5875
|
// using mode = 2 for neox mode
|
|
@@ -5864,12 +6146,10 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
5864
6146
|
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
|
5865
6147
|
cb(cur, "wqkv_clamped", il);
|
|
5866
6148
|
|
|
5867
|
-
Qcur =
|
|
5868
|
-
Kcur =
|
|
6149
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6150
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
5869
6151
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
5870
6152
|
|
|
5871
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
5872
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
5873
6153
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
5874
6154
|
|
|
5875
6155
|
Qcur = ggml_rope_ext(
|
|
@@ -6380,12 +6660,10 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
6380
6660
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6381
6661
|
cb(cur, "wqkv", il);
|
|
6382
6662
|
|
|
6383
|
-
Qcur =
|
|
6384
|
-
Kcur =
|
|
6663
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6664
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6385
6665
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6386
6666
|
|
|
6387
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6388
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6389
6667
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6390
6668
|
|
|
6391
6669
|
// RoPE
|
|
@@ -6615,8 +6893,8 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6615
6893
|
cb(cur, "wqkv_clamped", il);
|
|
6616
6894
|
}
|
|
6617
6895
|
|
|
6618
|
-
ggml_tensor * Qcur =
|
|
6619
|
-
ggml_tensor * Kcur =
|
|
6896
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6897
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6620
6898
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6621
6899
|
|
|
6622
6900
|
cb(Qcur, "Qcur", il);
|
|
@@ -6636,6 +6914,12 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6636
6914
|
model.layers[il].attn_k_norm_b,
|
|
6637
6915
|
LLM_NORM, il);
|
|
6638
6916
|
cb(Kcur, "Kcur", il);
|
|
6917
|
+
} else {
|
|
6918
|
+
Qcur = ggml_cont(ctx0, Qcur);
|
|
6919
|
+
cb(Qcur, "Qcur", il);
|
|
6920
|
+
|
|
6921
|
+
Kcur = ggml_cont(ctx0, Kcur);
|
|
6922
|
+
cb(Kcur, "Kcur", il);
|
|
6639
6923
|
}
|
|
6640
6924
|
|
|
6641
6925
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
@@ -6890,12 +7174,10 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
6890
7174
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6891
7175
|
cb(cur, "bqkv", il);
|
|
6892
7176
|
|
|
6893
|
-
ggml_tensor * Qcur =
|
|
6894
|
-
ggml_tensor * Kcur =
|
|
7177
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7178
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6895
7179
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
|
|
6896
7180
|
|
|
6897
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6898
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6899
7181
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6900
7182
|
|
|
6901
7183
|
// using mode = 2 for neox mode
|
|
@@ -7660,21 +7942,21 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7660
7942
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7661
7943
|
cb(cur, "bqkv", il);
|
|
7662
7944
|
|
|
7663
|
-
Qcur =
|
|
7664
|
-
Kcur =
|
|
7945
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7946
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7665
7947
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
7666
7948
|
} else {
|
|
7667
7949
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
7668
7950
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
7669
7951
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
7952
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7953
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7670
7954
|
}
|
|
7671
7955
|
|
|
7672
7956
|
cb(Qcur, "Qcur", il);
|
|
7673
7957
|
cb(Kcur, "Kcur", il);
|
|
7674
7958
|
cb(Vcur, "Vcur", il);
|
|
7675
7959
|
|
|
7676
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7677
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7678
7960
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7679
7961
|
|
|
7680
7962
|
Qcur = ggml_rope_ext(
|
|
@@ -7798,21 +8080,21 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7798
8080
|
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
|
|
7799
8081
|
cb(cur, "wqkv", il);
|
|
7800
8082
|
|
|
7801
|
-
Qcur =
|
|
7802
|
-
Kcur =
|
|
8083
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
|
|
8084
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
7803
8085
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
|
7804
8086
|
} else {
|
|
7805
8087
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
7806
8088
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
7807
8089
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
8090
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8091
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7808
8092
|
}
|
|
7809
8093
|
|
|
7810
8094
|
cb(Qcur, "Qcur", il);
|
|
7811
8095
|
cb(Kcur, "Kcur", il);
|
|
7812
8096
|
cb(Vcur, "Vcur", il);
|
|
7813
8097
|
|
|
7814
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7815
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7816
8098
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7817
8099
|
|
|
7818
8100
|
Qcur = ggml_rope_ext(
|
|
@@ -8168,12 +8450,10 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
8168
8450
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
8169
8451
|
cb(cur, "bqkv", il);
|
|
8170
8452
|
|
|
8171
|
-
ggml_tensor * Qcur =
|
|
8172
|
-
ggml_tensor * Kcur =
|
|
8453
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8454
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
8173
8455
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
8174
8456
|
|
|
8175
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8176
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8177
8457
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8178
8458
|
|
|
8179
8459
|
Qcur = ggml_rope_ext(
|
|
@@ -8589,8 +8869,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8589
8869
|
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
8590
8870
|
cb(k_pe, "k_pe", il);
|
|
8591
8871
|
|
|
8592
|
-
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
|
|
8593
|
-
kv_compressed = ggml_cont(ctx0, kv_compressed);
|
|
8594
8872
|
kv_compressed = build_norm(kv_compressed,
|
|
8595
8873
|
model.layers[il].attn_kv_a_norm, NULL,
|
|
8596
8874
|
LLM_NORM_RMS, il);
|
|
@@ -8617,12 +8895,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8617
8895
|
v_states = ggml_cont(ctx0, v_states);
|
|
8618
8896
|
cb(v_states, "v_states", il);
|
|
8619
8897
|
|
|
8620
|
-
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
|
8621
|
-
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
|
8622
|
-
0);
|
|
8623
|
-
cb(v_states, "v_states", il);
|
|
8624
|
-
|
|
8625
|
-
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
|
|
8626
8898
|
q_pe = ggml_rope_ext(
|
|
8627
8899
|
ctx0, q_pe, inp_pos, rope_factors,
|
|
8628
8900
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -8631,7 +8903,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8631
8903
|
cb(q_pe, "q_pe", il);
|
|
8632
8904
|
|
|
8633
8905
|
// shared RoPE key
|
|
8634
|
-
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
|
|
8635
8906
|
k_pe = ggml_rope_ext(
|
|
8636
8907
|
ctx0, k_pe, inp_pos, rope_factors,
|
|
8637
8908
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9665,9 +9936,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
9665
9936
|
};
|
|
9666
9937
|
|
|
9667
9938
|
struct llm_build_mamba : public llm_graph_context {
|
|
9668
|
-
const llama_model & model
|
|
9669
|
-
|
|
9670
|
-
llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
|
|
9939
|
+
llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
9671
9940
|
ggml_tensor * cur;
|
|
9672
9941
|
ggml_tensor * inpL;
|
|
9673
9942
|
|
|
@@ -9685,7 +9954,11 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9685
9954
|
LLM_NORM_RMS, il);
|
|
9686
9955
|
cb(cur, "attn_norm", il);
|
|
9687
9956
|
|
|
9688
|
-
|
|
9957
|
+
if (model.arch == LLM_ARCH_MAMBA2) {
|
|
9958
|
+
cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il);
|
|
9959
|
+
} else {
|
|
9960
|
+
cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il);
|
|
9961
|
+
}
|
|
9689
9962
|
|
|
9690
9963
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
9691
9964
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
@@ -9719,11 +9992,11 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9719
9992
|
ggml_build_forward_expand(gf, cur);
|
|
9720
9993
|
}
|
|
9721
9994
|
|
|
9722
|
-
// TODO: split
|
|
9723
9995
|
ggml_tensor * build_mamba_layer(
|
|
9724
9996
|
llm_graph_input_rs * inp,
|
|
9725
9997
|
ggml_cgraph * gf,
|
|
9726
9998
|
ggml_tensor * cur,
|
|
9999
|
+
const llama_model & model,
|
|
9727
10000
|
const llama_ubatch & ubatch,
|
|
9728
10001
|
int il) const {
|
|
9729
10002
|
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
@@ -9734,6 +10007,8 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9734
10007
|
const int64_t d_inner = hparams.ssm_d_inner;
|
|
9735
10008
|
const int64_t d_state = hparams.ssm_d_state;
|
|
9736
10009
|
const int64_t dt_rank = hparams.ssm_dt_rank;
|
|
10010
|
+
const int64_t n_head = d_inner;
|
|
10011
|
+
const int64_t head_dim = 1;
|
|
9737
10012
|
const int64_t n_seqs = ubatch.n_seqs;
|
|
9738
10013
|
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
|
|
9739
10014
|
const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
|
|
@@ -9749,15 +10024,8 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9749
10024
|
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
9750
10025
|
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
9751
10026
|
|
|
9752
|
-
|
|
9753
|
-
ggml_tensor * conv = build_rs(
|
|
9754
|
-
inp, gf, conv_states_all,
|
|
9755
|
-
hparams.n_embd_r(), n_seqs);
|
|
10027
|
+
ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
|
|
9756
10028
|
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
|
9757
|
-
ggml_tensor * ssm = build_rs(
|
|
9758
|
-
inp, gf, ssm_states_all,
|
|
9759
|
-
hparams.n_embd_s(), n_seqs);
|
|
9760
|
-
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
|
9761
10029
|
|
|
9762
10030
|
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
9763
10031
|
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
|
@@ -9806,8 +10074,8 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9806
10074
|
ggml_tensor * x_db = build_lora_mm(model.layers[il].ssm_x, x);
|
|
9807
10075
|
// split
|
|
9808
10076
|
ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
|
|
9809
|
-
ggml_tensor * B =
|
|
9810
|
-
ggml_tensor * C =
|
|
10077
|
+
ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
|
|
10078
|
+
ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
|
|
9811
10079
|
|
|
9812
10080
|
// Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
|
|
9813
10081
|
if (ssm_dt_b_c_rms) {
|
|
@@ -9820,23 +10088,36 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9820
10088
|
dt = build_lora_mm(model.layers[il].ssm_dt, dt);
|
|
9821
10089
|
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
|
|
9822
10090
|
|
|
9823
|
-
|
|
9824
|
-
|
|
9825
|
-
|
|
9826
|
-
ggml_tensor *
|
|
10091
|
+
cur = x;
|
|
10092
|
+
x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
|
|
10093
|
+
|
|
10094
|
+
ggml_tensor * A = model.layers[il].ssm_a;
|
|
10095
|
+
|
|
10096
|
+
// use the states and the indices provided by build_recurrent_state
|
|
10097
|
+
// (this is necessary in order to properly use the states before they are overwritten,
|
|
10098
|
+
// while avoiding to make unnecessary copies of the states)
|
|
10099
|
+
auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
|
|
10100
|
+
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
|
|
10101
|
+
|
|
10102
|
+
// Custom operator to optimize the parallel associative scan
|
|
10103
|
+
// as described in the Annex D of the Mamba paper.
|
|
10104
|
+
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
|
|
10105
|
+
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
|
10106
|
+
};
|
|
10107
|
+
|
|
10108
|
+
ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
|
9827
10109
|
|
|
9828
10110
|
// store last states
|
|
9829
10111
|
ggml_build_forward_expand(gf,
|
|
9830
10112
|
ggml_cpy(ctx0,
|
|
9831
|
-
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
|
|
10113
|
+
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
|
|
9832
10114
|
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
|
9833
10115
|
|
|
9834
|
-
ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[
|
|
10116
|
+
ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
|
|
9835
10117
|
|
|
9836
10118
|
// TODO: skip computing output earlier for unused tokens
|
|
9837
10119
|
|
|
9838
|
-
|
|
9839
|
-
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
|
10120
|
+
y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, model.layers[il].ssm_d));
|
|
9840
10121
|
y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
|
|
9841
10122
|
|
|
9842
10123
|
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
@@ -9845,40 +10126,169 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9845
10126
|
|
|
9846
10127
|
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
|
9847
10128
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
|
|
9848
|
-
//cb(cur, "mamba_out", il);
|
|
10129
|
+
// cb(cur, "mamba_out", il);
|
|
9849
10130
|
|
|
9850
10131
|
return cur;
|
|
9851
10132
|
}
|
|
9852
|
-
};
|
|
9853
10133
|
|
|
9854
|
-
|
|
9855
|
-
|
|
9856
|
-
|
|
10134
|
+
ggml_tensor * build_mamba2_layer(
|
|
10135
|
+
llm_graph_input_rs * inp,
|
|
10136
|
+
ggml_cgraph * gf,
|
|
10137
|
+
ggml_tensor * cur,
|
|
10138
|
+
const llama_model & model,
|
|
10139
|
+
const llama_ubatch & ubatch,
|
|
10140
|
+
int il) const {
|
|
10141
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
9857
10142
|
|
|
9858
|
-
|
|
10143
|
+
const auto kv_head = mctx_cur->get_head();
|
|
9859
10144
|
|
|
9860
|
-
const
|
|
10145
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
10146
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
10147
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
10148
|
+
const int64_t n_head = hparams.ssm_dt_rank;
|
|
10149
|
+
const int64_t head_dim = d_inner / n_head;
|
|
10150
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
10151
|
+
const int64_t n_seqs = ubatch.n_seqs;
|
|
9861
10152
|
|
|
9862
|
-
|
|
9863
|
-
ggml_tensor * inpL;
|
|
10153
|
+
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
9864
10154
|
|
|
9865
|
-
|
|
10155
|
+
GGML_ASSERT(n_seqs != 0);
|
|
10156
|
+
GGML_ASSERT(ubatch.equal_seqs);
|
|
10157
|
+
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
9866
10158
|
|
|
9867
|
-
|
|
9868
|
-
ggml_tensor *
|
|
10159
|
+
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
10160
|
+
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
9869
10161
|
|
|
9870
|
-
|
|
10162
|
+
ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
|
|
10163
|
+
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
|
|
9871
10164
|
|
|
9872
|
-
|
|
10165
|
+
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
10166
|
+
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
|
9873
10167
|
|
|
9874
|
-
|
|
9875
|
-
// norm
|
|
9876
|
-
cur = build_norm(inpL,
|
|
9877
|
-
model.layers[il].attn_norm, NULL,
|
|
9878
|
-
LLM_NORM, il);
|
|
9879
|
-
cb(cur, "attn_norm", il);
|
|
10168
|
+
// d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
|
|
9880
10169
|
|
|
9881
|
-
|
|
10170
|
+
// {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
|
|
10171
|
+
ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
|
|
10172
|
+
|
|
10173
|
+
// split the above in three
|
|
10174
|
+
ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
|
|
10175
|
+
ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
|
|
10176
|
+
ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
|
|
10177
|
+
|
|
10178
|
+
// conv
|
|
10179
|
+
{
|
|
10180
|
+
// => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
|
|
10181
|
+
ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
|
|
10182
|
+
|
|
10183
|
+
// copy last (d_conv - 1) columns back into the state cache
|
|
10184
|
+
ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
|
|
10185
|
+
|
|
10186
|
+
ggml_build_forward_expand(gf,
|
|
10187
|
+
ggml_cpy(ctx0, last_conv,
|
|
10188
|
+
ggml_view_1d(ctx0, conv_states_all,
|
|
10189
|
+
(d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
|
|
10190
|
+
kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
|
|
10191
|
+
|
|
10192
|
+
// 1D convolution
|
|
10193
|
+
// The equivalent is to make a self-overlapping view of conv_x
|
|
10194
|
+
// over d_conv columns at each stride in the 3rd dimension,
|
|
10195
|
+
// then element-wise multiply that with the conv1d weight,
|
|
10196
|
+
// then sum the elements of each row,
|
|
10197
|
+
// (the last two steps are a dot product over rows (also doable with mul_mat))
|
|
10198
|
+
// then permute away the ne[0] dimension,
|
|
10199
|
+
// and then you're left with the resulting x tensor.
|
|
10200
|
+
// For simultaneous sequences, all sequences need to have the same length.
|
|
10201
|
+
xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
|
|
10202
|
+
|
|
10203
|
+
// bias
|
|
10204
|
+
xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
|
|
10205
|
+
|
|
10206
|
+
xBC = ggml_silu(ctx0, xBC);
|
|
10207
|
+
}
|
|
10208
|
+
|
|
10209
|
+
// ssm
|
|
10210
|
+
{
|
|
10211
|
+
// These correspond to V K Q in SSM/attention duality
|
|
10212
|
+
ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
|
|
10213
|
+
ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
|
|
10214
|
+
ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
|
|
10215
|
+
|
|
10216
|
+
// {n_head, n_seq_tokens, n_seqs}
|
|
10217
|
+
dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
|
|
10218
|
+
|
|
10219
|
+
ggml_tensor * A = model.layers[il].ssm_a;
|
|
10220
|
+
|
|
10221
|
+
// use the states and the indices provided by build_recurrent_state
|
|
10222
|
+
// (this is necessary in order to properly use the states before they are overwritten,
|
|
10223
|
+
// while avoiding to make unnecessary copies of the states)
|
|
10224
|
+
auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
|
|
10225
|
+
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
|
|
10226
|
+
|
|
10227
|
+
// TODO: use semistructured matrices to implement state-space duality
|
|
10228
|
+
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
|
|
10229
|
+
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
|
10230
|
+
};
|
|
10231
|
+
|
|
10232
|
+
ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
|
10233
|
+
|
|
10234
|
+
// store last states
|
|
10235
|
+
ggml_build_forward_expand(gf,
|
|
10236
|
+
ggml_cpy(ctx0,
|
|
10237
|
+
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
|
|
10238
|
+
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
|
10239
|
+
|
|
10240
|
+
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
|
|
10241
|
+
|
|
10242
|
+
// TODO: skip computing output earlier for unused tokens
|
|
10243
|
+
|
|
10244
|
+
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
|
10245
|
+
y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
|
|
10246
|
+
|
|
10247
|
+
// grouped RMS norm
|
|
10248
|
+
y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
|
|
10249
|
+
y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
|
|
10250
|
+
y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
|
|
10251
|
+
|
|
10252
|
+
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
10253
|
+
cur = build_lora_mm(model.layers[il].ssm_out, y);
|
|
10254
|
+
}
|
|
10255
|
+
|
|
10256
|
+
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
|
10257
|
+
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
|
|
10258
|
+
cb(cur, "mamba_out", il);
|
|
10259
|
+
|
|
10260
|
+
return cur;
|
|
10261
|
+
}
|
|
10262
|
+
};
|
|
10263
|
+
|
|
10264
|
+
struct llm_build_command_r : public llm_graph_context {
|
|
10265
|
+
llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
10266
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10267
|
+
|
|
10268
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
10269
|
+
|
|
10270
|
+
const float f_logit_scale = hparams.f_logit_scale;
|
|
10271
|
+
|
|
10272
|
+
ggml_tensor * cur;
|
|
10273
|
+
ggml_tensor * inpL;
|
|
10274
|
+
|
|
10275
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
10276
|
+
|
|
10277
|
+
// inp_pos - contains the positions
|
|
10278
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
10279
|
+
|
|
10280
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
10281
|
+
|
|
10282
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10283
|
+
|
|
10284
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
10285
|
+
// norm
|
|
10286
|
+
cur = build_norm(inpL,
|
|
10287
|
+
model.layers[il].attn_norm, NULL,
|
|
10288
|
+
LLM_NORM, il);
|
|
10289
|
+
cb(cur, "attn_norm", il);
|
|
10290
|
+
|
|
10291
|
+
ggml_tensor * ffn_inp = cur;
|
|
9882
10292
|
|
|
9883
10293
|
// self-attention
|
|
9884
10294
|
{
|
|
@@ -10557,10 +10967,10 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
10557
10967
|
|
|
10558
10968
|
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
|
|
10559
10969
|
|
|
10560
|
-
ggml_tensor * Qcur =
|
|
10970
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
|
|
10561
10971
|
cb(Qcur, "Qcur", il);
|
|
10562
10972
|
|
|
10563
|
-
ggml_tensor * Kcur =
|
|
10973
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
|
|
10564
10974
|
cb(Kcur, "Kcur", il);
|
|
10565
10975
|
|
|
10566
10976
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
|
|
@@ -10682,12 +11092,10 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
10682
11092
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
10683
11093
|
cb(cur, "bqkv", il);
|
|
10684
11094
|
|
|
10685
|
-
ggml_tensor * Qcur =
|
|
10686
|
-
ggml_tensor * Kcur =
|
|
11095
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
11096
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
10687
11097
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
10688
11098
|
|
|
10689
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
10690
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
10691
11099
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
10692
11100
|
|
|
10693
11101
|
Qcur = ggml_rope_ext(
|
|
@@ -11932,6 +12340,8 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11932
12340
|
if (model.layers[il].bv) {
|
|
11933
12341
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
11934
12342
|
}
|
|
12343
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12344
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
11935
12345
|
} else {
|
|
11936
12346
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
11937
12347
|
cb(cur, "wqkv", il);
|
|
@@ -11939,13 +12349,11 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11939
12349
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
11940
12350
|
cb(cur, "bqkv", il);
|
|
11941
12351
|
}
|
|
11942
|
-
Qcur =
|
|
11943
|
-
Kcur =
|
|
12352
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12353
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
11944
12354
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
11945
12355
|
}
|
|
11946
12356
|
|
|
11947
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
11948
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
11949
12357
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
11950
12358
|
|
|
11951
12359
|
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
|
@@ -12066,6 +12474,8 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
12066
12474
|
if (model.layers[il].bv) {
|
|
12067
12475
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
12068
12476
|
}
|
|
12477
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12478
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12069
12479
|
} else {
|
|
12070
12480
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
12071
12481
|
cb(cur, "wqkv", il);
|
|
@@ -12073,13 +12483,11 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
12073
12483
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
12074
12484
|
cb(cur, "bqkv", il);
|
|
12075
12485
|
}
|
|
12076
|
-
Qcur =
|
|
12077
|
-
Kcur =
|
|
12486
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12487
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
12078
12488
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
12079
12489
|
}
|
|
12080
12490
|
|
|
12081
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12082
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12083
12491
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12084
12492
|
|
|
12085
12493
|
Qcur = ggml_rope_ext(
|
|
@@ -14298,12 +14706,11 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
14298
14706
|
}
|
|
14299
14707
|
};
|
|
14300
14708
|
|
|
14301
|
-
struct
|
|
14302
|
-
|
|
14303
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14709
|
+
struct llm_build_falcon_h1 : public llm_graph_context {
|
|
14710
|
+
const llama_model & model;
|
|
14304
14711
|
|
|
14305
|
-
|
|
14306
|
-
|
|
14712
|
+
llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
|
|
14713
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14307
14714
|
|
|
14308
14715
|
ggml_tensor * cur;
|
|
14309
14716
|
ggml_tensor * inpL;
|
|
@@ -14313,7 +14720,8 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14313
14720
|
// inp_pos - contains the positions
|
|
14314
14721
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
14315
14722
|
|
|
14316
|
-
|
|
14723
|
+
// Build the inputs in the recurrent & kv cache
|
|
14724
|
+
auto * inp = build_inp_mem_hybrid();
|
|
14317
14725
|
|
|
14318
14726
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14319
14727
|
|
|
@@ -14322,90 +14730,83 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14322
14730
|
for (int il = 0; il < n_layer; ++il) {
|
|
14323
14731
|
ggml_tensor * inpSA = inpL;
|
|
14324
14732
|
|
|
14325
|
-
// norm
|
|
14326
14733
|
cur = build_norm(inpL,
|
|
14327
14734
|
model.layers[il].attn_norm, NULL,
|
|
14328
14735
|
LLM_NORM_RMS, il);
|
|
14329
14736
|
cb(cur, "attn_norm", il);
|
|
14330
14737
|
|
|
14331
14738
|
// self-attention
|
|
14332
|
-
|
|
14333
|
-
|
|
14334
|
-
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
14739
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14740
|
+
cb(Qcur, "Qcur", il);
|
|
14335
14741
|
|
|
14336
|
-
|
|
14337
|
-
|
|
14338
|
-
cb(Qcur, "Qcur", il);
|
|
14339
|
-
if (model.layers[il].bq) {
|
|
14340
|
-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14341
|
-
cb(Qcur, "Qcur", il);
|
|
14342
|
-
}
|
|
14742
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14743
|
+
cb(Kcur, "Kcur", il);
|
|
14343
14744
|
|
|
14344
|
-
|
|
14345
|
-
|
|
14346
|
-
if (model.layers[il].bk) {
|
|
14347
|
-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14348
|
-
cb(Kcur, "Kcur", il);
|
|
14349
|
-
}
|
|
14745
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14746
|
+
cb(Vcur, "Vcur", il);
|
|
14350
14747
|
|
|
14351
|
-
|
|
14352
|
-
|
|
14353
|
-
if (model.layers[il].bv) {
|
|
14354
|
-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14355
|
-
cb(Vcur, "Vcur", il);
|
|
14356
|
-
}
|
|
14748
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
14749
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
14357
14750
|
|
|
14358
|
-
|
|
14359
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
14360
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14751
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14361
14752
|
|
|
14362
|
-
|
|
14363
|
-
|
|
14364
|
-
|
|
14365
|
-
|
|
14366
|
-
);
|
|
14753
|
+
Qcur = ggml_rope_ext(
|
|
14754
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
14755
|
+
n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14756
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
14367
14757
|
|
|
14368
|
-
|
|
14369
|
-
|
|
14370
|
-
|
|
14371
|
-
|
|
14372
|
-
|
|
14758
|
+
Kcur = ggml_rope_ext(
|
|
14759
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
14760
|
+
n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14761
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14762
|
+
);
|
|
14373
14763
|
|
|
14374
|
-
|
|
14375
|
-
|
|
14376
|
-
|
|
14764
|
+
cb(Qcur, "Qcur-post-rope", il);
|
|
14765
|
+
cb(Kcur, "Kcur-post-rope", il);
|
|
14766
|
+
cb(Vcur, "Vcur-post-rope", il);
|
|
14377
14767
|
|
|
14378
|
-
|
|
14379
|
-
|
|
14380
|
-
|
|
14381
|
-
|
|
14382
|
-
|
|
14768
|
+
ggml_tensor * attn_out = build_attn(inp, gf,
|
|
14769
|
+
model.layers[il].wo, NULL,
|
|
14770
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
14771
|
+
cb(attn_out, "attn_out", il);
|
|
14772
|
+
|
|
14773
|
+
cur = build_norm(inpL,
|
|
14774
|
+
model.layers[il].attn_norm, NULL,
|
|
14775
|
+
LLM_NORM_RMS, il);
|
|
14776
|
+
// Mamba2 layer
|
|
14777
|
+
cb(cur, "ssm_in", il);
|
|
14778
|
+
|
|
14779
|
+
ggml_tensor * ssm_out = build_mamba2_layer(inp, gf, cur, ubatch, il);
|
|
14780
|
+
cb(ssm_out, "ssm_out", il);
|
|
14781
|
+
|
|
14782
|
+
// // Aggregation
|
|
14783
|
+
cur = ggml_add(ctx0, attn_out, ssm_out);
|
|
14784
|
+
inpSA = ggml_add(ctx0, cur, inpSA);
|
|
14785
|
+
cb(cur, "layer_out", il);
|
|
14383
14786
|
|
|
14384
14787
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
14385
14788
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14386
14789
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14387
14790
|
}
|
|
14388
14791
|
|
|
14389
|
-
ggml_tensor * ffn_inp =
|
|
14792
|
+
ggml_tensor * ffn_inp = inpSA;
|
|
14390
14793
|
cb(ffn_inp, "ffn_inp", il);
|
|
14391
14794
|
|
|
14392
14795
|
// feed-forward network
|
|
14393
|
-
// ARCEE uses relu^2 instead of silu
|
|
14394
14796
|
cur = build_norm(ffn_inp,
|
|
14395
14797
|
model.layers[il].ffn_norm, NULL,
|
|
14396
14798
|
LLM_NORM_RMS, il);
|
|
14397
14799
|
cb(cur, "ffn_norm", il);
|
|
14398
14800
|
|
|
14399
14801
|
cur = build_ffn(cur,
|
|
14400
|
-
model.layers[il].ffn_up,
|
|
14401
|
-
|
|
14402
|
-
model.layers[il].ffn_down,
|
|
14802
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
14803
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
14804
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
14403
14805
|
NULL,
|
|
14404
|
-
|
|
14806
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14405
14807
|
cb(cur, "ffn_out", il);
|
|
14406
14808
|
|
|
14407
|
-
cur = ggml_add(ctx0, cur,
|
|
14408
|
-
cb(cur, "ffn_out", il);
|
|
14809
|
+
cur = ggml_add(ctx0, cur, inpSA);
|
|
14409
14810
|
|
|
14410
14811
|
cur = build_cvec(cur, il);
|
|
14411
14812
|
cb(cur, "l_out", il);
|
|
@@ -14431,97 +14832,665 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14431
14832
|
|
|
14432
14833
|
ggml_build_forward_expand(gf, cur);
|
|
14433
14834
|
}
|
|
14434
|
-
};
|
|
14435
14835
|
|
|
14436
|
-
|
|
14437
|
-
|
|
14836
|
+
ggml_tensor * build_mamba2_layer(
|
|
14837
|
+
llm_graph_input_mem_hybrid * inp,
|
|
14838
|
+
ggml_cgraph * gf,
|
|
14839
|
+
ggml_tensor * cur,
|
|
14840
|
+
const llama_ubatch & ubatch,
|
|
14841
|
+
int il) const {
|
|
14842
|
+
const auto * kv_state = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
|
|
14438
14843
|
|
|
14439
|
-
|
|
14440
|
-
// Models that need specific instantiation should be handled in the
|
|
14441
|
-
// switch statement
|
|
14442
|
-
case LLM_ARCH_BERT:
|
|
14443
|
-
case LLM_ARCH_JINA_BERT_V2:
|
|
14444
|
-
case LLM_ARCH_NOMIC_BERT:
|
|
14445
|
-
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
14446
|
-
case LLM_ARCH_NEO_BERT:
|
|
14447
|
-
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
14448
|
-
{
|
|
14449
|
-
res = nullptr;
|
|
14450
|
-
} break;
|
|
14451
|
-
// Models that need standard caching should rely on recurrent/hybrid
|
|
14452
|
-
// checks
|
|
14453
|
-
default:
|
|
14454
|
-
{
|
|
14455
|
-
if (llm_arch_is_recurrent(arch)) {
|
|
14456
|
-
res = new llama_memory_recurrent(
|
|
14457
|
-
*this,
|
|
14458
|
-
nullptr,
|
|
14459
|
-
GGML_TYPE_F32,
|
|
14460
|
-
GGML_TYPE_F32,
|
|
14461
|
-
cparams.offload_kqv,
|
|
14462
|
-
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
14463
|
-
cparams.n_seq_max);
|
|
14464
|
-
} else if (llm_arch_is_hybrid(arch)) {
|
|
14465
|
-
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
14844
|
+
const auto kv_head = kv_state->get_head();
|
|
14466
14845
|
|
|
14467
|
-
|
|
14846
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
14847
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
14848
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
14849
|
+
const int64_t n_head = hparams.ssm_dt_rank;
|
|
14850
|
+
const int64_t head_dim = d_inner / n_head;
|
|
14851
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
14852
|
+
const int64_t n_seqs = ubatch.n_seqs;
|
|
14468
14853
|
|
|
14469
|
-
|
|
14470
|
-
/* model */ *this,
|
|
14471
|
-
/* attn_type_k */ params.type_k,
|
|
14472
|
-
/* attn_type_v */ params.type_v,
|
|
14473
|
-
/* attn_v_trans */ !cparams.flash_attn,
|
|
14474
|
-
/* attn_kv_size */ cparams.n_ctx,
|
|
14475
|
-
/* attn_n_pad */ padding,
|
|
14476
|
-
/* attn_n_swa */ hparams.n_swa,
|
|
14477
|
-
/* attn_swa_type */ hparams.swa_type,
|
|
14478
|
-
/* recurrent_type_k */ GGML_TYPE_F32,
|
|
14479
|
-
/* recurrent_type_v */ GGML_TYPE_F32,
|
|
14480
|
-
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
14481
|
-
/* n_seq_max */ cparams.n_seq_max,
|
|
14482
|
-
/* offload */ cparams.offload_kqv);
|
|
14483
|
-
} else {
|
|
14484
|
-
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
14854
|
+
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
14485
14855
|
|
|
14486
|
-
|
|
14856
|
+
GGML_ASSERT(n_seqs != 0);
|
|
14857
|
+
GGML_ASSERT(ubatch.equal_seqs);
|
|
14858
|
+
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
14487
14859
|
|
|
14488
|
-
|
|
14860
|
+
ggml_tensor * conv_states_all = kv_state->get_r_l(il);
|
|
14861
|
+
ggml_tensor * ssm_states_all = kv_state->get_s_l(il);
|
|
14489
14862
|
|
|
14490
|
-
|
|
14491
|
-
|
|
14863
|
+
ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
|
|
14864
|
+
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
|
|
14492
14865
|
|
|
14493
|
-
|
|
14494
|
-
|
|
14495
|
-
params.type_k,
|
|
14496
|
-
params.type_v,
|
|
14497
|
-
!cparams.flash_attn,
|
|
14498
|
-
cparams.offload_kqv,
|
|
14499
|
-
params.swa_full,
|
|
14500
|
-
cparams.n_ctx,
|
|
14501
|
-
cparams.n_seq_max,
|
|
14502
|
-
cparams.n_ubatch,
|
|
14503
|
-
padding);
|
|
14504
|
-
} else {
|
|
14505
|
-
GGML_ASSERT(!hparams.is_swa_any());
|
|
14866
|
+
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
14867
|
+
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
|
14506
14868
|
|
|
14507
|
-
|
|
14508
|
-
*this,
|
|
14509
|
-
nullptr,
|
|
14510
|
-
params.type_k,
|
|
14511
|
-
params.type_v,
|
|
14512
|
-
!cparams.flash_attn,
|
|
14513
|
-
cparams.offload_kqv,
|
|
14514
|
-
cparams.n_ctx,
|
|
14515
|
-
cparams.n_seq_max,
|
|
14516
|
-
padding,
|
|
14517
|
-
hparams.n_swa,
|
|
14518
|
-
hparams.swa_type);
|
|
14519
|
-
}
|
|
14520
|
-
}
|
|
14521
|
-
}
|
|
14522
|
-
}
|
|
14869
|
+
// d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
|
|
14523
14870
|
|
|
14524
|
-
|
|
14871
|
+
// {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
|
|
14872
|
+
ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
|
|
14873
|
+
cb(zxBCdt, "zxBCdt", il);
|
|
14874
|
+
|
|
14875
|
+
// split the above in three
|
|
14876
|
+
ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
|
|
14877
|
+
ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
|
|
14878
|
+
ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
|
|
14879
|
+
|
|
14880
|
+
// conv
|
|
14881
|
+
{
|
|
14882
|
+
// => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
|
|
14883
|
+
ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
|
|
14884
|
+
|
|
14885
|
+
// copy last (d_conv - 1) columns back into the state cache
|
|
14886
|
+
ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
|
|
14887
|
+
|
|
14888
|
+
ggml_build_forward_expand(gf,
|
|
14889
|
+
ggml_cpy(ctx0, last_conv,
|
|
14890
|
+
ggml_view_1d(ctx0, conv_states_all,
|
|
14891
|
+
(d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
|
|
14892
|
+
kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
|
|
14893
|
+
|
|
14894
|
+
// 1D convolution
|
|
14895
|
+
// The equivalent is to make a self-overlapping view of conv_x
|
|
14896
|
+
// over d_conv columns at each stride in the 3rd dimension,
|
|
14897
|
+
// then element-wise multiply that with the conv1d weight,
|
|
14898
|
+
// then sum the elements of each row,
|
|
14899
|
+
// (the last two steps are a dot product over rows (also doable with mul_mat))
|
|
14900
|
+
// then permute away the ne[0] dimension,
|
|
14901
|
+
// and then you're left with the resulting x tensor.
|
|
14902
|
+
// For simultaneous sequences, all sequences need to have the same length.
|
|
14903
|
+
xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
|
|
14904
|
+
|
|
14905
|
+
// bias
|
|
14906
|
+
xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
|
|
14907
|
+
|
|
14908
|
+
xBC = ggml_silu(ctx0, xBC);
|
|
14909
|
+
}
|
|
14910
|
+
|
|
14911
|
+
// ssm
|
|
14912
|
+
{
|
|
14913
|
+
// These correspond to V K Q in SSM/attention duality
|
|
14914
|
+
ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
|
|
14915
|
+
|
|
14916
|
+
ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
|
|
14917
|
+
|
|
14918
|
+
ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
|
|
14919
|
+
|
|
14920
|
+
// {n_head, n_seq_tokens, n_seqs}
|
|
14921
|
+
dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
|
|
14922
|
+
|
|
14923
|
+
ggml_tensor * A = model.layers[il].ssm_a;
|
|
14924
|
+
|
|
14925
|
+
// use the states and the indices provided by build_rs
|
|
14926
|
+
// (this is necessary in order to properly use the states before they are overwritten,
|
|
14927
|
+
// while avoiding to make unnecessary copies of the states)
|
|
14928
|
+
auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
|
|
14929
|
+
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, kv_state->get_size());
|
|
14930
|
+
|
|
14931
|
+
// TODO: use semistructured matrices to implement state-space duality
|
|
14932
|
+
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
|
|
14933
|
+
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
|
14934
|
+
};
|
|
14935
|
+
|
|
14936
|
+
ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
|
14937
|
+
|
|
14938
|
+
// store last states
|
|
14939
|
+
ggml_build_forward_expand(gf,
|
|
14940
|
+
ggml_cpy(ctx0,
|
|
14941
|
+
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
|
|
14942
|
+
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
|
14943
|
+
|
|
14944
|
+
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
|
|
14945
|
+
|
|
14946
|
+
// TODO: skip computing output earlier for unused tokens
|
|
14947
|
+
|
|
14948
|
+
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
|
14949
|
+
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
|
|
14950
|
+
|
|
14951
|
+
// grouped RMS norm
|
|
14952
|
+
if (model.layers[il].ssm_norm) {
|
|
14953
|
+
y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
|
|
14954
|
+
y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
|
|
14955
|
+
}
|
|
14956
|
+
|
|
14957
|
+
y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
|
|
14958
|
+
|
|
14959
|
+
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
14960
|
+
cur = build_lora_mm(model.layers[il].ssm_out, y);
|
|
14961
|
+
}
|
|
14962
|
+
|
|
14963
|
+
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
|
14964
|
+
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
|
|
14965
|
+
cb(cur, "mamba_out", il);
|
|
14966
|
+
return cur;
|
|
14967
|
+
}
|
|
14968
|
+
};
|
|
14969
|
+
|
|
14970
|
+
struct llm_build_arcee : public llm_graph_context {
|
|
14971
|
+
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
14972
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14973
|
+
|
|
14974
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14975
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
14976
|
+
|
|
14977
|
+
ggml_tensor * cur;
|
|
14978
|
+
ggml_tensor * inpL;
|
|
14979
|
+
|
|
14980
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14981
|
+
|
|
14982
|
+
// inp_pos - contains the positions
|
|
14983
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
14984
|
+
|
|
14985
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
14986
|
+
|
|
14987
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14988
|
+
|
|
14989
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14990
|
+
|
|
14991
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14992
|
+
ggml_tensor * inpSA = inpL;
|
|
14993
|
+
|
|
14994
|
+
// norm
|
|
14995
|
+
cur = build_norm(inpL,
|
|
14996
|
+
model.layers[il].attn_norm, NULL,
|
|
14997
|
+
LLM_NORM_RMS, il);
|
|
14998
|
+
cb(cur, "attn_norm", il);
|
|
14999
|
+
|
|
15000
|
+
// self-attention
|
|
15001
|
+
{
|
|
15002
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
15003
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
15004
|
+
|
|
15005
|
+
// compute Q and K and RoPE them
|
|
15006
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
15007
|
+
cb(Qcur, "Qcur", il);
|
|
15008
|
+
if (model.layers[il].bq) {
|
|
15009
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
15010
|
+
cb(Qcur, "Qcur", il);
|
|
15011
|
+
}
|
|
15012
|
+
|
|
15013
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
15014
|
+
cb(Kcur, "Kcur", il);
|
|
15015
|
+
if (model.layers[il].bk) {
|
|
15016
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
15017
|
+
cb(Kcur, "Kcur", il);
|
|
15018
|
+
}
|
|
15019
|
+
|
|
15020
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
15021
|
+
cb(Vcur, "Vcur", il);
|
|
15022
|
+
if (model.layers[il].bv) {
|
|
15023
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
15024
|
+
cb(Vcur, "Vcur", il);
|
|
15025
|
+
}
|
|
15026
|
+
|
|
15027
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
15028
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
15029
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
15030
|
+
|
|
15031
|
+
Qcur = ggml_rope_ext(
|
|
15032
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
15033
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15034
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15035
|
+
);
|
|
15036
|
+
|
|
15037
|
+
Kcur = ggml_rope_ext(
|
|
15038
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
15039
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15040
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15041
|
+
);
|
|
15042
|
+
|
|
15043
|
+
cb(Qcur, "Qcur", il);
|
|
15044
|
+
cb(Kcur, "Kcur", il);
|
|
15045
|
+
cb(Vcur, "Vcur", il);
|
|
15046
|
+
|
|
15047
|
+
cur = build_attn(inp_attn, gf,
|
|
15048
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
15049
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15050
|
+
cb(cur, "attn_out", il);
|
|
15051
|
+
}
|
|
15052
|
+
|
|
15053
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
15054
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
15055
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
15056
|
+
}
|
|
15057
|
+
|
|
15058
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
15059
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
15060
|
+
|
|
15061
|
+
// feed-forward network
|
|
15062
|
+
// ARCEE uses relu^2 instead of silu
|
|
15063
|
+
cur = build_norm(ffn_inp,
|
|
15064
|
+
model.layers[il].ffn_norm, NULL,
|
|
15065
|
+
LLM_NORM_RMS, il);
|
|
15066
|
+
cb(cur, "ffn_norm", il);
|
|
15067
|
+
|
|
15068
|
+
cur = build_ffn(cur,
|
|
15069
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
15070
|
+
NULL, NULL, NULL,
|
|
15071
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
15072
|
+
NULL,
|
|
15073
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
15074
|
+
cb(cur, "ffn_out", il);
|
|
15075
|
+
|
|
15076
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
15077
|
+
cb(cur, "ffn_out", il);
|
|
15078
|
+
|
|
15079
|
+
cur = build_cvec(cur, il);
|
|
15080
|
+
cb(cur, "l_out", il);
|
|
15081
|
+
|
|
15082
|
+
// input for next layer
|
|
15083
|
+
inpL = cur;
|
|
15084
|
+
}
|
|
15085
|
+
|
|
15086
|
+
cur = inpL;
|
|
15087
|
+
|
|
15088
|
+
cur = build_norm(cur,
|
|
15089
|
+
model.output_norm, NULL,
|
|
15090
|
+
LLM_NORM_RMS, -1);
|
|
15091
|
+
|
|
15092
|
+
cb(cur, "result_norm", -1);
|
|
15093
|
+
res->t_embd = cur;
|
|
15094
|
+
|
|
15095
|
+
// lm_head
|
|
15096
|
+
cur = build_lora_mm(model.output, cur);
|
|
15097
|
+
|
|
15098
|
+
cb(cur, "result_output", -1);
|
|
15099
|
+
res->t_logits = cur;
|
|
15100
|
+
|
|
15101
|
+
ggml_build_forward_expand(gf, cur);
|
|
15102
|
+
}
|
|
15103
|
+
};
|
|
15104
|
+
|
|
15105
|
+
struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
15106
|
+
llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
15107
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
15108
|
+
|
|
15109
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
15110
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
15111
|
+
|
|
15112
|
+
ggml_tensor * cur;
|
|
15113
|
+
ggml_tensor * inpL;
|
|
15114
|
+
|
|
15115
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
15116
|
+
|
|
15117
|
+
// inp_pos - contains the positions
|
|
15118
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
15119
|
+
|
|
15120
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
15121
|
+
|
|
15122
|
+
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
15123
|
+
|
|
15124
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15125
|
+
|
|
15126
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
15127
|
+
ggml_tensor * inpSA = inpL;
|
|
15128
|
+
|
|
15129
|
+
// norm
|
|
15130
|
+
cur = build_norm(inpL,
|
|
15131
|
+
model.layers[il].attn_norm, NULL,
|
|
15132
|
+
LLM_NORM_RMS, il);
|
|
15133
|
+
cb(cur, "attn_norm", il);
|
|
15134
|
+
|
|
15135
|
+
// self-attention
|
|
15136
|
+
{
|
|
15137
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
15138
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
15139
|
+
|
|
15140
|
+
// compute Q and K and RoPE them
|
|
15141
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
15142
|
+
cb(Qcur, "Qcur", il);
|
|
15143
|
+
if (model.layers[il].bq) {
|
|
15144
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
15145
|
+
cb(Qcur, "Qcur", il);
|
|
15146
|
+
}
|
|
15147
|
+
|
|
15148
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
15149
|
+
cb(Kcur, "Kcur", il);
|
|
15150
|
+
if (model.layers[il].bk) {
|
|
15151
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
15152
|
+
cb(Kcur, "Kcur", il);
|
|
15153
|
+
}
|
|
15154
|
+
|
|
15155
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
15156
|
+
cb(Vcur, "Vcur", il);
|
|
15157
|
+
if (model.layers[il].bv) {
|
|
15158
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
15159
|
+
cb(Vcur, "Vcur", il);
|
|
15160
|
+
}
|
|
15161
|
+
|
|
15162
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
15163
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
15164
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
15165
|
+
|
|
15166
|
+
Qcur = ggml_rope_ext(
|
|
15167
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
15168
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15169
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15170
|
+
);
|
|
15171
|
+
|
|
15172
|
+
cb(Qcur, "Qcur", il);
|
|
15173
|
+
cb(Kcur, "Kcur", il);
|
|
15174
|
+
cb(Vcur, "Vcur", il);
|
|
15175
|
+
|
|
15176
|
+
Kcur = ggml_rope_ext(
|
|
15177
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
15178
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15179
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15180
|
+
);
|
|
15181
|
+
|
|
15182
|
+
Kcur = build_norm(Kcur,
|
|
15183
|
+
model.layers[il].attn_k_norm, nullptr,
|
|
15184
|
+
LLM_NORM_RMS, il);
|
|
15185
|
+
cb(Kcur, "Kcur_norm", il);
|
|
15186
|
+
|
|
15187
|
+
Qcur = build_norm(Qcur,
|
|
15188
|
+
model.layers[il].attn_q_norm, nullptr,
|
|
15189
|
+
LLM_NORM_RMS, il);
|
|
15190
|
+
cb(Qcur, "Qcur_norm", il);
|
|
15191
|
+
|
|
15192
|
+
cur = build_attn(inp_attn, gf,
|
|
15193
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
15194
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15195
|
+
cb(cur, "attn_out", il);
|
|
15196
|
+
}
|
|
15197
|
+
|
|
15198
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
15199
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
15200
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
15201
|
+
}
|
|
15202
|
+
|
|
15203
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
15204
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
15205
|
+
|
|
15206
|
+
cur = build_norm(ffn_inp,
|
|
15207
|
+
model.layers[il].ffn_norm, NULL,
|
|
15208
|
+
LLM_NORM_RMS, il);
|
|
15209
|
+
cb(cur, "ffn_norm", il);
|
|
15210
|
+
|
|
15211
|
+
// feed-forward network (non-MoE)
|
|
15212
|
+
ggml_tensor * cur_mlp = build_ffn(cur,
|
|
15213
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
15214
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
15215
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
15216
|
+
NULL,
|
|
15217
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15218
|
+
cb(cur_mlp, "ffn_mlp", il);
|
|
15219
|
+
|
|
15220
|
+
// MoE branch
|
|
15221
|
+
ggml_tensor * cur_moe = build_moe_ffn(cur,
|
|
15222
|
+
model.layers[il].ffn_gate_inp,
|
|
15223
|
+
model.layers[il].ffn_up_exps,
|
|
15224
|
+
model.layers[il].ffn_gate_exps,
|
|
15225
|
+
model.layers[il].ffn_down_exps,
|
|
15226
|
+
nullptr,
|
|
15227
|
+
n_expert, n_expert_used,
|
|
15228
|
+
LLM_FFN_SILU,
|
|
15229
|
+
true, // norm_topk_prob
|
|
15230
|
+
false,
|
|
15231
|
+
0.0,
|
|
15232
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
15233
|
+
il);
|
|
15234
|
+
cb(cur_moe, "ffn_moe_out", il);
|
|
15235
|
+
|
|
15236
|
+
ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
|
|
15237
|
+
cb(ffn_out, "ffn_out", il);
|
|
15238
|
+
|
|
15239
|
+
cur = ggml_add(ctx0, ffn_out, ffn_inp);
|
|
15240
|
+
|
|
15241
|
+
cur = build_cvec(cur, il);
|
|
15242
|
+
cb(cur, "l_out", il);
|
|
15243
|
+
|
|
15244
|
+
// input for next layer
|
|
15245
|
+
inpL = cur;
|
|
15246
|
+
}
|
|
15247
|
+
|
|
15248
|
+
cur = inpL;
|
|
15249
|
+
|
|
15250
|
+
cur = build_norm(cur,
|
|
15251
|
+
model.output_norm, NULL,
|
|
15252
|
+
LLM_NORM_RMS, -1);
|
|
15253
|
+
|
|
15254
|
+
cb(cur, "result_norm", -1);
|
|
15255
|
+
res->t_embd = cur;
|
|
15256
|
+
|
|
15257
|
+
// lm_head
|
|
15258
|
+
cur = build_lora_mm(model.output, cur);
|
|
15259
|
+
cb(cur, "result_output", -1);
|
|
15260
|
+
res->t_logits = cur;
|
|
15261
|
+
|
|
15262
|
+
ggml_build_forward_expand(gf, cur);
|
|
15263
|
+
}
|
|
15264
|
+
};
|
|
15265
|
+
|
|
15266
|
+
struct llm_build_smollm3 : public llm_graph_context {
|
|
15267
|
+
llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
15268
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
15269
|
+
|
|
15270
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
15271
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
15272
|
+
|
|
15273
|
+
ggml_tensor * cur;
|
|
15274
|
+
ggml_tensor * inpL;
|
|
15275
|
+
|
|
15276
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
15277
|
+
|
|
15278
|
+
// inp_pos - contains the positions
|
|
15279
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
15280
|
+
|
|
15281
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
15282
|
+
|
|
15283
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
15284
|
+
|
|
15285
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15286
|
+
|
|
15287
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
15288
|
+
ggml_tensor * inpSA = inpL;
|
|
15289
|
+
|
|
15290
|
+
const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
15291
|
+
|
|
15292
|
+
// norm
|
|
15293
|
+
cur = build_norm(inpL,
|
|
15294
|
+
model.layers[il].attn_norm, NULL,
|
|
15295
|
+
LLM_NORM_RMS, il);
|
|
15296
|
+
cb(cur, "attn_norm", il);
|
|
15297
|
+
|
|
15298
|
+
// self-attention
|
|
15299
|
+
{
|
|
15300
|
+
// compute Q and K and RoPE them
|
|
15301
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
15302
|
+
cb(Qcur, "Qcur", il);
|
|
15303
|
+
if (model.layers[il].bq) {
|
|
15304
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
15305
|
+
cb(Qcur, "Qcur", il);
|
|
15306
|
+
}
|
|
15307
|
+
|
|
15308
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
15309
|
+
cb(Kcur, "Kcur", il);
|
|
15310
|
+
if (model.layers[il].bk) {
|
|
15311
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
15312
|
+
cb(Kcur, "Kcur", il);
|
|
15313
|
+
}
|
|
15314
|
+
|
|
15315
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
15316
|
+
cb(Vcur, "Vcur", il);
|
|
15317
|
+
if (model.layers[il].bv) {
|
|
15318
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
15319
|
+
cb(Vcur, "Vcur", il);
|
|
15320
|
+
}
|
|
15321
|
+
|
|
15322
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
15323
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
15324
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
15325
|
+
|
|
15326
|
+
if (use_rope) {
|
|
15327
|
+
Qcur = ggml_rope_ext(
|
|
15328
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
15329
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15330
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15331
|
+
);
|
|
15332
|
+
|
|
15333
|
+
Kcur = ggml_rope_ext(
|
|
15334
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
15335
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15336
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15337
|
+
);
|
|
15338
|
+
}
|
|
15339
|
+
|
|
15340
|
+
cb(Qcur, "Qcur", il);
|
|
15341
|
+
cb(Kcur, "Kcur", il);
|
|
15342
|
+
cb(Vcur, "Vcur", il);
|
|
15343
|
+
|
|
15344
|
+
cur = build_attn(inp_attn, gf,
|
|
15345
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
15346
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15347
|
+
cb(cur, "attn_out", il);
|
|
15348
|
+
}
|
|
15349
|
+
|
|
15350
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
15351
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
15352
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
15353
|
+
}
|
|
15354
|
+
|
|
15355
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
15356
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
15357
|
+
|
|
15358
|
+
// feed-forward network
|
|
15359
|
+
{
|
|
15360
|
+
cur = build_norm(ffn_inp,
|
|
15361
|
+
model.layers[il].ffn_norm, NULL,
|
|
15362
|
+
LLM_NORM_RMS, il);
|
|
15363
|
+
cb(cur, "ffn_norm", il);
|
|
15364
|
+
|
|
15365
|
+
cur = build_ffn(cur,
|
|
15366
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
15367
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
15368
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
15369
|
+
NULL,
|
|
15370
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15371
|
+
cb(cur, "ffn_out", il);
|
|
15372
|
+
}
|
|
15373
|
+
|
|
15374
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
15375
|
+
cb(cur, "ffn_out", il);
|
|
15376
|
+
|
|
15377
|
+
cur = build_cvec(cur, il);
|
|
15378
|
+
cb(cur, "l_out", il);
|
|
15379
|
+
|
|
15380
|
+
// input for next layer
|
|
15381
|
+
inpL = cur;
|
|
15382
|
+
}
|
|
15383
|
+
|
|
15384
|
+
cur = inpL;
|
|
15385
|
+
|
|
15386
|
+
cur = build_norm(cur,
|
|
15387
|
+
model.output_norm, NULL,
|
|
15388
|
+
LLM_NORM_RMS, -1);
|
|
15389
|
+
|
|
15390
|
+
cb(cur, "result_norm", -1);
|
|
15391
|
+
res->t_embd = cur;
|
|
15392
|
+
|
|
15393
|
+
// lm_head
|
|
15394
|
+
cur = build_lora_mm(model.output, cur);
|
|
15395
|
+
|
|
15396
|
+
cb(cur, "result_output", -1);
|
|
15397
|
+
res->t_logits = cur;
|
|
15398
|
+
|
|
15399
|
+
ggml_build_forward_expand(gf, cur);
|
|
15400
|
+
}
|
|
15401
|
+
};
|
|
15402
|
+
|
|
15403
|
+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
15404
|
+
llama_memory_i * res;
|
|
15405
|
+
|
|
15406
|
+
switch (arch) {
|
|
15407
|
+
// Models that need specific instantiation should be handled in the
|
|
15408
|
+
// switch statement
|
|
15409
|
+
case LLM_ARCH_BERT:
|
|
15410
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
15411
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
15412
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
15413
|
+
case LLM_ARCH_NEO_BERT:
|
|
15414
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
15415
|
+
{
|
|
15416
|
+
res = nullptr;
|
|
15417
|
+
} break;
|
|
15418
|
+
// Models that need standard caching should rely on recurrent/hybrid
|
|
15419
|
+
// checks
|
|
15420
|
+
default:
|
|
15421
|
+
{
|
|
15422
|
+
if (llm_arch_is_recurrent(arch)) {
|
|
15423
|
+
res = new llama_memory_recurrent(
|
|
15424
|
+
*this,
|
|
15425
|
+
nullptr,
|
|
15426
|
+
GGML_TYPE_F32,
|
|
15427
|
+
GGML_TYPE_F32,
|
|
15428
|
+
cparams.offload_kqv,
|
|
15429
|
+
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
15430
|
+
cparams.n_seq_max);
|
|
15431
|
+
} else if (llm_arch_is_hybrid(arch)) {
|
|
15432
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
15433
|
+
|
|
15434
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
15435
|
+
|
|
15436
|
+
res = new llama_memory_hybrid(
|
|
15437
|
+
/* model */ *this,
|
|
15438
|
+
/* attn_type_k */ params.type_k,
|
|
15439
|
+
/* attn_type_v */ params.type_v,
|
|
15440
|
+
/* attn_v_trans */ !cparams.flash_attn,
|
|
15441
|
+
/* attn_kv_size */ cparams.n_ctx,
|
|
15442
|
+
/* attn_n_pad */ padding,
|
|
15443
|
+
/* attn_n_swa */ hparams.n_swa,
|
|
15444
|
+
/* attn_swa_type */ hparams.swa_type,
|
|
15445
|
+
/* recurrent_type_k */ GGML_TYPE_F32,
|
|
15446
|
+
/* recurrent_type_v */ GGML_TYPE_F32,
|
|
15447
|
+
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
15448
|
+
/* n_seq_max */ cparams.n_seq_max,
|
|
15449
|
+
/* offload */ cparams.offload_kqv,
|
|
15450
|
+
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
|
15451
|
+
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
|
15452
|
+
} else {
|
|
15453
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
15454
|
+
|
|
15455
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
15456
|
+
|
|
15457
|
+
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
15458
|
+
|
|
15459
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
15460
|
+
GGML_ASSERT(hparams.is_swa_any());
|
|
15461
|
+
|
|
15462
|
+
res = new llama_kv_cache_unified_iswa(
|
|
15463
|
+
*this,
|
|
15464
|
+
params.type_k,
|
|
15465
|
+
params.type_v,
|
|
15466
|
+
!cparams.flash_attn,
|
|
15467
|
+
cparams.offload_kqv,
|
|
15468
|
+
params.swa_full,
|
|
15469
|
+
cparams.n_ctx,
|
|
15470
|
+
cparams.n_seq_max,
|
|
15471
|
+
cparams.n_ubatch,
|
|
15472
|
+
padding);
|
|
15473
|
+
} else {
|
|
15474
|
+
GGML_ASSERT(!hparams.is_swa_any());
|
|
15475
|
+
|
|
15476
|
+
res = new llama_kv_cache_unified(
|
|
15477
|
+
*this,
|
|
15478
|
+
nullptr,
|
|
15479
|
+
params.type_k,
|
|
15480
|
+
params.type_v,
|
|
15481
|
+
!cparams.flash_attn,
|
|
15482
|
+
cparams.offload_kqv,
|
|
15483
|
+
cparams.n_ctx,
|
|
15484
|
+
cparams.n_seq_max,
|
|
15485
|
+
padding,
|
|
15486
|
+
hparams.n_swa,
|
|
15487
|
+
hparams.swa_type);
|
|
15488
|
+
}
|
|
15489
|
+
}
|
|
15490
|
+
}
|
|
15491
|
+
}
|
|
15492
|
+
|
|
15493
|
+
return res;
|
|
14525
15494
|
}
|
|
14526
15495
|
|
|
14527
15496
|
llm_graph_result_ptr llama_model::build_graph(
|
|
@@ -14668,6 +15637,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
14668
15637
|
llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
|
|
14669
15638
|
} break;
|
|
14670
15639
|
case LLM_ARCH_MAMBA:
|
|
15640
|
+
case LLM_ARCH_MAMBA2:
|
|
14671
15641
|
{
|
|
14672
15642
|
llm = std::make_unique<llm_build_mamba>(*this, params, gf);
|
|
14673
15643
|
} break;
|
|
@@ -14812,6 +15782,18 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
14812
15782
|
{
|
|
14813
15783
|
llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf);
|
|
14814
15784
|
} break;
|
|
15785
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
15786
|
+
{
|
|
15787
|
+
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params, gf);
|
|
15788
|
+
} break;
|
|
15789
|
+
case LLM_ARCH_SMOLLM3:
|
|
15790
|
+
{
|
|
15791
|
+
llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
|
|
15792
|
+
} break;
|
|
15793
|
+
case LLM_ARCH_FALCON_H1:
|
|
15794
|
+
{
|
|
15795
|
+
llm = std::make_unique<llm_build_falcon_h1>(*this, params, gf);
|
|
15796
|
+
} break;
|
|
14815
15797
|
default:
|
|
14816
15798
|
GGML_ABORT("fatal error");
|
|
14817
15799
|
}
|
|
@@ -14928,6 +15910,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14928
15910
|
case LLM_ARCH_REFACT:
|
|
14929
15911
|
case LLM_ARCH_BLOOM:
|
|
14930
15912
|
case LLM_ARCH_MAMBA:
|
|
15913
|
+
case LLM_ARCH_MAMBA2:
|
|
14931
15914
|
case LLM_ARCH_JINA_BERT_V2:
|
|
14932
15915
|
case LLM_ARCH_T5:
|
|
14933
15916
|
case LLM_ARCH_T5ENCODER:
|
|
@@ -14962,12 +15945,14 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14962
15945
|
case LLM_ARCH_CHAMELEON:
|
|
14963
15946
|
case LLM_ARCH_BAILINGMOE:
|
|
14964
15947
|
case LLM_ARCH_NEO_BERT:
|
|
15948
|
+
case LLM_ARCH_SMOLLM3:
|
|
14965
15949
|
case LLM_ARCH_ARCEE:
|
|
14966
15950
|
case LLM_ARCH_ERNIE4_5:
|
|
14967
15951
|
return LLAMA_ROPE_TYPE_NORM;
|
|
14968
15952
|
|
|
14969
15953
|
// the pairs of head values are offset by n_rot/2
|
|
14970
15954
|
case LLM_ARCH_FALCON:
|
|
15955
|
+
case LLM_ARCH_FALCON_H1:
|
|
14971
15956
|
case LLM_ARCH_GROK:
|
|
14972
15957
|
case LLM_ARCH_DBRX:
|
|
14973
15958
|
case LLM_ARCH_BERT:
|
|
@@ -14999,6 +15984,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14999
15984
|
case LLM_ARCH_EXAONE:
|
|
15000
15985
|
case LLM_ARCH_MINICPM3:
|
|
15001
15986
|
case LLM_ARCH_DOTS1:
|
|
15987
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
15002
15988
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
15003
15989
|
|
|
15004
15990
|
case LLM_ARCH_QWEN2VL:
|