@fugood/llama.node 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +12 -12
- package/src/llama.cpp/CMakeLists.txt +0 -1
- package/src/llama.cpp/common/arg.cpp +17 -0
- package/src/llama.cpp/common/chat.cpp +37 -20
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.h +4 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +181 -10
- package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +38 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1297 -211
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -9
- package/src/llama.cpp/include/llama.h +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +108 -2
- package/src/llama.cpp/src/llama-arch.h +7 -0
- package/src/llama.cpp/src/llama-batch.cpp +27 -1
- package/src/llama.cpp/src/llama-batch.h +8 -1
- package/src/llama.cpp/src/llama-chat.cpp +15 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +95 -81
- package/src/llama.cpp/src/llama-graph.h +43 -16
- package/src/llama.cpp/src/llama-hparams.cpp +2 -1
- package/src/llama.cpp/src/llama-hparams.h +1 -0
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
- package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
- package/src/llama.cpp/src/llama-kv-cells.h +62 -10
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
- package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +34 -16
- package/src/llama.cpp/src/llama-memory.cpp +17 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +1374 -210
- package/src/llama.cpp/src/llama-model.h +3 -0
- package/src/llama.cpp/src/llama-vocab.cpp +8 -1
- package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
|
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
47
47
|
case LLM_TYPE_475M: return "475M";
|
|
48
48
|
case LLM_TYPE_770M: return "770M";
|
|
49
49
|
case LLM_TYPE_780M: return "780M";
|
|
50
|
+
case LLM_TYPE_0_3B: return "0.3B";
|
|
50
51
|
case LLM_TYPE_0_5B: return "0.5B";
|
|
51
52
|
case LLM_TYPE_0_6B: return "0.6B";
|
|
52
53
|
case LLM_TYPE_1B: return "1B";
|
|
@@ -101,6 +102,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
101
102
|
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
|
102
103
|
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
|
103
104
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
105
|
+
case LLM_TYPE_A13B: return "A13B";
|
|
104
106
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
105
107
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
106
108
|
case LLM_TYPE_E2B: return "E2B";
|
|
@@ -207,23 +209,27 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
207
209
|
} break;
|
|
208
210
|
case GGML_OP_SSM_CONV:
|
|
209
211
|
{
|
|
210
|
-
|
|
211
|
-
|
|
212
|
+
const int64_t n_seq_tokens = 512;
|
|
213
|
+
const int64_t n_seqs = 3;
|
|
214
|
+
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
|
|
212
215
|
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
|
|
213
216
|
} break;
|
|
214
217
|
case GGML_OP_SSM_SCAN:
|
|
215
218
|
{
|
|
216
|
-
//
|
|
217
|
-
const int64_t d_state = w->ne[0];
|
|
218
|
-
const int64_t
|
|
219
|
+
// w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
|
|
220
|
+
const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
|
|
221
|
+
const int64_t n_head = w->ne[1];
|
|
222
|
+
const int64_t head_dim = hparams.ssm_d_inner / n_head;
|
|
223
|
+
const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
|
|
219
224
|
const int64_t n_seq_tokens = 512;
|
|
220
|
-
const int64_t n_seqs =
|
|
221
|
-
ggml_tensor * s
|
|
222
|
-
ggml_tensor * x
|
|
223
|
-
ggml_tensor * dt
|
|
224
|
-
ggml_tensor * B
|
|
225
|
-
ggml_tensor * C
|
|
226
|
-
|
|
225
|
+
const int64_t n_seqs = 3;
|
|
226
|
+
ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
|
|
227
|
+
ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
|
|
228
|
+
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
|
|
229
|
+
ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
|
230
|
+
ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
|
231
|
+
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
|
|
232
|
+
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
|
|
227
233
|
} break;
|
|
228
234
|
case GGML_OP_RWKV_WKV6:
|
|
229
235
|
{
|
|
@@ -1080,6 +1086,38 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1080
1086
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1081
1087
|
}
|
|
1082
1088
|
} break;
|
|
1089
|
+
case LLM_ARCH_MAMBA2:
|
|
1090
|
+
{
|
|
1091
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1092
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1093
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1094
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1095
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1096
|
+
|
|
1097
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1098
|
+
|
|
1099
|
+
switch (hparams.n_layer) {
|
|
1100
|
+
case 24:
|
|
1101
|
+
switch (hparams.n_embd) {
|
|
1102
|
+
case 768: type = LLM_TYPE_SMALL; break;
|
|
1103
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1104
|
+
} break;
|
|
1105
|
+
case 48:
|
|
1106
|
+
switch (hparams.n_embd) {
|
|
1107
|
+
case 1024: type = LLM_TYPE_MEDIUM; break;
|
|
1108
|
+
case 1536: type = LLM_TYPE_LARGE; break;
|
|
1109
|
+
case 2048: type = LLM_TYPE_XL; break;
|
|
1110
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1111
|
+
} break;
|
|
1112
|
+
case 64:
|
|
1113
|
+
switch (hparams.n_embd) {
|
|
1114
|
+
case 2560: type = LLM_TYPE_3B; break;
|
|
1115
|
+
case 4096: type = LLM_TYPE_7B; break;
|
|
1116
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1117
|
+
} break;
|
|
1118
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1119
|
+
}
|
|
1120
|
+
} break;
|
|
1083
1121
|
case LLM_ARCH_XVERSE:
|
|
1084
1122
|
{
|
|
1085
1123
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1504,6 +1542,66 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1504
1542
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1505
1543
|
}
|
|
1506
1544
|
} break;
|
|
1545
|
+
case LLM_ARCH_ERNIE4_5:
|
|
1546
|
+
{
|
|
1547
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1548
|
+
switch (hparams.n_layer) {
|
|
1549
|
+
case 18: type = LLM_TYPE_0_3B; break;
|
|
1550
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1551
|
+
}
|
|
1552
|
+
} break;
|
|
1553
|
+
case LLM_ARCH_FALCON_H1:
|
|
1554
|
+
{
|
|
1555
|
+
// Common parameters
|
|
1556
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1557
|
+
|
|
1558
|
+
// SSM parameters
|
|
1559
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1560
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1561
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1562
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1563
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1564
|
+
|
|
1565
|
+
std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
|
|
1566
|
+
|
|
1567
|
+
switch (hparams.n_layer) {
|
|
1568
|
+
case 36:
|
|
1569
|
+
type = LLM_TYPE_0_5B; break;
|
|
1570
|
+
case 24:
|
|
1571
|
+
type = LLM_TYPE_1_5B; break;
|
|
1572
|
+
case 66:
|
|
1573
|
+
type = LLM_TYPE_1B; break;
|
|
1574
|
+
case 32:
|
|
1575
|
+
type = LLM_TYPE_3B; break;
|
|
1576
|
+
case 44:
|
|
1577
|
+
type = LLM_TYPE_7B; break;
|
|
1578
|
+
case 72:
|
|
1579
|
+
type = LLM_TYPE_34B; break;
|
|
1580
|
+
default:
|
|
1581
|
+
type = LLM_TYPE_UNKNOWN;
|
|
1582
|
+
}
|
|
1583
|
+
} break;
|
|
1584
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
1585
|
+
{
|
|
1586
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1587
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1588
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
|
1589
|
+
|
|
1590
|
+
switch (hparams.n_layer) {
|
|
1591
|
+
case 32: type = LLM_TYPE_A13B; break;
|
|
1592
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1593
|
+
}
|
|
1594
|
+
} break;
|
|
1595
|
+
case LLM_ARCH_SMOLLM3:
|
|
1596
|
+
{
|
|
1597
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1598
|
+
hparams.n_no_rope_layer_step = 4;
|
|
1599
|
+
|
|
1600
|
+
switch (hparams.n_layer) {
|
|
1601
|
+
case 36: type = LLM_TYPE_3B; break;
|
|
1602
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1603
|
+
}
|
|
1604
|
+
} break;
|
|
1507
1605
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1508
1606
|
}
|
|
1509
1607
|
|
|
@@ -3111,6 +3209,54 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3111
3209
|
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
|
|
3112
3210
|
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
|
|
3113
3211
|
|
|
3212
|
+
// out_proj
|
|
3213
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
3214
|
+
}
|
|
3215
|
+
} break;
|
|
3216
|
+
case LLM_ARCH_MAMBA2:
|
|
3217
|
+
{
|
|
3218
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
3219
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
3220
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
3221
|
+
const int64_t n_head = hparams.ssm_dt_rank;
|
|
3222
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
3223
|
+
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
|
|
3224
|
+
|
|
3225
|
+
// only an expansion factor of 2 is supported for now
|
|
3226
|
+
GGML_ASSERT(2 * n_embd == d_inner);
|
|
3227
|
+
|
|
3228
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3229
|
+
|
|
3230
|
+
// output
|
|
3231
|
+
{
|
|
3232
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3233
|
+
|
|
3234
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3235
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
3236
|
+
if (output == NULL) {
|
|
3237
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
3238
|
+
}
|
|
3239
|
+
}
|
|
3240
|
+
|
|
3241
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3242
|
+
auto & layer = layers[i];
|
|
3243
|
+
|
|
3244
|
+
// norm
|
|
3245
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3246
|
+
|
|
3247
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
|
|
3248
|
+
|
|
3249
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
|
|
3250
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0);
|
|
3251
|
+
|
|
3252
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0);
|
|
3253
|
+
|
|
3254
|
+
// no "weight" suffix for these
|
|
3255
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
|
|
3256
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0);
|
|
3257
|
+
|
|
3258
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
|
|
3259
|
+
|
|
3114
3260
|
// out_proj
|
|
3115
3261
|
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
3116
3262
|
}
|
|
@@ -4344,6 +4490,183 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4344
4490
|
|
|
4345
4491
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
4346
4492
|
|
|
4493
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4494
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4495
|
+
}
|
|
4496
|
+
} break;
|
|
4497
|
+
case LLM_ARCH_ERNIE4_5:
|
|
4498
|
+
{
|
|
4499
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4500
|
+
|
|
4501
|
+
// output
|
|
4502
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4503
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4504
|
+
// if output is NULL, init from the input tok embed
|
|
4505
|
+
if (output == NULL) {
|
|
4506
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4507
|
+
}
|
|
4508
|
+
|
|
4509
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4510
|
+
auto & layer = layers[i];
|
|
4511
|
+
|
|
4512
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4513
|
+
|
|
4514
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4515
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
4516
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
4517
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4518
|
+
|
|
4519
|
+
// optional bias tensors
|
|
4520
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4521
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
4522
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
4523
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4524
|
+
|
|
4525
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4526
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4527
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4528
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4529
|
+
}
|
|
4530
|
+
} break;
|
|
4531
|
+
case LLM_ARCH_FALCON_H1:
|
|
4532
|
+
{
|
|
4533
|
+
// Common
|
|
4534
|
+
const int64_t hidden_size = hparams.n_embd; // hidden_size
|
|
4535
|
+
|
|
4536
|
+
// mamba2 Mixer SSM params
|
|
4537
|
+
const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size
|
|
4538
|
+
const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups
|
|
4539
|
+
const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size
|
|
4540
|
+
const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand
|
|
4541
|
+
const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads
|
|
4542
|
+
const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size;
|
|
4543
|
+
const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads;
|
|
4544
|
+
|
|
4545
|
+
// attn params
|
|
4546
|
+
const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head
|
|
4547
|
+
const int64_t attn_num_key_value_head = hparams.n_head_kv(0);
|
|
4548
|
+
|
|
4549
|
+
// ffn params
|
|
4550
|
+
const int64_t ffn_intermediate_size = hparams.n_ff(0);
|
|
4551
|
+
|
|
4552
|
+
// embeddings
|
|
4553
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0);
|
|
4554
|
+
|
|
4555
|
+
// output
|
|
4556
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4557
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);
|
|
4558
|
+
|
|
4559
|
+
// if output is NULL, init from the input tok embed
|
|
4560
|
+
if (output == NULL) {
|
|
4561
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED);
|
|
4562
|
+
}
|
|
4563
|
+
|
|
4564
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4565
|
+
auto & layer = layers[i];
|
|
4566
|
+
|
|
4567
|
+
/*SSM LAYERS*/
|
|
4568
|
+
// ssm in
|
|
4569
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0);
|
|
4570
|
+
// ssm 1d conv
|
|
4571
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0);
|
|
4572
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED);
|
|
4573
|
+
// ssm_dt
|
|
4574
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0);
|
|
4575
|
+
// no "weight" suffix for these
|
|
4576
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0);
|
|
4577
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0);
|
|
4578
|
+
// ssm_norm
|
|
4579
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED);
|
|
4580
|
+
// out_proj
|
|
4581
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);
|
|
4582
|
+
|
|
4583
|
+
/*ATTENTION LAYERS*/
|
|
4584
|
+
// attention layers (with optional bias)
|
|
4585
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0);
|
|
4586
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0);
|
|
4587
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0);
|
|
4588
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0);
|
|
4589
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
|
|
4590
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED);
|
|
4591
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED);
|
|
4592
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
|
|
4593
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0);
|
|
4594
|
+
|
|
4595
|
+
|
|
4596
|
+
// feed forward (w/ optional biases)
|
|
4597
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0);
|
|
4598
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
4599
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
|
|
4600
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0);
|
|
4601
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0);
|
|
4602
|
+
|
|
4603
|
+
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
|
|
4604
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED);
|
|
4605
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED);
|
|
4606
|
+
}
|
|
4607
|
+
} break;
|
|
4608
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
4609
|
+
{
|
|
4610
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4611
|
+
|
|
4612
|
+
// output
|
|
4613
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4614
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4615
|
+
// if output is NULL, init from the input tok embed
|
|
4616
|
+
if (output == NULL) {
|
|
4617
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4618
|
+
}
|
|
4619
|
+
|
|
4620
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4621
|
+
auto & layer = layers[i];
|
|
4622
|
+
|
|
4623
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4624
|
+
|
|
4625
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4626
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4627
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4628
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4629
|
+
|
|
4630
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4631
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
4632
|
+
|
|
4633
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4634
|
+
|
|
4635
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
4636
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
4637
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
4638
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
4639
|
+
|
|
4640
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
4641
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
|
4642
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
|
4643
|
+
}
|
|
4644
|
+
} break;
|
|
4645
|
+
case LLM_ARCH_SMOLLM3:
|
|
4646
|
+
{
|
|
4647
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4648
|
+
|
|
4649
|
+
// output
|
|
4650
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4651
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4652
|
+
|
|
4653
|
+
// if output is NULL, init from the input tok embed
|
|
4654
|
+
if (output == NULL) {
|
|
4655
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4656
|
+
}
|
|
4657
|
+
|
|
4658
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4659
|
+
auto & layer = layers[i];
|
|
4660
|
+
|
|
4661
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4662
|
+
|
|
4663
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4664
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
4665
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
4666
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4667
|
+
|
|
4668
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4669
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4347
4670
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4348
4671
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4349
4672
|
}
|
|
@@ -4587,10 +4910,14 @@ void llama_model::print_info() const {
|
|
|
4587
4910
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
4588
4911
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
4589
4912
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
4913
|
+
}
|
|
4914
|
+
|
|
4915
|
+
if (arch == LLM_ARCH_MAMBA || arch == LLM_ARCH_MAMBA2) {
|
|
4590
4916
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
4591
4917
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
4592
4918
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
4593
4919
|
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
4920
|
+
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
|
|
4594
4921
|
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
4595
4922
|
|
|
4596
4923
|
if (!classifier_labels.empty()) {
|
|
@@ -5539,12 +5866,10 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
5539
5866
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
5540
5867
|
cb(cur, "wqkv", il);
|
|
5541
5868
|
|
|
5542
|
-
ggml_tensor * Qcur =
|
|
5543
|
-
ggml_tensor * Kcur =
|
|
5869
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
5870
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
5544
5871
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
5545
5872
|
|
|
5546
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
5547
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
5548
5873
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
5549
5874
|
|
|
5550
5875
|
// using mode = 2 for neox mode
|
|
@@ -5821,12 +6146,10 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
5821
6146
|
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
|
5822
6147
|
cb(cur, "wqkv_clamped", il);
|
|
5823
6148
|
|
|
5824
|
-
Qcur =
|
|
5825
|
-
Kcur =
|
|
6149
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6150
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
5826
6151
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
5827
6152
|
|
|
5828
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
5829
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
5830
6153
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
5831
6154
|
|
|
5832
6155
|
Qcur = ggml_rope_ext(
|
|
@@ -6337,12 +6660,10 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
6337
6660
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
6338
6661
|
cb(cur, "wqkv", il);
|
|
6339
6662
|
|
|
6340
|
-
Qcur =
|
|
6341
|
-
Kcur =
|
|
6663
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6664
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6342
6665
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6343
6666
|
|
|
6344
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6345
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6346
6667
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6347
6668
|
|
|
6348
6669
|
// RoPE
|
|
@@ -6572,8 +6893,8 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6572
6893
|
cb(cur, "wqkv_clamped", il);
|
|
6573
6894
|
}
|
|
6574
6895
|
|
|
6575
|
-
ggml_tensor * Qcur =
|
|
6576
|
-
ggml_tensor * Kcur =
|
|
6896
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6897
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6577
6898
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6578
6899
|
|
|
6579
6900
|
cb(Qcur, "Qcur", il);
|
|
@@ -6593,6 +6914,12 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
6593
6914
|
model.layers[il].attn_k_norm_b,
|
|
6594
6915
|
LLM_NORM, il);
|
|
6595
6916
|
cb(Kcur, "Kcur", il);
|
|
6917
|
+
} else {
|
|
6918
|
+
Qcur = ggml_cont(ctx0, Qcur);
|
|
6919
|
+
cb(Qcur, "Qcur", il);
|
|
6920
|
+
|
|
6921
|
+
Kcur = ggml_cont(ctx0, Kcur);
|
|
6922
|
+
cb(Kcur, "Kcur", il);
|
|
6596
6923
|
}
|
|
6597
6924
|
|
|
6598
6925
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
@@ -6847,12 +7174,10 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
6847
7174
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6848
7175
|
cb(cur, "bqkv", il);
|
|
6849
7176
|
|
|
6850
|
-
ggml_tensor * Qcur =
|
|
6851
|
-
ggml_tensor * Kcur =
|
|
7177
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7178
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6852
7179
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
|
|
6853
7180
|
|
|
6854
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6855
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6856
7181
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6857
7182
|
|
|
6858
7183
|
// using mode = 2 for neox mode
|
|
@@ -7617,21 +7942,21 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
7617
7942
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7618
7943
|
cb(cur, "bqkv", il);
|
|
7619
7944
|
|
|
7620
|
-
Qcur =
|
|
7621
|
-
Kcur =
|
|
7945
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7946
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7622
7947
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
7623
7948
|
} else {
|
|
7624
7949
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
7625
7950
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
7626
7951
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
7952
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7953
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7627
7954
|
}
|
|
7628
7955
|
|
|
7629
7956
|
cb(Qcur, "Qcur", il);
|
|
7630
7957
|
cb(Kcur, "Kcur", il);
|
|
7631
7958
|
cb(Vcur, "Vcur", il);
|
|
7632
7959
|
|
|
7633
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7634
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7635
7960
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7636
7961
|
|
|
7637
7962
|
Qcur = ggml_rope_ext(
|
|
@@ -7755,21 +8080,21 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
7755
8080
|
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
|
|
7756
8081
|
cb(cur, "wqkv", il);
|
|
7757
8082
|
|
|
7758
|
-
Qcur =
|
|
7759
|
-
Kcur =
|
|
8083
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
|
|
8084
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
7760
8085
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
|
7761
8086
|
} else {
|
|
7762
8087
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
7763
8088
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
7764
8089
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
8090
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8091
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7765
8092
|
}
|
|
7766
8093
|
|
|
7767
8094
|
cb(Qcur, "Qcur", il);
|
|
7768
8095
|
cb(Kcur, "Kcur", il);
|
|
7769
8096
|
cb(Vcur, "Vcur", il);
|
|
7770
8097
|
|
|
7771
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7772
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7773
8098
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7774
8099
|
|
|
7775
8100
|
Qcur = ggml_rope_ext(
|
|
@@ -8125,12 +8450,10 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
8125
8450
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
8126
8451
|
cb(cur, "bqkv", il);
|
|
8127
8452
|
|
|
8128
|
-
ggml_tensor * Qcur =
|
|
8129
|
-
ggml_tensor * Kcur =
|
|
8453
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8454
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
8130
8455
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
8131
8456
|
|
|
8132
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8133
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8134
8457
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8135
8458
|
|
|
8136
8459
|
Qcur = ggml_rope_ext(
|
|
@@ -8546,8 +8869,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8546
8869
|
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
8547
8870
|
cb(k_pe, "k_pe", il);
|
|
8548
8871
|
|
|
8549
|
-
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
|
|
8550
|
-
kv_compressed = ggml_cont(ctx0, kv_compressed);
|
|
8551
8872
|
kv_compressed = build_norm(kv_compressed,
|
|
8552
8873
|
model.layers[il].attn_kv_a_norm, NULL,
|
|
8553
8874
|
LLM_NORM_RMS, il);
|
|
@@ -8574,12 +8895,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8574
8895
|
v_states = ggml_cont(ctx0, v_states);
|
|
8575
8896
|
cb(v_states, "v_states", il);
|
|
8576
8897
|
|
|
8577
|
-
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
|
8578
|
-
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
|
8579
|
-
0);
|
|
8580
|
-
cb(v_states, "v_states", il);
|
|
8581
|
-
|
|
8582
|
-
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
|
|
8583
8898
|
q_pe = ggml_rope_ext(
|
|
8584
8899
|
ctx0, q_pe, inp_pos, rope_factors,
|
|
8585
8900
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -8588,7 +8903,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
8588
8903
|
cb(q_pe, "q_pe", il);
|
|
8589
8904
|
|
|
8590
8905
|
// shared RoPE key
|
|
8591
|
-
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
|
|
8592
8906
|
k_pe = ggml_rope_ext(
|
|
8593
8907
|
ctx0, k_pe, inp_pos, rope_factors,
|
|
8594
8908
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9622,9 +9936,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
9622
9936
|
};
|
|
9623
9937
|
|
|
9624
9938
|
struct llm_build_mamba : public llm_graph_context {
|
|
9625
|
-
const llama_model & model
|
|
9626
|
-
|
|
9627
|
-
llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
|
|
9939
|
+
llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
9628
9940
|
ggml_tensor * cur;
|
|
9629
9941
|
ggml_tensor * inpL;
|
|
9630
9942
|
|
|
@@ -9642,7 +9954,11 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9642
9954
|
LLM_NORM_RMS, il);
|
|
9643
9955
|
cb(cur, "attn_norm", il);
|
|
9644
9956
|
|
|
9645
|
-
|
|
9957
|
+
if (model.arch == LLM_ARCH_MAMBA2) {
|
|
9958
|
+
cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il);
|
|
9959
|
+
} else {
|
|
9960
|
+
cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il);
|
|
9961
|
+
}
|
|
9646
9962
|
|
|
9647
9963
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
9648
9964
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
@@ -9676,11 +9992,11 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9676
9992
|
ggml_build_forward_expand(gf, cur);
|
|
9677
9993
|
}
|
|
9678
9994
|
|
|
9679
|
-
// TODO: split
|
|
9680
9995
|
ggml_tensor * build_mamba_layer(
|
|
9681
9996
|
llm_graph_input_rs * inp,
|
|
9682
9997
|
ggml_cgraph * gf,
|
|
9683
9998
|
ggml_tensor * cur,
|
|
9999
|
+
const llama_model & model,
|
|
9684
10000
|
const llama_ubatch & ubatch,
|
|
9685
10001
|
int il) const {
|
|
9686
10002
|
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
@@ -9691,6 +10007,8 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9691
10007
|
const int64_t d_inner = hparams.ssm_d_inner;
|
|
9692
10008
|
const int64_t d_state = hparams.ssm_d_state;
|
|
9693
10009
|
const int64_t dt_rank = hparams.ssm_dt_rank;
|
|
10010
|
+
const int64_t n_head = d_inner;
|
|
10011
|
+
const int64_t head_dim = 1;
|
|
9694
10012
|
const int64_t n_seqs = ubatch.n_seqs;
|
|
9695
10013
|
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
|
|
9696
10014
|
const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
|
|
@@ -9706,15 +10024,8 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9706
10024
|
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
9707
10025
|
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
9708
10026
|
|
|
9709
|
-
|
|
9710
|
-
ggml_tensor * conv = build_rs(
|
|
9711
|
-
inp, gf, conv_states_all,
|
|
9712
|
-
hparams.n_embd_r(), n_seqs);
|
|
10027
|
+
ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
|
|
9713
10028
|
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
|
9714
|
-
ggml_tensor * ssm = build_rs(
|
|
9715
|
-
inp, gf, ssm_states_all,
|
|
9716
|
-
hparams.n_embd_s(), n_seqs);
|
|
9717
|
-
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
|
9718
10029
|
|
|
9719
10030
|
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
9720
10031
|
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
|
@@ -9763,8 +10074,8 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9763
10074
|
ggml_tensor * x_db = build_lora_mm(model.layers[il].ssm_x, x);
|
|
9764
10075
|
// split
|
|
9765
10076
|
ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
|
|
9766
|
-
ggml_tensor * B =
|
|
9767
|
-
ggml_tensor * C =
|
|
10077
|
+
ggml_tensor * B = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
|
|
10078
|
+
ggml_tensor * C = ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state*x_db->nb[0], x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
|
|
9768
10079
|
|
|
9769
10080
|
// Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
|
|
9770
10081
|
if (ssm_dt_b_c_rms) {
|
|
@@ -9777,32 +10088,174 @@ struct llm_build_mamba : public llm_graph_context {
|
|
|
9777
10088
|
dt = build_lora_mm(model.layers[il].ssm_dt, dt);
|
|
9778
10089
|
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
|
|
9779
10090
|
|
|
9780
|
-
|
|
9781
|
-
|
|
9782
|
-
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
|
|
9783
|
-
ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
|
|
10091
|
+
cur = x;
|
|
10092
|
+
x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
|
|
9784
10093
|
|
|
9785
|
-
|
|
9786
|
-
|
|
10094
|
+
ggml_tensor * A = model.layers[il].ssm_a;
|
|
10095
|
+
|
|
10096
|
+
// use the states and the indices provided by build_recurrent_state
|
|
10097
|
+
// (this is necessary in order to properly use the states before they are overwritten,
|
|
10098
|
+
// while avoiding to make unnecessary copies of the states)
|
|
10099
|
+
auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
|
|
10100
|
+
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
|
|
10101
|
+
|
|
10102
|
+
// Custom operator to optimize the parallel associative scan
|
|
10103
|
+
// as described in the Annex D of the Mamba paper.
|
|
10104
|
+
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
|
|
10105
|
+
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
|
10106
|
+
};
|
|
10107
|
+
|
|
10108
|
+
ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
|
10109
|
+
|
|
10110
|
+
// store last states
|
|
10111
|
+
ggml_build_forward_expand(gf,
|
|
10112
|
+
ggml_cpy(ctx0,
|
|
10113
|
+
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
|
|
10114
|
+
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
|
10115
|
+
|
|
10116
|
+
ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
|
|
10117
|
+
|
|
10118
|
+
// TODO: skip computing output earlier for unused tokens
|
|
10119
|
+
|
|
10120
|
+
y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, model.layers[il].ssm_d));
|
|
10121
|
+
y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
|
|
10122
|
+
|
|
10123
|
+
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
10124
|
+
cur = build_lora_mm(model.layers[il].ssm_out, y);
|
|
10125
|
+
}
|
|
10126
|
+
|
|
10127
|
+
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
|
10128
|
+
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
|
|
10129
|
+
// cb(cur, "mamba_out", il);
|
|
10130
|
+
|
|
10131
|
+
return cur;
|
|
10132
|
+
}
|
|
10133
|
+
|
|
10134
|
+
ggml_tensor * build_mamba2_layer(
|
|
10135
|
+
llm_graph_input_rs * inp,
|
|
10136
|
+
ggml_cgraph * gf,
|
|
10137
|
+
ggml_tensor * cur,
|
|
10138
|
+
const llama_model & model,
|
|
10139
|
+
const llama_ubatch & ubatch,
|
|
10140
|
+
int il) const {
|
|
10141
|
+
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
|
10142
|
+
|
|
10143
|
+
const auto kv_head = mctx_cur->get_head();
|
|
10144
|
+
|
|
10145
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
10146
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
10147
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
10148
|
+
const int64_t n_head = hparams.ssm_dt_rank;
|
|
10149
|
+
const int64_t head_dim = d_inner / n_head;
|
|
10150
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
10151
|
+
const int64_t n_seqs = ubatch.n_seqs;
|
|
10152
|
+
|
|
10153
|
+
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
10154
|
+
|
|
10155
|
+
GGML_ASSERT(n_seqs != 0);
|
|
10156
|
+
GGML_ASSERT(ubatch.equal_seqs);
|
|
10157
|
+
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
10158
|
+
|
|
10159
|
+
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
|
10160
|
+
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
|
10161
|
+
|
|
10162
|
+
ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
|
|
10163
|
+
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
|
|
10164
|
+
|
|
10165
|
+
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
10166
|
+
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
|
10167
|
+
|
|
10168
|
+
// d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
|
|
10169
|
+
|
|
10170
|
+
// {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
|
|
10171
|
+
ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
|
|
10172
|
+
|
|
10173
|
+
// split the above in three
|
|
10174
|
+
ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
|
|
10175
|
+
ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
|
|
10176
|
+
ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
|
|
10177
|
+
|
|
10178
|
+
// conv
|
|
10179
|
+
{
|
|
10180
|
+
// => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
|
|
10181
|
+
ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
|
|
10182
|
+
|
|
10183
|
+
// copy last (d_conv - 1) columns back into the state cache
|
|
10184
|
+
ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
|
|
10185
|
+
|
|
10186
|
+
ggml_build_forward_expand(gf,
|
|
10187
|
+
ggml_cpy(ctx0, last_conv,
|
|
10188
|
+
ggml_view_1d(ctx0, conv_states_all,
|
|
10189
|
+
(d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
|
|
10190
|
+
kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
|
|
10191
|
+
|
|
10192
|
+
// 1D convolution
|
|
10193
|
+
// The equivalent is to make a self-overlapping view of conv_x
|
|
10194
|
+
// over d_conv columns at each stride in the 3rd dimension,
|
|
10195
|
+
// then element-wise multiply that with the conv1d weight,
|
|
10196
|
+
// then sum the elements of each row,
|
|
10197
|
+
// (the last two steps are a dot product over rows (also doable with mul_mat))
|
|
10198
|
+
// then permute away the ne[0] dimension,
|
|
10199
|
+
// and then you're left with the resulting x tensor.
|
|
10200
|
+
// For simultaneous sequences, all sequences need to have the same length.
|
|
10201
|
+
xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
|
|
10202
|
+
|
|
10203
|
+
// bias
|
|
10204
|
+
xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
|
|
10205
|
+
|
|
10206
|
+
xBC = ggml_silu(ctx0, xBC);
|
|
10207
|
+
}
|
|
10208
|
+
|
|
10209
|
+
// ssm
|
|
10210
|
+
{
|
|
10211
|
+
// These correspond to V K Q in SSM/attention duality
|
|
10212
|
+
ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
|
|
10213
|
+
ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
|
|
10214
|
+
ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
|
|
10215
|
+
|
|
10216
|
+
// {n_head, n_seq_tokens, n_seqs}
|
|
10217
|
+
dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
|
|
10218
|
+
|
|
10219
|
+
ggml_tensor * A = model.layers[il].ssm_a;
|
|
10220
|
+
|
|
10221
|
+
// use the states and the indices provided by build_recurrent_state
|
|
10222
|
+
// (this is necessary in order to properly use the states before they are overwritten,
|
|
10223
|
+
// while avoiding to make unnecessary copies of the states)
|
|
10224
|
+
auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
|
|
10225
|
+
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
|
|
10226
|
+
|
|
10227
|
+
// TODO: use semistructured matrices to implement state-space duality
|
|
10228
|
+
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
|
|
10229
|
+
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
|
10230
|
+
};
|
|
10231
|
+
|
|
10232
|
+
ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
|
10233
|
+
|
|
10234
|
+
// store last states
|
|
10235
|
+
ggml_build_forward_expand(gf,
|
|
9787
10236
|
ggml_cpy(ctx0,
|
|
9788
|
-
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[
|
|
10237
|
+
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
|
|
9789
10238
|
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
|
9790
10239
|
|
|
9791
|
-
ggml_tensor * y =
|
|
10240
|
+
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
|
|
9792
10241
|
|
|
9793
10242
|
// TODO: skip computing output earlier for unused tokens
|
|
9794
10243
|
|
|
9795
|
-
// {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
|
|
9796
10244
|
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
|
9797
10245
|
y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
|
|
9798
10246
|
|
|
10247
|
+
// grouped RMS norm
|
|
10248
|
+
y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
|
|
10249
|
+
y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
|
|
10250
|
+
y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
|
|
10251
|
+
|
|
9799
10252
|
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
9800
10253
|
cur = build_lora_mm(model.layers[il].ssm_out, y);
|
|
9801
10254
|
}
|
|
9802
10255
|
|
|
9803
10256
|
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
|
9804
10257
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
|
|
9805
|
-
|
|
10258
|
+
cb(cur, "mamba_out", il);
|
|
9806
10259
|
|
|
9807
10260
|
return cur;
|
|
9808
10261
|
}
|
|
@@ -10514,10 +10967,10 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
10514
10967
|
|
|
10515
10968
|
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
|
|
10516
10969
|
|
|
10517
|
-
ggml_tensor * Qcur =
|
|
10970
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
|
|
10518
10971
|
cb(Qcur, "Qcur", il);
|
|
10519
10972
|
|
|
10520
|
-
ggml_tensor * Kcur =
|
|
10973
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
|
|
10521
10974
|
cb(Kcur, "Kcur", il);
|
|
10522
10975
|
|
|
10523
10976
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
|
|
@@ -10639,12 +11092,10 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
10639
11092
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
10640
11093
|
cb(cur, "bqkv", il);
|
|
10641
11094
|
|
|
10642
|
-
ggml_tensor * Qcur =
|
|
10643
|
-
ggml_tensor * Kcur =
|
|
11095
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
11096
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
10644
11097
|
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
10645
11098
|
|
|
10646
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
10647
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
10648
11099
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
10649
11100
|
|
|
10650
11101
|
Qcur = ggml_rope_ext(
|
|
@@ -11889,6 +12340,8 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11889
12340
|
if (model.layers[il].bv) {
|
|
11890
12341
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
11891
12342
|
}
|
|
12343
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12344
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
11892
12345
|
} else {
|
|
11893
12346
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
11894
12347
|
cb(cur, "wqkv", il);
|
|
@@ -11896,13 +12349,11 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
11896
12349
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
11897
12350
|
cb(cur, "bqkv", il);
|
|
11898
12351
|
}
|
|
11899
|
-
Qcur =
|
|
11900
|
-
Kcur =
|
|
12352
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12353
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
11901
12354
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
11902
12355
|
}
|
|
11903
12356
|
|
|
11904
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
11905
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
11906
12357
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
11907
12358
|
|
|
11908
12359
|
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
|
@@ -12023,6 +12474,8 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
12023
12474
|
if (model.layers[il].bv) {
|
|
12024
12475
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
12025
12476
|
}
|
|
12477
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12478
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12026
12479
|
} else {
|
|
12027
12480
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
12028
12481
|
cb(cur, "wqkv", il);
|
|
@@ -12030,13 +12483,11 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
12030
12483
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
12031
12484
|
cb(cur, "bqkv", il);
|
|
12032
12485
|
}
|
|
12033
|
-
Qcur =
|
|
12034
|
-
Kcur =
|
|
12486
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12487
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
12035
12488
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
12036
12489
|
}
|
|
12037
12490
|
|
|
12038
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12039
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12040
12491
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12041
12492
|
|
|
12042
12493
|
Qcur = ggml_rope_ext(
|
|
@@ -14125,8 +14576,8 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
14125
14576
|
}
|
|
14126
14577
|
};
|
|
14127
14578
|
|
|
14128
|
-
struct
|
|
14129
|
-
|
|
14579
|
+
struct llm_build_ernie4_5 : public llm_graph_context {
|
|
14580
|
+
llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
14130
14581
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14131
14582
|
|
|
14132
14583
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -14142,25 +14593,19 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14142
14593
|
|
|
14143
14594
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
14144
14595
|
|
|
14145
|
-
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14146
|
-
|
|
14147
|
-
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14148
|
-
|
|
14149
14596
|
for (int il = 0; il < n_layer; ++il) {
|
|
14150
14597
|
ggml_tensor * inpSA = inpL;
|
|
14151
14598
|
|
|
14152
14599
|
// norm
|
|
14153
|
-
|
|
14154
|
-
|
|
14155
|
-
|
|
14156
|
-
|
|
14600
|
+
{
|
|
14601
|
+
cur = build_norm(inpL,
|
|
14602
|
+
model.layers[il].attn_norm, NULL,
|
|
14603
|
+
LLM_NORM_RMS, il);
|
|
14604
|
+
cb(cur, "attn_norm", il);
|
|
14605
|
+
}
|
|
14157
14606
|
|
|
14158
14607
|
// self-attention
|
|
14159
14608
|
{
|
|
14160
|
-
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
14161
|
-
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
14162
|
-
|
|
14163
|
-
// compute Q and K and RoPE them
|
|
14164
14609
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14165
14610
|
cb(Qcur, "Qcur", il);
|
|
14166
14611
|
if (model.layers[il].bq) {
|
|
@@ -14187,13 +14632,13 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14187
14632
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14188
14633
|
|
|
14189
14634
|
Qcur = ggml_rope_ext(
|
|
14190
|
-
ctx0, Qcur, inp_pos,
|
|
14635
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
14191
14636
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14192
14637
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14193
14638
|
);
|
|
14194
14639
|
|
|
14195
14640
|
Kcur = ggml_rope_ext(
|
|
14196
|
-
ctx0, Kcur, inp_pos,
|
|
14641
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
14197
14642
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14198
14643
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14199
14644
|
);
|
|
@@ -14203,12 +14648,13 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14203
14648
|
cb(Vcur, "Vcur", il);
|
|
14204
14649
|
|
|
14205
14650
|
cur = build_attn(inp_attn, gf,
|
|
14206
|
-
model.layers[il].wo,
|
|
14207
|
-
Qcur, Kcur, Vcur, nullptr, nullptr,
|
|
14208
|
-
cb(cur, "attn_out", il);
|
|
14651
|
+
model.layers[il].wo, NULL,
|
|
14652
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14209
14653
|
}
|
|
14210
14654
|
|
|
14211
|
-
if (il == n_layer - 1
|
|
14655
|
+
if (il == n_layer - 1) {
|
|
14656
|
+
// skip computing output for unused tokens
|
|
14657
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14212
14658
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14213
14659
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14214
14660
|
}
|
|
@@ -14217,22 +14663,22 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14217
14663
|
cb(ffn_inp, "ffn_inp", il);
|
|
14218
14664
|
|
|
14219
14665
|
// feed-forward network
|
|
14220
|
-
|
|
14221
|
-
|
|
14222
|
-
|
|
14223
|
-
|
|
14224
|
-
|
|
14666
|
+
{
|
|
14667
|
+
cur = build_norm(ffn_inp,
|
|
14668
|
+
model.layers[il].ffn_norm, NULL,
|
|
14669
|
+
LLM_NORM_RMS, il);
|
|
14670
|
+
cb(cur, "ffn_norm", il);
|
|
14225
14671
|
|
|
14226
|
-
|
|
14227
|
-
|
|
14228
|
-
|
|
14229
|
-
|
|
14230
|
-
|
|
14231
|
-
|
|
14232
|
-
|
|
14672
|
+
cur = build_ffn(cur,
|
|
14673
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
14674
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
14675
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
14676
|
+
NULL,
|
|
14677
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14678
|
+
cb(cur, "ffn_out", il);
|
|
14679
|
+
}
|
|
14233
14680
|
|
|
14234
14681
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14235
|
-
cb(cur, "ffn_out", il);
|
|
14236
14682
|
|
|
14237
14683
|
cur = build_cvec(cur, il);
|
|
14238
14684
|
cb(cur, "l_out", il);
|
|
@@ -14260,104 +14706,800 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
14260
14706
|
}
|
|
14261
14707
|
};
|
|
14262
14708
|
|
|
14263
|
-
|
|
14264
|
-
|
|
14709
|
+
struct llm_build_falcon_h1 : public llm_graph_context {
|
|
14710
|
+
const llama_model & model;
|
|
14265
14711
|
|
|
14266
|
-
|
|
14267
|
-
|
|
14268
|
-
// switch statement
|
|
14269
|
-
case LLM_ARCH_BERT:
|
|
14270
|
-
case LLM_ARCH_JINA_BERT_V2:
|
|
14271
|
-
case LLM_ARCH_NOMIC_BERT:
|
|
14272
|
-
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
14273
|
-
case LLM_ARCH_NEO_BERT:
|
|
14274
|
-
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
14275
|
-
{
|
|
14276
|
-
res = nullptr;
|
|
14277
|
-
} break;
|
|
14278
|
-
// Models that need standard caching should rely on recurrent/hybrid
|
|
14279
|
-
// checks
|
|
14280
|
-
default:
|
|
14281
|
-
{
|
|
14282
|
-
if (llm_arch_is_recurrent(arch)) {
|
|
14283
|
-
res = new llama_memory_recurrent(
|
|
14284
|
-
*this,
|
|
14285
|
-
nullptr,
|
|
14286
|
-
GGML_TYPE_F32,
|
|
14287
|
-
GGML_TYPE_F32,
|
|
14288
|
-
cparams.offload_kqv,
|
|
14289
|
-
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
14290
|
-
cparams.n_seq_max);
|
|
14291
|
-
} else if (llm_arch_is_hybrid(arch)) {
|
|
14292
|
-
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
14712
|
+
llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
|
|
14713
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14293
14714
|
|
|
14294
|
-
|
|
14715
|
+
ggml_tensor * cur;
|
|
14716
|
+
ggml_tensor * inpL;
|
|
14295
14717
|
|
|
14296
|
-
|
|
14297
|
-
/* model */ *this,
|
|
14298
|
-
/* attn_type_k */ params.type_k,
|
|
14299
|
-
/* attn_type_v */ params.type_v,
|
|
14300
|
-
/* attn_v_trans */ !cparams.flash_attn,
|
|
14301
|
-
/* attn_kv_size */ cparams.n_ctx,
|
|
14302
|
-
/* attn_n_pad */ padding,
|
|
14303
|
-
/* attn_n_swa */ hparams.n_swa,
|
|
14304
|
-
/* attn_swa_type */ hparams.swa_type,
|
|
14305
|
-
/* recurrent_type_k */ GGML_TYPE_F32,
|
|
14306
|
-
/* recurrent_type_v */ GGML_TYPE_F32,
|
|
14307
|
-
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
14308
|
-
/* n_seq_max */ cparams.n_seq_max,
|
|
14309
|
-
/* offload */ cparams.offload_kqv);
|
|
14310
|
-
} else {
|
|
14311
|
-
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
14718
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14312
14719
|
|
|
14313
|
-
|
|
14720
|
+
// inp_pos - contains the positions
|
|
14721
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
14314
14722
|
|
|
14315
|
-
|
|
14723
|
+
// Build the inputs in the recurrent & kv cache
|
|
14724
|
+
auto * inp = build_inp_mem_hybrid();
|
|
14316
14725
|
|
|
14317
|
-
|
|
14318
|
-
GGML_ASSERT(hparams.is_swa_any());
|
|
14726
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14319
14727
|
|
|
14320
|
-
|
|
14321
|
-
*this,
|
|
14322
|
-
params.type_k,
|
|
14323
|
-
params.type_v,
|
|
14324
|
-
!cparams.flash_attn,
|
|
14325
|
-
cparams.offload_kqv,
|
|
14326
|
-
params.swa_full,
|
|
14327
|
-
cparams.n_ctx,
|
|
14328
|
-
cparams.n_seq_max,
|
|
14329
|
-
cparams.n_ubatch,
|
|
14330
|
-
padding);
|
|
14331
|
-
} else {
|
|
14332
|
-
GGML_ASSERT(!hparams.is_swa_any());
|
|
14728
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14333
14729
|
|
|
14334
|
-
|
|
14335
|
-
|
|
14336
|
-
nullptr,
|
|
14337
|
-
params.type_k,
|
|
14338
|
-
params.type_v,
|
|
14339
|
-
!cparams.flash_attn,
|
|
14340
|
-
cparams.offload_kqv,
|
|
14341
|
-
cparams.n_ctx,
|
|
14342
|
-
cparams.n_seq_max,
|
|
14343
|
-
padding,
|
|
14344
|
-
hparams.n_swa,
|
|
14345
|
-
hparams.swa_type);
|
|
14346
|
-
}
|
|
14347
|
-
}
|
|
14348
|
-
}
|
|
14349
|
-
}
|
|
14730
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14731
|
+
ggml_tensor * inpSA = inpL;
|
|
14350
14732
|
|
|
14351
|
-
|
|
14352
|
-
|
|
14733
|
+
cur = build_norm(inpL,
|
|
14734
|
+
model.layers[il].attn_norm, NULL,
|
|
14735
|
+
LLM_NORM_RMS, il);
|
|
14736
|
+
cb(cur, "attn_norm", il);
|
|
14353
14737
|
|
|
14354
|
-
|
|
14355
|
-
|
|
14356
|
-
|
|
14357
|
-
llm_graph_type type) const {
|
|
14358
|
-
std::unique_ptr<llm_graph_context> llm;
|
|
14738
|
+
// self-attention
|
|
14739
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14740
|
+
cb(Qcur, "Qcur", il);
|
|
14359
14741
|
|
|
14360
|
-
|
|
14742
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14743
|
+
cb(Kcur, "Kcur", il);
|
|
14744
|
+
|
|
14745
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14746
|
+
cb(Vcur, "Vcur", il);
|
|
14747
|
+
|
|
14748
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
14749
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
14750
|
+
|
|
14751
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14752
|
+
|
|
14753
|
+
Qcur = ggml_rope_ext(
|
|
14754
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
14755
|
+
n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14756
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
14757
|
+
|
|
14758
|
+
Kcur = ggml_rope_ext(
|
|
14759
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
14760
|
+
n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14761
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14762
|
+
);
|
|
14763
|
+
|
|
14764
|
+
cb(Qcur, "Qcur-post-rope", il);
|
|
14765
|
+
cb(Kcur, "Kcur-post-rope", il);
|
|
14766
|
+
cb(Vcur, "Vcur-post-rope", il);
|
|
14767
|
+
|
|
14768
|
+
ggml_tensor * attn_out = build_attn(inp, gf,
|
|
14769
|
+
model.layers[il].wo, NULL,
|
|
14770
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
14771
|
+
cb(attn_out, "attn_out", il);
|
|
14772
|
+
|
|
14773
|
+
cur = build_norm(inpL,
|
|
14774
|
+
model.layers[il].attn_norm, NULL,
|
|
14775
|
+
LLM_NORM_RMS, il);
|
|
14776
|
+
// Mamba2 layer
|
|
14777
|
+
cb(cur, "ssm_in", il);
|
|
14778
|
+
|
|
14779
|
+
ggml_tensor * ssm_out = build_mamba2_layer(inp, gf, cur, ubatch, il);
|
|
14780
|
+
cb(ssm_out, "ssm_out", il);
|
|
14781
|
+
|
|
14782
|
+
// // Aggregation
|
|
14783
|
+
cur = ggml_add(ctx0, attn_out, ssm_out);
|
|
14784
|
+
inpSA = ggml_add(ctx0, cur, inpSA);
|
|
14785
|
+
cb(cur, "layer_out", il);
|
|
14786
|
+
|
|
14787
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14788
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14789
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14790
|
+
}
|
|
14791
|
+
|
|
14792
|
+
ggml_tensor * ffn_inp = inpSA;
|
|
14793
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
14794
|
+
|
|
14795
|
+
// feed-forward network
|
|
14796
|
+
cur = build_norm(ffn_inp,
|
|
14797
|
+
model.layers[il].ffn_norm, NULL,
|
|
14798
|
+
LLM_NORM_RMS, il);
|
|
14799
|
+
cb(cur, "ffn_norm", il);
|
|
14800
|
+
|
|
14801
|
+
cur = build_ffn(cur,
|
|
14802
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
14803
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
14804
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
14805
|
+
NULL,
|
|
14806
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14807
|
+
cb(cur, "ffn_out", il);
|
|
14808
|
+
|
|
14809
|
+
cur = ggml_add(ctx0, cur, inpSA);
|
|
14810
|
+
|
|
14811
|
+
cur = build_cvec(cur, il);
|
|
14812
|
+
cb(cur, "l_out", il);
|
|
14813
|
+
|
|
14814
|
+
// input for next layer
|
|
14815
|
+
inpL = cur;
|
|
14816
|
+
}
|
|
14817
|
+
|
|
14818
|
+
cur = inpL;
|
|
14819
|
+
|
|
14820
|
+
cur = build_norm(cur,
|
|
14821
|
+
model.output_norm, NULL,
|
|
14822
|
+
LLM_NORM_RMS, -1);
|
|
14823
|
+
|
|
14824
|
+
cb(cur, "result_norm", -1);
|
|
14825
|
+
res->t_embd = cur;
|
|
14826
|
+
|
|
14827
|
+
// lm_head
|
|
14828
|
+
cur = build_lora_mm(model.output, cur);
|
|
14829
|
+
|
|
14830
|
+
cb(cur, "result_output", -1);
|
|
14831
|
+
res->t_logits = cur;
|
|
14832
|
+
|
|
14833
|
+
ggml_build_forward_expand(gf, cur);
|
|
14834
|
+
}
|
|
14835
|
+
|
|
14836
|
+
ggml_tensor * build_mamba2_layer(
|
|
14837
|
+
llm_graph_input_mem_hybrid * inp,
|
|
14838
|
+
ggml_cgraph * gf,
|
|
14839
|
+
ggml_tensor * cur,
|
|
14840
|
+
const llama_ubatch & ubatch,
|
|
14841
|
+
int il) const {
|
|
14842
|
+
const auto * kv_state = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
|
|
14843
|
+
|
|
14844
|
+
const auto kv_head = kv_state->get_head();
|
|
14845
|
+
|
|
14846
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
14847
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
14848
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
14849
|
+
const int64_t n_head = hparams.ssm_dt_rank;
|
|
14850
|
+
const int64_t head_dim = d_inner / n_head;
|
|
14851
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
14852
|
+
const int64_t n_seqs = ubatch.n_seqs;
|
|
14853
|
+
|
|
14854
|
+
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
14855
|
+
|
|
14856
|
+
GGML_ASSERT(n_seqs != 0);
|
|
14857
|
+
GGML_ASSERT(ubatch.equal_seqs);
|
|
14858
|
+
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
|
14859
|
+
|
|
14860
|
+
ggml_tensor * conv_states_all = kv_state->get_r_l(il);
|
|
14861
|
+
ggml_tensor * ssm_states_all = kv_state->get_s_l(il);
|
|
14862
|
+
|
|
14863
|
+
ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs);
|
|
14864
|
+
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
|
|
14865
|
+
|
|
14866
|
+
// {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
|
|
14867
|
+
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
|
14868
|
+
|
|
14869
|
+
// d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
|
|
14870
|
+
|
|
14871
|
+
// {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
|
|
14872
|
+
ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
|
|
14873
|
+
cb(zxBCdt, "zxBCdt", il);
|
|
14874
|
+
|
|
14875
|
+
// split the above in three
|
|
14876
|
+
ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0);
|
|
14877
|
+
ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt));
|
|
14878
|
+
ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt));
|
|
14879
|
+
|
|
14880
|
+
// conv
|
|
14881
|
+
{
|
|
14882
|
+
// => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
|
|
14883
|
+
ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
|
|
14884
|
+
|
|
14885
|
+
// copy last (d_conv - 1) columns back into the state cache
|
|
14886
|
+
ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
|
|
14887
|
+
|
|
14888
|
+
ggml_build_forward_expand(gf,
|
|
14889
|
+
ggml_cpy(ctx0, last_conv,
|
|
14890
|
+
ggml_view_1d(ctx0, conv_states_all,
|
|
14891
|
+
(d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
|
|
14892
|
+
kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
|
|
14893
|
+
|
|
14894
|
+
// 1D convolution
|
|
14895
|
+
// The equivalent is to make a self-overlapping view of conv_x
|
|
14896
|
+
// over d_conv columns at each stride in the 3rd dimension,
|
|
14897
|
+
// then element-wise multiply that with the conv1d weight,
|
|
14898
|
+
// then sum the elements of each row,
|
|
14899
|
+
// (the last two steps are a dot product over rows (also doable with mul_mat))
|
|
14900
|
+
// then permute away the ne[0] dimension,
|
|
14901
|
+
// and then you're left with the resulting x tensor.
|
|
14902
|
+
// For simultaneous sequences, all sequences need to have the same length.
|
|
14903
|
+
xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
|
|
14904
|
+
|
|
14905
|
+
// bias
|
|
14906
|
+
xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
|
|
14907
|
+
|
|
14908
|
+
xBC = ggml_silu(ctx0, xBC);
|
|
14909
|
+
}
|
|
14910
|
+
|
|
14911
|
+
// ssm
|
|
14912
|
+
{
|
|
14913
|
+
// These correspond to V K Q in SSM/attention duality
|
|
14914
|
+
ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0);
|
|
14915
|
+
|
|
14916
|
+
ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC));
|
|
14917
|
+
|
|
14918
|
+
ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC));
|
|
14919
|
+
|
|
14920
|
+
// {n_head, n_seq_tokens, n_seqs}
|
|
14921
|
+
dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
|
|
14922
|
+
|
|
14923
|
+
ggml_tensor * A = model.layers[il].ssm_a;
|
|
14924
|
+
|
|
14925
|
+
// use the states and the indices provided by build_rs
|
|
14926
|
+
// (this is necessary in order to properly use the states before they are overwritten,
|
|
14927
|
+
// while avoiding to make unnecessary copies of the states)
|
|
14928
|
+
auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
|
|
14929
|
+
ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, kv_state->get_size());
|
|
14930
|
+
|
|
14931
|
+
// TODO: use semistructured matrices to implement state-space duality
|
|
14932
|
+
// => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
|
|
14933
|
+
return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
|
|
14934
|
+
};
|
|
14935
|
+
|
|
14936
|
+
ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
|
|
14937
|
+
|
|
14938
|
+
// store last states
|
|
14939
|
+
ggml_build_forward_expand(gf,
|
|
14940
|
+
ggml_cpy(ctx0,
|
|
14941
|
+
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]),
|
|
14942
|
+
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
|
14943
|
+
|
|
14944
|
+
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0);
|
|
14945
|
+
|
|
14946
|
+
// TODO: skip computing output earlier for unused tokens
|
|
14947
|
+
|
|
14948
|
+
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
|
|
14949
|
+
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
|
|
14950
|
+
|
|
14951
|
+
// grouped RMS norm
|
|
14952
|
+
if (model.layers[il].ssm_norm) {
|
|
14953
|
+
y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
|
|
14954
|
+
y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
|
|
14955
|
+
}
|
|
14956
|
+
|
|
14957
|
+
y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
|
|
14958
|
+
|
|
14959
|
+
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
|
14960
|
+
cur = build_lora_mm(model.layers[il].ssm_out, y);
|
|
14961
|
+
}
|
|
14962
|
+
|
|
14963
|
+
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
|
14964
|
+
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
|
|
14965
|
+
cb(cur, "mamba_out", il);
|
|
14966
|
+
return cur;
|
|
14967
|
+
}
|
|
14968
|
+
};
|
|
14969
|
+
|
|
14970
|
+
struct llm_build_arcee : public llm_graph_context {
|
|
14971
|
+
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
14972
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14973
|
+
|
|
14974
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14975
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
14976
|
+
|
|
14977
|
+
ggml_tensor * cur;
|
|
14978
|
+
ggml_tensor * inpL;
|
|
14979
|
+
|
|
14980
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14981
|
+
|
|
14982
|
+
// inp_pos - contains the positions
|
|
14983
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
14984
|
+
|
|
14985
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
14986
|
+
|
|
14987
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14988
|
+
|
|
14989
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14990
|
+
|
|
14991
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14992
|
+
ggml_tensor * inpSA = inpL;
|
|
14993
|
+
|
|
14994
|
+
// norm
|
|
14995
|
+
cur = build_norm(inpL,
|
|
14996
|
+
model.layers[il].attn_norm, NULL,
|
|
14997
|
+
LLM_NORM_RMS, il);
|
|
14998
|
+
cb(cur, "attn_norm", il);
|
|
14999
|
+
|
|
15000
|
+
// self-attention
|
|
15001
|
+
{
|
|
15002
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
15003
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
15004
|
+
|
|
15005
|
+
// compute Q and K and RoPE them
|
|
15006
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
15007
|
+
cb(Qcur, "Qcur", il);
|
|
15008
|
+
if (model.layers[il].bq) {
|
|
15009
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
15010
|
+
cb(Qcur, "Qcur", il);
|
|
15011
|
+
}
|
|
15012
|
+
|
|
15013
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
15014
|
+
cb(Kcur, "Kcur", il);
|
|
15015
|
+
if (model.layers[il].bk) {
|
|
15016
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
15017
|
+
cb(Kcur, "Kcur", il);
|
|
15018
|
+
}
|
|
15019
|
+
|
|
15020
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
15021
|
+
cb(Vcur, "Vcur", il);
|
|
15022
|
+
if (model.layers[il].bv) {
|
|
15023
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
15024
|
+
cb(Vcur, "Vcur", il);
|
|
15025
|
+
}
|
|
15026
|
+
|
|
15027
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
15028
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
15029
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
15030
|
+
|
|
15031
|
+
Qcur = ggml_rope_ext(
|
|
15032
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
15033
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15034
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15035
|
+
);
|
|
15036
|
+
|
|
15037
|
+
Kcur = ggml_rope_ext(
|
|
15038
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
15039
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15040
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15041
|
+
);
|
|
15042
|
+
|
|
15043
|
+
cb(Qcur, "Qcur", il);
|
|
15044
|
+
cb(Kcur, "Kcur", il);
|
|
15045
|
+
cb(Vcur, "Vcur", il);
|
|
15046
|
+
|
|
15047
|
+
cur = build_attn(inp_attn, gf,
|
|
15048
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
15049
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15050
|
+
cb(cur, "attn_out", il);
|
|
15051
|
+
}
|
|
15052
|
+
|
|
15053
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
15054
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
15055
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
15056
|
+
}
|
|
15057
|
+
|
|
15058
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
15059
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
15060
|
+
|
|
15061
|
+
// feed-forward network
|
|
15062
|
+
// ARCEE uses relu^2 instead of silu
|
|
15063
|
+
cur = build_norm(ffn_inp,
|
|
15064
|
+
model.layers[il].ffn_norm, NULL,
|
|
15065
|
+
LLM_NORM_RMS, il);
|
|
15066
|
+
cb(cur, "ffn_norm", il);
|
|
15067
|
+
|
|
15068
|
+
cur = build_ffn(cur,
|
|
15069
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
15070
|
+
NULL, NULL, NULL,
|
|
15071
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
15072
|
+
NULL,
|
|
15073
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
15074
|
+
cb(cur, "ffn_out", il);
|
|
15075
|
+
|
|
15076
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
15077
|
+
cb(cur, "ffn_out", il);
|
|
15078
|
+
|
|
15079
|
+
cur = build_cvec(cur, il);
|
|
15080
|
+
cb(cur, "l_out", il);
|
|
15081
|
+
|
|
15082
|
+
// input for next layer
|
|
15083
|
+
inpL = cur;
|
|
15084
|
+
}
|
|
15085
|
+
|
|
15086
|
+
cur = inpL;
|
|
15087
|
+
|
|
15088
|
+
cur = build_norm(cur,
|
|
15089
|
+
model.output_norm, NULL,
|
|
15090
|
+
LLM_NORM_RMS, -1);
|
|
15091
|
+
|
|
15092
|
+
cb(cur, "result_norm", -1);
|
|
15093
|
+
res->t_embd = cur;
|
|
15094
|
+
|
|
15095
|
+
// lm_head
|
|
15096
|
+
cur = build_lora_mm(model.output, cur);
|
|
15097
|
+
|
|
15098
|
+
cb(cur, "result_output", -1);
|
|
15099
|
+
res->t_logits = cur;
|
|
15100
|
+
|
|
15101
|
+
ggml_build_forward_expand(gf, cur);
|
|
15102
|
+
}
|
|
15103
|
+
};
|
|
15104
|
+
|
|
15105
|
+
struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
15106
|
+
llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
15107
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
15108
|
+
|
|
15109
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
15110
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
15111
|
+
|
|
15112
|
+
ggml_tensor * cur;
|
|
15113
|
+
ggml_tensor * inpL;
|
|
15114
|
+
|
|
15115
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
15116
|
+
|
|
15117
|
+
// inp_pos - contains the positions
|
|
15118
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
15119
|
+
|
|
15120
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
15121
|
+
|
|
15122
|
+
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
15123
|
+
|
|
15124
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15125
|
+
|
|
15126
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
15127
|
+
ggml_tensor * inpSA = inpL;
|
|
15128
|
+
|
|
15129
|
+
// norm
|
|
15130
|
+
cur = build_norm(inpL,
|
|
15131
|
+
model.layers[il].attn_norm, NULL,
|
|
15132
|
+
LLM_NORM_RMS, il);
|
|
15133
|
+
cb(cur, "attn_norm", il);
|
|
15134
|
+
|
|
15135
|
+
// self-attention
|
|
15136
|
+
{
|
|
15137
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
15138
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
15139
|
+
|
|
15140
|
+
// compute Q and K and RoPE them
|
|
15141
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
15142
|
+
cb(Qcur, "Qcur", il);
|
|
15143
|
+
if (model.layers[il].bq) {
|
|
15144
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
15145
|
+
cb(Qcur, "Qcur", il);
|
|
15146
|
+
}
|
|
15147
|
+
|
|
15148
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
15149
|
+
cb(Kcur, "Kcur", il);
|
|
15150
|
+
if (model.layers[il].bk) {
|
|
15151
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
15152
|
+
cb(Kcur, "Kcur", il);
|
|
15153
|
+
}
|
|
15154
|
+
|
|
15155
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
15156
|
+
cb(Vcur, "Vcur", il);
|
|
15157
|
+
if (model.layers[il].bv) {
|
|
15158
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
15159
|
+
cb(Vcur, "Vcur", il);
|
|
15160
|
+
}
|
|
15161
|
+
|
|
15162
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
15163
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
15164
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
15165
|
+
|
|
15166
|
+
Qcur = ggml_rope_ext(
|
|
15167
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
15168
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15169
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15170
|
+
);
|
|
15171
|
+
|
|
15172
|
+
cb(Qcur, "Qcur", il);
|
|
15173
|
+
cb(Kcur, "Kcur", il);
|
|
15174
|
+
cb(Vcur, "Vcur", il);
|
|
15175
|
+
|
|
15176
|
+
Kcur = ggml_rope_ext(
|
|
15177
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
15178
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15179
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15180
|
+
);
|
|
15181
|
+
|
|
15182
|
+
Kcur = build_norm(Kcur,
|
|
15183
|
+
model.layers[il].attn_k_norm, nullptr,
|
|
15184
|
+
LLM_NORM_RMS, il);
|
|
15185
|
+
cb(Kcur, "Kcur_norm", il);
|
|
15186
|
+
|
|
15187
|
+
Qcur = build_norm(Qcur,
|
|
15188
|
+
model.layers[il].attn_q_norm, nullptr,
|
|
15189
|
+
LLM_NORM_RMS, il);
|
|
15190
|
+
cb(Qcur, "Qcur_norm", il);
|
|
15191
|
+
|
|
15192
|
+
cur = build_attn(inp_attn, gf,
|
|
15193
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
15194
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15195
|
+
cb(cur, "attn_out", il);
|
|
15196
|
+
}
|
|
15197
|
+
|
|
15198
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
15199
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
15200
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
15201
|
+
}
|
|
15202
|
+
|
|
15203
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
15204
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
15205
|
+
|
|
15206
|
+
cur = build_norm(ffn_inp,
|
|
15207
|
+
model.layers[il].ffn_norm, NULL,
|
|
15208
|
+
LLM_NORM_RMS, il);
|
|
15209
|
+
cb(cur, "ffn_norm", il);
|
|
15210
|
+
|
|
15211
|
+
// feed-forward network (non-MoE)
|
|
15212
|
+
ggml_tensor * cur_mlp = build_ffn(cur,
|
|
15213
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
15214
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
15215
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
15216
|
+
NULL,
|
|
15217
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15218
|
+
cb(cur_mlp, "ffn_mlp", il);
|
|
15219
|
+
|
|
15220
|
+
// MoE branch
|
|
15221
|
+
ggml_tensor * cur_moe = build_moe_ffn(cur,
|
|
15222
|
+
model.layers[il].ffn_gate_inp,
|
|
15223
|
+
model.layers[il].ffn_up_exps,
|
|
15224
|
+
model.layers[il].ffn_gate_exps,
|
|
15225
|
+
model.layers[il].ffn_down_exps,
|
|
15226
|
+
nullptr,
|
|
15227
|
+
n_expert, n_expert_used,
|
|
15228
|
+
LLM_FFN_SILU,
|
|
15229
|
+
true, // norm_topk_prob
|
|
15230
|
+
false,
|
|
15231
|
+
0.0,
|
|
15232
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
15233
|
+
il);
|
|
15234
|
+
cb(cur_moe, "ffn_moe_out", il);
|
|
15235
|
+
|
|
15236
|
+
ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp);
|
|
15237
|
+
cb(ffn_out, "ffn_out", il);
|
|
15238
|
+
|
|
15239
|
+
cur = ggml_add(ctx0, ffn_out, ffn_inp);
|
|
15240
|
+
|
|
15241
|
+
cur = build_cvec(cur, il);
|
|
15242
|
+
cb(cur, "l_out", il);
|
|
15243
|
+
|
|
15244
|
+
// input for next layer
|
|
15245
|
+
inpL = cur;
|
|
15246
|
+
}
|
|
15247
|
+
|
|
15248
|
+
cur = inpL;
|
|
15249
|
+
|
|
15250
|
+
cur = build_norm(cur,
|
|
15251
|
+
model.output_norm, NULL,
|
|
15252
|
+
LLM_NORM_RMS, -1);
|
|
15253
|
+
|
|
15254
|
+
cb(cur, "result_norm", -1);
|
|
15255
|
+
res->t_embd = cur;
|
|
15256
|
+
|
|
15257
|
+
// lm_head
|
|
15258
|
+
cur = build_lora_mm(model.output, cur);
|
|
15259
|
+
cb(cur, "result_output", -1);
|
|
15260
|
+
res->t_logits = cur;
|
|
15261
|
+
|
|
15262
|
+
ggml_build_forward_expand(gf, cur);
|
|
15263
|
+
}
|
|
15264
|
+
};
|
|
15265
|
+
|
|
15266
|
+
struct llm_build_smollm3 : public llm_graph_context {
|
|
15267
|
+
llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
15268
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
15269
|
+
|
|
15270
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
15271
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
15272
|
+
|
|
15273
|
+
ggml_tensor * cur;
|
|
15274
|
+
ggml_tensor * inpL;
|
|
15275
|
+
|
|
15276
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
15277
|
+
|
|
15278
|
+
// inp_pos - contains the positions
|
|
15279
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
15280
|
+
|
|
15281
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
15282
|
+
|
|
15283
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
15284
|
+
|
|
15285
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15286
|
+
|
|
15287
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
15288
|
+
ggml_tensor * inpSA = inpL;
|
|
15289
|
+
|
|
15290
|
+
const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
15291
|
+
|
|
15292
|
+
// norm
|
|
15293
|
+
cur = build_norm(inpL,
|
|
15294
|
+
model.layers[il].attn_norm, NULL,
|
|
15295
|
+
LLM_NORM_RMS, il);
|
|
15296
|
+
cb(cur, "attn_norm", il);
|
|
15297
|
+
|
|
15298
|
+
// self-attention
|
|
15299
|
+
{
|
|
15300
|
+
// compute Q and K and RoPE them
|
|
15301
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
15302
|
+
cb(Qcur, "Qcur", il);
|
|
15303
|
+
if (model.layers[il].bq) {
|
|
15304
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
15305
|
+
cb(Qcur, "Qcur", il);
|
|
15306
|
+
}
|
|
15307
|
+
|
|
15308
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
15309
|
+
cb(Kcur, "Kcur", il);
|
|
15310
|
+
if (model.layers[il].bk) {
|
|
15311
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
15312
|
+
cb(Kcur, "Kcur", il);
|
|
15313
|
+
}
|
|
15314
|
+
|
|
15315
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
15316
|
+
cb(Vcur, "Vcur", il);
|
|
15317
|
+
if (model.layers[il].bv) {
|
|
15318
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
15319
|
+
cb(Vcur, "Vcur", il);
|
|
15320
|
+
}
|
|
15321
|
+
|
|
15322
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
15323
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
15324
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
15325
|
+
|
|
15326
|
+
if (use_rope) {
|
|
15327
|
+
Qcur = ggml_rope_ext(
|
|
15328
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
15329
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15330
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15331
|
+
);
|
|
15332
|
+
|
|
15333
|
+
Kcur = ggml_rope_ext(
|
|
15334
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
15335
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15336
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15337
|
+
);
|
|
15338
|
+
}
|
|
15339
|
+
|
|
15340
|
+
cb(Qcur, "Qcur", il);
|
|
15341
|
+
cb(Kcur, "Kcur", il);
|
|
15342
|
+
cb(Vcur, "Vcur", il);
|
|
15343
|
+
|
|
15344
|
+
cur = build_attn(inp_attn, gf,
|
|
15345
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
15346
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15347
|
+
cb(cur, "attn_out", il);
|
|
15348
|
+
}
|
|
15349
|
+
|
|
15350
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
15351
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
15352
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
15353
|
+
}
|
|
15354
|
+
|
|
15355
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
15356
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
15357
|
+
|
|
15358
|
+
// feed-forward network
|
|
15359
|
+
{
|
|
15360
|
+
cur = build_norm(ffn_inp,
|
|
15361
|
+
model.layers[il].ffn_norm, NULL,
|
|
15362
|
+
LLM_NORM_RMS, il);
|
|
15363
|
+
cb(cur, "ffn_norm", il);
|
|
15364
|
+
|
|
15365
|
+
cur = build_ffn(cur,
|
|
15366
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
15367
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
15368
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
15369
|
+
NULL,
|
|
15370
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
15371
|
+
cb(cur, "ffn_out", il);
|
|
15372
|
+
}
|
|
15373
|
+
|
|
15374
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
15375
|
+
cb(cur, "ffn_out", il);
|
|
15376
|
+
|
|
15377
|
+
cur = build_cvec(cur, il);
|
|
15378
|
+
cb(cur, "l_out", il);
|
|
15379
|
+
|
|
15380
|
+
// input for next layer
|
|
15381
|
+
inpL = cur;
|
|
15382
|
+
}
|
|
15383
|
+
|
|
15384
|
+
cur = inpL;
|
|
15385
|
+
|
|
15386
|
+
cur = build_norm(cur,
|
|
15387
|
+
model.output_norm, NULL,
|
|
15388
|
+
LLM_NORM_RMS, -1);
|
|
15389
|
+
|
|
15390
|
+
cb(cur, "result_norm", -1);
|
|
15391
|
+
res->t_embd = cur;
|
|
15392
|
+
|
|
15393
|
+
// lm_head
|
|
15394
|
+
cur = build_lora_mm(model.output, cur);
|
|
15395
|
+
|
|
15396
|
+
cb(cur, "result_output", -1);
|
|
15397
|
+
res->t_logits = cur;
|
|
15398
|
+
|
|
15399
|
+
ggml_build_forward_expand(gf, cur);
|
|
15400
|
+
}
|
|
15401
|
+
};
|
|
15402
|
+
|
|
15403
|
+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
15404
|
+
llama_memory_i * res;
|
|
15405
|
+
|
|
15406
|
+
switch (arch) {
|
|
15407
|
+
// Models that need specific instantiation should be handled in the
|
|
15408
|
+
// switch statement
|
|
15409
|
+
case LLM_ARCH_BERT:
|
|
15410
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
15411
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
15412
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
15413
|
+
case LLM_ARCH_NEO_BERT:
|
|
15414
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
15415
|
+
{
|
|
15416
|
+
res = nullptr;
|
|
15417
|
+
} break;
|
|
15418
|
+
// Models that need standard caching should rely on recurrent/hybrid
|
|
15419
|
+
// checks
|
|
15420
|
+
default:
|
|
15421
|
+
{
|
|
15422
|
+
if (llm_arch_is_recurrent(arch)) {
|
|
15423
|
+
res = new llama_memory_recurrent(
|
|
15424
|
+
*this,
|
|
15425
|
+
nullptr,
|
|
15426
|
+
GGML_TYPE_F32,
|
|
15427
|
+
GGML_TYPE_F32,
|
|
15428
|
+
cparams.offload_kqv,
|
|
15429
|
+
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
15430
|
+
cparams.n_seq_max);
|
|
15431
|
+
} else if (llm_arch_is_hybrid(arch)) {
|
|
15432
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
15433
|
+
|
|
15434
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
15435
|
+
|
|
15436
|
+
res = new llama_memory_hybrid(
|
|
15437
|
+
/* model */ *this,
|
|
15438
|
+
/* attn_type_k */ params.type_k,
|
|
15439
|
+
/* attn_type_v */ params.type_v,
|
|
15440
|
+
/* attn_v_trans */ !cparams.flash_attn,
|
|
15441
|
+
/* attn_kv_size */ cparams.n_ctx,
|
|
15442
|
+
/* attn_n_pad */ padding,
|
|
15443
|
+
/* attn_n_swa */ hparams.n_swa,
|
|
15444
|
+
/* attn_swa_type */ hparams.swa_type,
|
|
15445
|
+
/* recurrent_type_k */ GGML_TYPE_F32,
|
|
15446
|
+
/* recurrent_type_v */ GGML_TYPE_F32,
|
|
15447
|
+
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
15448
|
+
/* n_seq_max */ cparams.n_seq_max,
|
|
15449
|
+
/* offload */ cparams.offload_kqv,
|
|
15450
|
+
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
|
15451
|
+
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
|
15452
|
+
} else {
|
|
15453
|
+
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
15454
|
+
|
|
15455
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
15456
|
+
|
|
15457
|
+
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
15458
|
+
|
|
15459
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
15460
|
+
GGML_ASSERT(hparams.is_swa_any());
|
|
15461
|
+
|
|
15462
|
+
res = new llama_kv_cache_unified_iswa(
|
|
15463
|
+
*this,
|
|
15464
|
+
params.type_k,
|
|
15465
|
+
params.type_v,
|
|
15466
|
+
!cparams.flash_attn,
|
|
15467
|
+
cparams.offload_kqv,
|
|
15468
|
+
params.swa_full,
|
|
15469
|
+
cparams.n_ctx,
|
|
15470
|
+
cparams.n_seq_max,
|
|
15471
|
+
cparams.n_ubatch,
|
|
15472
|
+
padding);
|
|
15473
|
+
} else {
|
|
15474
|
+
GGML_ASSERT(!hparams.is_swa_any());
|
|
15475
|
+
|
|
15476
|
+
res = new llama_kv_cache_unified(
|
|
15477
|
+
*this,
|
|
15478
|
+
nullptr,
|
|
15479
|
+
params.type_k,
|
|
15480
|
+
params.type_v,
|
|
15481
|
+
!cparams.flash_attn,
|
|
15482
|
+
cparams.offload_kqv,
|
|
15483
|
+
cparams.n_ctx,
|
|
15484
|
+
cparams.n_seq_max,
|
|
15485
|
+
padding,
|
|
15486
|
+
hparams.n_swa,
|
|
15487
|
+
hparams.swa_type);
|
|
15488
|
+
}
|
|
15489
|
+
}
|
|
15490
|
+
}
|
|
15491
|
+
}
|
|
15492
|
+
|
|
15493
|
+
return res;
|
|
15494
|
+
}
|
|
15495
|
+
|
|
15496
|
+
llm_graph_result_ptr llama_model::build_graph(
|
|
15497
|
+
const llm_graph_params & params,
|
|
15498
|
+
ggml_cgraph * gf,
|
|
15499
|
+
llm_graph_type type) const {
|
|
15500
|
+
std::unique_ptr<llm_graph_context> llm;
|
|
15501
|
+
|
|
15502
|
+
switch (arch) {
|
|
14361
15503
|
case LLM_ARCH_LLAMA:
|
|
14362
15504
|
{
|
|
14363
15505
|
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
|
@@ -14495,6 +15637,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
14495
15637
|
llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
|
|
14496
15638
|
} break;
|
|
14497
15639
|
case LLM_ARCH_MAMBA:
|
|
15640
|
+
case LLM_ARCH_MAMBA2:
|
|
14498
15641
|
{
|
|
14499
15642
|
llm = std::make_unique<llm_build_mamba>(*this, params, gf);
|
|
14500
15643
|
} break;
|
|
@@ -14635,6 +15778,22 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
14635
15778
|
{
|
|
14636
15779
|
llm = std::make_unique<llm_build_arcee>(*this, params, gf);
|
|
14637
15780
|
} break;
|
|
15781
|
+
case LLM_ARCH_ERNIE4_5:
|
|
15782
|
+
{
|
|
15783
|
+
llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf);
|
|
15784
|
+
} break;
|
|
15785
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
15786
|
+
{
|
|
15787
|
+
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params, gf);
|
|
15788
|
+
} break;
|
|
15789
|
+
case LLM_ARCH_SMOLLM3:
|
|
15790
|
+
{
|
|
15791
|
+
llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
|
|
15792
|
+
} break;
|
|
15793
|
+
case LLM_ARCH_FALCON_H1:
|
|
15794
|
+
{
|
|
15795
|
+
llm = std::make_unique<llm_build_falcon_h1>(*this, params, gf);
|
|
15796
|
+
} break;
|
|
14638
15797
|
default:
|
|
14639
15798
|
GGML_ABORT("fatal error");
|
|
14640
15799
|
}
|
|
@@ -14751,6 +15910,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14751
15910
|
case LLM_ARCH_REFACT:
|
|
14752
15911
|
case LLM_ARCH_BLOOM:
|
|
14753
15912
|
case LLM_ARCH_MAMBA:
|
|
15913
|
+
case LLM_ARCH_MAMBA2:
|
|
14754
15914
|
case LLM_ARCH_JINA_BERT_V2:
|
|
14755
15915
|
case LLM_ARCH_T5:
|
|
14756
15916
|
case LLM_ARCH_T5ENCODER:
|
|
@@ -14785,11 +15945,14 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14785
15945
|
case LLM_ARCH_CHAMELEON:
|
|
14786
15946
|
case LLM_ARCH_BAILINGMOE:
|
|
14787
15947
|
case LLM_ARCH_NEO_BERT:
|
|
15948
|
+
case LLM_ARCH_SMOLLM3:
|
|
14788
15949
|
case LLM_ARCH_ARCEE:
|
|
15950
|
+
case LLM_ARCH_ERNIE4_5:
|
|
14789
15951
|
return LLAMA_ROPE_TYPE_NORM;
|
|
14790
15952
|
|
|
14791
15953
|
// the pairs of head values are offset by n_rot/2
|
|
14792
15954
|
case LLM_ARCH_FALCON:
|
|
15955
|
+
case LLM_ARCH_FALCON_H1:
|
|
14793
15956
|
case LLM_ARCH_GROK:
|
|
14794
15957
|
case LLM_ARCH_DBRX:
|
|
14795
15958
|
case LLM_ARCH_BERT:
|
|
@@ -14821,6 +15984,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14821
15984
|
case LLM_ARCH_EXAONE:
|
|
14822
15985
|
case LLM_ARCH_MINICPM3:
|
|
14823
15986
|
case LLM_ARCH_DOTS1:
|
|
15987
|
+
case LLM_ARCH_HUNYUAN_MOE:
|
|
14824
15988
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
14825
15989
|
|
|
14826
15990
|
case LLM_ARCH_QWEN2VL:
|