@fugood/llama.node 1.1.8 → 1.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +9 -0
- package/lib/index.js +9 -2
- package/lib/index.ts +57 -30
- package/lib/version.js +2 -2
- package/lib/version.ts +2 -2
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +15 -5
- package/src/LlamaCompletionWorker.cpp +12 -3
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +14 -1
- package/src/llama.cpp/common/arg.cpp +6 -4
- package/src/llama.cpp/common/chat.cpp +34 -3
- package/src/llama.cpp/common/common.cpp +0 -15
- package/src/llama.cpp/common/common.h +1 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +25 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/include/llama.h +1 -110
- package/src/llama.cpp/src/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +19 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -192
- package/src/llama.cpp/src/llama-context.h +2 -7
- package/src/llama.cpp/src/llama-cparams.h +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +35 -57
- package/src/llama.cpp/src/llama-graph.h +36 -46
- package/src/llama.cpp/src/llama-hparams.cpp +25 -0
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +69 -52
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +28 -26
- package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +123 -474
- package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +34 -59
- package/src/llama.cpp/src/llama-kv-cells.h +21 -21
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +34 -33
- package/src/llama.cpp/src/llama-memory-hybrid.h +24 -28
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
- package/src/llama.cpp/src/llama-memory-recurrent.h +8 -12
- package/src/llama.cpp/src/llama-memory.h +11 -8
- package/src/llama.cpp/src/llama-model.cpp +396 -187
- package/src/llama.cpp/src/llama-model.h +1 -0
|
@@ -6,8 +6,8 @@
|
|
|
6
6
|
#include "llama-cparams.h"
|
|
7
7
|
#include "llama-model-loader.h"
|
|
8
8
|
|
|
9
|
-
#include "llama-kv-cache
|
|
10
|
-
#include "llama-kv-cache-
|
|
9
|
+
#include "llama-kv-cache.h"
|
|
10
|
+
#include "llama-kv-cache-iswa.h"
|
|
11
11
|
#include "llama-memory-hybrid.h"
|
|
12
12
|
#include "llama-memory-recurrent.h"
|
|
13
13
|
|
|
@@ -83,6 +83,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
83
83
|
case LLM_TYPE_32B: return "32B";
|
|
84
84
|
case LLM_TYPE_34B: return "34B";
|
|
85
85
|
case LLM_TYPE_35B: return "35B";
|
|
86
|
+
case LLM_TYPE_36B: return "36B";
|
|
86
87
|
case LLM_TYPE_40B: return "40B";
|
|
87
88
|
case LLM_TYPE_65B: return "65B";
|
|
88
89
|
case LLM_TYPE_70B: return "70B";
|
|
@@ -1114,6 +1115,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1114
1115
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1115
1116
|
hparams.set_swa_pattern(5);
|
|
1116
1117
|
|
|
1118
|
+
hparams.n_layer_kv_from_start = 20;
|
|
1117
1119
|
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1118
1120
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1119
1121
|
hparams.f_attention_scale = 1.0f;
|
|
@@ -1288,6 +1290,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1288
1290
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1289
1291
|
}
|
|
1290
1292
|
} break;
|
|
1293
|
+
case LLM_ARCH_SEED_OSS:
|
|
1294
|
+
{
|
|
1295
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1296
|
+
switch (hparams.n_layer) {
|
|
1297
|
+
case 64: type = LLM_TYPE_36B; break;
|
|
1298
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1299
|
+
}
|
|
1300
|
+
} break;
|
|
1291
1301
|
case LLM_ARCH_OLMOE:
|
|
1292
1302
|
{
|
|
1293
1303
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1465,12 +1475,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1465
1475
|
// Expert gating function (GLM-4.5 uses sigmoid)
|
|
1466
1476
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1467
1477
|
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
1468
|
-
hparams.expert_gating_func =
|
|
1478
|
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
|
1469
1479
|
}
|
|
1470
1480
|
|
|
1471
1481
|
// NextN/MTP parameters
|
|
1472
1482
|
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1473
1483
|
|
|
1484
|
+
// TODO: when MTP is implemented, this should probably be updated if needed
|
|
1485
|
+
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
|
1486
|
+
|
|
1474
1487
|
switch (hparams.n_layer) {
|
|
1475
1488
|
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
|
1476
1489
|
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
|
|
@@ -3967,6 +3980,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3967
3980
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3968
3981
|
}
|
|
3969
3982
|
} break;
|
|
3983
|
+
case LLM_ARCH_SEED_OSS:
|
|
3984
|
+
{
|
|
3985
|
+
const uint32_t head_dim = hparams.n_embd_head_k;
|
|
3986
|
+
const int64_t n_qo_dim = n_head * head_dim;
|
|
3987
|
+
const int64_t n_kv_dim = n_head_kv * head_dim;
|
|
3988
|
+
|
|
3989
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3990
|
+
|
|
3991
|
+
// output
|
|
3992
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3993
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3994
|
+
// if output is NULL, init from the input tok embed
|
|
3995
|
+
if (output == NULL) {
|
|
3996
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3997
|
+
}
|
|
3998
|
+
|
|
3999
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4000
|
+
auto & layer = layers[i];
|
|
4001
|
+
|
|
4002
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
|
|
4003
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
|
|
4004
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
|
|
4005
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
|
|
4006
|
+
|
|
4007
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
|
|
4008
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
|
|
4009
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
|
|
4010
|
+
|
|
4011
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4012
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
4013
|
+
|
|
4014
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4015
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4016
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4017
|
+
}
|
|
4018
|
+
} break;
|
|
4019
|
+
|
|
3970
4020
|
case LLM_ARCH_OLMOE:
|
|
3971
4021
|
{
|
|
3972
4022
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5474,8 +5524,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5474
5524
|
} break;
|
|
5475
5525
|
case LLM_ARCH_LFM2:
|
|
5476
5526
|
{
|
|
5477
|
-
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,
|
|
5527
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5478
5528
|
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
5529
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5530
|
+
|
|
5531
|
+
if (output == NULL) {
|
|
5532
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5533
|
+
}
|
|
5479
5534
|
|
|
5480
5535
|
for (int i = 0; i < n_layer; ++i) {
|
|
5481
5536
|
auto & layer = layers[i];
|
|
@@ -5986,7 +6041,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
5986
6041
|
// inp_pos - contains the positions
|
|
5987
6042
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
5988
6043
|
|
|
5989
|
-
auto * inp_attn =
|
|
6044
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
5990
6045
|
|
|
5991
6046
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
5992
6047
|
|
|
@@ -6050,7 +6105,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
6050
6105
|
|
|
6051
6106
|
cur = build_attn(inp_attn,
|
|
6052
6107
|
model.layers[il].wo, model.layers[il].bo,
|
|
6053
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6108
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
6054
6109
|
cb(cur, "attn_out", il);
|
|
6055
6110
|
}
|
|
6056
6111
|
|
|
@@ -6146,7 +6201,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
6146
6201
|
ggml_tensor * inp_attn_scale = nullptr;
|
|
6147
6202
|
inp_attn_scale = build_inp_attn_scale();
|
|
6148
6203
|
|
|
6149
|
-
auto * inp_attn =
|
|
6204
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
6150
6205
|
|
|
6151
6206
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
6152
6207
|
|
|
@@ -6224,7 +6279,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
6224
6279
|
|
|
6225
6280
|
cur = build_attn(inp_attn,
|
|
6226
6281
|
model.layers[il].wo, model.layers[il].bo,
|
|
6227
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6282
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
6228
6283
|
cb(cur, "attn_out", il);
|
|
6229
6284
|
}
|
|
6230
6285
|
|
|
@@ -6325,7 +6380,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
6325
6380
|
// inp_pos - contains the positions
|
|
6326
6381
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6327
6382
|
|
|
6328
|
-
auto * inp_attn =
|
|
6383
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6329
6384
|
|
|
6330
6385
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
6331
6386
|
|
|
@@ -6401,7 +6456,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
6401
6456
|
|
|
6402
6457
|
cur = build_attn(inp_attn,
|
|
6403
6458
|
model.layers[il].wo, model.layers[il].bo,
|
|
6404
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6459
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
6405
6460
|
}
|
|
6406
6461
|
|
|
6407
6462
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6481,7 +6536,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
6481
6536
|
// inp_pos - contains the positions
|
|
6482
6537
|
ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
|
|
6483
6538
|
|
|
6484
|
-
auto * inp_attn =
|
|
6539
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6485
6540
|
|
|
6486
6541
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6487
6542
|
|
|
@@ -6533,7 +6588,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
6533
6588
|
|
|
6534
6589
|
cur = build_attn(inp_attn,
|
|
6535
6590
|
model.layers[il].wo, NULL,
|
|
6536
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6591
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6537
6592
|
}
|
|
6538
6593
|
|
|
6539
6594
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6603,7 +6658,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
6603
6658
|
// inp_pos - contains the positions
|
|
6604
6659
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6605
6660
|
|
|
6606
|
-
auto * inp_attn =
|
|
6661
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6607
6662
|
|
|
6608
6663
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6609
6664
|
|
|
@@ -6648,7 +6703,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
6648
6703
|
|
|
6649
6704
|
cur = build_attn(inp_attn,
|
|
6650
6705
|
model.layers[il].wo, NULL,
|
|
6651
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6706
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6652
6707
|
}
|
|
6653
6708
|
|
|
6654
6709
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6717,7 +6772,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6717
6772
|
// inp_pos - contains the positions
|
|
6718
6773
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6719
6774
|
|
|
6720
|
-
auto * inp_attn =
|
|
6775
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6721
6776
|
|
|
6722
6777
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6723
6778
|
|
|
@@ -6771,7 +6826,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6771
6826
|
|
|
6772
6827
|
cur = build_attn(inp_attn,
|
|
6773
6828
|
model.layers[il].wo, NULL,
|
|
6774
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6829
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6775
6830
|
}
|
|
6776
6831
|
|
|
6777
6832
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6841,7 +6896,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
6841
6896
|
// inp_pos - contains the positions
|
|
6842
6897
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6843
6898
|
|
|
6844
|
-
auto * inp_attn =
|
|
6899
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6845
6900
|
|
|
6846
6901
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6847
6902
|
|
|
@@ -6901,7 +6956,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
6901
6956
|
|
|
6902
6957
|
cur = build_attn(inp_attn,
|
|
6903
6958
|
model.layers[il].wo, model.layers[il].bo,
|
|
6904
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
6959
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
6905
6960
|
}
|
|
6906
6961
|
|
|
6907
6962
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7001,7 +7056,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
7001
7056
|
// inp_pos - contains the positions
|
|
7002
7057
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7003
7058
|
|
|
7004
|
-
auto * inp_attn =
|
|
7059
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7005
7060
|
|
|
7006
7061
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7007
7062
|
|
|
@@ -7050,7 +7105,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
7050
7105
|
|
|
7051
7106
|
cur = build_attn(inp_attn,
|
|
7052
7107
|
model.layers[il].wo, NULL,
|
|
7053
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7108
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7054
7109
|
}
|
|
7055
7110
|
|
|
7056
7111
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7125,7 +7180,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7125
7180
|
// inp_pos - contains the positions
|
|
7126
7181
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7127
7182
|
|
|
7128
|
-
auto * inp_attn =
|
|
7183
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7129
7184
|
|
|
7130
7185
|
ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
7131
7186
|
cb(pos, "pos_embd", -1);
|
|
@@ -7164,7 +7219,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7164
7219
|
|
|
7165
7220
|
cur = build_attn(inp_attn,
|
|
7166
7221
|
model.layers[il].wo, model.layers[il].bo,
|
|
7167
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7222
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7168
7223
|
}
|
|
7169
7224
|
|
|
7170
7225
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7230,7 +7285,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
7230
7285
|
|
|
7231
7286
|
inpL = build_inp_embd(model.tok_embd);
|
|
7232
7287
|
|
|
7233
|
-
auto * inp_attn =
|
|
7288
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7234
7289
|
|
|
7235
7290
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7236
7291
|
|
|
@@ -7263,7 +7318,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
7263
7318
|
|
|
7264
7319
|
cur = build_attn(inp_attn,
|
|
7265
7320
|
model.layers[il].wo, NULL,
|
|
7266
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7321
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7267
7322
|
}
|
|
7268
7323
|
|
|
7269
7324
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7426,7 +7481,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7426
7481
|
|
|
7427
7482
|
cur = build_attn(inp_attn,
|
|
7428
7483
|
model.layers[il].wo, model.layers[il].bo,
|
|
7429
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7484
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7430
7485
|
cb(cur, "kqv_out", il);
|
|
7431
7486
|
}
|
|
7432
7487
|
|
|
@@ -7571,7 +7626,7 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
7571
7626
|
|
|
7572
7627
|
cur = build_attn(inp_attn,
|
|
7573
7628
|
model.layers[il].wo, nullptr,
|
|
7574
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7629
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7575
7630
|
cb(cur, "kqv_out", il);
|
|
7576
7631
|
}
|
|
7577
7632
|
|
|
@@ -7632,7 +7687,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7632
7687
|
|
|
7633
7688
|
inpL = build_inp_embd(model.tok_embd);
|
|
7634
7689
|
|
|
7635
|
-
auto * inp_attn =
|
|
7690
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7636
7691
|
|
|
7637
7692
|
inpL = build_norm(inpL,
|
|
7638
7693
|
model.tok_norm,
|
|
@@ -7671,7 +7726,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7671
7726
|
|
|
7672
7727
|
cur = build_attn(inp_attn,
|
|
7673
7728
|
model.layers[il].wo, model.layers[il].bo,
|
|
7674
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7729
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7675
7730
|
}
|
|
7676
7731
|
|
|
7677
7732
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7739,7 +7794,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7739
7794
|
|
|
7740
7795
|
inpL = build_inp_embd(model.tok_embd);
|
|
7741
7796
|
|
|
7742
|
-
auto * inp_attn =
|
|
7797
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7743
7798
|
|
|
7744
7799
|
if (model.pos_embd) {
|
|
7745
7800
|
// inp_pos - contains the positions
|
|
@@ -7819,7 +7874,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7819
7874
|
|
|
7820
7875
|
cur = build_attn(inp_attn,
|
|
7821
7876
|
model.layers[il].wo, model.layers[il].bo,
|
|
7822
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7877
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7823
7878
|
}
|
|
7824
7879
|
|
|
7825
7880
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7889,7 +7944,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
7889
7944
|
// inp_pos - contains the positions
|
|
7890
7945
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7891
7946
|
|
|
7892
|
-
auto * inp_attn =
|
|
7947
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7893
7948
|
|
|
7894
7949
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7895
7950
|
|
|
@@ -7965,7 +8020,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
7965
8020
|
|
|
7966
8021
|
cur = build_attn(inp_attn,
|
|
7967
8022
|
model.layers[il].wo, NULL,
|
|
7968
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8023
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7969
8024
|
}
|
|
7970
8025
|
|
|
7971
8026
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8041,7 +8096,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8041
8096
|
// inp_pos - contains the positions
|
|
8042
8097
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8043
8098
|
|
|
8044
|
-
auto * inp_attn =
|
|
8099
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8045
8100
|
|
|
8046
8101
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8047
8102
|
|
|
@@ -8086,7 +8141,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8086
8141
|
|
|
8087
8142
|
cur = build_attn(inp_attn,
|
|
8088
8143
|
model.layers[il].wo, NULL,
|
|
8089
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8144
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8090
8145
|
}
|
|
8091
8146
|
|
|
8092
8147
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8156,7 +8211,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
8156
8211
|
// inp_pos - contains the positions
|
|
8157
8212
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8158
8213
|
|
|
8159
|
-
auto * inp_attn =
|
|
8214
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8160
8215
|
|
|
8161
8216
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8162
8217
|
|
|
@@ -8206,7 +8261,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
8206
8261
|
|
|
8207
8262
|
cur = build_attn(inp_attn,
|
|
8208
8263
|
model.layers[il].wo, model.layers[il].bo,
|
|
8209
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8264
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8210
8265
|
}
|
|
8211
8266
|
|
|
8212
8267
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8320,8 +8375,9 @@ struct llm_build_dream : public llm_graph_context {
|
|
|
8320
8375
|
cb(Kcur, "Kcur", il);
|
|
8321
8376
|
cb(Vcur, "Vcur", il);
|
|
8322
8377
|
|
|
8323
|
-
cur = build_attn(inp_attn,
|
|
8324
|
-
|
|
8378
|
+
cur = build_attn(inp_attn,
|
|
8379
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
8380
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
8325
8381
|
}
|
|
8326
8382
|
|
|
8327
8383
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8420,8 +8476,9 @@ struct llm_build_llada : public llm_graph_context {
|
|
|
8420
8476
|
cb(Kcur, "Kcur", il);
|
|
8421
8477
|
cb(Vcur, "Vcur", il);
|
|
8422
8478
|
|
|
8423
|
-
cur = build_attn(inp_attn,
|
|
8424
|
-
|
|
8479
|
+
cur = build_attn(inp_attn,
|
|
8480
|
+
model.layers[il].wo, NULL,
|
|
8481
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
8425
8482
|
}
|
|
8426
8483
|
|
|
8427
8484
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8481,7 +8538,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8481
8538
|
// inp_pos - contains the positions
|
|
8482
8539
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8483
8540
|
|
|
8484
|
-
auto * inp_attn =
|
|
8541
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8485
8542
|
|
|
8486
8543
|
int sections[4];
|
|
8487
8544
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
@@ -8534,7 +8591,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8534
8591
|
|
|
8535
8592
|
cur = build_attn(inp_attn,
|
|
8536
8593
|
model.layers[il].wo, model.layers[il].bo,
|
|
8537
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8594
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8538
8595
|
}
|
|
8539
8596
|
|
|
8540
8597
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8602,7 +8659,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
8602
8659
|
// inp_pos - contains the positions
|
|
8603
8660
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8604
8661
|
|
|
8605
|
-
auto * inp_attn =
|
|
8662
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8606
8663
|
|
|
8607
8664
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8608
8665
|
|
|
@@ -8661,7 +8718,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
8661
8718
|
|
|
8662
8719
|
cur = build_attn(inp_attn,
|
|
8663
8720
|
model.layers[il].wo, model.layers[il].bo,
|
|
8664
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8721
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8665
8722
|
}
|
|
8666
8723
|
|
|
8667
8724
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8761,7 +8818,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
8761
8818
|
// inp_pos - contains the positions
|
|
8762
8819
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8763
8820
|
|
|
8764
|
-
auto * inp_attn =
|
|
8821
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8765
8822
|
|
|
8766
8823
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8767
8824
|
|
|
@@ -8814,7 +8871,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
8814
8871
|
|
|
8815
8872
|
cur = build_attn(inp_attn,
|
|
8816
8873
|
model.layers[il].wo, model.layers[il].bo,
|
|
8817
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8874
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8818
8875
|
}
|
|
8819
8876
|
|
|
8820
8877
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8882,7 +8939,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
8882
8939
|
// inp_pos - contains the positions
|
|
8883
8940
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8884
8941
|
|
|
8885
|
-
auto * inp_attn =
|
|
8942
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8886
8943
|
|
|
8887
8944
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8888
8945
|
|
|
@@ -8935,7 +8992,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
8935
8992
|
|
|
8936
8993
|
cur = build_attn(inp_attn,
|
|
8937
8994
|
model.layers[il].wo, model.layers[il].bo,
|
|
8938
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8995
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8939
8996
|
}
|
|
8940
8997
|
|
|
8941
8998
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9012,7 +9069,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9012
9069
|
// inp_pos - contains the positions
|
|
9013
9070
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9014
9071
|
|
|
9015
|
-
auto * inp_attn =
|
|
9072
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9016
9073
|
|
|
9017
9074
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9018
9075
|
|
|
@@ -9075,7 +9132,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9075
9132
|
|
|
9076
9133
|
cur = build_attn(inp_attn,
|
|
9077
9134
|
model.layers[il].wo, model.layers[il].bo,
|
|
9078
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9135
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
9079
9136
|
}
|
|
9080
9137
|
|
|
9081
9138
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9141,13 +9198,13 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9141
9198
|
// inp_pos - contains the positions
|
|
9142
9199
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9143
9200
|
|
|
9144
|
-
using inp_attn_type = std::conditional_t<iswa,
|
|
9201
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
9145
9202
|
inp_attn_type * inp_attn = nullptr;
|
|
9146
9203
|
|
|
9147
9204
|
if constexpr (iswa) {
|
|
9148
|
-
inp_attn =
|
|
9205
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
9149
9206
|
} else {
|
|
9150
|
-
inp_attn =
|
|
9207
|
+
inp_attn = build_attn_inp_kv();
|
|
9151
9208
|
}
|
|
9152
9209
|
|
|
9153
9210
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -9212,7 +9269,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9212
9269
|
|
|
9213
9270
|
cur = build_attn(inp_attn,
|
|
9214
9271
|
model.layers[il].wo, model.layers[il].bo,
|
|
9215
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9272
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
9216
9273
|
}
|
|
9217
9274
|
|
|
9218
9275
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9299,7 +9356,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
9299
9356
|
// inp_pos - contains the positions
|
|
9300
9357
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9301
9358
|
|
|
9302
|
-
auto * inp_attn =
|
|
9359
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9303
9360
|
|
|
9304
9361
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9305
9362
|
|
|
@@ -9346,7 +9403,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
9346
9403
|
|
|
9347
9404
|
cur = build_attn(inp_attn,
|
|
9348
9405
|
model.layers[il].wo, NULL,
|
|
9349
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9406
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9350
9407
|
}
|
|
9351
9408
|
|
|
9352
9409
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9415,7 +9472,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
9415
9472
|
// inp_pos - contains the positions
|
|
9416
9473
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9417
9474
|
|
|
9418
|
-
auto * inp_attn =
|
|
9475
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9419
9476
|
|
|
9420
9477
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
9421
9478
|
cb(pos, "pos_embd", -1);
|
|
@@ -9454,7 +9511,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
9454
9511
|
|
|
9455
9512
|
cur = build_attn(inp_attn,
|
|
9456
9513
|
model.layers[il].wo, model.layers[il].bo,
|
|
9457
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9514
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9458
9515
|
}
|
|
9459
9516
|
|
|
9460
9517
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9525,7 +9582,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9525
9582
|
// inp_pos - contains the positions
|
|
9526
9583
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9527
9584
|
|
|
9528
|
-
auto * inp_attn =
|
|
9585
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9529
9586
|
|
|
9530
9587
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9531
9588
|
|
|
@@ -9568,7 +9625,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9568
9625
|
|
|
9569
9626
|
cur = build_attn(inp_attn,
|
|
9570
9627
|
model.layers[il].wo, model.layers[il].bo,
|
|
9571
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9628
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9572
9629
|
}
|
|
9573
9630
|
|
|
9574
9631
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9638,7 +9695,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
9638
9695
|
// inp_pos - contains the positions
|
|
9639
9696
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9640
9697
|
|
|
9641
|
-
auto * inp_attn =
|
|
9698
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9642
9699
|
|
|
9643
9700
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9644
9701
|
|
|
@@ -9697,7 +9754,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
9697
9754
|
|
|
9698
9755
|
cur = build_attn(inp_attn,
|
|
9699
9756
|
model.layers[il].wo, NULL,
|
|
9700
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9757
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9701
9758
|
}
|
|
9702
9759
|
|
|
9703
9760
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9765,7 +9822,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
9765
9822
|
// inp_pos - contains the positions
|
|
9766
9823
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9767
9824
|
|
|
9768
|
-
auto * inp_attn =
|
|
9825
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9769
9826
|
|
|
9770
9827
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9771
9828
|
|
|
@@ -9824,7 +9881,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
9824
9881
|
|
|
9825
9882
|
cur = build_attn(inp_attn,
|
|
9826
9883
|
model.layers[il].wo, model.layers[il].bo,
|
|
9827
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9884
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9828
9885
|
}
|
|
9829
9886
|
|
|
9830
9887
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9901,7 +9958,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
9901
9958
|
// inp_pos - contains the positions
|
|
9902
9959
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9903
9960
|
|
|
9904
|
-
auto * inp_attn =
|
|
9961
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9905
9962
|
|
|
9906
9963
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9907
9964
|
|
|
@@ -10012,7 +10069,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
10012
10069
|
|
|
10013
10070
|
cur = build_attn(inp_attn,
|
|
10014
10071
|
model.layers[il].wo, NULL,
|
|
10015
|
-
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
10072
|
+
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
|
|
10016
10073
|
}
|
|
10017
10074
|
|
|
10018
10075
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10096,7 +10153,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
10096
10153
|
// inp_pos - contains the positions
|
|
10097
10154
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10098
10155
|
|
|
10099
|
-
auto * inp_attn =
|
|
10156
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
10100
10157
|
|
|
10101
10158
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10102
10159
|
|
|
@@ -10142,7 +10199,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
10142
10199
|
|
|
10143
10200
|
cur = build_attn(inp_attn,
|
|
10144
10201
|
model.layers[il].wo, NULL,
|
|
10145
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10202
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
10146
10203
|
}
|
|
10147
10204
|
|
|
10148
10205
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10212,7 +10269,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
10212
10269
|
// inp_pos - contains the positions
|
|
10213
10270
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10214
10271
|
|
|
10215
|
-
auto * inp_attn =
|
|
10272
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
10216
10273
|
|
|
10217
10274
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10218
10275
|
|
|
@@ -10257,7 +10314,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
10257
10314
|
|
|
10258
10315
|
cur = build_attn(inp_attn,
|
|
10259
10316
|
model.layers[il].wo, NULL,
|
|
10260
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10317
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
10261
10318
|
}
|
|
10262
10319
|
|
|
10263
10320
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10346,7 +10403,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
10346
10403
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10347
10404
|
|
|
10348
10405
|
// TODO: is causal == true correct? might need some changes
|
|
10349
|
-
auto * inp_attn =
|
|
10406
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
10350
10407
|
|
|
10351
10408
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10352
10409
|
|
|
@@ -10399,7 +10456,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
10399
10456
|
|
|
10400
10457
|
cur = build_attn(inp_attn,
|
|
10401
10458
|
model.layers[il].wo, NULL,
|
|
10402
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10459
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
10403
10460
|
}
|
|
10404
10461
|
|
|
10405
10462
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10471,7 +10528,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10471
10528
|
const int64_t n_embd_altup;
|
|
10472
10529
|
const int64_t n_altup;
|
|
10473
10530
|
const int i_altup_act;
|
|
10474
|
-
const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
|
|
10475
10531
|
const int n_layer_sparsity = 10; // number of layers using activation sparsity
|
|
10476
10532
|
const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
|
|
10477
10533
|
|
|
@@ -10497,7 +10553,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10497
10553
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10498
10554
|
|
|
10499
10555
|
// TODO: is causal == true correct? might need some changes
|
|
10500
|
-
auto * inp_attn =
|
|
10556
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
10501
10557
|
|
|
10502
10558
|
// inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
|
|
10503
10559
|
ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
|
|
@@ -10521,8 +10577,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10521
10577
|
|
|
10522
10578
|
for (int il = 0; il < n_layer; ++il) {
|
|
10523
10579
|
// this block is made to be closely resemble Gemma3p5DecoderLayer on python code
|
|
10524
|
-
const bool has_kv = (il < n_layer_kv);
|
|
10525
|
-
|
|
10526
10580
|
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
10527
10581
|
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
10528
10582
|
|
|
@@ -10542,7 +10596,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10542
10596
|
ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
|
|
10543
10597
|
|
|
10544
10598
|
// self-attention
|
|
10545
|
-
if (has_kv) {
|
|
10599
|
+
if (hparams.has_kv(il)) {
|
|
10546
10600
|
// compute Q and K and RoPE them
|
|
10547
10601
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
10548
10602
|
cb(Qcur, "Qcur", il);
|
|
@@ -10580,9 +10634,9 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10580
10634
|
|
|
10581
10635
|
cur = build_attn(inp_attn,
|
|
10582
10636
|
model.layers[il].wo, NULL,
|
|
10583
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10637
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10584
10638
|
} else {
|
|
10585
|
-
//
|
|
10639
|
+
// reuse KV cache of earlier layers
|
|
10586
10640
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
10587
10641
|
cb(Qcur, "Qcur", il);
|
|
10588
10642
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
@@ -10598,7 +10652,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10598
10652
|
|
|
10599
10653
|
cur = build_attn(inp_attn,
|
|
10600
10654
|
model.layers[il].wo, NULL,
|
|
10601
|
-
Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10655
|
+
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10602
10656
|
}
|
|
10603
10657
|
|
|
10604
10658
|
cur = build_norm(cur,
|
|
@@ -10904,7 +10958,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
10904
10958
|
// inp_pos - contains the positions
|
|
10905
10959
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10906
10960
|
|
|
10907
|
-
auto * inp_attn =
|
|
10961
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
10908
10962
|
|
|
10909
10963
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10910
10964
|
|
|
@@ -10963,7 +11017,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
10963
11017
|
|
|
10964
11018
|
cur = build_attn(inp_attn,
|
|
10965
11019
|
model.layers[il].wo, model.layers[il].bo,
|
|
10966
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11020
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10967
11021
|
}
|
|
10968
11022
|
|
|
10969
11023
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11390,7 +11444,9 @@ struct llm_build_jamba : public llm_graph_context_mamba {
|
|
|
11390
11444
|
cb(Vcur, "Vcur", il);
|
|
11391
11445
|
|
|
11392
11446
|
// No RoPE :)
|
|
11393
|
-
cur = build_attn(inp_hybrid->get_attn(),
|
|
11447
|
+
cur = build_attn(inp_hybrid->get_attn(),
|
|
11448
|
+
model.layers[il].wo, NULL,
|
|
11449
|
+
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11394
11450
|
}
|
|
11395
11451
|
|
|
11396
11452
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11473,7 +11529,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
11473
11529
|
// inp_pos - contains the positions
|
|
11474
11530
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11475
11531
|
|
|
11476
|
-
auto * inp_attn =
|
|
11532
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11477
11533
|
|
|
11478
11534
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11479
11535
|
|
|
@@ -11548,7 +11604,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
11548
11604
|
|
|
11549
11605
|
cur = build_attn(inp_attn,
|
|
11550
11606
|
model.layers[il].wo, model.layers[il].bo,
|
|
11551
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11607
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11552
11608
|
}
|
|
11553
11609
|
|
|
11554
11610
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11620,7 +11676,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
11620
11676
|
// inp_pos - contains the positions
|
|
11621
11677
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11622
11678
|
|
|
11623
|
-
auto * inp_attn =
|
|
11679
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
11624
11680
|
|
|
11625
11681
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11626
11682
|
|
|
@@ -11683,7 +11739,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
11683
11739
|
|
|
11684
11740
|
cur = build_attn(inp_attn,
|
|
11685
11741
|
model.layers[il].wo, model.layers[il].bo,
|
|
11686
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11742
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11687
11743
|
}
|
|
11688
11744
|
|
|
11689
11745
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11755,7 +11811,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
11755
11811
|
// inp_pos - contains the positions
|
|
11756
11812
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11757
11813
|
|
|
11758
|
-
auto * inp_attn =
|
|
11814
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11759
11815
|
|
|
11760
11816
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11761
11817
|
|
|
@@ -11814,7 +11870,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
11814
11870
|
|
|
11815
11871
|
cur = build_attn(inp_attn,
|
|
11816
11872
|
model.layers[il].wo, nullptr,
|
|
11817
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11873
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11818
11874
|
}
|
|
11819
11875
|
|
|
11820
11876
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11883,7 +11939,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
11883
11939
|
// inp_pos - contains the positions
|
|
11884
11940
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11885
11941
|
|
|
11886
|
-
auto * inp_attn =
|
|
11942
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11887
11943
|
|
|
11888
11944
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11889
11945
|
|
|
@@ -11934,7 +11990,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
11934
11990
|
|
|
11935
11991
|
cur = build_attn(inp_attn,
|
|
11936
11992
|
model.layers[il].wo, NULL,
|
|
11937
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11993
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11938
11994
|
}
|
|
11939
11995
|
|
|
11940
11996
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12012,7 +12068,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
12012
12068
|
// inp_pos - contains the positions
|
|
12013
12069
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12014
12070
|
|
|
12015
|
-
auto * inp_attn =
|
|
12071
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12016
12072
|
|
|
12017
12073
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12018
12074
|
|
|
@@ -12067,7 +12123,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
12067
12123
|
|
|
12068
12124
|
cur = build_attn(inp_attn,
|
|
12069
12125
|
model.layers[il].wo, NULL,
|
|
12070
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12126
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12071
12127
|
}
|
|
12072
12128
|
|
|
12073
12129
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12138,7 +12194,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
12138
12194
|
// inp_pos - contains the positions
|
|
12139
12195
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12140
12196
|
|
|
12141
|
-
auto * inp_attn =
|
|
12197
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12142
12198
|
|
|
12143
12199
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12144
12200
|
|
|
@@ -12200,7 +12256,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
12200
12256
|
|
|
12201
12257
|
cur = build_attn(inp_attn,
|
|
12202
12258
|
model.layers[il].wo, NULL,
|
|
12203
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12259
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12204
12260
|
}
|
|
12205
12261
|
|
|
12206
12262
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12269,7 +12325,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12269
12325
|
// inp_pos - contains the positions
|
|
12270
12326
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12271
12327
|
|
|
12272
|
-
auto * inp_attn =
|
|
12328
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12273
12329
|
|
|
12274
12330
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12275
12331
|
|
|
@@ -12312,7 +12368,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12312
12368
|
|
|
12313
12369
|
cur = build_attn(inp_attn,
|
|
12314
12370
|
model.layers[il].wo, model.layers[il].bo,
|
|
12315
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12371
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12316
12372
|
}
|
|
12317
12373
|
|
|
12318
12374
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12415,7 +12471,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
12415
12471
|
// inp_pos - contains the positions
|
|
12416
12472
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12417
12473
|
|
|
12418
|
-
auto * inp_attn =
|
|
12474
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12419
12475
|
|
|
12420
12476
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12421
12477
|
|
|
@@ -12462,7 +12518,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
12462
12518
|
|
|
12463
12519
|
cur = build_attn(inp_attn,
|
|
12464
12520
|
model.layers[il].wo, NULL,
|
|
12465
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12521
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12466
12522
|
}
|
|
12467
12523
|
|
|
12468
12524
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12553,7 +12609,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
12553
12609
|
// inp_pos - contains the positions
|
|
12554
12610
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12555
12611
|
|
|
12556
|
-
auto * inp_attn =
|
|
12612
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12557
12613
|
|
|
12558
12614
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
12559
12615
|
|
|
@@ -12617,7 +12673,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
12617
12673
|
|
|
12618
12674
|
cur = build_attn(inp_attn,
|
|
12619
12675
|
model.layers[il].wo, model.layers[il].bo,
|
|
12620
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
12676
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
12621
12677
|
}
|
|
12622
12678
|
|
|
12623
12679
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12730,7 +12786,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12730
12786
|
// inp_pos - contains the positions
|
|
12731
12787
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12732
12788
|
|
|
12733
|
-
auto * inp_attn =
|
|
12789
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12734
12790
|
|
|
12735
12791
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12736
12792
|
|
|
@@ -12845,7 +12901,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12845
12901
|
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
|
12846
12902
|
cur = build_attn(inp_attn,
|
|
12847
12903
|
model.layers[il].wo, NULL,
|
|
12848
|
-
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
12904
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
12849
12905
|
} else {
|
|
12850
12906
|
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
|
12851
12907
|
cb(kv, "kv", il);
|
|
@@ -12879,7 +12935,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12879
12935
|
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
|
12880
12936
|
cur = build_attn(inp_attn,
|
|
12881
12937
|
model.layers[il].wo, NULL,
|
|
12882
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
12938
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
12883
12939
|
}
|
|
12884
12940
|
}
|
|
12885
12941
|
|
|
@@ -12977,7 +13033,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
12977
13033
|
// inp_pos - contains the positions
|
|
12978
13034
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12979
13035
|
|
|
12980
|
-
auto * inp_attn =
|
|
13036
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12981
13037
|
|
|
12982
13038
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12983
13039
|
|
|
@@ -13046,7 +13102,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
13046
13102
|
|
|
13047
13103
|
cur = build_attn(inp_attn,
|
|
13048
13104
|
NULL, NULL,
|
|
13049
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13105
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13050
13106
|
|
|
13051
13107
|
cur = build_norm(cur,
|
|
13052
13108
|
model.layers[il].attn_sub_norm, NULL,
|
|
@@ -13169,7 +13225,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
13169
13225
|
|
|
13170
13226
|
cur = build_attn(inp_attn,
|
|
13171
13227
|
model.layers[il].wo_enc, nullptr,
|
|
13172
|
-
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
13228
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
|
|
13173
13229
|
cb(cur, "kqv_out", il);
|
|
13174
13230
|
}
|
|
13175
13231
|
|
|
@@ -13241,7 +13297,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13241
13297
|
|
|
13242
13298
|
const int64_t n_outputs_enc = embd_enc->ne[1];
|
|
13243
13299
|
|
|
13244
|
-
auto * inp_attn_self =
|
|
13300
|
+
auto * inp_attn_self = build_attn_inp_kv();
|
|
13245
13301
|
auto * inp_attn_cross = build_attn_inp_cross();
|
|
13246
13302
|
|
|
13247
13303
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -13275,7 +13331,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13275
13331
|
|
|
13276
13332
|
cur = build_attn(inp_attn_self,
|
|
13277
13333
|
model.layers[il].wo, model.layers[il].bo,
|
|
13278
|
-
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
13334
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
|
|
13279
13335
|
cb(cur, "kqv_out", il);
|
|
13280
13336
|
}
|
|
13281
13337
|
|
|
@@ -13307,7 +13363,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13307
13363
|
|
|
13308
13364
|
cur = build_attn(inp_attn_cross,
|
|
13309
13365
|
model.layers[il].wo_cross, nullptr,
|
|
13310
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
13366
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
13311
13367
|
cb(cur, "kqv_out", il);
|
|
13312
13368
|
|
|
13313
13369
|
//ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
|
@@ -13406,7 +13462,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
13406
13462
|
|
|
13407
13463
|
inpL = build_inp_embd(model.tok_embd);
|
|
13408
13464
|
|
|
13409
|
-
auto * inp_attn =
|
|
13465
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13410
13466
|
|
|
13411
13467
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13412
13468
|
|
|
@@ -13439,7 +13495,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
13439
13495
|
|
|
13440
13496
|
cur = build_attn(inp_attn,
|
|
13441
13497
|
model.layers[il].wo, model.layers[il].bo,
|
|
13442
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
13498
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
13443
13499
|
}
|
|
13444
13500
|
|
|
13445
13501
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13504,7 +13560,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13504
13560
|
// inp_pos - contains the positions
|
|
13505
13561
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13506
13562
|
|
|
13507
|
-
auto * inp_attn =
|
|
13563
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13508
13564
|
|
|
13509
13565
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13510
13566
|
|
|
@@ -13571,7 +13627,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13571
13627
|
|
|
13572
13628
|
cur = build_attn(inp_attn,
|
|
13573
13629
|
model.layers[il].wo, NULL,
|
|
13574
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13630
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13575
13631
|
}
|
|
13576
13632
|
|
|
13577
13633
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13637,7 +13693,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13637
13693
|
// inp_pos - contains the positions
|
|
13638
13694
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13639
13695
|
|
|
13640
|
-
auto * inp_attn =
|
|
13696
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13641
13697
|
|
|
13642
13698
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13643
13699
|
|
|
@@ -13704,7 +13760,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13704
13760
|
|
|
13705
13761
|
cur = build_attn(inp_attn,
|
|
13706
13762
|
model.layers[il].wo, NULL,
|
|
13707
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13763
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13708
13764
|
}
|
|
13709
13765
|
|
|
13710
13766
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13787,7 +13843,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
|
|
|
13787
13843
|
// inp_pos - contains the positions
|
|
13788
13844
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13789
13845
|
|
|
13790
|
-
auto * inp_attn =
|
|
13846
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13791
13847
|
|
|
13792
13848
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13793
13849
|
|
|
@@ -13853,7 +13909,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
|
|
|
13853
13909
|
|
|
13854
13910
|
cur = build_attn(inp_attn,
|
|
13855
13911
|
model.layers[il].wo, NULL,
|
|
13856
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13912
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13857
13913
|
}
|
|
13858
13914
|
|
|
13859
13915
|
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
|
@@ -13947,7 +14003,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
13947
14003
|
// inp_pos - contains the positions
|
|
13948
14004
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13949
14005
|
|
|
13950
|
-
auto * inp_attn =
|
|
14006
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13951
14007
|
|
|
13952
14008
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13953
14009
|
|
|
@@ -14007,7 +14063,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
14007
14063
|
|
|
14008
14064
|
cur = build_attn(inp_attn,
|
|
14009
14065
|
model.layers[il].wo, model.layers[il].bo,
|
|
14010
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14066
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14011
14067
|
}
|
|
14012
14068
|
|
|
14013
14069
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -14076,7 +14132,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
14076
14132
|
// inp_pos - contains the positions
|
|
14077
14133
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
14078
14134
|
|
|
14079
|
-
auto * inp_attn =
|
|
14135
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
14080
14136
|
|
|
14081
14137
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14082
14138
|
|
|
@@ -14138,7 +14194,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
14138
14194
|
|
|
14139
14195
|
cur = build_attn(inp_attn,
|
|
14140
14196
|
model.layers[il].wo, model.layers[il].bo,
|
|
14141
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14197
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14142
14198
|
}
|
|
14143
14199
|
|
|
14144
14200
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -14208,13 +14264,13 @@ struct llm_build_exaone4 : public llm_graph_context {
|
|
|
14208
14264
|
// inp_pos - contains the positions
|
|
14209
14265
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
14210
14266
|
|
|
14211
|
-
using inp_attn_type = std::conditional_t<iswa,
|
|
14267
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
14212
14268
|
inp_attn_type * inp_attn = nullptr;
|
|
14213
14269
|
|
|
14214
14270
|
if constexpr (iswa) {
|
|
14215
|
-
inp_attn =
|
|
14271
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
14216
14272
|
} else {
|
|
14217
|
-
inp_attn =
|
|
14273
|
+
inp_attn = build_attn_inp_kv();
|
|
14218
14274
|
}
|
|
14219
14275
|
|
|
14220
14276
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -14269,7 +14325,7 @@ struct llm_build_exaone4 : public llm_graph_context {
|
|
|
14269
14325
|
|
|
14270
14326
|
cur = build_attn(inp_attn,
|
|
14271
14327
|
model.layers[il].wo, NULL,
|
|
14272
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14328
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14273
14329
|
cb(cur, "attn_out", il);
|
|
14274
14330
|
}
|
|
14275
14331
|
|
|
@@ -15097,7 +15153,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
15097
15153
|
inp_pos = build_inp_pos();
|
|
15098
15154
|
}
|
|
15099
15155
|
|
|
15100
|
-
auto * inp_attn =
|
|
15156
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15101
15157
|
|
|
15102
15158
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15103
15159
|
|
|
@@ -15148,12 +15204,12 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
15148
15204
|
}
|
|
15149
15205
|
|
|
15150
15206
|
ggml_tensor * build_attention_layer(
|
|
15151
|
-
ggml_tensor
|
|
15152
|
-
ggml_tensor
|
|
15153
|
-
|
|
15154
|
-
const llama_model
|
|
15155
|
-
const int64_t
|
|
15156
|
-
const int
|
|
15207
|
+
ggml_tensor * cur,
|
|
15208
|
+
ggml_tensor * inp_pos,
|
|
15209
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
15210
|
+
const llama_model & model,
|
|
15211
|
+
const int64_t n_embd_head,
|
|
15212
|
+
const int il) {
|
|
15157
15213
|
|
|
15158
15214
|
// compute Q and K and (optionally) RoPE them
|
|
15159
15215
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -15204,7 +15260,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
15204
15260
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
15205
15261
|
cur = build_attn(inp_attn,
|
|
15206
15262
|
model.layers[il].wo, model.layers[il].bo,
|
|
15207
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15263
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
15208
15264
|
cb(cur, "attn_out", il);
|
|
15209
15265
|
return cur;
|
|
15210
15266
|
}
|
|
@@ -15367,12 +15423,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
15367
15423
|
}
|
|
15368
15424
|
|
|
15369
15425
|
ggml_tensor * build_attention_layer(
|
|
15370
|
-
ggml_tensor
|
|
15371
|
-
ggml_tensor
|
|
15372
|
-
|
|
15373
|
-
const llama_model
|
|
15374
|
-
const int64_t
|
|
15375
|
-
const int
|
|
15426
|
+
ggml_tensor * cur,
|
|
15427
|
+
ggml_tensor * inp_pos,
|
|
15428
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
15429
|
+
const llama_model & model,
|
|
15430
|
+
const int64_t n_embd_head,
|
|
15431
|
+
const int il) {
|
|
15376
15432
|
|
|
15377
15433
|
// compute Q and K and (optionally) RoPE them
|
|
15378
15434
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -15423,7 +15479,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
15423
15479
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
15424
15480
|
cur = build_attn(inp_attn,
|
|
15425
15481
|
model.layers[il].wo, model.layers[il].bo,
|
|
15426
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15482
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
15427
15483
|
cb(cur, "attn_out", il);
|
|
15428
15484
|
return cur;
|
|
15429
15485
|
}
|
|
@@ -15529,7 +15585,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
15529
15585
|
// inp_pos - contains the positions
|
|
15530
15586
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15531
15587
|
|
|
15532
|
-
auto * inp_attn =
|
|
15588
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15533
15589
|
|
|
15534
15590
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15535
15591
|
|
|
@@ -15608,7 +15664,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
15608
15664
|
|
|
15609
15665
|
cur = build_attn(inp_attn,
|
|
15610
15666
|
model.layers[il].wo, nullptr,
|
|
15611
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15667
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15612
15668
|
}
|
|
15613
15669
|
|
|
15614
15670
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -15860,7 +15916,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
15860
15916
|
// inp_pos - contains the positions
|
|
15861
15917
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15862
15918
|
|
|
15863
|
-
auto * inp_attn =
|
|
15919
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15864
15920
|
|
|
15865
15921
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15866
15922
|
|
|
@@ -15964,7 +16020,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
15964
16020
|
|
|
15965
16021
|
cur = build_attn(inp_attn,
|
|
15966
16022
|
model.layers[il].wo, NULL,
|
|
15967
|
-
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
16023
|
+
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
|
|
15968
16024
|
}
|
|
15969
16025
|
|
|
15970
16026
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -16025,7 +16081,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
16025
16081
|
// inp_pos - contains the positions
|
|
16026
16082
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16027
16083
|
|
|
16028
|
-
auto * inp_attn =
|
|
16084
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16029
16085
|
|
|
16030
16086
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16031
16087
|
|
|
@@ -16087,7 +16143,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
16087
16143
|
|
|
16088
16144
|
cur = build_attn(inp_attn,
|
|
16089
16145
|
model.layers[il].wo, model.layers[il].bo,
|
|
16090
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
16146
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
16091
16147
|
}
|
|
16092
16148
|
|
|
16093
16149
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -16174,7 +16230,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
16174
16230
|
// inp_pos - contains the positions
|
|
16175
16231
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16176
16232
|
|
|
16177
|
-
auto * inp_attn =
|
|
16233
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16178
16234
|
|
|
16179
16235
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16180
16236
|
|
|
@@ -16227,7 +16283,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
16227
16283
|
|
|
16228
16284
|
cur = build_attn(inp_attn,
|
|
16229
16285
|
model.layers[il].wo, model.layers[il].bo,
|
|
16230
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16286
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16231
16287
|
}
|
|
16232
16288
|
|
|
16233
16289
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -16324,7 +16380,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
16324
16380
|
// inp_pos - contains the positions
|
|
16325
16381
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16326
16382
|
|
|
16327
|
-
auto * inp_attn =
|
|
16383
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16328
16384
|
|
|
16329
16385
|
for (int il = 0; il < n_layer; ++il) {
|
|
16330
16386
|
ggml_tensor * inpSA = inpL;
|
|
@@ -16382,7 +16438,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
16382
16438
|
|
|
16383
16439
|
cur = build_attn(inp_attn,
|
|
16384
16440
|
model.layers[il].wo, NULL,
|
|
16385
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16441
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16386
16442
|
}
|
|
16387
16443
|
|
|
16388
16444
|
if (il == n_layer - 1) {
|
|
@@ -16454,7 +16510,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
|
|
16454
16510
|
// inp_pos - contains the positions
|
|
16455
16511
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16456
16512
|
|
|
16457
|
-
auto * inp_attn =
|
|
16513
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16458
16514
|
|
|
16459
16515
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16460
16516
|
|
|
@@ -16515,7 +16571,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
|
|
16515
16571
|
|
|
16516
16572
|
cur = build_attn(inp_attn,
|
|
16517
16573
|
model.layers[il].wo, NULL,
|
|
16518
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16574
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16519
16575
|
cb(cur, "attn_out", il);
|
|
16520
16576
|
}
|
|
16521
16577
|
|
|
@@ -16668,7 +16724,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
|
|
|
16668
16724
|
|
|
16669
16725
|
ggml_tensor * attn_out = build_attn(inp->get_attn(),
|
|
16670
16726
|
model.layers[il].wo, NULL,
|
|
16671
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
16727
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
16672
16728
|
cb(attn_out, "attn_out", il);
|
|
16673
16729
|
|
|
16674
16730
|
cur = build_norm(inpL,
|
|
@@ -16828,7 +16884,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
|
|
|
16828
16884
|
|
|
16829
16885
|
private:
|
|
16830
16886
|
ggml_tensor * build_plamo2_attn_layer(
|
|
16831
|
-
|
|
16887
|
+
llm_graph_input_attn_kv * inp,
|
|
16832
16888
|
ggml_tensor * inp_pos,
|
|
16833
16889
|
ggml_tensor * cur,
|
|
16834
16890
|
const llama_model & model,
|
|
@@ -16878,7 +16934,9 @@ private:
|
|
|
16878
16934
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
16879
16935
|
);
|
|
16880
16936
|
|
|
16881
|
-
cur = build_attn(inp,
|
|
16937
|
+
cur = build_attn(inp,
|
|
16938
|
+
model.layers[il].wo, NULL,
|
|
16939
|
+
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
|
|
16882
16940
|
}
|
|
16883
16941
|
|
|
16884
16942
|
cb(cur, "attn_out", il);
|
|
@@ -17061,7 +17119,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
17061
17119
|
// inp_pos - contains the positions
|
|
17062
17120
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17063
17121
|
|
|
17064
|
-
auto * inp_attn =
|
|
17122
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17065
17123
|
|
|
17066
17124
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
17067
17125
|
|
|
@@ -17125,7 +17183,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
17125
17183
|
|
|
17126
17184
|
cur = build_attn(inp_attn,
|
|
17127
17185
|
model.layers[il].wo, model.layers[il].bo,
|
|
17128
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17186
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17129
17187
|
cb(cur, "attn_out", il);
|
|
17130
17188
|
}
|
|
17131
17189
|
|
|
@@ -17196,7 +17254,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
17196
17254
|
// inp_pos - contains the positions
|
|
17197
17255
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17198
17256
|
|
|
17199
|
-
auto * inp_attn =
|
|
17257
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17200
17258
|
|
|
17201
17259
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
17202
17260
|
|
|
@@ -17270,7 +17328,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
17270
17328
|
|
|
17271
17329
|
cur = build_attn(inp_attn,
|
|
17272
17330
|
model.layers[il].wo, model.layers[il].bo,
|
|
17273
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17331
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17274
17332
|
cb(cur, "attn_out", il);
|
|
17275
17333
|
}
|
|
17276
17334
|
|
|
@@ -17357,7 +17415,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
|
|
|
17357
17415
|
// inp_pos - contains the positions
|
|
17358
17416
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17359
17417
|
|
|
17360
|
-
auto * inp_attn =
|
|
17418
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17361
17419
|
|
|
17362
17420
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
17363
17421
|
|
|
@@ -17430,7 +17488,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
|
|
|
17430
17488
|
|
|
17431
17489
|
cur = build_attn(inp_attn,
|
|
17432
17490
|
model.layers[il].wo, model.layers[il].bo,
|
|
17433
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17491
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17434
17492
|
cb(cur, "attn_out", il);
|
|
17435
17493
|
}
|
|
17436
17494
|
|
|
@@ -17495,7 +17553,7 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
17495
17553
|
// inp_pos - contains the positions
|
|
17496
17554
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17497
17555
|
|
|
17498
|
-
auto * inp_attn =
|
|
17556
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17499
17557
|
|
|
17500
17558
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
17501
17559
|
|
|
@@ -17560,7 +17618,7 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
17560
17618
|
|
|
17561
17619
|
cur = build_attn(inp_attn,
|
|
17562
17620
|
model.layers[il].wo, model.layers[il].bo,
|
|
17563
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17621
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17564
17622
|
cb(cur, "attn_out", il);
|
|
17565
17623
|
}
|
|
17566
17624
|
|
|
@@ -17627,7 +17685,7 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
|
|
|
17627
17685
|
// inp_pos - contains the positions
|
|
17628
17686
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17629
17687
|
|
|
17630
|
-
auto * inp_attn =
|
|
17688
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
17631
17689
|
|
|
17632
17690
|
for (int il = 0; il < n_layer; ++il) {
|
|
17633
17691
|
ggml_tensor * inpSA = inpL;
|
|
@@ -17682,9 +17740,9 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
|
|
|
17682
17740
|
cb(Kcur, "Kcur", il);
|
|
17683
17741
|
cb(Vcur, "Vcur", il);
|
|
17684
17742
|
|
|
17685
|
-
cur =
|
|
17743
|
+
cur = build_attn(inp_attn,
|
|
17686
17744
|
model.layers[il].wo, model.layers[il].bo,
|
|
17687
|
-
Qcur, Kcur, Vcur, nullptr,
|
|
17745
|
+
Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
17688
17746
|
|
|
17689
17747
|
cb(cur, "attn_out", il);
|
|
17690
17748
|
}
|
|
@@ -17781,8 +17839,7 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17781
17839
|
cb(cur, "model.embedding_norm", -1);
|
|
17782
17840
|
res->t_embd = cur;
|
|
17783
17841
|
|
|
17784
|
-
|
|
17785
|
-
cur = build_lora_mm(model.tok_embd, cur);
|
|
17842
|
+
cur = build_lora_mm(model.output, cur);
|
|
17786
17843
|
cb(cur, "lm_head", -1);
|
|
17787
17844
|
|
|
17788
17845
|
res->t_logits = cur;
|
|
@@ -17809,10 +17866,10 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17809
17866
|
return cur;
|
|
17810
17867
|
}
|
|
17811
17868
|
|
|
17812
|
-
ggml_tensor * build_attn_block(ggml_tensor
|
|
17813
|
-
ggml_tensor
|
|
17814
|
-
|
|
17815
|
-
int
|
|
17869
|
+
ggml_tensor * build_attn_block(ggml_tensor * cur,
|
|
17870
|
+
ggml_tensor * inp_pos,
|
|
17871
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
17872
|
+
int il) const {
|
|
17816
17873
|
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
|
|
17817
17874
|
auto const n_embd_head = hparams.n_embd_head_v;
|
|
17818
17875
|
auto const n_head_kv = hparams.n_head_kv(il);
|
|
@@ -17847,7 +17904,7 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17847
17904
|
);
|
|
17848
17905
|
|
|
17849
17906
|
cur = build_attn(inp_attn, model.layers[il].wo, NULL,
|
|
17850
|
-
q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
17907
|
+
q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
17851
17908
|
|
|
17852
17909
|
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
|
17853
17910
|
|
|
@@ -17924,6 +17981,137 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17924
17981
|
}
|
|
17925
17982
|
};
|
|
17926
17983
|
|
|
17984
|
+
struct llm_build_seed_oss : public llm_graph_context {
|
|
17985
|
+
llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17986
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
17987
|
+
|
|
17988
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
17989
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
17990
|
+
|
|
17991
|
+
ggml_tensor * cur;
|
|
17992
|
+
ggml_tensor * inpL;
|
|
17993
|
+
|
|
17994
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
17995
|
+
|
|
17996
|
+
// inp_pos - contains the positions
|
|
17997
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
17998
|
+
|
|
17999
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
18000
|
+
|
|
18001
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
18002
|
+
|
|
18003
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
18004
|
+
|
|
18005
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
18006
|
+
ggml_tensor * inpSA = inpL;
|
|
18007
|
+
|
|
18008
|
+
// norm
|
|
18009
|
+
cur = build_norm(inpL,
|
|
18010
|
+
model.layers[il].attn_norm, NULL,
|
|
18011
|
+
LLM_NORM_RMS, il);
|
|
18012
|
+
cb(cur, "attn_norm", il);
|
|
18013
|
+
|
|
18014
|
+
// self-attention
|
|
18015
|
+
{
|
|
18016
|
+
// compute Q and K and RoPE them
|
|
18017
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
18018
|
+
cb(Qcur, "Qcur", il);
|
|
18019
|
+
if (model.layers[il].bq) {
|
|
18020
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
18021
|
+
cb(Qcur, "Qcur", il);
|
|
18022
|
+
}
|
|
18023
|
+
|
|
18024
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
18025
|
+
cb(Kcur, "Kcur", il);
|
|
18026
|
+
if (model.layers[il].bk) {
|
|
18027
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
18028
|
+
cb(Kcur, "Kcur", il);
|
|
18029
|
+
}
|
|
18030
|
+
|
|
18031
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
18032
|
+
cb(Vcur, "Vcur", il);
|
|
18033
|
+
if (model.layers[il].bv) {
|
|
18034
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
18035
|
+
cb(Vcur, "Vcur", il);
|
|
18036
|
+
}
|
|
18037
|
+
|
|
18038
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
18039
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
18040
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
18041
|
+
|
|
18042
|
+
Qcur = ggml_rope_ext(
|
|
18043
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
18044
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
18045
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
18046
|
+
);
|
|
18047
|
+
|
|
18048
|
+
Kcur = ggml_rope_ext(
|
|
18049
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
18050
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
18051
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
18052
|
+
);
|
|
18053
|
+
|
|
18054
|
+
cb(Qcur, "Qcur", il);
|
|
18055
|
+
cb(Kcur, "Kcur", il);
|
|
18056
|
+
cb(Vcur, "Vcur", il);
|
|
18057
|
+
|
|
18058
|
+
cur = build_attn(inp_attn,
|
|
18059
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
18060
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
18061
|
+
cb(cur, "attn_out", il);
|
|
18062
|
+
}
|
|
18063
|
+
|
|
18064
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
18065
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
18066
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
18067
|
+
}
|
|
18068
|
+
|
|
18069
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
18070
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
18071
|
+
|
|
18072
|
+
// feed-forward network
|
|
18073
|
+
cur = build_norm(ffn_inp,
|
|
18074
|
+
model.layers[il].attn_post_norm, NULL,
|
|
18075
|
+
LLM_NORM_RMS, il);
|
|
18076
|
+
cb(cur, "attn_post_norm", il);
|
|
18077
|
+
|
|
18078
|
+
cur = build_ffn(cur,
|
|
18079
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
18080
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
18081
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
18082
|
+
NULL,
|
|
18083
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
18084
|
+
cb(cur, "ffn_out", il);
|
|
18085
|
+
|
|
18086
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
18087
|
+
cb(cur, "ffn_out", il);
|
|
18088
|
+
|
|
18089
|
+
cur = build_cvec(cur, il);
|
|
18090
|
+
cb(cur, "l_out", il);
|
|
18091
|
+
|
|
18092
|
+
// input for next layer
|
|
18093
|
+
inpL = cur;
|
|
18094
|
+
}
|
|
18095
|
+
|
|
18096
|
+
cur = inpL;
|
|
18097
|
+
|
|
18098
|
+
cur = build_norm(cur,
|
|
18099
|
+
model.output_norm, NULL,
|
|
18100
|
+
LLM_NORM_RMS, -1);
|
|
18101
|
+
|
|
18102
|
+
cb(cur, "result_norm", -1);
|
|
18103
|
+
res->t_embd = cur;
|
|
18104
|
+
|
|
18105
|
+
// lm_head
|
|
18106
|
+
cur = build_lora_mm(model.output, cur);
|
|
18107
|
+
|
|
18108
|
+
cb(cur, "result_output", -1);
|
|
18109
|
+
res->t_logits = cur;
|
|
18110
|
+
|
|
18111
|
+
ggml_build_forward_expand(gf, cur);
|
|
18112
|
+
}
|
|
18113
|
+
};
|
|
18114
|
+
|
|
17927
18115
|
template <bool iswa>
|
|
17928
18116
|
struct llm_build_smallthinker : public llm_graph_context{
|
|
17929
18117
|
llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
|
|
@@ -17940,13 +18128,13 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
17940
18128
|
// inp_pos - contains the positions
|
|
17941
18129
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17942
18130
|
|
|
17943
|
-
using inp_attn_type = std::conditional_t<iswa,
|
|
18131
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
17944
18132
|
inp_attn_type * inp_attn = nullptr;
|
|
17945
18133
|
|
|
17946
18134
|
if constexpr (iswa) {
|
|
17947
|
-
inp_attn =
|
|
18135
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
17948
18136
|
} else {
|
|
17949
|
-
inp_attn =
|
|
18137
|
+
inp_attn = build_attn_inp_kv();
|
|
17950
18138
|
}
|
|
17951
18139
|
|
|
17952
18140
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -17991,7 +18179,7 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
17991
18179
|
|
|
17992
18180
|
cur = build_attn(inp_attn,
|
|
17993
18181
|
model.layers[il].wo, model.layers[il].bo,
|
|
17994
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
18182
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
17995
18183
|
}
|
|
17996
18184
|
|
|
17997
18185
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -18069,14 +18257,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18069
18257
|
if (llm_arch_is_recurrent(arch)) {
|
|
18070
18258
|
res = new llama_memory_recurrent(
|
|
18071
18259
|
*this,
|
|
18072
|
-
nullptr,
|
|
18073
18260
|
GGML_TYPE_F32,
|
|
18074
18261
|
GGML_TYPE_F32,
|
|
18075
18262
|
cparams.offload_kqv,
|
|
18076
18263
|
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
18077
|
-
cparams.n_seq_max
|
|
18264
|
+
cparams.n_seq_max,
|
|
18265
|
+
nullptr);
|
|
18078
18266
|
} else if (llm_arch_is_hybrid(arch)) {
|
|
18079
|
-
const auto padding =
|
|
18267
|
+
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18080
18268
|
|
|
18081
18269
|
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
18082
18270
|
|
|
@@ -18098,7 +18286,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18098
18286
|
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
|
18099
18287
|
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
|
18100
18288
|
} else {
|
|
18101
|
-
const auto padding =
|
|
18289
|
+
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18102
18290
|
|
|
18103
18291
|
uint32_t n_ctx_per_stream = cparams.n_ctx;
|
|
18104
18292
|
|
|
@@ -18115,10 +18303,22 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18115
18303
|
|
|
18116
18304
|
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
18117
18305
|
|
|
18306
|
+
llama_memory_i::layer_reuse_cb reuse = nullptr;
|
|
18307
|
+
|
|
18308
|
+
if (arch == LLM_ARCH_GEMMA3N) {
|
|
18309
|
+
reuse = [&](int32_t il) {
|
|
18310
|
+
if (il >= (int32_t) hparams.n_layer_kv_from_start) {
|
|
18311
|
+
return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
|
|
18312
|
+
}
|
|
18313
|
+
|
|
18314
|
+
return -1;
|
|
18315
|
+
};
|
|
18316
|
+
}
|
|
18317
|
+
|
|
18118
18318
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
18119
18319
|
GGML_ASSERT(hparams.is_swa_any());
|
|
18120
18320
|
|
|
18121
|
-
res = new
|
|
18321
|
+
res = new llama_kv_cache_iswa(
|
|
18122
18322
|
*this,
|
|
18123
18323
|
params.type_k,
|
|
18124
18324
|
params.type_v,
|
|
@@ -18129,13 +18329,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18129
18329
|
n_ctx_per_stream,
|
|
18130
18330
|
cparams.n_seq_max,
|
|
18131
18331
|
cparams.n_ubatch,
|
|
18132
|
-
padding
|
|
18332
|
+
padding,
|
|
18333
|
+
nullptr,
|
|
18334
|
+
reuse);
|
|
18133
18335
|
} else {
|
|
18134
18336
|
GGML_ASSERT(!hparams.is_swa_any());
|
|
18135
18337
|
|
|
18136
|
-
res = new
|
|
18338
|
+
res = new llama_kv_cache(
|
|
18137
18339
|
*this,
|
|
18138
|
-
nullptr,
|
|
18139
18340
|
params.type_k,
|
|
18140
18341
|
params.type_v,
|
|
18141
18342
|
!cparams.flash_attn,
|
|
@@ -18145,7 +18346,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18145
18346
|
cparams.n_seq_max,
|
|
18146
18347
|
padding,
|
|
18147
18348
|
hparams.n_swa,
|
|
18148
|
-
hparams.swa_type
|
|
18349
|
+
hparams.swa_type,
|
|
18350
|
+
nullptr,
|
|
18351
|
+
nullptr);
|
|
18149
18352
|
}
|
|
18150
18353
|
}
|
|
18151
18354
|
}
|
|
@@ -18462,6 +18665,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18462
18665
|
{
|
|
18463
18666
|
llm = std::make_unique<llm_build_bailingmoe>(*this, params);
|
|
18464
18667
|
} break;
|
|
18668
|
+
case LLM_ARCH_SEED_OSS:
|
|
18669
|
+
{
|
|
18670
|
+
llm = std::make_unique<llm_build_seed_oss>(*this, params);
|
|
18671
|
+
} break;
|
|
18465
18672
|
case LLM_ARCH_DOTS1:
|
|
18466
18673
|
{
|
|
18467
18674
|
llm = std::make_unique<llm_build_dots1>(*this, params);
|
|
@@ -18520,6 +18727,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18520
18727
|
return llm->res->get_gf();
|
|
18521
18728
|
}
|
|
18522
18729
|
|
|
18730
|
+
|
|
18523
18731
|
//
|
|
18524
18732
|
// interface implementation
|
|
18525
18733
|
//
|
|
@@ -18714,6 +18922,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
18714
18922
|
case LLM_ARCH_LFM2:
|
|
18715
18923
|
case LLM_ARCH_SMALLTHINKER:
|
|
18716
18924
|
case LLM_ARCH_GLM4_MOE:
|
|
18925
|
+
case LLM_ARCH_SEED_OSS:
|
|
18717
18926
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
18718
18927
|
|
|
18719
18928
|
case LLM_ARCH_QWEN2VL:
|