@fugood/llama.node 1.1.8 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +9 -2
- package/lib/index.ts +57 -30
- package/lib/version.js +2 -2
- package/lib/version.ts +2 -2
- package/package.json +14 -14
- package/src/LlamaContext.cpp +11 -0
- package/src/llama.cpp/common/arg.cpp +6 -4
- package/src/llama.cpp/common/chat.cpp +33 -1
- package/src/llama.cpp/common/common.cpp +0 -15
- package/src/llama.cpp/common/common.h +1 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +25 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/include/llama.h +1 -110
- package/src/llama.cpp/src/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +19 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -192
- package/src/llama.cpp/src/llama-context.h +2 -7
- package/src/llama.cpp/src/llama-cparams.h +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +35 -57
- package/src/llama.cpp/src/llama-graph.h +36 -46
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
- package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
- package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
- package/src/llama.cpp/src/llama-kv-cells.h +21 -21
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
- package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
- package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
- package/src/llama.cpp/src/llama-memory.h +3 -8
- package/src/llama.cpp/src/llama-model.cpp +369 -176
- package/src/llama.cpp/src/llama-model.h +1 -0
|
@@ -6,8 +6,8 @@
|
|
|
6
6
|
#include "llama-cparams.h"
|
|
7
7
|
#include "llama-model-loader.h"
|
|
8
8
|
|
|
9
|
-
#include "llama-kv-cache
|
|
10
|
-
#include "llama-kv-cache-
|
|
9
|
+
#include "llama-kv-cache.h"
|
|
10
|
+
#include "llama-kv-cache-iswa.h"
|
|
11
11
|
#include "llama-memory-hybrid.h"
|
|
12
12
|
#include "llama-memory-recurrent.h"
|
|
13
13
|
|
|
@@ -83,6 +83,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
83
83
|
case LLM_TYPE_32B: return "32B";
|
|
84
84
|
case LLM_TYPE_34B: return "34B";
|
|
85
85
|
case LLM_TYPE_35B: return "35B";
|
|
86
|
+
case LLM_TYPE_36B: return "36B";
|
|
86
87
|
case LLM_TYPE_40B: return "40B";
|
|
87
88
|
case LLM_TYPE_65B: return "65B";
|
|
88
89
|
case LLM_TYPE_70B: return "70B";
|
|
@@ -1288,6 +1289,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1288
1289
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1289
1290
|
}
|
|
1290
1291
|
} break;
|
|
1292
|
+
case LLM_ARCH_SEED_OSS:
|
|
1293
|
+
{
|
|
1294
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1295
|
+
switch (hparams.n_layer) {
|
|
1296
|
+
case 64: type = LLM_TYPE_36B; break;
|
|
1297
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1298
|
+
}
|
|
1299
|
+
} break;
|
|
1291
1300
|
case LLM_ARCH_OLMOE:
|
|
1292
1301
|
{
|
|
1293
1302
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -3967,6 +3976,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3967
3976
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3968
3977
|
}
|
|
3969
3978
|
} break;
|
|
3979
|
+
case LLM_ARCH_SEED_OSS:
|
|
3980
|
+
{
|
|
3981
|
+
const uint32_t head_dim = hparams.n_embd_head_k;
|
|
3982
|
+
const int64_t n_qo_dim = n_head * head_dim;
|
|
3983
|
+
const int64_t n_kv_dim = n_head_kv * head_dim;
|
|
3984
|
+
|
|
3985
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3986
|
+
|
|
3987
|
+
// output
|
|
3988
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3989
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3990
|
+
// if output is NULL, init from the input tok embed
|
|
3991
|
+
if (output == NULL) {
|
|
3992
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3993
|
+
}
|
|
3994
|
+
|
|
3995
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3996
|
+
auto & layer = layers[i];
|
|
3997
|
+
|
|
3998
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
|
|
3999
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
|
|
4000
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
|
|
4001
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
|
|
4002
|
+
|
|
4003
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
|
|
4004
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
|
|
4005
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
|
|
4006
|
+
|
|
4007
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4008
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
4009
|
+
|
|
4010
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4011
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4012
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4013
|
+
}
|
|
4014
|
+
} break;
|
|
4015
|
+
|
|
3970
4016
|
case LLM_ARCH_OLMOE:
|
|
3971
4017
|
{
|
|
3972
4018
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5474,8 +5520,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5474
5520
|
} break;
|
|
5475
5521
|
case LLM_ARCH_LFM2:
|
|
5476
5522
|
{
|
|
5477
|
-
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,
|
|
5523
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5478
5524
|
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
5525
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5526
|
+
|
|
5527
|
+
if (output == NULL) {
|
|
5528
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5529
|
+
}
|
|
5479
5530
|
|
|
5480
5531
|
for (int i = 0; i < n_layer; ++i) {
|
|
5481
5532
|
auto & layer = layers[i];
|
|
@@ -5986,7 +6037,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
5986
6037
|
// inp_pos - contains the positions
|
|
5987
6038
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
5988
6039
|
|
|
5989
|
-
auto * inp_attn =
|
|
6040
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
5990
6041
|
|
|
5991
6042
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
5992
6043
|
|
|
@@ -6050,7 +6101,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
6050
6101
|
|
|
6051
6102
|
cur = build_attn(inp_attn,
|
|
6052
6103
|
model.layers[il].wo, model.layers[il].bo,
|
|
6053
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6104
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
6054
6105
|
cb(cur, "attn_out", il);
|
|
6055
6106
|
}
|
|
6056
6107
|
|
|
@@ -6146,7 +6197,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
6146
6197
|
ggml_tensor * inp_attn_scale = nullptr;
|
|
6147
6198
|
inp_attn_scale = build_inp_attn_scale();
|
|
6148
6199
|
|
|
6149
|
-
auto * inp_attn =
|
|
6200
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
6150
6201
|
|
|
6151
6202
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
6152
6203
|
|
|
@@ -6224,7 +6275,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
6224
6275
|
|
|
6225
6276
|
cur = build_attn(inp_attn,
|
|
6226
6277
|
model.layers[il].wo, model.layers[il].bo,
|
|
6227
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6278
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
6228
6279
|
cb(cur, "attn_out", il);
|
|
6229
6280
|
}
|
|
6230
6281
|
|
|
@@ -6325,7 +6376,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
6325
6376
|
// inp_pos - contains the positions
|
|
6326
6377
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6327
6378
|
|
|
6328
|
-
auto * inp_attn =
|
|
6379
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6329
6380
|
|
|
6330
6381
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
6331
6382
|
|
|
@@ -6401,7 +6452,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
6401
6452
|
|
|
6402
6453
|
cur = build_attn(inp_attn,
|
|
6403
6454
|
model.layers[il].wo, model.layers[il].bo,
|
|
6404
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6455
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
6405
6456
|
}
|
|
6406
6457
|
|
|
6407
6458
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6481,7 +6532,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
6481
6532
|
// inp_pos - contains the positions
|
|
6482
6533
|
ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
|
|
6483
6534
|
|
|
6484
|
-
auto * inp_attn =
|
|
6535
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6485
6536
|
|
|
6486
6537
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6487
6538
|
|
|
@@ -6533,7 +6584,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
6533
6584
|
|
|
6534
6585
|
cur = build_attn(inp_attn,
|
|
6535
6586
|
model.layers[il].wo, NULL,
|
|
6536
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6587
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6537
6588
|
}
|
|
6538
6589
|
|
|
6539
6590
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6603,7 +6654,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
6603
6654
|
// inp_pos - contains the positions
|
|
6604
6655
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6605
6656
|
|
|
6606
|
-
auto * inp_attn =
|
|
6657
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6607
6658
|
|
|
6608
6659
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6609
6660
|
|
|
@@ -6648,7 +6699,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
6648
6699
|
|
|
6649
6700
|
cur = build_attn(inp_attn,
|
|
6650
6701
|
model.layers[il].wo, NULL,
|
|
6651
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6702
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6652
6703
|
}
|
|
6653
6704
|
|
|
6654
6705
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6717,7 +6768,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6717
6768
|
// inp_pos - contains the positions
|
|
6718
6769
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6719
6770
|
|
|
6720
|
-
auto * inp_attn =
|
|
6771
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6721
6772
|
|
|
6722
6773
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6723
6774
|
|
|
@@ -6771,7 +6822,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6771
6822
|
|
|
6772
6823
|
cur = build_attn(inp_attn,
|
|
6773
6824
|
model.layers[il].wo, NULL,
|
|
6774
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6825
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6775
6826
|
}
|
|
6776
6827
|
|
|
6777
6828
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6841,7 +6892,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
6841
6892
|
// inp_pos - contains the positions
|
|
6842
6893
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6843
6894
|
|
|
6844
|
-
auto * inp_attn =
|
|
6895
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6845
6896
|
|
|
6846
6897
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6847
6898
|
|
|
@@ -6901,7 +6952,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
6901
6952
|
|
|
6902
6953
|
cur = build_attn(inp_attn,
|
|
6903
6954
|
model.layers[il].wo, model.layers[il].bo,
|
|
6904
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
6955
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
6905
6956
|
}
|
|
6906
6957
|
|
|
6907
6958
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7001,7 +7052,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
7001
7052
|
// inp_pos - contains the positions
|
|
7002
7053
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7003
7054
|
|
|
7004
|
-
auto * inp_attn =
|
|
7055
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7005
7056
|
|
|
7006
7057
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7007
7058
|
|
|
@@ -7050,7 +7101,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
7050
7101
|
|
|
7051
7102
|
cur = build_attn(inp_attn,
|
|
7052
7103
|
model.layers[il].wo, NULL,
|
|
7053
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7104
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7054
7105
|
}
|
|
7055
7106
|
|
|
7056
7107
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7125,7 +7176,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7125
7176
|
// inp_pos - contains the positions
|
|
7126
7177
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7127
7178
|
|
|
7128
|
-
auto * inp_attn =
|
|
7179
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7129
7180
|
|
|
7130
7181
|
ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
7131
7182
|
cb(pos, "pos_embd", -1);
|
|
@@ -7164,7 +7215,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7164
7215
|
|
|
7165
7216
|
cur = build_attn(inp_attn,
|
|
7166
7217
|
model.layers[il].wo, model.layers[il].bo,
|
|
7167
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7218
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7168
7219
|
}
|
|
7169
7220
|
|
|
7170
7221
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7230,7 +7281,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
7230
7281
|
|
|
7231
7282
|
inpL = build_inp_embd(model.tok_embd);
|
|
7232
7283
|
|
|
7233
|
-
auto * inp_attn =
|
|
7284
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7234
7285
|
|
|
7235
7286
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7236
7287
|
|
|
@@ -7263,7 +7314,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
7263
7314
|
|
|
7264
7315
|
cur = build_attn(inp_attn,
|
|
7265
7316
|
model.layers[il].wo, NULL,
|
|
7266
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7317
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7267
7318
|
}
|
|
7268
7319
|
|
|
7269
7320
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7426,7 +7477,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7426
7477
|
|
|
7427
7478
|
cur = build_attn(inp_attn,
|
|
7428
7479
|
model.layers[il].wo, model.layers[il].bo,
|
|
7429
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7480
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7430
7481
|
cb(cur, "kqv_out", il);
|
|
7431
7482
|
}
|
|
7432
7483
|
|
|
@@ -7571,7 +7622,7 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
7571
7622
|
|
|
7572
7623
|
cur = build_attn(inp_attn,
|
|
7573
7624
|
model.layers[il].wo, nullptr,
|
|
7574
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7625
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7575
7626
|
cb(cur, "kqv_out", il);
|
|
7576
7627
|
}
|
|
7577
7628
|
|
|
@@ -7632,7 +7683,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7632
7683
|
|
|
7633
7684
|
inpL = build_inp_embd(model.tok_embd);
|
|
7634
7685
|
|
|
7635
|
-
auto * inp_attn =
|
|
7686
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7636
7687
|
|
|
7637
7688
|
inpL = build_norm(inpL,
|
|
7638
7689
|
model.tok_norm,
|
|
@@ -7671,7 +7722,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7671
7722
|
|
|
7672
7723
|
cur = build_attn(inp_attn,
|
|
7673
7724
|
model.layers[il].wo, model.layers[il].bo,
|
|
7674
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7725
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7675
7726
|
}
|
|
7676
7727
|
|
|
7677
7728
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7739,7 +7790,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7739
7790
|
|
|
7740
7791
|
inpL = build_inp_embd(model.tok_embd);
|
|
7741
7792
|
|
|
7742
|
-
auto * inp_attn =
|
|
7793
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7743
7794
|
|
|
7744
7795
|
if (model.pos_embd) {
|
|
7745
7796
|
// inp_pos - contains the positions
|
|
@@ -7819,7 +7870,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7819
7870
|
|
|
7820
7871
|
cur = build_attn(inp_attn,
|
|
7821
7872
|
model.layers[il].wo, model.layers[il].bo,
|
|
7822
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7873
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7823
7874
|
}
|
|
7824
7875
|
|
|
7825
7876
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7889,7 +7940,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
7889
7940
|
// inp_pos - contains the positions
|
|
7890
7941
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7891
7942
|
|
|
7892
|
-
auto * inp_attn =
|
|
7943
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7893
7944
|
|
|
7894
7945
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7895
7946
|
|
|
@@ -7965,7 +8016,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
7965
8016
|
|
|
7966
8017
|
cur = build_attn(inp_attn,
|
|
7967
8018
|
model.layers[il].wo, NULL,
|
|
7968
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8019
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7969
8020
|
}
|
|
7970
8021
|
|
|
7971
8022
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8041,7 +8092,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8041
8092
|
// inp_pos - contains the positions
|
|
8042
8093
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8043
8094
|
|
|
8044
|
-
auto * inp_attn =
|
|
8095
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8045
8096
|
|
|
8046
8097
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8047
8098
|
|
|
@@ -8086,7 +8137,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8086
8137
|
|
|
8087
8138
|
cur = build_attn(inp_attn,
|
|
8088
8139
|
model.layers[il].wo, NULL,
|
|
8089
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8140
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8090
8141
|
}
|
|
8091
8142
|
|
|
8092
8143
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8156,7 +8207,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
8156
8207
|
// inp_pos - contains the positions
|
|
8157
8208
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8158
8209
|
|
|
8159
|
-
auto * inp_attn =
|
|
8210
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8160
8211
|
|
|
8161
8212
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8162
8213
|
|
|
@@ -8206,7 +8257,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
8206
8257
|
|
|
8207
8258
|
cur = build_attn(inp_attn,
|
|
8208
8259
|
model.layers[il].wo, model.layers[il].bo,
|
|
8209
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8260
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8210
8261
|
}
|
|
8211
8262
|
|
|
8212
8263
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8320,8 +8371,9 @@ struct llm_build_dream : public llm_graph_context {
|
|
|
8320
8371
|
cb(Kcur, "Kcur", il);
|
|
8321
8372
|
cb(Vcur, "Vcur", il);
|
|
8322
8373
|
|
|
8323
|
-
cur = build_attn(inp_attn,
|
|
8324
|
-
|
|
8374
|
+
cur = build_attn(inp_attn,
|
|
8375
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
8376
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
8325
8377
|
}
|
|
8326
8378
|
|
|
8327
8379
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8420,8 +8472,9 @@ struct llm_build_llada : public llm_graph_context {
|
|
|
8420
8472
|
cb(Kcur, "Kcur", il);
|
|
8421
8473
|
cb(Vcur, "Vcur", il);
|
|
8422
8474
|
|
|
8423
|
-
cur = build_attn(inp_attn,
|
|
8424
|
-
|
|
8475
|
+
cur = build_attn(inp_attn,
|
|
8476
|
+
model.layers[il].wo, NULL,
|
|
8477
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
8425
8478
|
}
|
|
8426
8479
|
|
|
8427
8480
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8481,7 +8534,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8481
8534
|
// inp_pos - contains the positions
|
|
8482
8535
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8483
8536
|
|
|
8484
|
-
auto * inp_attn =
|
|
8537
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8485
8538
|
|
|
8486
8539
|
int sections[4];
|
|
8487
8540
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
@@ -8534,7 +8587,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8534
8587
|
|
|
8535
8588
|
cur = build_attn(inp_attn,
|
|
8536
8589
|
model.layers[il].wo, model.layers[il].bo,
|
|
8537
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8590
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8538
8591
|
}
|
|
8539
8592
|
|
|
8540
8593
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8602,7 +8655,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
8602
8655
|
// inp_pos - contains the positions
|
|
8603
8656
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8604
8657
|
|
|
8605
|
-
auto * inp_attn =
|
|
8658
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8606
8659
|
|
|
8607
8660
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8608
8661
|
|
|
@@ -8661,7 +8714,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
8661
8714
|
|
|
8662
8715
|
cur = build_attn(inp_attn,
|
|
8663
8716
|
model.layers[il].wo, model.layers[il].bo,
|
|
8664
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8717
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8665
8718
|
}
|
|
8666
8719
|
|
|
8667
8720
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8761,7 +8814,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
8761
8814
|
// inp_pos - contains the positions
|
|
8762
8815
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8763
8816
|
|
|
8764
|
-
auto * inp_attn =
|
|
8817
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8765
8818
|
|
|
8766
8819
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8767
8820
|
|
|
@@ -8814,7 +8867,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
8814
8867
|
|
|
8815
8868
|
cur = build_attn(inp_attn,
|
|
8816
8869
|
model.layers[il].wo, model.layers[il].bo,
|
|
8817
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8870
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8818
8871
|
}
|
|
8819
8872
|
|
|
8820
8873
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8882,7 +8935,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
8882
8935
|
// inp_pos - contains the positions
|
|
8883
8936
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8884
8937
|
|
|
8885
|
-
auto * inp_attn =
|
|
8938
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8886
8939
|
|
|
8887
8940
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8888
8941
|
|
|
@@ -8935,7 +8988,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
8935
8988
|
|
|
8936
8989
|
cur = build_attn(inp_attn,
|
|
8937
8990
|
model.layers[il].wo, model.layers[il].bo,
|
|
8938
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8991
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8939
8992
|
}
|
|
8940
8993
|
|
|
8941
8994
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9012,7 +9065,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9012
9065
|
// inp_pos - contains the positions
|
|
9013
9066
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9014
9067
|
|
|
9015
|
-
auto * inp_attn =
|
|
9068
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9016
9069
|
|
|
9017
9070
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9018
9071
|
|
|
@@ -9075,7 +9128,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9075
9128
|
|
|
9076
9129
|
cur = build_attn(inp_attn,
|
|
9077
9130
|
model.layers[il].wo, model.layers[il].bo,
|
|
9078
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9131
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
9079
9132
|
}
|
|
9080
9133
|
|
|
9081
9134
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9141,13 +9194,13 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9141
9194
|
// inp_pos - contains the positions
|
|
9142
9195
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9143
9196
|
|
|
9144
|
-
using inp_attn_type = std::conditional_t<iswa,
|
|
9197
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
9145
9198
|
inp_attn_type * inp_attn = nullptr;
|
|
9146
9199
|
|
|
9147
9200
|
if constexpr (iswa) {
|
|
9148
|
-
inp_attn =
|
|
9201
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
9149
9202
|
} else {
|
|
9150
|
-
inp_attn =
|
|
9203
|
+
inp_attn = build_attn_inp_kv();
|
|
9151
9204
|
}
|
|
9152
9205
|
|
|
9153
9206
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -9212,7 +9265,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9212
9265
|
|
|
9213
9266
|
cur = build_attn(inp_attn,
|
|
9214
9267
|
model.layers[il].wo, model.layers[il].bo,
|
|
9215
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9268
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
9216
9269
|
}
|
|
9217
9270
|
|
|
9218
9271
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9299,7 +9352,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
9299
9352
|
// inp_pos - contains the positions
|
|
9300
9353
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9301
9354
|
|
|
9302
|
-
auto * inp_attn =
|
|
9355
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9303
9356
|
|
|
9304
9357
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9305
9358
|
|
|
@@ -9346,7 +9399,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
9346
9399
|
|
|
9347
9400
|
cur = build_attn(inp_attn,
|
|
9348
9401
|
model.layers[il].wo, NULL,
|
|
9349
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9402
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9350
9403
|
}
|
|
9351
9404
|
|
|
9352
9405
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9415,7 +9468,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
9415
9468
|
// inp_pos - contains the positions
|
|
9416
9469
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9417
9470
|
|
|
9418
|
-
auto * inp_attn =
|
|
9471
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9419
9472
|
|
|
9420
9473
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
9421
9474
|
cb(pos, "pos_embd", -1);
|
|
@@ -9454,7 +9507,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
9454
9507
|
|
|
9455
9508
|
cur = build_attn(inp_attn,
|
|
9456
9509
|
model.layers[il].wo, model.layers[il].bo,
|
|
9457
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9510
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9458
9511
|
}
|
|
9459
9512
|
|
|
9460
9513
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9525,7 +9578,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9525
9578
|
// inp_pos - contains the positions
|
|
9526
9579
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9527
9580
|
|
|
9528
|
-
auto * inp_attn =
|
|
9581
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9529
9582
|
|
|
9530
9583
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9531
9584
|
|
|
@@ -9568,7 +9621,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9568
9621
|
|
|
9569
9622
|
cur = build_attn(inp_attn,
|
|
9570
9623
|
model.layers[il].wo, model.layers[il].bo,
|
|
9571
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9624
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9572
9625
|
}
|
|
9573
9626
|
|
|
9574
9627
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9638,7 +9691,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
9638
9691
|
// inp_pos - contains the positions
|
|
9639
9692
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9640
9693
|
|
|
9641
|
-
auto * inp_attn =
|
|
9694
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9642
9695
|
|
|
9643
9696
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9644
9697
|
|
|
@@ -9697,7 +9750,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
9697
9750
|
|
|
9698
9751
|
cur = build_attn(inp_attn,
|
|
9699
9752
|
model.layers[il].wo, NULL,
|
|
9700
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9753
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9701
9754
|
}
|
|
9702
9755
|
|
|
9703
9756
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9765,7 +9818,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
9765
9818
|
// inp_pos - contains the positions
|
|
9766
9819
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9767
9820
|
|
|
9768
|
-
auto * inp_attn =
|
|
9821
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9769
9822
|
|
|
9770
9823
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9771
9824
|
|
|
@@ -9824,7 +9877,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
9824
9877
|
|
|
9825
9878
|
cur = build_attn(inp_attn,
|
|
9826
9879
|
model.layers[il].wo, model.layers[il].bo,
|
|
9827
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9880
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9828
9881
|
}
|
|
9829
9882
|
|
|
9830
9883
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9901,7 +9954,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
9901
9954
|
// inp_pos - contains the positions
|
|
9902
9955
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9903
9956
|
|
|
9904
|
-
auto * inp_attn =
|
|
9957
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9905
9958
|
|
|
9906
9959
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9907
9960
|
|
|
@@ -10012,7 +10065,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
10012
10065
|
|
|
10013
10066
|
cur = build_attn(inp_attn,
|
|
10014
10067
|
model.layers[il].wo, NULL,
|
|
10015
|
-
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
10068
|
+
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
|
|
10016
10069
|
}
|
|
10017
10070
|
|
|
10018
10071
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10096,7 +10149,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
10096
10149
|
// inp_pos - contains the positions
|
|
10097
10150
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10098
10151
|
|
|
10099
|
-
auto * inp_attn =
|
|
10152
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
10100
10153
|
|
|
10101
10154
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10102
10155
|
|
|
@@ -10142,7 +10195,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
10142
10195
|
|
|
10143
10196
|
cur = build_attn(inp_attn,
|
|
10144
10197
|
model.layers[il].wo, NULL,
|
|
10145
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10198
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
10146
10199
|
}
|
|
10147
10200
|
|
|
10148
10201
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10212,7 +10265,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
10212
10265
|
// inp_pos - contains the positions
|
|
10213
10266
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10214
10267
|
|
|
10215
|
-
auto * inp_attn =
|
|
10268
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
10216
10269
|
|
|
10217
10270
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10218
10271
|
|
|
@@ -10257,7 +10310,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
10257
10310
|
|
|
10258
10311
|
cur = build_attn(inp_attn,
|
|
10259
10312
|
model.layers[il].wo, NULL,
|
|
10260
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10313
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
10261
10314
|
}
|
|
10262
10315
|
|
|
10263
10316
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10346,7 +10399,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
10346
10399
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10347
10400
|
|
|
10348
10401
|
// TODO: is causal == true correct? might need some changes
|
|
10349
|
-
auto * inp_attn =
|
|
10402
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
10350
10403
|
|
|
10351
10404
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10352
10405
|
|
|
@@ -10399,7 +10452,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
10399
10452
|
|
|
10400
10453
|
cur = build_attn(inp_attn,
|
|
10401
10454
|
model.layers[il].wo, NULL,
|
|
10402
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10455
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
10403
10456
|
}
|
|
10404
10457
|
|
|
10405
10458
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10497,7 +10550,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10497
10550
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10498
10551
|
|
|
10499
10552
|
// TODO: is causal == true correct? might need some changes
|
|
10500
|
-
auto * inp_attn =
|
|
10553
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
10501
10554
|
|
|
10502
10555
|
// inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
|
|
10503
10556
|
ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
|
|
@@ -10580,7 +10633,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10580
10633
|
|
|
10581
10634
|
cur = build_attn(inp_attn,
|
|
10582
10635
|
model.layers[il].wo, NULL,
|
|
10583
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10636
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10584
10637
|
} else {
|
|
10585
10638
|
// no KV layers
|
|
10586
10639
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -10598,7 +10651,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10598
10651
|
|
|
10599
10652
|
cur = build_attn(inp_attn,
|
|
10600
10653
|
model.layers[il].wo, NULL,
|
|
10601
|
-
Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10654
|
+
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10602
10655
|
}
|
|
10603
10656
|
|
|
10604
10657
|
cur = build_norm(cur,
|
|
@@ -10904,7 +10957,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
10904
10957
|
// inp_pos - contains the positions
|
|
10905
10958
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10906
10959
|
|
|
10907
|
-
auto * inp_attn =
|
|
10960
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
10908
10961
|
|
|
10909
10962
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10910
10963
|
|
|
@@ -10963,7 +11016,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
10963
11016
|
|
|
10964
11017
|
cur = build_attn(inp_attn,
|
|
10965
11018
|
model.layers[il].wo, model.layers[il].bo,
|
|
10966
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11019
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10967
11020
|
}
|
|
10968
11021
|
|
|
10969
11022
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11390,7 +11443,9 @@ struct llm_build_jamba : public llm_graph_context_mamba {
|
|
|
11390
11443
|
cb(Vcur, "Vcur", il);
|
|
11391
11444
|
|
|
11392
11445
|
// No RoPE :)
|
|
11393
|
-
cur = build_attn(inp_hybrid->get_attn(),
|
|
11446
|
+
cur = build_attn(inp_hybrid->get_attn(),
|
|
11447
|
+
model.layers[il].wo, NULL,
|
|
11448
|
+
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11394
11449
|
}
|
|
11395
11450
|
|
|
11396
11451
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11473,7 +11528,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
11473
11528
|
// inp_pos - contains the positions
|
|
11474
11529
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11475
11530
|
|
|
11476
|
-
auto * inp_attn =
|
|
11531
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11477
11532
|
|
|
11478
11533
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11479
11534
|
|
|
@@ -11548,7 +11603,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
11548
11603
|
|
|
11549
11604
|
cur = build_attn(inp_attn,
|
|
11550
11605
|
model.layers[il].wo, model.layers[il].bo,
|
|
11551
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11606
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11552
11607
|
}
|
|
11553
11608
|
|
|
11554
11609
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11620,7 +11675,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
11620
11675
|
// inp_pos - contains the positions
|
|
11621
11676
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11622
11677
|
|
|
11623
|
-
auto * inp_attn =
|
|
11678
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
11624
11679
|
|
|
11625
11680
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11626
11681
|
|
|
@@ -11683,7 +11738,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
11683
11738
|
|
|
11684
11739
|
cur = build_attn(inp_attn,
|
|
11685
11740
|
model.layers[il].wo, model.layers[il].bo,
|
|
11686
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11741
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11687
11742
|
}
|
|
11688
11743
|
|
|
11689
11744
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11755,7 +11810,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
11755
11810
|
// inp_pos - contains the positions
|
|
11756
11811
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11757
11812
|
|
|
11758
|
-
auto * inp_attn =
|
|
11813
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11759
11814
|
|
|
11760
11815
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11761
11816
|
|
|
@@ -11814,7 +11869,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
11814
11869
|
|
|
11815
11870
|
cur = build_attn(inp_attn,
|
|
11816
11871
|
model.layers[il].wo, nullptr,
|
|
11817
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11872
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11818
11873
|
}
|
|
11819
11874
|
|
|
11820
11875
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11883,7 +11938,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
11883
11938
|
// inp_pos - contains the positions
|
|
11884
11939
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11885
11940
|
|
|
11886
|
-
auto * inp_attn =
|
|
11941
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11887
11942
|
|
|
11888
11943
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11889
11944
|
|
|
@@ -11934,7 +11989,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
11934
11989
|
|
|
11935
11990
|
cur = build_attn(inp_attn,
|
|
11936
11991
|
model.layers[il].wo, NULL,
|
|
11937
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11992
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11938
11993
|
}
|
|
11939
11994
|
|
|
11940
11995
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12012,7 +12067,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
12012
12067
|
// inp_pos - contains the positions
|
|
12013
12068
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12014
12069
|
|
|
12015
|
-
auto * inp_attn =
|
|
12070
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12016
12071
|
|
|
12017
12072
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12018
12073
|
|
|
@@ -12067,7 +12122,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
12067
12122
|
|
|
12068
12123
|
cur = build_attn(inp_attn,
|
|
12069
12124
|
model.layers[il].wo, NULL,
|
|
12070
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12125
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12071
12126
|
}
|
|
12072
12127
|
|
|
12073
12128
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12138,7 +12193,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
12138
12193
|
// inp_pos - contains the positions
|
|
12139
12194
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12140
12195
|
|
|
12141
|
-
auto * inp_attn =
|
|
12196
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12142
12197
|
|
|
12143
12198
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12144
12199
|
|
|
@@ -12200,7 +12255,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
12200
12255
|
|
|
12201
12256
|
cur = build_attn(inp_attn,
|
|
12202
12257
|
model.layers[il].wo, NULL,
|
|
12203
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12258
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12204
12259
|
}
|
|
12205
12260
|
|
|
12206
12261
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12269,7 +12324,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12269
12324
|
// inp_pos - contains the positions
|
|
12270
12325
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12271
12326
|
|
|
12272
|
-
auto * inp_attn =
|
|
12327
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12273
12328
|
|
|
12274
12329
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12275
12330
|
|
|
@@ -12312,7 +12367,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12312
12367
|
|
|
12313
12368
|
cur = build_attn(inp_attn,
|
|
12314
12369
|
model.layers[il].wo, model.layers[il].bo,
|
|
12315
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12370
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12316
12371
|
}
|
|
12317
12372
|
|
|
12318
12373
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12415,7 +12470,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
12415
12470
|
// inp_pos - contains the positions
|
|
12416
12471
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12417
12472
|
|
|
12418
|
-
auto * inp_attn =
|
|
12473
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12419
12474
|
|
|
12420
12475
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12421
12476
|
|
|
@@ -12462,7 +12517,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
12462
12517
|
|
|
12463
12518
|
cur = build_attn(inp_attn,
|
|
12464
12519
|
model.layers[il].wo, NULL,
|
|
12465
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12520
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12466
12521
|
}
|
|
12467
12522
|
|
|
12468
12523
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12553,7 +12608,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
12553
12608
|
// inp_pos - contains the positions
|
|
12554
12609
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12555
12610
|
|
|
12556
|
-
auto * inp_attn =
|
|
12611
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12557
12612
|
|
|
12558
12613
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
12559
12614
|
|
|
@@ -12617,7 +12672,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
12617
12672
|
|
|
12618
12673
|
cur = build_attn(inp_attn,
|
|
12619
12674
|
model.layers[il].wo, model.layers[il].bo,
|
|
12620
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
12675
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
12621
12676
|
}
|
|
12622
12677
|
|
|
12623
12678
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12730,7 +12785,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12730
12785
|
// inp_pos - contains the positions
|
|
12731
12786
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12732
12787
|
|
|
12733
|
-
auto * inp_attn =
|
|
12788
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12734
12789
|
|
|
12735
12790
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12736
12791
|
|
|
@@ -12845,7 +12900,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12845
12900
|
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
|
12846
12901
|
cur = build_attn(inp_attn,
|
|
12847
12902
|
model.layers[il].wo, NULL,
|
|
12848
|
-
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
12903
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
12849
12904
|
} else {
|
|
12850
12905
|
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
|
12851
12906
|
cb(kv, "kv", il);
|
|
@@ -12879,7 +12934,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12879
12934
|
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
|
12880
12935
|
cur = build_attn(inp_attn,
|
|
12881
12936
|
model.layers[il].wo, NULL,
|
|
12882
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
12937
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
12883
12938
|
}
|
|
12884
12939
|
}
|
|
12885
12940
|
|
|
@@ -12977,7 +13032,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
12977
13032
|
// inp_pos - contains the positions
|
|
12978
13033
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12979
13034
|
|
|
12980
|
-
auto * inp_attn =
|
|
13035
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12981
13036
|
|
|
12982
13037
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12983
13038
|
|
|
@@ -13046,7 +13101,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
13046
13101
|
|
|
13047
13102
|
cur = build_attn(inp_attn,
|
|
13048
13103
|
NULL, NULL,
|
|
13049
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13104
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13050
13105
|
|
|
13051
13106
|
cur = build_norm(cur,
|
|
13052
13107
|
model.layers[il].attn_sub_norm, NULL,
|
|
@@ -13169,7 +13224,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
13169
13224
|
|
|
13170
13225
|
cur = build_attn(inp_attn,
|
|
13171
13226
|
model.layers[il].wo_enc, nullptr,
|
|
13172
|
-
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
13227
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
|
|
13173
13228
|
cb(cur, "kqv_out", il);
|
|
13174
13229
|
}
|
|
13175
13230
|
|
|
@@ -13241,7 +13296,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13241
13296
|
|
|
13242
13297
|
const int64_t n_outputs_enc = embd_enc->ne[1];
|
|
13243
13298
|
|
|
13244
|
-
auto * inp_attn_self =
|
|
13299
|
+
auto * inp_attn_self = build_attn_inp_kv();
|
|
13245
13300
|
auto * inp_attn_cross = build_attn_inp_cross();
|
|
13246
13301
|
|
|
13247
13302
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -13275,7 +13330,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13275
13330
|
|
|
13276
13331
|
cur = build_attn(inp_attn_self,
|
|
13277
13332
|
model.layers[il].wo, model.layers[il].bo,
|
|
13278
|
-
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
13333
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
|
|
13279
13334
|
cb(cur, "kqv_out", il);
|
|
13280
13335
|
}
|
|
13281
13336
|
|
|
@@ -13307,7 +13362,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13307
13362
|
|
|
13308
13363
|
cur = build_attn(inp_attn_cross,
|
|
13309
13364
|
model.layers[il].wo_cross, nullptr,
|
|
13310
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
13365
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
13311
13366
|
cb(cur, "kqv_out", il);
|
|
13312
13367
|
|
|
13313
13368
|
//ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
|
@@ -13406,7 +13461,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
13406
13461
|
|
|
13407
13462
|
inpL = build_inp_embd(model.tok_embd);
|
|
13408
13463
|
|
|
13409
|
-
auto * inp_attn =
|
|
13464
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13410
13465
|
|
|
13411
13466
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13412
13467
|
|
|
@@ -13439,7 +13494,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
13439
13494
|
|
|
13440
13495
|
cur = build_attn(inp_attn,
|
|
13441
13496
|
model.layers[il].wo, model.layers[il].bo,
|
|
13442
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
13497
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
13443
13498
|
}
|
|
13444
13499
|
|
|
13445
13500
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13504,7 +13559,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13504
13559
|
// inp_pos - contains the positions
|
|
13505
13560
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13506
13561
|
|
|
13507
|
-
auto * inp_attn =
|
|
13562
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13508
13563
|
|
|
13509
13564
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13510
13565
|
|
|
@@ -13571,7 +13626,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13571
13626
|
|
|
13572
13627
|
cur = build_attn(inp_attn,
|
|
13573
13628
|
model.layers[il].wo, NULL,
|
|
13574
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13629
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13575
13630
|
}
|
|
13576
13631
|
|
|
13577
13632
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13637,7 +13692,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13637
13692
|
// inp_pos - contains the positions
|
|
13638
13693
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13639
13694
|
|
|
13640
|
-
auto * inp_attn =
|
|
13695
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13641
13696
|
|
|
13642
13697
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13643
13698
|
|
|
@@ -13704,7 +13759,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13704
13759
|
|
|
13705
13760
|
cur = build_attn(inp_attn,
|
|
13706
13761
|
model.layers[il].wo, NULL,
|
|
13707
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13762
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13708
13763
|
}
|
|
13709
13764
|
|
|
13710
13765
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13787,7 +13842,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
|
|
|
13787
13842
|
// inp_pos - contains the positions
|
|
13788
13843
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13789
13844
|
|
|
13790
|
-
auto * inp_attn =
|
|
13845
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13791
13846
|
|
|
13792
13847
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13793
13848
|
|
|
@@ -13853,7 +13908,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
|
|
|
13853
13908
|
|
|
13854
13909
|
cur = build_attn(inp_attn,
|
|
13855
13910
|
model.layers[il].wo, NULL,
|
|
13856
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13911
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13857
13912
|
}
|
|
13858
13913
|
|
|
13859
13914
|
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
|
@@ -13947,7 +14002,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
13947
14002
|
// inp_pos - contains the positions
|
|
13948
14003
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13949
14004
|
|
|
13950
|
-
auto * inp_attn =
|
|
14005
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13951
14006
|
|
|
13952
14007
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13953
14008
|
|
|
@@ -14007,7 +14062,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
14007
14062
|
|
|
14008
14063
|
cur = build_attn(inp_attn,
|
|
14009
14064
|
model.layers[il].wo, model.layers[il].bo,
|
|
14010
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14065
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14011
14066
|
}
|
|
14012
14067
|
|
|
14013
14068
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -14076,7 +14131,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
14076
14131
|
// inp_pos - contains the positions
|
|
14077
14132
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
14078
14133
|
|
|
14079
|
-
auto * inp_attn =
|
|
14134
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
14080
14135
|
|
|
14081
14136
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14082
14137
|
|
|
@@ -14138,7 +14193,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
14138
14193
|
|
|
14139
14194
|
cur = build_attn(inp_attn,
|
|
14140
14195
|
model.layers[il].wo, model.layers[il].bo,
|
|
14141
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14196
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14142
14197
|
}
|
|
14143
14198
|
|
|
14144
14199
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -14208,13 +14263,13 @@ struct llm_build_exaone4 : public llm_graph_context {
|
|
|
14208
14263
|
// inp_pos - contains the positions
|
|
14209
14264
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
14210
14265
|
|
|
14211
|
-
using inp_attn_type = std::conditional_t<iswa,
|
|
14266
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
14212
14267
|
inp_attn_type * inp_attn = nullptr;
|
|
14213
14268
|
|
|
14214
14269
|
if constexpr (iswa) {
|
|
14215
|
-
inp_attn =
|
|
14270
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
14216
14271
|
} else {
|
|
14217
|
-
inp_attn =
|
|
14272
|
+
inp_attn = build_attn_inp_kv();
|
|
14218
14273
|
}
|
|
14219
14274
|
|
|
14220
14275
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -14269,7 +14324,7 @@ struct llm_build_exaone4 : public llm_graph_context {
|
|
|
14269
14324
|
|
|
14270
14325
|
cur = build_attn(inp_attn,
|
|
14271
14326
|
model.layers[il].wo, NULL,
|
|
14272
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14327
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14273
14328
|
cb(cur, "attn_out", il);
|
|
14274
14329
|
}
|
|
14275
14330
|
|
|
@@ -15097,7 +15152,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
15097
15152
|
inp_pos = build_inp_pos();
|
|
15098
15153
|
}
|
|
15099
15154
|
|
|
15100
|
-
auto * inp_attn =
|
|
15155
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15101
15156
|
|
|
15102
15157
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15103
15158
|
|
|
@@ -15148,12 +15203,12 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
15148
15203
|
}
|
|
15149
15204
|
|
|
15150
15205
|
ggml_tensor * build_attention_layer(
|
|
15151
|
-
ggml_tensor
|
|
15152
|
-
ggml_tensor
|
|
15153
|
-
|
|
15154
|
-
const llama_model
|
|
15155
|
-
const int64_t
|
|
15156
|
-
const int
|
|
15206
|
+
ggml_tensor * cur,
|
|
15207
|
+
ggml_tensor * inp_pos,
|
|
15208
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
15209
|
+
const llama_model & model,
|
|
15210
|
+
const int64_t n_embd_head,
|
|
15211
|
+
const int il) {
|
|
15157
15212
|
|
|
15158
15213
|
// compute Q and K and (optionally) RoPE them
|
|
15159
15214
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -15204,7 +15259,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
15204
15259
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
15205
15260
|
cur = build_attn(inp_attn,
|
|
15206
15261
|
model.layers[il].wo, model.layers[il].bo,
|
|
15207
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15262
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
15208
15263
|
cb(cur, "attn_out", il);
|
|
15209
15264
|
return cur;
|
|
15210
15265
|
}
|
|
@@ -15367,12 +15422,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
15367
15422
|
}
|
|
15368
15423
|
|
|
15369
15424
|
ggml_tensor * build_attention_layer(
|
|
15370
|
-
ggml_tensor
|
|
15371
|
-
ggml_tensor
|
|
15372
|
-
|
|
15373
|
-
const llama_model
|
|
15374
|
-
const int64_t
|
|
15375
|
-
const int
|
|
15425
|
+
ggml_tensor * cur,
|
|
15426
|
+
ggml_tensor * inp_pos,
|
|
15427
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
15428
|
+
const llama_model & model,
|
|
15429
|
+
const int64_t n_embd_head,
|
|
15430
|
+
const int il) {
|
|
15376
15431
|
|
|
15377
15432
|
// compute Q and K and (optionally) RoPE them
|
|
15378
15433
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -15423,7 +15478,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
15423
15478
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
15424
15479
|
cur = build_attn(inp_attn,
|
|
15425
15480
|
model.layers[il].wo, model.layers[il].bo,
|
|
15426
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15481
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
15427
15482
|
cb(cur, "attn_out", il);
|
|
15428
15483
|
return cur;
|
|
15429
15484
|
}
|
|
@@ -15529,7 +15584,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
15529
15584
|
// inp_pos - contains the positions
|
|
15530
15585
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15531
15586
|
|
|
15532
|
-
auto * inp_attn =
|
|
15587
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15533
15588
|
|
|
15534
15589
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15535
15590
|
|
|
@@ -15608,7 +15663,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
15608
15663
|
|
|
15609
15664
|
cur = build_attn(inp_attn,
|
|
15610
15665
|
model.layers[il].wo, nullptr,
|
|
15611
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15666
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15612
15667
|
}
|
|
15613
15668
|
|
|
15614
15669
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -15860,7 +15915,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
15860
15915
|
// inp_pos - contains the positions
|
|
15861
15916
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15862
15917
|
|
|
15863
|
-
auto * inp_attn =
|
|
15918
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15864
15919
|
|
|
15865
15920
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15866
15921
|
|
|
@@ -15964,7 +16019,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
15964
16019
|
|
|
15965
16020
|
cur = build_attn(inp_attn,
|
|
15966
16021
|
model.layers[il].wo, NULL,
|
|
15967
|
-
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
16022
|
+
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
|
|
15968
16023
|
}
|
|
15969
16024
|
|
|
15970
16025
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -16025,7 +16080,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
16025
16080
|
// inp_pos - contains the positions
|
|
16026
16081
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16027
16082
|
|
|
16028
|
-
auto * inp_attn =
|
|
16083
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16029
16084
|
|
|
16030
16085
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16031
16086
|
|
|
@@ -16087,7 +16142,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
16087
16142
|
|
|
16088
16143
|
cur = build_attn(inp_attn,
|
|
16089
16144
|
model.layers[il].wo, model.layers[il].bo,
|
|
16090
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
16145
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
16091
16146
|
}
|
|
16092
16147
|
|
|
16093
16148
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -16174,7 +16229,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
16174
16229
|
// inp_pos - contains the positions
|
|
16175
16230
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16176
16231
|
|
|
16177
|
-
auto * inp_attn =
|
|
16232
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16178
16233
|
|
|
16179
16234
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16180
16235
|
|
|
@@ -16227,7 +16282,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
16227
16282
|
|
|
16228
16283
|
cur = build_attn(inp_attn,
|
|
16229
16284
|
model.layers[il].wo, model.layers[il].bo,
|
|
16230
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16285
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16231
16286
|
}
|
|
16232
16287
|
|
|
16233
16288
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -16324,7 +16379,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
16324
16379
|
// inp_pos - contains the positions
|
|
16325
16380
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16326
16381
|
|
|
16327
|
-
auto * inp_attn =
|
|
16382
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16328
16383
|
|
|
16329
16384
|
for (int il = 0; il < n_layer; ++il) {
|
|
16330
16385
|
ggml_tensor * inpSA = inpL;
|
|
@@ -16382,7 +16437,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
16382
16437
|
|
|
16383
16438
|
cur = build_attn(inp_attn,
|
|
16384
16439
|
model.layers[il].wo, NULL,
|
|
16385
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16440
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16386
16441
|
}
|
|
16387
16442
|
|
|
16388
16443
|
if (il == n_layer - 1) {
|
|
@@ -16454,7 +16509,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
|
|
16454
16509
|
// inp_pos - contains the positions
|
|
16455
16510
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16456
16511
|
|
|
16457
|
-
auto * inp_attn =
|
|
16512
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16458
16513
|
|
|
16459
16514
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16460
16515
|
|
|
@@ -16515,7 +16570,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
|
|
16515
16570
|
|
|
16516
16571
|
cur = build_attn(inp_attn,
|
|
16517
16572
|
model.layers[il].wo, NULL,
|
|
16518
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16573
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16519
16574
|
cb(cur, "attn_out", il);
|
|
16520
16575
|
}
|
|
16521
16576
|
|
|
@@ -16668,7 +16723,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
|
|
|
16668
16723
|
|
|
16669
16724
|
ggml_tensor * attn_out = build_attn(inp->get_attn(),
|
|
16670
16725
|
model.layers[il].wo, NULL,
|
|
16671
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
16726
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
16672
16727
|
cb(attn_out, "attn_out", il);
|
|
16673
16728
|
|
|
16674
16729
|
cur = build_norm(inpL,
|
|
@@ -16828,7 +16883,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
|
|
|
16828
16883
|
|
|
16829
16884
|
private:
|
|
16830
16885
|
ggml_tensor * build_plamo2_attn_layer(
|
|
16831
|
-
|
|
16886
|
+
llm_graph_input_attn_kv * inp,
|
|
16832
16887
|
ggml_tensor * inp_pos,
|
|
16833
16888
|
ggml_tensor * cur,
|
|
16834
16889
|
const llama_model & model,
|
|
@@ -16878,7 +16933,9 @@ private:
|
|
|
16878
16933
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
16879
16934
|
);
|
|
16880
16935
|
|
|
16881
|
-
cur = build_attn(inp,
|
|
16936
|
+
cur = build_attn(inp,
|
|
16937
|
+
model.layers[il].wo, NULL,
|
|
16938
|
+
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
|
|
16882
16939
|
}
|
|
16883
16940
|
|
|
16884
16941
|
cb(cur, "attn_out", il);
|
|
@@ -17061,7 +17118,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
17061
17118
|
// inp_pos - contains the positions
|
|
17062
17119
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17063
17120
|
|
|
17064
|
-
auto * inp_attn =
|
|
17121
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17065
17122
|
|
|
17066
17123
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
17067
17124
|
|
|
@@ -17125,7 +17182,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
17125
17182
|
|
|
17126
17183
|
cur = build_attn(inp_attn,
|
|
17127
17184
|
model.layers[il].wo, model.layers[il].bo,
|
|
17128
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17185
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17129
17186
|
cb(cur, "attn_out", il);
|
|
17130
17187
|
}
|
|
17131
17188
|
|
|
@@ -17196,7 +17253,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
17196
17253
|
// inp_pos - contains the positions
|
|
17197
17254
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17198
17255
|
|
|
17199
|
-
auto * inp_attn =
|
|
17256
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17200
17257
|
|
|
17201
17258
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
17202
17259
|
|
|
@@ -17270,7 +17327,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
17270
17327
|
|
|
17271
17328
|
cur = build_attn(inp_attn,
|
|
17272
17329
|
model.layers[il].wo, model.layers[il].bo,
|
|
17273
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17330
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17274
17331
|
cb(cur, "attn_out", il);
|
|
17275
17332
|
}
|
|
17276
17333
|
|
|
@@ -17357,7 +17414,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
|
|
|
17357
17414
|
// inp_pos - contains the positions
|
|
17358
17415
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17359
17416
|
|
|
17360
|
-
auto * inp_attn =
|
|
17417
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17361
17418
|
|
|
17362
17419
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
17363
17420
|
|
|
@@ -17430,7 +17487,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
|
|
|
17430
17487
|
|
|
17431
17488
|
cur = build_attn(inp_attn,
|
|
17432
17489
|
model.layers[il].wo, model.layers[il].bo,
|
|
17433
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17490
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17434
17491
|
cb(cur, "attn_out", il);
|
|
17435
17492
|
}
|
|
17436
17493
|
|
|
@@ -17495,7 +17552,7 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
17495
17552
|
// inp_pos - contains the positions
|
|
17496
17553
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17497
17554
|
|
|
17498
|
-
auto * inp_attn =
|
|
17555
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17499
17556
|
|
|
17500
17557
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
17501
17558
|
|
|
@@ -17560,7 +17617,7 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
17560
17617
|
|
|
17561
17618
|
cur = build_attn(inp_attn,
|
|
17562
17619
|
model.layers[il].wo, model.layers[il].bo,
|
|
17563
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17620
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17564
17621
|
cb(cur, "attn_out", il);
|
|
17565
17622
|
}
|
|
17566
17623
|
|
|
@@ -17627,7 +17684,7 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
|
|
|
17627
17684
|
// inp_pos - contains the positions
|
|
17628
17685
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17629
17686
|
|
|
17630
|
-
auto * inp_attn =
|
|
17687
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
17631
17688
|
|
|
17632
17689
|
for (int il = 0; il < n_layer; ++il) {
|
|
17633
17690
|
ggml_tensor * inpSA = inpL;
|
|
@@ -17682,9 +17739,9 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
|
|
|
17682
17739
|
cb(Kcur, "Kcur", il);
|
|
17683
17740
|
cb(Vcur, "Vcur", il);
|
|
17684
17741
|
|
|
17685
|
-
cur =
|
|
17742
|
+
cur = build_attn(inp_attn,
|
|
17686
17743
|
model.layers[il].wo, model.layers[il].bo,
|
|
17687
|
-
Qcur, Kcur, Vcur, nullptr,
|
|
17744
|
+
Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
17688
17745
|
|
|
17689
17746
|
cb(cur, "attn_out", il);
|
|
17690
17747
|
}
|
|
@@ -17781,8 +17838,7 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17781
17838
|
cb(cur, "model.embedding_norm", -1);
|
|
17782
17839
|
res->t_embd = cur;
|
|
17783
17840
|
|
|
17784
|
-
|
|
17785
|
-
cur = build_lora_mm(model.tok_embd, cur);
|
|
17841
|
+
cur = build_lora_mm(model.output, cur);
|
|
17786
17842
|
cb(cur, "lm_head", -1);
|
|
17787
17843
|
|
|
17788
17844
|
res->t_logits = cur;
|
|
@@ -17809,10 +17865,10 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17809
17865
|
return cur;
|
|
17810
17866
|
}
|
|
17811
17867
|
|
|
17812
|
-
ggml_tensor * build_attn_block(ggml_tensor
|
|
17813
|
-
ggml_tensor
|
|
17814
|
-
|
|
17815
|
-
int
|
|
17868
|
+
ggml_tensor * build_attn_block(ggml_tensor * cur,
|
|
17869
|
+
ggml_tensor * inp_pos,
|
|
17870
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
17871
|
+
int il) const {
|
|
17816
17872
|
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
|
|
17817
17873
|
auto const n_embd_head = hparams.n_embd_head_v;
|
|
17818
17874
|
auto const n_head_kv = hparams.n_head_kv(il);
|
|
@@ -17847,7 +17903,7 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17847
17903
|
);
|
|
17848
17904
|
|
|
17849
17905
|
cur = build_attn(inp_attn, model.layers[il].wo, NULL,
|
|
17850
|
-
q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
17906
|
+
q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
17851
17907
|
|
|
17852
17908
|
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
|
17853
17909
|
|
|
@@ -17924,6 +17980,137 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17924
17980
|
}
|
|
17925
17981
|
};
|
|
17926
17982
|
|
|
17983
|
+
struct llm_build_seed_oss : public llm_graph_context {
|
|
17984
|
+
llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17985
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
17986
|
+
|
|
17987
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
17988
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
17989
|
+
|
|
17990
|
+
ggml_tensor * cur;
|
|
17991
|
+
ggml_tensor * inpL;
|
|
17992
|
+
|
|
17993
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
17994
|
+
|
|
17995
|
+
// inp_pos - contains the positions
|
|
17996
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
17997
|
+
|
|
17998
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17999
|
+
|
|
18000
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
18001
|
+
|
|
18002
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
18003
|
+
|
|
18004
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
18005
|
+
ggml_tensor * inpSA = inpL;
|
|
18006
|
+
|
|
18007
|
+
// norm
|
|
18008
|
+
cur = build_norm(inpL,
|
|
18009
|
+
model.layers[il].attn_norm, NULL,
|
|
18010
|
+
LLM_NORM_RMS, il);
|
|
18011
|
+
cb(cur, "attn_norm", il);
|
|
18012
|
+
|
|
18013
|
+
// self-attention
|
|
18014
|
+
{
|
|
18015
|
+
// compute Q and K and RoPE them
|
|
18016
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
18017
|
+
cb(Qcur, "Qcur", il);
|
|
18018
|
+
if (model.layers[il].bq) {
|
|
18019
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
18020
|
+
cb(Qcur, "Qcur", il);
|
|
18021
|
+
}
|
|
18022
|
+
|
|
18023
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
18024
|
+
cb(Kcur, "Kcur", il);
|
|
18025
|
+
if (model.layers[il].bk) {
|
|
18026
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
18027
|
+
cb(Kcur, "Kcur", il);
|
|
18028
|
+
}
|
|
18029
|
+
|
|
18030
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
18031
|
+
cb(Vcur, "Vcur", il);
|
|
18032
|
+
if (model.layers[il].bv) {
|
|
18033
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
18034
|
+
cb(Vcur, "Vcur", il);
|
|
18035
|
+
}
|
|
18036
|
+
|
|
18037
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
18038
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
18039
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
18040
|
+
|
|
18041
|
+
Qcur = ggml_rope_ext(
|
|
18042
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
18043
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
18044
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
18045
|
+
);
|
|
18046
|
+
|
|
18047
|
+
Kcur = ggml_rope_ext(
|
|
18048
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
18049
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
18050
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
18051
|
+
);
|
|
18052
|
+
|
|
18053
|
+
cb(Qcur, "Qcur", il);
|
|
18054
|
+
cb(Kcur, "Kcur", il);
|
|
18055
|
+
cb(Vcur, "Vcur", il);
|
|
18056
|
+
|
|
18057
|
+
cur = build_attn(inp_attn,
|
|
18058
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
18059
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
18060
|
+
cb(cur, "attn_out", il);
|
|
18061
|
+
}
|
|
18062
|
+
|
|
18063
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
18064
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
18065
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
18066
|
+
}
|
|
18067
|
+
|
|
18068
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
18069
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
18070
|
+
|
|
18071
|
+
// feed-forward network
|
|
18072
|
+
cur = build_norm(ffn_inp,
|
|
18073
|
+
model.layers[il].attn_post_norm, NULL,
|
|
18074
|
+
LLM_NORM_RMS, il);
|
|
18075
|
+
cb(cur, "attn_post_norm", il);
|
|
18076
|
+
|
|
18077
|
+
cur = build_ffn(cur,
|
|
18078
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
18079
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
18080
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
18081
|
+
NULL,
|
|
18082
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
18083
|
+
cb(cur, "ffn_out", il);
|
|
18084
|
+
|
|
18085
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
18086
|
+
cb(cur, "ffn_out", il);
|
|
18087
|
+
|
|
18088
|
+
cur = build_cvec(cur, il);
|
|
18089
|
+
cb(cur, "l_out", il);
|
|
18090
|
+
|
|
18091
|
+
// input for next layer
|
|
18092
|
+
inpL = cur;
|
|
18093
|
+
}
|
|
18094
|
+
|
|
18095
|
+
cur = inpL;
|
|
18096
|
+
|
|
18097
|
+
cur = build_norm(cur,
|
|
18098
|
+
model.output_norm, NULL,
|
|
18099
|
+
LLM_NORM_RMS, -1);
|
|
18100
|
+
|
|
18101
|
+
cb(cur, "result_norm", -1);
|
|
18102
|
+
res->t_embd = cur;
|
|
18103
|
+
|
|
18104
|
+
// lm_head
|
|
18105
|
+
cur = build_lora_mm(model.output, cur);
|
|
18106
|
+
|
|
18107
|
+
cb(cur, "result_output", -1);
|
|
18108
|
+
res->t_logits = cur;
|
|
18109
|
+
|
|
18110
|
+
ggml_build_forward_expand(gf, cur);
|
|
18111
|
+
}
|
|
18112
|
+
};
|
|
18113
|
+
|
|
17927
18114
|
template <bool iswa>
|
|
17928
18115
|
struct llm_build_smallthinker : public llm_graph_context{
|
|
17929
18116
|
llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
|
|
@@ -17940,13 +18127,13 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
17940
18127
|
// inp_pos - contains the positions
|
|
17941
18128
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17942
18129
|
|
|
17943
|
-
using inp_attn_type = std::conditional_t<iswa,
|
|
18130
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
17944
18131
|
inp_attn_type * inp_attn = nullptr;
|
|
17945
18132
|
|
|
17946
18133
|
if constexpr (iswa) {
|
|
17947
|
-
inp_attn =
|
|
18134
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
17948
18135
|
} else {
|
|
17949
|
-
inp_attn =
|
|
18136
|
+
inp_attn = build_attn_inp_kv();
|
|
17950
18137
|
}
|
|
17951
18138
|
|
|
17952
18139
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -17991,7 +18178,7 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
17991
18178
|
|
|
17992
18179
|
cur = build_attn(inp_attn,
|
|
17993
18180
|
model.layers[il].wo, model.layers[il].bo,
|
|
17994
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
18181
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
17995
18182
|
}
|
|
17996
18183
|
|
|
17997
18184
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -18076,7 +18263,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18076
18263
|
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
18077
18264
|
cparams.n_seq_max);
|
|
18078
18265
|
} else if (llm_arch_is_hybrid(arch)) {
|
|
18079
|
-
const auto padding =
|
|
18266
|
+
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18080
18267
|
|
|
18081
18268
|
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
18082
18269
|
|
|
@@ -18098,7 +18285,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18098
18285
|
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
|
18099
18286
|
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
|
18100
18287
|
} else {
|
|
18101
|
-
const auto padding =
|
|
18288
|
+
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18102
18289
|
|
|
18103
18290
|
uint32_t n_ctx_per_stream = cparams.n_ctx;
|
|
18104
18291
|
|
|
@@ -18118,7 +18305,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18118
18305
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
18119
18306
|
GGML_ASSERT(hparams.is_swa_any());
|
|
18120
18307
|
|
|
18121
|
-
res = new
|
|
18308
|
+
res = new llama_kv_cache_iswa(
|
|
18122
18309
|
*this,
|
|
18123
18310
|
params.type_k,
|
|
18124
18311
|
params.type_v,
|
|
@@ -18133,7 +18320,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18133
18320
|
} else {
|
|
18134
18321
|
GGML_ASSERT(!hparams.is_swa_any());
|
|
18135
18322
|
|
|
18136
|
-
res = new
|
|
18323
|
+
res = new llama_kv_cache(
|
|
18137
18324
|
*this,
|
|
18138
18325
|
nullptr,
|
|
18139
18326
|
params.type_k,
|
|
@@ -18462,6 +18649,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18462
18649
|
{
|
|
18463
18650
|
llm = std::make_unique<llm_build_bailingmoe>(*this, params);
|
|
18464
18651
|
} break;
|
|
18652
|
+
case LLM_ARCH_SEED_OSS:
|
|
18653
|
+
{
|
|
18654
|
+
llm = std::make_unique<llm_build_seed_oss>(*this, params);
|
|
18655
|
+
} break;
|
|
18465
18656
|
case LLM_ARCH_DOTS1:
|
|
18466
18657
|
{
|
|
18467
18658
|
llm = std::make_unique<llm_build_dots1>(*this, params);
|
|
@@ -18520,6 +18711,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18520
18711
|
return llm->res->get_gf();
|
|
18521
18712
|
}
|
|
18522
18713
|
|
|
18714
|
+
|
|
18523
18715
|
//
|
|
18524
18716
|
// interface implementation
|
|
18525
18717
|
//
|
|
@@ -18714,6 +18906,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
18714
18906
|
case LLM_ARCH_LFM2:
|
|
18715
18907
|
case LLM_ARCH_SMALLTHINKER:
|
|
18716
18908
|
case LLM_ARCH_GLM4_MOE:
|
|
18909
|
+
case LLM_ARCH_SEED_OSS:
|
|
18717
18910
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
18718
18911
|
|
|
18719
18912
|
case LLM_ARCH_QWEN2VL:
|