@fugood/llama.node 1.1.7 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +9 -2
- package/lib/index.ts +57 -30
- package/lib/version.js +2 -2
- package/lib/version.ts +2 -2
- package/package.json +14 -14
- package/src/LlamaContext.cpp +20 -0
- package/src/common.hpp +8 -1
- package/src/llama.cpp/common/arg.cpp +13 -4
- package/src/llama.cpp/common/chat.cpp +33 -2
- package/src/llama.cpp/common/common.cpp +0 -15
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +25 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/include/llama.h +1 -110
- package/src/llama.cpp/src/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +19 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -197
- package/src/llama.cpp/src/llama-context.h +2 -7
- package/src/llama.cpp/src/llama-cparams.h +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +35 -57
- package/src/llama.cpp/src/llama-graph.h +36 -46
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
- package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
- package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
- package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
- package/src/llama.cpp/src/llama-kv-cells.h +21 -21
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
- package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
- package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
- package/src/llama.cpp/src/llama-memory.h +3 -8
- package/src/llama.cpp/src/llama-model.cpp +449 -246
- package/src/llama.cpp/src/llama-model.h +2 -0
|
@@ -6,8 +6,8 @@
|
|
|
6
6
|
#include "llama-cparams.h"
|
|
7
7
|
#include "llama-model-loader.h"
|
|
8
8
|
|
|
9
|
-
#include "llama-kv-cache
|
|
10
|
-
#include "llama-kv-cache-
|
|
9
|
+
#include "llama-kv-cache.h"
|
|
10
|
+
#include "llama-kv-cache-iswa.h"
|
|
11
11
|
#include "llama-memory-hybrid.h"
|
|
12
12
|
#include "llama-memory-recurrent.h"
|
|
13
13
|
|
|
@@ -83,9 +83,11 @@ const char * llm_type_name(llm_type type) {
|
|
|
83
83
|
case LLM_TYPE_32B: return "32B";
|
|
84
84
|
case LLM_TYPE_34B: return "34B";
|
|
85
85
|
case LLM_TYPE_35B: return "35B";
|
|
86
|
+
case LLM_TYPE_36B: return "36B";
|
|
86
87
|
case LLM_TYPE_40B: return "40B";
|
|
87
88
|
case LLM_TYPE_65B: return "65B";
|
|
88
89
|
case LLM_TYPE_70B: return "70B";
|
|
90
|
+
case LLM_TYPE_120B: return "120B";
|
|
89
91
|
case LLM_TYPE_142B: return "142B";
|
|
90
92
|
case LLM_TYPE_236B: return "236B";
|
|
91
93
|
case LLM_TYPE_290B: return "290B";
|
|
@@ -1287,6 +1289,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1287
1289
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1288
1290
|
}
|
|
1289
1291
|
} break;
|
|
1292
|
+
case LLM_ARCH_SEED_OSS:
|
|
1293
|
+
{
|
|
1294
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1295
|
+
switch (hparams.n_layer) {
|
|
1296
|
+
case 64: type = LLM_TYPE_36B; break;
|
|
1297
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1298
|
+
}
|
|
1299
|
+
} break;
|
|
1290
1300
|
case LLM_ARCH_OLMOE:
|
|
1291
1301
|
{
|
|
1292
1302
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1834,7 +1844,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1834
1844
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1835
1845
|
hparams.set_swa_pattern(2);
|
|
1836
1846
|
|
|
1837
|
-
|
|
1847
|
+
switch (hparams.n_layer) {
|
|
1848
|
+
case 24: type = LLM_TYPE_20B; break;
|
|
1849
|
+
case 36: type = LLM_TYPE_120B; break;
|
|
1850
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1851
|
+
}
|
|
1838
1852
|
} break;
|
|
1839
1853
|
case LLM_ARCH_LFM2:
|
|
1840
1854
|
{
|
|
@@ -3962,6 +3976,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3962
3976
|
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3963
3977
|
}
|
|
3964
3978
|
} break;
|
|
3979
|
+
case LLM_ARCH_SEED_OSS:
|
|
3980
|
+
{
|
|
3981
|
+
const uint32_t head_dim = hparams.n_embd_head_k;
|
|
3982
|
+
const int64_t n_qo_dim = n_head * head_dim;
|
|
3983
|
+
const int64_t n_kv_dim = n_head_kv * head_dim;
|
|
3984
|
+
|
|
3985
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3986
|
+
|
|
3987
|
+
// output
|
|
3988
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3989
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3990
|
+
// if output is NULL, init from the input tok embed
|
|
3991
|
+
if (output == NULL) {
|
|
3992
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3993
|
+
}
|
|
3994
|
+
|
|
3995
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3996
|
+
auto & layer = layers[i];
|
|
3997
|
+
|
|
3998
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0);
|
|
3999
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0);
|
|
4000
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0);
|
|
4001
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0);
|
|
4002
|
+
|
|
4003
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED);
|
|
4004
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
|
|
4005
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED);
|
|
4006
|
+
|
|
4007
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4008
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
4009
|
+
|
|
4010
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4011
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4012
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4013
|
+
}
|
|
4014
|
+
} break;
|
|
4015
|
+
|
|
3965
4016
|
case LLM_ARCH_OLMOE:
|
|
3966
4017
|
{
|
|
3967
4018
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5469,8 +5520,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5469
5520
|
} break;
|
|
5470
5521
|
case LLM_ARCH_LFM2:
|
|
5471
5522
|
{
|
|
5472
|
-
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,
|
|
5523
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5473
5524
|
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
5525
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5526
|
+
|
|
5527
|
+
if (output == NULL) {
|
|
5528
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5529
|
+
}
|
|
5474
5530
|
|
|
5475
5531
|
for (int i = 0; i < n_layer; ++i) {
|
|
5476
5532
|
auto & layer = layers[i];
|
|
@@ -5981,7 +6037,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
5981
6037
|
// inp_pos - contains the positions
|
|
5982
6038
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
5983
6039
|
|
|
5984
|
-
auto * inp_attn =
|
|
6040
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
5985
6041
|
|
|
5986
6042
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
5987
6043
|
|
|
@@ -6045,7 +6101,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
6045
6101
|
|
|
6046
6102
|
cur = build_attn(inp_attn,
|
|
6047
6103
|
model.layers[il].wo, model.layers[il].bo,
|
|
6048
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6104
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
6049
6105
|
cb(cur, "attn_out", il);
|
|
6050
6106
|
}
|
|
6051
6107
|
|
|
@@ -6141,7 +6197,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
6141
6197
|
ggml_tensor * inp_attn_scale = nullptr;
|
|
6142
6198
|
inp_attn_scale = build_inp_attn_scale();
|
|
6143
6199
|
|
|
6144
|
-
auto * inp_attn =
|
|
6200
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
6145
6201
|
|
|
6146
6202
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
6147
6203
|
|
|
@@ -6219,7 +6275,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
6219
6275
|
|
|
6220
6276
|
cur = build_attn(inp_attn,
|
|
6221
6277
|
model.layers[il].wo, model.layers[il].bo,
|
|
6222
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6278
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
6223
6279
|
cb(cur, "attn_out", il);
|
|
6224
6280
|
}
|
|
6225
6281
|
|
|
@@ -6320,7 +6376,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
6320
6376
|
// inp_pos - contains the positions
|
|
6321
6377
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6322
6378
|
|
|
6323
|
-
auto * inp_attn =
|
|
6379
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6324
6380
|
|
|
6325
6381
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
6326
6382
|
|
|
@@ -6396,7 +6452,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
6396
6452
|
|
|
6397
6453
|
cur = build_attn(inp_attn,
|
|
6398
6454
|
model.layers[il].wo, model.layers[il].bo,
|
|
6399
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
6455
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
6400
6456
|
}
|
|
6401
6457
|
|
|
6402
6458
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6476,7 +6532,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
6476
6532
|
// inp_pos - contains the positions
|
|
6477
6533
|
ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
|
|
6478
6534
|
|
|
6479
|
-
auto * inp_attn =
|
|
6535
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6480
6536
|
|
|
6481
6537
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6482
6538
|
|
|
@@ -6528,7 +6584,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
6528
6584
|
|
|
6529
6585
|
cur = build_attn(inp_attn,
|
|
6530
6586
|
model.layers[il].wo, NULL,
|
|
6531
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6587
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6532
6588
|
}
|
|
6533
6589
|
|
|
6534
6590
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6598,7 +6654,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
6598
6654
|
// inp_pos - contains the positions
|
|
6599
6655
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6600
6656
|
|
|
6601
|
-
auto * inp_attn =
|
|
6657
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6602
6658
|
|
|
6603
6659
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6604
6660
|
|
|
@@ -6643,7 +6699,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
6643
6699
|
|
|
6644
6700
|
cur = build_attn(inp_attn,
|
|
6645
6701
|
model.layers[il].wo, NULL,
|
|
6646
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6702
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6647
6703
|
}
|
|
6648
6704
|
|
|
6649
6705
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6712,7 +6768,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6712
6768
|
// inp_pos - contains the positions
|
|
6713
6769
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6714
6770
|
|
|
6715
|
-
auto * inp_attn =
|
|
6771
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6716
6772
|
|
|
6717
6773
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6718
6774
|
|
|
@@ -6743,9 +6799,9 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6743
6799
|
|
|
6744
6800
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6745
6801
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6746
|
-
ggml_tensor * Vcur =
|
|
6802
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
6747
6803
|
|
|
6748
|
-
Vcur =
|
|
6804
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6749
6805
|
|
|
6750
6806
|
// using mode = 2 for neox mode
|
|
6751
6807
|
Qcur = ggml_rope_ext(
|
|
@@ -6766,7 +6822,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6766
6822
|
|
|
6767
6823
|
cur = build_attn(inp_attn,
|
|
6768
6824
|
model.layers[il].wo, NULL,
|
|
6769
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6825
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6770
6826
|
}
|
|
6771
6827
|
|
|
6772
6828
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6836,7 +6892,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
6836
6892
|
// inp_pos - contains the positions
|
|
6837
6893
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6838
6894
|
|
|
6839
|
-
auto * inp_attn =
|
|
6895
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
6840
6896
|
|
|
6841
6897
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6842
6898
|
|
|
@@ -6896,7 +6952,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
6896
6952
|
|
|
6897
6953
|
cur = build_attn(inp_attn,
|
|
6898
6954
|
model.layers[il].wo, model.layers[il].bo,
|
|
6899
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
6955
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
6900
6956
|
}
|
|
6901
6957
|
|
|
6902
6958
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -6996,7 +7052,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
6996
7052
|
// inp_pos - contains the positions
|
|
6997
7053
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
6998
7054
|
|
|
6999
|
-
auto * inp_attn =
|
|
7055
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7000
7056
|
|
|
7001
7057
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7002
7058
|
|
|
@@ -7023,9 +7079,9 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
7023
7079
|
|
|
7024
7080
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7025
7081
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7026
|
-
Vcur =
|
|
7082
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7027
7083
|
|
|
7028
|
-
Vcur =
|
|
7084
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7029
7085
|
|
|
7030
7086
|
Qcur = ggml_rope_ext(
|
|
7031
7087
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -7045,7 +7101,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
7045
7101
|
|
|
7046
7102
|
cur = build_attn(inp_attn,
|
|
7047
7103
|
model.layers[il].wo, NULL,
|
|
7048
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7104
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7049
7105
|
}
|
|
7050
7106
|
|
|
7051
7107
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7120,7 +7176,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7120
7176
|
// inp_pos - contains the positions
|
|
7121
7177
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7122
7178
|
|
|
7123
|
-
auto * inp_attn =
|
|
7179
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7124
7180
|
|
|
7125
7181
|
ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
7126
7182
|
cb(pos, "pos_embd", -1);
|
|
@@ -7145,13 +7201,13 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7145
7201
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7146
7202
|
cb(cur, "bqkv", il);
|
|
7147
7203
|
|
|
7148
|
-
ggml_tensor * Qcur =
|
|
7149
|
-
ggml_tensor * Kcur =
|
|
7150
|
-
ggml_tensor * Vcur =
|
|
7204
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7205
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7206
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7151
7207
|
|
|
7152
|
-
Qcur =
|
|
7153
|
-
Kcur =
|
|
7154
|
-
Vcur =
|
|
7208
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7209
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7210
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7155
7211
|
|
|
7156
7212
|
cb(Qcur, "Qcur", il);
|
|
7157
7213
|
cb(Kcur, "Kcur", il);
|
|
@@ -7159,7 +7215,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7159
7215
|
|
|
7160
7216
|
cur = build_attn(inp_attn,
|
|
7161
7217
|
model.layers[il].wo, model.layers[il].bo,
|
|
7162
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7218
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7163
7219
|
}
|
|
7164
7220
|
|
|
7165
7221
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7225,7 +7281,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
7225
7281
|
|
|
7226
7282
|
inpL = build_inp_embd(model.tok_embd);
|
|
7227
7283
|
|
|
7228
|
-
auto * inp_attn =
|
|
7284
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7229
7285
|
|
|
7230
7286
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7231
7287
|
|
|
@@ -7258,7 +7314,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
7258
7314
|
|
|
7259
7315
|
cur = build_attn(inp_attn,
|
|
7260
7316
|
model.layers[il].wo, NULL,
|
|
7261
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7317
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7262
7318
|
}
|
|
7263
7319
|
|
|
7264
7320
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7367,13 +7423,15 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7367
7423
|
cb(cur, "bqkv", il);
|
|
7368
7424
|
}
|
|
7369
7425
|
|
|
7370
|
-
Qcur =
|
|
7371
|
-
Kcur =
|
|
7372
|
-
Vcur =
|
|
7426
|
+
Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7427
|
+
Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7428
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7429
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7373
7430
|
} else {
|
|
7374
7431
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
7375
7432
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
7376
7433
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
7434
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7377
7435
|
}
|
|
7378
7436
|
|
|
7379
7437
|
if (model.layers[il].attn_q_norm) {
|
|
@@ -7381,6 +7439,10 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7381
7439
|
model.layers[il].attn_q_norm,
|
|
7382
7440
|
model.layers[il].attn_q_norm_b,
|
|
7383
7441
|
LLM_NORM, il);
|
|
7442
|
+
|
|
7443
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7444
|
+
} else {
|
|
7445
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7384
7446
|
}
|
|
7385
7447
|
|
|
7386
7448
|
if (model.layers[il].attn_k_norm) {
|
|
@@ -7388,11 +7450,11 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7388
7450
|
model.layers[il].attn_k_norm,
|
|
7389
7451
|
model.layers[il].attn_k_norm_b,
|
|
7390
7452
|
LLM_NORM, il);
|
|
7391
|
-
}
|
|
7392
7453
|
|
|
7393
|
-
|
|
7394
|
-
|
|
7395
|
-
|
|
7454
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7455
|
+
} else {
|
|
7456
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7457
|
+
}
|
|
7396
7458
|
|
|
7397
7459
|
// RoPE
|
|
7398
7460
|
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
@@ -7415,7 +7477,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7415
7477
|
|
|
7416
7478
|
cur = build_attn(inp_attn,
|
|
7417
7479
|
model.layers[il].wo, model.layers[il].bo,
|
|
7418
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7480
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7419
7481
|
cb(cur, "kqv_out", il);
|
|
7420
7482
|
}
|
|
7421
7483
|
|
|
@@ -7537,9 +7599,9 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
7537
7599
|
|
|
7538
7600
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7539
7601
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7540
|
-
Vcur =
|
|
7602
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7541
7603
|
|
|
7542
|
-
Vcur =
|
|
7604
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7543
7605
|
|
|
7544
7606
|
// RoPE
|
|
7545
7607
|
Qcur = ggml_rope_ext(
|
|
@@ -7560,7 +7622,7 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
7560
7622
|
|
|
7561
7623
|
cur = build_attn(inp_attn,
|
|
7562
7624
|
model.layers[il].wo, nullptr,
|
|
7563
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7625
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7564
7626
|
cb(cur, "kqv_out", il);
|
|
7565
7627
|
}
|
|
7566
7628
|
|
|
@@ -7621,7 +7683,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7621
7683
|
|
|
7622
7684
|
inpL = build_inp_embd(model.tok_embd);
|
|
7623
7685
|
|
|
7624
|
-
auto * inp_attn =
|
|
7686
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7625
7687
|
|
|
7626
7688
|
inpL = build_norm(inpL,
|
|
7627
7689
|
model.tok_norm,
|
|
@@ -7646,13 +7708,13 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7646
7708
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7647
7709
|
cb(cur, "bqkv", il);
|
|
7648
7710
|
|
|
7649
|
-
ggml_tensor * Qcur =
|
|
7650
|
-
ggml_tensor * Kcur =
|
|
7651
|
-
ggml_tensor * Vcur =
|
|
7711
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7712
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7713
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7652
7714
|
|
|
7653
|
-
Qcur =
|
|
7654
|
-
Kcur =
|
|
7655
|
-
Vcur =
|
|
7715
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7716
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7717
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7656
7718
|
|
|
7657
7719
|
cb(Qcur, "Qcur", il);
|
|
7658
7720
|
cb(Kcur, "Kcur", il);
|
|
@@ -7660,7 +7722,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7660
7722
|
|
|
7661
7723
|
cur = build_attn(inp_attn,
|
|
7662
7724
|
model.layers[il].wo, model.layers[il].bo,
|
|
7663
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7725
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7664
7726
|
}
|
|
7665
7727
|
|
|
7666
7728
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7728,7 +7790,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7728
7790
|
|
|
7729
7791
|
inpL = build_inp_embd(model.tok_embd);
|
|
7730
7792
|
|
|
7731
|
-
auto * inp_attn =
|
|
7793
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7732
7794
|
|
|
7733
7795
|
if (model.pos_embd) {
|
|
7734
7796
|
// inp_pos - contains the positions
|
|
@@ -7770,7 +7832,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7770
7832
|
|
|
7771
7833
|
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7772
7834
|
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7773
|
-
ggml_tensor * Vcur =
|
|
7835
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7774
7836
|
|
|
7775
7837
|
cb(Qcur, "Qcur", il);
|
|
7776
7838
|
cb(Kcur, "Kcur", il);
|
|
@@ -7789,17 +7851,18 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7789
7851
|
model.layers[il].attn_k_norm_b,
|
|
7790
7852
|
LLM_NORM, il);
|
|
7791
7853
|
cb(Kcur, "Kcur", il);
|
|
7854
|
+
|
|
7855
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7856
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7792
7857
|
} else {
|
|
7793
|
-
Qcur =
|
|
7858
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7794
7859
|
cb(Qcur, "Qcur", il);
|
|
7795
7860
|
|
|
7796
|
-
Kcur =
|
|
7861
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7797
7862
|
cb(Kcur, "Kcur", il);
|
|
7798
7863
|
}
|
|
7799
7864
|
|
|
7800
|
-
|
|
7801
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7802
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7865
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7803
7866
|
|
|
7804
7867
|
cb(Qcur, "Qcur", il);
|
|
7805
7868
|
cb(Kcur, "Kcur", il);
|
|
@@ -7807,7 +7870,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7807
7870
|
|
|
7808
7871
|
cur = build_attn(inp_attn,
|
|
7809
7872
|
model.layers[il].wo, model.layers[il].bo,
|
|
7810
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7873
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7811
7874
|
}
|
|
7812
7875
|
|
|
7813
7876
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -7877,7 +7940,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
7877
7940
|
// inp_pos - contains the positions
|
|
7878
7941
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7879
7942
|
|
|
7880
|
-
auto * inp_attn =
|
|
7943
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
7881
7944
|
|
|
7882
7945
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7883
7946
|
|
|
@@ -7953,7 +8016,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
7953
8016
|
|
|
7954
8017
|
cur = build_attn(inp_attn,
|
|
7955
8018
|
model.layers[il].wo, NULL,
|
|
7956
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8019
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7957
8020
|
}
|
|
7958
8021
|
|
|
7959
8022
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8029,7 +8092,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8029
8092
|
// inp_pos - contains the positions
|
|
8030
8093
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8031
8094
|
|
|
8032
|
-
auto * inp_attn =
|
|
8095
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8033
8096
|
|
|
8034
8097
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8035
8098
|
|
|
@@ -8051,9 +8114,9 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8051
8114
|
|
|
8052
8115
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8053
8116
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
8054
|
-
ggml_tensor * Vcur =
|
|
8117
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
|
|
8055
8118
|
|
|
8056
|
-
Vcur =
|
|
8119
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8057
8120
|
|
|
8058
8121
|
// using mode = 2 for neox mode
|
|
8059
8122
|
Qcur = ggml_rope_ext(
|
|
@@ -8074,7 +8137,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8074
8137
|
|
|
8075
8138
|
cur = build_attn(inp_attn,
|
|
8076
8139
|
model.layers[il].wo, NULL,
|
|
8077
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8140
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8078
8141
|
}
|
|
8079
8142
|
|
|
8080
8143
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8144,7 +8207,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
8144
8207
|
// inp_pos - contains the positions
|
|
8145
8208
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8146
8209
|
|
|
8147
|
-
auto * inp_attn =
|
|
8210
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8148
8211
|
|
|
8149
8212
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8150
8213
|
|
|
@@ -8194,7 +8257,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
8194
8257
|
|
|
8195
8258
|
cur = build_attn(inp_attn,
|
|
8196
8259
|
model.layers[il].wo, model.layers[il].bo,
|
|
8197
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8260
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8198
8261
|
}
|
|
8199
8262
|
|
|
8200
8263
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8308,8 +8371,9 @@ struct llm_build_dream : public llm_graph_context {
|
|
|
8308
8371
|
cb(Kcur, "Kcur", il);
|
|
8309
8372
|
cb(Vcur, "Vcur", il);
|
|
8310
8373
|
|
|
8311
|
-
cur = build_attn(inp_attn,
|
|
8312
|
-
|
|
8374
|
+
cur = build_attn(inp_attn,
|
|
8375
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
8376
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
8313
8377
|
}
|
|
8314
8378
|
|
|
8315
8379
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8408,8 +8472,9 @@ struct llm_build_llada : public llm_graph_context {
|
|
|
8408
8472
|
cb(Kcur, "Kcur", il);
|
|
8409
8473
|
cb(Vcur, "Vcur", il);
|
|
8410
8474
|
|
|
8411
|
-
cur = build_attn(inp_attn,
|
|
8412
|
-
|
|
8475
|
+
cur = build_attn(inp_attn,
|
|
8476
|
+
model.layers[il].wo, NULL,
|
|
8477
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
8413
8478
|
}
|
|
8414
8479
|
|
|
8415
8480
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8469,7 +8534,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8469
8534
|
// inp_pos - contains the positions
|
|
8470
8535
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8471
8536
|
|
|
8472
|
-
auto * inp_attn =
|
|
8537
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8473
8538
|
|
|
8474
8539
|
int sections[4];
|
|
8475
8540
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
@@ -8522,7 +8587,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
8522
8587
|
|
|
8523
8588
|
cur = build_attn(inp_attn,
|
|
8524
8589
|
model.layers[il].wo, model.layers[il].bo,
|
|
8525
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8590
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8526
8591
|
}
|
|
8527
8592
|
|
|
8528
8593
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8590,7 +8655,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
8590
8655
|
// inp_pos - contains the positions
|
|
8591
8656
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8592
8657
|
|
|
8593
|
-
auto * inp_attn =
|
|
8658
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8594
8659
|
|
|
8595
8660
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8596
8661
|
|
|
@@ -8649,7 +8714,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
8649
8714
|
|
|
8650
8715
|
cur = build_attn(inp_attn,
|
|
8651
8716
|
model.layers[il].wo, model.layers[il].bo,
|
|
8652
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8717
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8653
8718
|
}
|
|
8654
8719
|
|
|
8655
8720
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8749,7 +8814,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
8749
8814
|
// inp_pos - contains the positions
|
|
8750
8815
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8751
8816
|
|
|
8752
|
-
auto * inp_attn =
|
|
8817
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8753
8818
|
|
|
8754
8819
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8755
8820
|
|
|
@@ -8802,7 +8867,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
8802
8867
|
|
|
8803
8868
|
cur = build_attn(inp_attn,
|
|
8804
8869
|
model.layers[il].wo, model.layers[il].bo,
|
|
8805
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8870
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8806
8871
|
}
|
|
8807
8872
|
|
|
8808
8873
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -8870,7 +8935,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
8870
8935
|
// inp_pos - contains the positions
|
|
8871
8936
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
8872
8937
|
|
|
8873
|
-
auto * inp_attn =
|
|
8938
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
8874
8939
|
|
|
8875
8940
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8876
8941
|
|
|
@@ -8923,7 +8988,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
8923
8988
|
|
|
8924
8989
|
cur = build_attn(inp_attn,
|
|
8925
8990
|
model.layers[il].wo, model.layers[il].bo,
|
|
8926
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8991
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8927
8992
|
}
|
|
8928
8993
|
|
|
8929
8994
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9000,7 +9065,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9000
9065
|
// inp_pos - contains the positions
|
|
9001
9066
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9002
9067
|
|
|
9003
|
-
auto * inp_attn =
|
|
9068
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9004
9069
|
|
|
9005
9070
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9006
9071
|
|
|
@@ -9026,21 +9091,21 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9026
9091
|
|
|
9027
9092
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9028
9093
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9029
|
-
Vcur =
|
|
9094
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9095
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9030
9096
|
} else {
|
|
9031
9097
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9032
9098
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9033
9099
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9034
9100
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9035
9101
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9102
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9036
9103
|
}
|
|
9037
9104
|
|
|
9038
9105
|
cb(Qcur, "Qcur", il);
|
|
9039
9106
|
cb(Kcur, "Kcur", il);
|
|
9040
9107
|
cb(Vcur, "Vcur", il);
|
|
9041
9108
|
|
|
9042
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9043
|
-
|
|
9044
9109
|
Qcur = ggml_rope_ext(
|
|
9045
9110
|
ctx0, Qcur, inp_pos, nullptr,
|
|
9046
9111
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9063,7 +9128,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9063
9128
|
|
|
9064
9129
|
cur = build_attn(inp_attn,
|
|
9065
9130
|
model.layers[il].wo, model.layers[il].bo,
|
|
9066
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9131
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
9067
9132
|
}
|
|
9068
9133
|
|
|
9069
9134
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9129,13 +9194,13 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9129
9194
|
// inp_pos - contains the positions
|
|
9130
9195
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9131
9196
|
|
|
9132
|
-
using inp_attn_type = std::conditional_t<iswa,
|
|
9197
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
9133
9198
|
inp_attn_type * inp_attn = nullptr;
|
|
9134
9199
|
|
|
9135
9200
|
if constexpr (iswa) {
|
|
9136
|
-
inp_attn =
|
|
9201
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
9137
9202
|
} else {
|
|
9138
|
-
inp_attn =
|
|
9203
|
+
inp_attn = build_attn_inp_kv();
|
|
9139
9204
|
}
|
|
9140
9205
|
|
|
9141
9206
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -9164,21 +9229,21 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9164
9229
|
|
|
9165
9230
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
|
|
9166
9231
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
9167
|
-
Vcur =
|
|
9232
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
9233
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9168
9234
|
} else {
|
|
9169
9235
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9170
9236
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9171
9237
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9172
9238
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9173
9239
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9240
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9174
9241
|
}
|
|
9175
9242
|
|
|
9176
9243
|
cb(Qcur, "Qcur", il);
|
|
9177
9244
|
cb(Kcur, "Kcur", il);
|
|
9178
9245
|
cb(Vcur, "Vcur", il);
|
|
9179
9246
|
|
|
9180
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9181
|
-
|
|
9182
9247
|
Qcur = ggml_rope_ext(
|
|
9183
9248
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
9184
9249
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9200,7 +9265,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9200
9265
|
|
|
9201
9266
|
cur = build_attn(inp_attn,
|
|
9202
9267
|
model.layers[il].wo, model.layers[il].bo,
|
|
9203
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9268
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
9204
9269
|
}
|
|
9205
9270
|
|
|
9206
9271
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9287,7 +9352,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
9287
9352
|
// inp_pos - contains the positions
|
|
9288
9353
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9289
9354
|
|
|
9290
|
-
auto * inp_attn =
|
|
9355
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9291
9356
|
|
|
9292
9357
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9293
9358
|
|
|
@@ -9334,7 +9399,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
9334
9399
|
|
|
9335
9400
|
cur = build_attn(inp_attn,
|
|
9336
9401
|
model.layers[il].wo, NULL,
|
|
9337
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9402
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9338
9403
|
}
|
|
9339
9404
|
|
|
9340
9405
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9403,7 +9468,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
9403
9468
|
// inp_pos - contains the positions
|
|
9404
9469
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9405
9470
|
|
|
9406
|
-
auto * inp_attn =
|
|
9471
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9407
9472
|
|
|
9408
9473
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
9409
9474
|
cb(pos, "pos_embd", -1);
|
|
@@ -9428,21 +9493,21 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
9428
9493
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
9429
9494
|
cb(cur, "bqkv", il);
|
|
9430
9495
|
|
|
9431
|
-
ggml_tensor * Qcur =
|
|
9432
|
-
ggml_tensor * Kcur =
|
|
9433
|
-
ggml_tensor * Vcur =
|
|
9496
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9497
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9498
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9434
9499
|
|
|
9435
9500
|
cb(Qcur, "Qcur", il);
|
|
9436
9501
|
cb(Kcur, "Kcur", il);
|
|
9437
9502
|
cb(Vcur, "Vcur", il);
|
|
9438
9503
|
|
|
9439
|
-
Qcur =
|
|
9440
|
-
Kcur =
|
|
9441
|
-
Vcur =
|
|
9504
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9505
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9506
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9442
9507
|
|
|
9443
9508
|
cur = build_attn(inp_attn,
|
|
9444
9509
|
model.layers[il].wo, model.layers[il].bo,
|
|
9445
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9510
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9446
9511
|
}
|
|
9447
9512
|
|
|
9448
9513
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9513,7 +9578,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9513
9578
|
// inp_pos - contains the positions
|
|
9514
9579
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9515
9580
|
|
|
9516
|
-
auto * inp_attn =
|
|
9581
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9517
9582
|
|
|
9518
9583
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9519
9584
|
|
|
@@ -9534,9 +9599,9 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9534
9599
|
|
|
9535
9600
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9536
9601
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9537
|
-
ggml_tensor * Vcur =
|
|
9602
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9538
9603
|
|
|
9539
|
-
Vcur =
|
|
9604
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9540
9605
|
|
|
9541
9606
|
Qcur = ggml_rope_ext(
|
|
9542
9607
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -9556,7 +9621,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9556
9621
|
|
|
9557
9622
|
cur = build_attn(inp_attn,
|
|
9558
9623
|
model.layers[il].wo, model.layers[il].bo,
|
|
9559
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9624
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9560
9625
|
}
|
|
9561
9626
|
|
|
9562
9627
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9626,7 +9691,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
9626
9691
|
// inp_pos - contains the positions
|
|
9627
9692
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9628
9693
|
|
|
9629
|
-
auto * inp_attn =
|
|
9694
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9630
9695
|
|
|
9631
9696
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9632
9697
|
|
|
@@ -9685,7 +9750,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
9685
9750
|
|
|
9686
9751
|
cur = build_attn(inp_attn,
|
|
9687
9752
|
model.layers[il].wo, NULL,
|
|
9688
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9753
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9689
9754
|
}
|
|
9690
9755
|
|
|
9691
9756
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9753,7 +9818,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
9753
9818
|
// inp_pos - contains the positions
|
|
9754
9819
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9755
9820
|
|
|
9756
|
-
auto * inp_attn =
|
|
9821
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9757
9822
|
|
|
9758
9823
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9759
9824
|
|
|
@@ -9812,7 +9877,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
9812
9877
|
|
|
9813
9878
|
cur = build_attn(inp_attn,
|
|
9814
9879
|
model.layers[il].wo, model.layers[il].bo,
|
|
9815
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9880
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9816
9881
|
}
|
|
9817
9882
|
|
|
9818
9883
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -9889,7 +9954,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
9889
9954
|
// inp_pos - contains the positions
|
|
9890
9955
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
9891
9956
|
|
|
9892
|
-
auto * inp_attn =
|
|
9957
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
9893
9958
|
|
|
9894
9959
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
9895
9960
|
|
|
@@ -10000,7 +10065,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
10000
10065
|
|
|
10001
10066
|
cur = build_attn(inp_attn,
|
|
10002
10067
|
model.layers[il].wo, NULL,
|
|
10003
|
-
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
10068
|
+
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
|
|
10004
10069
|
}
|
|
10005
10070
|
|
|
10006
10071
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10084,7 +10149,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
10084
10149
|
// inp_pos - contains the positions
|
|
10085
10150
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10086
10151
|
|
|
10087
|
-
auto * inp_attn =
|
|
10152
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
10088
10153
|
|
|
10089
10154
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10090
10155
|
|
|
@@ -10130,7 +10195,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
10130
10195
|
|
|
10131
10196
|
cur = build_attn(inp_attn,
|
|
10132
10197
|
model.layers[il].wo, NULL,
|
|
10133
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10198
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
10134
10199
|
}
|
|
10135
10200
|
|
|
10136
10201
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10200,7 +10265,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
10200
10265
|
// inp_pos - contains the positions
|
|
10201
10266
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10202
10267
|
|
|
10203
|
-
auto * inp_attn =
|
|
10268
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
10204
10269
|
|
|
10205
10270
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10206
10271
|
|
|
@@ -10245,7 +10310,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
10245
10310
|
|
|
10246
10311
|
cur = build_attn(inp_attn,
|
|
10247
10312
|
model.layers[il].wo, NULL,
|
|
10248
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10313
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
10249
10314
|
}
|
|
10250
10315
|
|
|
10251
10316
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10334,7 +10399,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
10334
10399
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10335
10400
|
|
|
10336
10401
|
// TODO: is causal == true correct? might need some changes
|
|
10337
|
-
auto * inp_attn =
|
|
10402
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
10338
10403
|
|
|
10339
10404
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10340
10405
|
|
|
@@ -10387,7 +10452,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
10387
10452
|
|
|
10388
10453
|
cur = build_attn(inp_attn,
|
|
10389
10454
|
model.layers[il].wo, NULL,
|
|
10390
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
10455
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
10391
10456
|
}
|
|
10392
10457
|
|
|
10393
10458
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -10485,7 +10550,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10485
10550
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10486
10551
|
|
|
10487
10552
|
// TODO: is causal == true correct? might need some changes
|
|
10488
|
-
auto * inp_attn =
|
|
10553
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
10489
10554
|
|
|
10490
10555
|
// inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
|
|
10491
10556
|
ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
|
|
@@ -10568,7 +10633,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10568
10633
|
|
|
10569
10634
|
cur = build_attn(inp_attn,
|
|
10570
10635
|
model.layers[il].wo, NULL,
|
|
10571
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10636
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10572
10637
|
} else {
|
|
10573
10638
|
// no KV layers
|
|
10574
10639
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -10586,7 +10651,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10586
10651
|
|
|
10587
10652
|
cur = build_attn(inp_attn,
|
|
10588
10653
|
model.layers[il].wo, NULL,
|
|
10589
|
-
Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10654
|
+
Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10590
10655
|
}
|
|
10591
10656
|
|
|
10592
10657
|
cur = build_norm(cur,
|
|
@@ -10864,8 +10929,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10864
10929
|
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
|
|
10865
10930
|
all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
|
|
10866
10931
|
cb(all_coefs, "all_coefs", il);
|
|
10867
|
-
all_coefs =
|
|
10868
|
-
all_coefs =
|
|
10932
|
+
all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
|
|
10933
|
+
all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
|
|
10869
10934
|
|
|
10870
10935
|
innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
|
|
10871
10936
|
ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
|
|
@@ -10892,7 +10957,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
10892
10957
|
// inp_pos - contains the positions
|
|
10893
10958
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
10894
10959
|
|
|
10895
|
-
auto * inp_attn =
|
|
10960
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
10896
10961
|
|
|
10897
10962
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10898
10963
|
|
|
@@ -10951,7 +11016,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
10951
11016
|
|
|
10952
11017
|
cur = build_attn(inp_attn,
|
|
10953
11018
|
model.layers[il].wo, model.layers[il].bo,
|
|
10954
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11019
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10955
11020
|
}
|
|
10956
11021
|
|
|
10957
11022
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11378,7 +11443,9 @@ struct llm_build_jamba : public llm_graph_context_mamba {
|
|
|
11378
11443
|
cb(Vcur, "Vcur", il);
|
|
11379
11444
|
|
|
11380
11445
|
// No RoPE :)
|
|
11381
|
-
cur = build_attn(inp_hybrid->get_attn(),
|
|
11446
|
+
cur = build_attn(inp_hybrid->get_attn(),
|
|
11447
|
+
model.layers[il].wo, NULL,
|
|
11448
|
+
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11382
11449
|
}
|
|
11383
11450
|
|
|
11384
11451
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11461,7 +11528,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
11461
11528
|
// inp_pos - contains the positions
|
|
11462
11529
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11463
11530
|
|
|
11464
|
-
auto * inp_attn =
|
|
11531
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11465
11532
|
|
|
11466
11533
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11467
11534
|
|
|
@@ -11536,7 +11603,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
11536
11603
|
|
|
11537
11604
|
cur = build_attn(inp_attn,
|
|
11538
11605
|
model.layers[il].wo, model.layers[il].bo,
|
|
11539
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11606
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11540
11607
|
}
|
|
11541
11608
|
|
|
11542
11609
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11608,7 +11675,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
11608
11675
|
// inp_pos - contains the positions
|
|
11609
11676
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11610
11677
|
|
|
11611
|
-
auto * inp_attn =
|
|
11678
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
11612
11679
|
|
|
11613
11680
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11614
11681
|
|
|
@@ -11671,7 +11738,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
11671
11738
|
|
|
11672
11739
|
cur = build_attn(inp_attn,
|
|
11673
11740
|
model.layers[il].wo, model.layers[il].bo,
|
|
11674
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11741
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11675
11742
|
}
|
|
11676
11743
|
|
|
11677
11744
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11743,7 +11810,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
11743
11810
|
// inp_pos - contains the positions
|
|
11744
11811
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11745
11812
|
|
|
11746
|
-
auto * inp_attn =
|
|
11813
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11747
11814
|
|
|
11748
11815
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11749
11816
|
|
|
@@ -11802,7 +11869,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
11802
11869
|
|
|
11803
11870
|
cur = build_attn(inp_attn,
|
|
11804
11871
|
model.layers[il].wo, nullptr,
|
|
11805
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11872
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11806
11873
|
}
|
|
11807
11874
|
|
|
11808
11875
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -11871,7 +11938,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
11871
11938
|
// inp_pos - contains the positions
|
|
11872
11939
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11873
11940
|
|
|
11874
|
-
auto * inp_attn =
|
|
11941
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
11875
11942
|
|
|
11876
11943
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11877
11944
|
|
|
@@ -11922,7 +11989,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
11922
11989
|
|
|
11923
11990
|
cur = build_attn(inp_attn,
|
|
11924
11991
|
model.layers[il].wo, NULL,
|
|
11925
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11992
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11926
11993
|
}
|
|
11927
11994
|
|
|
11928
11995
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12000,7 +12067,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
12000
12067
|
// inp_pos - contains the positions
|
|
12001
12068
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12002
12069
|
|
|
12003
|
-
auto * inp_attn =
|
|
12070
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12004
12071
|
|
|
12005
12072
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12006
12073
|
|
|
@@ -12055,7 +12122,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
12055
12122
|
|
|
12056
12123
|
cur = build_attn(inp_attn,
|
|
12057
12124
|
model.layers[il].wo, NULL,
|
|
12058
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12125
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12059
12126
|
}
|
|
12060
12127
|
|
|
12061
12128
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12126,7 +12193,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
12126
12193
|
// inp_pos - contains the positions
|
|
12127
12194
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12128
12195
|
|
|
12129
|
-
auto * inp_attn =
|
|
12196
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12130
12197
|
|
|
12131
12198
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12132
12199
|
|
|
@@ -12188,7 +12255,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
12188
12255
|
|
|
12189
12256
|
cur = build_attn(inp_attn,
|
|
12190
12257
|
model.layers[il].wo, NULL,
|
|
12191
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12258
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12192
12259
|
}
|
|
12193
12260
|
|
|
12194
12261
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12257,7 +12324,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12257
12324
|
// inp_pos - contains the positions
|
|
12258
12325
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12259
12326
|
|
|
12260
|
-
auto * inp_attn =
|
|
12327
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12261
12328
|
|
|
12262
12329
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12263
12330
|
|
|
@@ -12278,9 +12345,9 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12278
12345
|
|
|
12279
12346
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12280
12347
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
12281
|
-
ggml_tensor * Vcur =
|
|
12348
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
12282
12349
|
|
|
12283
|
-
Vcur =
|
|
12350
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12284
12351
|
|
|
12285
12352
|
Qcur = ggml_rope_ext(
|
|
12286
12353
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -12300,7 +12367,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12300
12367
|
|
|
12301
12368
|
cur = build_attn(inp_attn,
|
|
12302
12369
|
model.layers[il].wo, model.layers[il].bo,
|
|
12303
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12370
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12304
12371
|
}
|
|
12305
12372
|
|
|
12306
12373
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12403,7 +12470,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
12403
12470
|
// inp_pos - contains the positions
|
|
12404
12471
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12405
12472
|
|
|
12406
|
-
auto * inp_attn =
|
|
12473
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12407
12474
|
|
|
12408
12475
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12409
12476
|
|
|
@@ -12450,7 +12517,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
12450
12517
|
|
|
12451
12518
|
cur = build_attn(inp_attn,
|
|
12452
12519
|
model.layers[il].wo, NULL,
|
|
12453
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12520
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12454
12521
|
}
|
|
12455
12522
|
|
|
12456
12523
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12541,7 +12608,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
12541
12608
|
// inp_pos - contains the positions
|
|
12542
12609
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12543
12610
|
|
|
12544
|
-
auto * inp_attn =
|
|
12611
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12545
12612
|
|
|
12546
12613
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
12547
12614
|
|
|
@@ -12605,7 +12672,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
12605
12672
|
|
|
12606
12673
|
cur = build_attn(inp_attn,
|
|
12607
12674
|
model.layers[il].wo, model.layers[il].bo,
|
|
12608
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
12675
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
12609
12676
|
}
|
|
12610
12677
|
|
|
12611
12678
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -12718,7 +12785,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12718
12785
|
// inp_pos - contains the positions
|
|
12719
12786
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12720
12787
|
|
|
12721
|
-
auto * inp_attn =
|
|
12788
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12722
12789
|
|
|
12723
12790
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12724
12791
|
|
|
@@ -12833,7 +12900,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12833
12900
|
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
|
12834
12901
|
cur = build_attn(inp_attn,
|
|
12835
12902
|
model.layers[il].wo, NULL,
|
|
12836
|
-
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
12903
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
12837
12904
|
} else {
|
|
12838
12905
|
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
|
12839
12906
|
cb(kv, "kv", il);
|
|
@@ -12867,7 +12934,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
12867
12934
|
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
|
12868
12935
|
cur = build_attn(inp_attn,
|
|
12869
12936
|
model.layers[il].wo, NULL,
|
|
12870
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
12937
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
12871
12938
|
}
|
|
12872
12939
|
}
|
|
12873
12940
|
|
|
@@ -12965,7 +13032,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
12965
13032
|
// inp_pos - contains the positions
|
|
12966
13033
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12967
13034
|
|
|
12968
|
-
auto * inp_attn =
|
|
13035
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
12969
13036
|
|
|
12970
13037
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12971
13038
|
|
|
@@ -13034,7 +13101,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
13034
13101
|
|
|
13035
13102
|
cur = build_attn(inp_attn,
|
|
13036
13103
|
NULL, NULL,
|
|
13037
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13104
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13038
13105
|
|
|
13039
13106
|
cur = build_norm(cur,
|
|
13040
13107
|
model.layers[il].attn_sub_norm, NULL,
|
|
@@ -13157,7 +13224,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
13157
13224
|
|
|
13158
13225
|
cur = build_attn(inp_attn,
|
|
13159
13226
|
model.layers[il].wo_enc, nullptr,
|
|
13160
|
-
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
13227
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
|
|
13161
13228
|
cb(cur, "kqv_out", il);
|
|
13162
13229
|
}
|
|
13163
13230
|
|
|
@@ -13229,7 +13296,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13229
13296
|
|
|
13230
13297
|
const int64_t n_outputs_enc = embd_enc->ne[1];
|
|
13231
13298
|
|
|
13232
|
-
auto * inp_attn_self =
|
|
13299
|
+
auto * inp_attn_self = build_attn_inp_kv();
|
|
13233
13300
|
auto * inp_attn_cross = build_attn_inp_cross();
|
|
13234
13301
|
|
|
13235
13302
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -13263,7 +13330,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13263
13330
|
|
|
13264
13331
|
cur = build_attn(inp_attn_self,
|
|
13265
13332
|
model.layers[il].wo, model.layers[il].bo,
|
|
13266
|
-
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
13333
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, nullptr, 1.0f, il);
|
|
13267
13334
|
cb(cur, "kqv_out", il);
|
|
13268
13335
|
}
|
|
13269
13336
|
|
|
@@ -13295,7 +13362,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13295
13362
|
|
|
13296
13363
|
cur = build_attn(inp_attn_cross,
|
|
13297
13364
|
model.layers[il].wo_cross, nullptr,
|
|
13298
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
13365
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
13299
13366
|
cb(cur, "kqv_out", il);
|
|
13300
13367
|
|
|
13301
13368
|
//ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
|
@@ -13394,7 +13461,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
13394
13461
|
|
|
13395
13462
|
inpL = build_inp_embd(model.tok_embd);
|
|
13396
13463
|
|
|
13397
|
-
auto * inp_attn =
|
|
13464
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13398
13465
|
|
|
13399
13466
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13400
13467
|
|
|
@@ -13413,21 +13480,21 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
13413
13480
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
13414
13481
|
cb(cur, "bqkv", il);
|
|
13415
13482
|
|
|
13416
|
-
ggml_tensor * Qcur =
|
|
13417
|
-
ggml_tensor * Kcur =
|
|
13418
|
-
ggml_tensor * Vcur =
|
|
13483
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
|
|
13484
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
|
|
13485
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
|
|
13419
13486
|
|
|
13420
13487
|
cb(Qcur, "Qcur", il);
|
|
13421
13488
|
cb(Kcur, "Kcur", il);
|
|
13422
13489
|
cb(Vcur, "Vcur", il);
|
|
13423
13490
|
|
|
13424
|
-
Qcur =
|
|
13425
|
-
Kcur =
|
|
13426
|
-
Vcur =
|
|
13491
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13492
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13493
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13427
13494
|
|
|
13428
13495
|
cur = build_attn(inp_attn,
|
|
13429
13496
|
model.layers[il].wo, model.layers[il].bo,
|
|
13430
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
13497
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
13431
13498
|
}
|
|
13432
13499
|
|
|
13433
13500
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13492,7 +13559,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13492
13559
|
// inp_pos - contains the positions
|
|
13493
13560
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13494
13561
|
|
|
13495
|
-
auto * inp_attn =
|
|
13562
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13496
13563
|
|
|
13497
13564
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13498
13565
|
|
|
@@ -13526,6 +13593,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13526
13593
|
}
|
|
13527
13594
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13528
13595
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13596
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13529
13597
|
} else {
|
|
13530
13598
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
13531
13599
|
cb(cur, "wqkv", il);
|
|
@@ -13535,11 +13603,10 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13535
13603
|
}
|
|
13536
13604
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13537
13605
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13538
|
-
Vcur =
|
|
13606
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13607
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13539
13608
|
}
|
|
13540
13609
|
|
|
13541
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13542
|
-
|
|
13543
13610
|
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
|
13544
13611
|
Qcur = ggml_rope_ext(
|
|
13545
13612
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -13559,7 +13626,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13559
13626
|
|
|
13560
13627
|
cur = build_attn(inp_attn,
|
|
13561
13628
|
model.layers[il].wo, NULL,
|
|
13562
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13629
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13563
13630
|
}
|
|
13564
13631
|
|
|
13565
13632
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13625,7 +13692,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13625
13692
|
// inp_pos - contains the positions
|
|
13626
13693
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13627
13694
|
|
|
13628
|
-
auto * inp_attn =
|
|
13695
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13629
13696
|
|
|
13630
13697
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13631
13698
|
|
|
@@ -13660,6 +13727,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13660
13727
|
}
|
|
13661
13728
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13662
13729
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13730
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13663
13731
|
} else {
|
|
13664
13732
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
13665
13733
|
cb(cur, "wqkv", il);
|
|
@@ -13669,11 +13737,10 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13669
13737
|
}
|
|
13670
13738
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13671
13739
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13672
|
-
Vcur =
|
|
13740
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13741
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13673
13742
|
}
|
|
13674
13743
|
|
|
13675
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13676
|
-
|
|
13677
13744
|
Qcur = ggml_rope_ext(
|
|
13678
13745
|
ctx0, Qcur, inp_pos, nullptr,
|
|
13679
13746
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -13692,7 +13759,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13692
13759
|
|
|
13693
13760
|
cur = build_attn(inp_attn,
|
|
13694
13761
|
model.layers[il].wo, NULL,
|
|
13695
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13762
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13696
13763
|
}
|
|
13697
13764
|
|
|
13698
13765
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -13775,7 +13842,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
|
|
|
13775
13842
|
// inp_pos - contains the positions
|
|
13776
13843
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13777
13844
|
|
|
13778
|
-
auto * inp_attn =
|
|
13845
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13779
13846
|
|
|
13780
13847
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13781
13848
|
|
|
@@ -13841,7 +13908,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
|
|
|
13841
13908
|
|
|
13842
13909
|
cur = build_attn(inp_attn,
|
|
13843
13910
|
model.layers[il].wo, NULL,
|
|
13844
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13911
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13845
13912
|
}
|
|
13846
13913
|
|
|
13847
13914
|
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
|
@@ -13935,7 +14002,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
13935
14002
|
// inp_pos - contains the positions
|
|
13936
14003
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13937
14004
|
|
|
13938
|
-
auto * inp_attn =
|
|
14005
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
13939
14006
|
|
|
13940
14007
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13941
14008
|
|
|
@@ -13995,7 +14062,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
13995
14062
|
|
|
13996
14063
|
cur = build_attn(inp_attn,
|
|
13997
14064
|
model.layers[il].wo, model.layers[il].bo,
|
|
13998
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14065
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
13999
14066
|
}
|
|
14000
14067
|
|
|
14001
14068
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -14064,7 +14131,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
14064
14131
|
// inp_pos - contains the positions
|
|
14065
14132
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
14066
14133
|
|
|
14067
|
-
auto * inp_attn =
|
|
14134
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
14068
14135
|
|
|
14069
14136
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14070
14137
|
|
|
@@ -14126,7 +14193,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
14126
14193
|
|
|
14127
14194
|
cur = build_attn(inp_attn,
|
|
14128
14195
|
model.layers[il].wo, model.layers[il].bo,
|
|
14129
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14196
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14130
14197
|
}
|
|
14131
14198
|
|
|
14132
14199
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -14196,13 +14263,13 @@ struct llm_build_exaone4 : public llm_graph_context {
|
|
|
14196
14263
|
// inp_pos - contains the positions
|
|
14197
14264
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
14198
14265
|
|
|
14199
|
-
using inp_attn_type = std::conditional_t<iswa,
|
|
14266
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
14200
14267
|
inp_attn_type * inp_attn = nullptr;
|
|
14201
14268
|
|
|
14202
14269
|
if constexpr (iswa) {
|
|
14203
|
-
inp_attn =
|
|
14270
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
14204
14271
|
} else {
|
|
14205
|
-
inp_attn =
|
|
14272
|
+
inp_attn = build_attn_inp_kv();
|
|
14206
14273
|
}
|
|
14207
14274
|
|
|
14208
14275
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -14257,7 +14324,7 @@ struct llm_build_exaone4 : public llm_graph_context {
|
|
|
14257
14324
|
|
|
14258
14325
|
cur = build_attn(inp_attn,
|
|
14259
14326
|
model.layers[il].wo, NULL,
|
|
14260
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14327
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14261
14328
|
cb(cur, "attn_out", il);
|
|
14262
14329
|
}
|
|
14263
14330
|
|
|
@@ -15085,7 +15152,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
15085
15152
|
inp_pos = build_inp_pos();
|
|
15086
15153
|
}
|
|
15087
15154
|
|
|
15088
|
-
auto * inp_attn =
|
|
15155
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15089
15156
|
|
|
15090
15157
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15091
15158
|
|
|
@@ -15136,12 +15203,12 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
15136
15203
|
}
|
|
15137
15204
|
|
|
15138
15205
|
ggml_tensor * build_attention_layer(
|
|
15139
|
-
ggml_tensor
|
|
15140
|
-
ggml_tensor
|
|
15141
|
-
|
|
15142
|
-
const llama_model
|
|
15143
|
-
const int64_t
|
|
15144
|
-
const int
|
|
15206
|
+
ggml_tensor * cur,
|
|
15207
|
+
ggml_tensor * inp_pos,
|
|
15208
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
15209
|
+
const llama_model & model,
|
|
15210
|
+
const int64_t n_embd_head,
|
|
15211
|
+
const int il) {
|
|
15145
15212
|
|
|
15146
15213
|
// compute Q and K and (optionally) RoPE them
|
|
15147
15214
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -15192,7 +15259,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
15192
15259
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
15193
15260
|
cur = build_attn(inp_attn,
|
|
15194
15261
|
model.layers[il].wo, model.layers[il].bo,
|
|
15195
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15262
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
15196
15263
|
cb(cur, "attn_out", il);
|
|
15197
15264
|
return cur;
|
|
15198
15265
|
}
|
|
@@ -15355,12 +15422,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
15355
15422
|
}
|
|
15356
15423
|
|
|
15357
15424
|
ggml_tensor * build_attention_layer(
|
|
15358
|
-
ggml_tensor
|
|
15359
|
-
ggml_tensor
|
|
15360
|
-
|
|
15361
|
-
const llama_model
|
|
15362
|
-
const int64_t
|
|
15363
|
-
const int
|
|
15425
|
+
ggml_tensor * cur,
|
|
15426
|
+
ggml_tensor * inp_pos,
|
|
15427
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
15428
|
+
const llama_model & model,
|
|
15429
|
+
const int64_t n_embd_head,
|
|
15430
|
+
const int il) {
|
|
15364
15431
|
|
|
15365
15432
|
// compute Q and K and (optionally) RoPE them
|
|
15366
15433
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
@@ -15411,7 +15478,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
15411
15478
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
15412
15479
|
cur = build_attn(inp_attn,
|
|
15413
15480
|
model.layers[il].wo, model.layers[il].bo,
|
|
15414
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
15481
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
15415
15482
|
cb(cur, "attn_out", il);
|
|
15416
15483
|
return cur;
|
|
15417
15484
|
}
|
|
@@ -15517,7 +15584,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
15517
15584
|
// inp_pos - contains the positions
|
|
15518
15585
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15519
15586
|
|
|
15520
|
-
auto * inp_attn =
|
|
15587
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15521
15588
|
|
|
15522
15589
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15523
15590
|
|
|
@@ -15596,7 +15663,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
15596
15663
|
|
|
15597
15664
|
cur = build_attn(inp_attn,
|
|
15598
15665
|
model.layers[il].wo, nullptr,
|
|
15599
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15666
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
15600
15667
|
}
|
|
15601
15668
|
|
|
15602
15669
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -15848,7 +15915,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
15848
15915
|
// inp_pos - contains the positions
|
|
15849
15916
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15850
15917
|
|
|
15851
|
-
auto * inp_attn =
|
|
15918
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
15852
15919
|
|
|
15853
15920
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15854
15921
|
|
|
@@ -15952,7 +16019,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
15952
16019
|
|
|
15953
16020
|
cur = build_attn(inp_attn,
|
|
15954
16021
|
model.layers[il].wo, NULL,
|
|
15955
|
-
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
16022
|
+
q_states, k_states, v_states, nullptr, nullptr, nullptr, kq_scale, il);
|
|
15956
16023
|
}
|
|
15957
16024
|
|
|
15958
16025
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -16013,7 +16080,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
16013
16080
|
// inp_pos - contains the positions
|
|
16014
16081
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16015
16082
|
|
|
16016
|
-
auto * inp_attn =
|
|
16083
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16017
16084
|
|
|
16018
16085
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16019
16086
|
|
|
@@ -16075,7 +16142,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
16075
16142
|
|
|
16076
16143
|
cur = build_attn(inp_attn,
|
|
16077
16144
|
model.layers[il].wo, model.layers[il].bo,
|
|
16078
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
16145
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
16079
16146
|
}
|
|
16080
16147
|
|
|
16081
16148
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -16162,7 +16229,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
16162
16229
|
// inp_pos - contains the positions
|
|
16163
16230
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16164
16231
|
|
|
16165
|
-
auto * inp_attn =
|
|
16232
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16166
16233
|
|
|
16167
16234
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16168
16235
|
|
|
@@ -16215,7 +16282,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
16215
16282
|
|
|
16216
16283
|
cur = build_attn(inp_attn,
|
|
16217
16284
|
model.layers[il].wo, model.layers[il].bo,
|
|
16218
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16285
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16219
16286
|
}
|
|
16220
16287
|
|
|
16221
16288
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -16312,7 +16379,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
16312
16379
|
// inp_pos - contains the positions
|
|
16313
16380
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16314
16381
|
|
|
16315
|
-
auto * inp_attn =
|
|
16382
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16316
16383
|
|
|
16317
16384
|
for (int il = 0; il < n_layer; ++il) {
|
|
16318
16385
|
ggml_tensor * inpSA = inpL;
|
|
@@ -16370,7 +16437,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
16370
16437
|
|
|
16371
16438
|
cur = build_attn(inp_attn,
|
|
16372
16439
|
model.layers[il].wo, NULL,
|
|
16373
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16440
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16374
16441
|
}
|
|
16375
16442
|
|
|
16376
16443
|
if (il == n_layer - 1) {
|
|
@@ -16442,7 +16509,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
|
|
16442
16509
|
// inp_pos - contains the positions
|
|
16443
16510
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16444
16511
|
|
|
16445
|
-
auto * inp_attn =
|
|
16512
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
16446
16513
|
|
|
16447
16514
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
16448
16515
|
|
|
@@ -16503,7 +16570,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
|
|
16503
16570
|
|
|
16504
16571
|
cur = build_attn(inp_attn,
|
|
16505
16572
|
model.layers[il].wo, NULL,
|
|
16506
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16573
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
16507
16574
|
cb(cur, "attn_out", il);
|
|
16508
16575
|
}
|
|
16509
16576
|
|
|
@@ -16656,7 +16723,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba {
|
|
|
16656
16723
|
|
|
16657
16724
|
ggml_tensor * attn_out = build_attn(inp->get_attn(),
|
|
16658
16725
|
model.layers[il].wo, NULL,
|
|
16659
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
16726
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
16660
16727
|
cb(attn_out, "attn_out", il);
|
|
16661
16728
|
|
|
16662
16729
|
cur = build_norm(inpL,
|
|
@@ -16816,7 +16883,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
|
|
|
16816
16883
|
|
|
16817
16884
|
private:
|
|
16818
16885
|
ggml_tensor * build_plamo2_attn_layer(
|
|
16819
|
-
|
|
16886
|
+
llm_graph_input_attn_kv * inp,
|
|
16820
16887
|
ggml_tensor * inp_pos,
|
|
16821
16888
|
ggml_tensor * cur,
|
|
16822
16889
|
const llama_model & model,
|
|
@@ -16840,13 +16907,13 @@ private:
|
|
|
16840
16907
|
|
|
16841
16908
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
|
16842
16909
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
|
16843
|
-
ggml_tensor * Vcur =
|
|
16910
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
|
|
16844
16911
|
|
|
16845
16912
|
cb(Qcur, "Qcur", il);
|
|
16846
16913
|
cb(Kcur, "Kcur", il);
|
|
16847
16914
|
cb(Vcur, "Vcur", il);
|
|
16848
16915
|
|
|
16849
|
-
Vcur =
|
|
16916
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
|
|
16850
16917
|
|
|
16851
16918
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
16852
16919
|
cb(Qcur, "Qcur_normed", il);
|
|
@@ -16866,7 +16933,9 @@ private:
|
|
|
16866
16933
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
16867
16934
|
);
|
|
16868
16935
|
|
|
16869
|
-
cur = build_attn(inp,
|
|
16936
|
+
cur = build_attn(inp,
|
|
16937
|
+
model.layers[il].wo, NULL,
|
|
16938
|
+
Qcur, Kcur, Vcur, NULL, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
|
|
16870
16939
|
}
|
|
16871
16940
|
|
|
16872
16941
|
cb(cur, "attn_out", il);
|
|
@@ -16913,15 +16982,13 @@ private:
|
|
|
16913
16982
|
cb(zx, "mamba_in_proj", il);
|
|
16914
16983
|
// {8192, 5, 1, 1} -> {8192, 1, 5, 1}
|
|
16915
16984
|
zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
|
|
16916
|
-
zx =
|
|
16917
|
-
zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
|
|
16985
|
+
zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
|
|
16918
16986
|
cb(zx, "mamba_in_proj_out", il);
|
|
16919
16987
|
|
|
16920
16988
|
// split into z and x
|
|
16921
16989
|
// => {head_dim * n_heads, n_seq_tokens, n_seqs}
|
|
16922
16990
|
ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
|
|
16923
|
-
x =
|
|
16924
|
-
x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
|
|
16991
|
+
x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
|
|
16925
16992
|
// x = ggml_permute(ctx0, x, 0, 2, 1, 3);
|
|
16926
16993
|
cb(x, "mamba_x_split", il);
|
|
16927
16994
|
|
|
@@ -17051,7 +17118,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
17051
17118
|
// inp_pos - contains the positions
|
|
17052
17119
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17053
17120
|
|
|
17054
|
-
auto * inp_attn =
|
|
17121
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17055
17122
|
|
|
17056
17123
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
17057
17124
|
|
|
@@ -17115,7 +17182,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
17115
17182
|
|
|
17116
17183
|
cur = build_attn(inp_attn,
|
|
17117
17184
|
model.layers[il].wo, model.layers[il].bo,
|
|
17118
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17185
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17119
17186
|
cb(cur, "attn_out", il);
|
|
17120
17187
|
}
|
|
17121
17188
|
|
|
@@ -17186,7 +17253,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
17186
17253
|
// inp_pos - contains the positions
|
|
17187
17254
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17188
17255
|
|
|
17189
|
-
auto * inp_attn =
|
|
17256
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17190
17257
|
|
|
17191
17258
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
17192
17259
|
|
|
@@ -17260,7 +17327,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
17260
17327
|
|
|
17261
17328
|
cur = build_attn(inp_attn,
|
|
17262
17329
|
model.layers[il].wo, model.layers[il].bo,
|
|
17263
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17330
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17264
17331
|
cb(cur, "attn_out", il);
|
|
17265
17332
|
}
|
|
17266
17333
|
|
|
@@ -17347,7 +17414,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
|
|
|
17347
17414
|
// inp_pos - contains the positions
|
|
17348
17415
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17349
17416
|
|
|
17350
|
-
auto * inp_attn =
|
|
17417
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17351
17418
|
|
|
17352
17419
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
17353
17420
|
|
|
@@ -17420,7 +17487,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
|
|
|
17420
17487
|
|
|
17421
17488
|
cur = build_attn(inp_attn,
|
|
17422
17489
|
model.layers[il].wo, model.layers[il].bo,
|
|
17423
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17490
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17424
17491
|
cb(cur, "attn_out", il);
|
|
17425
17492
|
}
|
|
17426
17493
|
|
|
@@ -17485,7 +17552,7 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
17485
17552
|
// inp_pos - contains the positions
|
|
17486
17553
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17487
17554
|
|
|
17488
|
-
auto * inp_attn =
|
|
17555
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17489
17556
|
|
|
17490
17557
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
17491
17558
|
|
|
@@ -17550,7 +17617,7 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
17550
17617
|
|
|
17551
17618
|
cur = build_attn(inp_attn,
|
|
17552
17619
|
model.layers[il].wo, model.layers[il].bo,
|
|
17553
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
17620
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
17554
17621
|
cb(cur, "attn_out", il);
|
|
17555
17622
|
}
|
|
17556
17623
|
|
|
@@ -17617,7 +17684,7 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
|
|
|
17617
17684
|
// inp_pos - contains the positions
|
|
17618
17685
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17619
17686
|
|
|
17620
|
-
auto * inp_attn =
|
|
17687
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
17621
17688
|
|
|
17622
17689
|
for (int il = 0; il < n_layer; ++il) {
|
|
17623
17690
|
ggml_tensor * inpSA = inpL;
|
|
@@ -17672,9 +17739,9 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
|
|
|
17672
17739
|
cb(Kcur, "Kcur", il);
|
|
17673
17740
|
cb(Vcur, "Vcur", il);
|
|
17674
17741
|
|
|
17675
|
-
cur =
|
|
17742
|
+
cur = build_attn(inp_attn,
|
|
17676
17743
|
model.layers[il].wo, model.layers[il].bo,
|
|
17677
|
-
Qcur, Kcur, Vcur, nullptr,
|
|
17744
|
+
Qcur, Kcur, Vcur, nullptr, model.layers[il].attn_sinks, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
17678
17745
|
|
|
17679
17746
|
cb(cur, "attn_out", il);
|
|
17680
17747
|
}
|
|
@@ -17771,8 +17838,7 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17771
17838
|
cb(cur, "model.embedding_norm", -1);
|
|
17772
17839
|
res->t_embd = cur;
|
|
17773
17840
|
|
|
17774
|
-
|
|
17775
|
-
cur = build_lora_mm(model.tok_embd, cur);
|
|
17841
|
+
cur = build_lora_mm(model.output, cur);
|
|
17776
17842
|
cb(cur, "lm_head", -1);
|
|
17777
17843
|
|
|
17778
17844
|
res->t_logits = cur;
|
|
@@ -17799,10 +17865,10 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17799
17865
|
return cur;
|
|
17800
17866
|
}
|
|
17801
17867
|
|
|
17802
|
-
ggml_tensor * build_attn_block(ggml_tensor
|
|
17803
|
-
ggml_tensor
|
|
17804
|
-
|
|
17805
|
-
int
|
|
17868
|
+
ggml_tensor * build_attn_block(ggml_tensor * cur,
|
|
17869
|
+
ggml_tensor * inp_pos,
|
|
17870
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
17871
|
+
int il) const {
|
|
17806
17872
|
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
|
|
17807
17873
|
auto const n_embd_head = hparams.n_embd_head_v;
|
|
17808
17874
|
auto const n_head_kv = hparams.n_head_kv(il);
|
|
@@ -17837,7 +17903,7 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17837
17903
|
);
|
|
17838
17904
|
|
|
17839
17905
|
cur = build_attn(inp_attn, model.layers[il].wo, NULL,
|
|
17840
|
-
q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
17906
|
+
q, k, v, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
17841
17907
|
|
|
17842
17908
|
cb(cur, "model.layers.{}.self_attn.out_proj", il);
|
|
17843
17909
|
|
|
@@ -17914,6 +17980,137 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
17914
17980
|
}
|
|
17915
17981
|
};
|
|
17916
17982
|
|
|
17983
|
+
struct llm_build_seed_oss : public llm_graph_context {
|
|
17984
|
+
llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17985
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
17986
|
+
|
|
17987
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
17988
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
17989
|
+
|
|
17990
|
+
ggml_tensor * cur;
|
|
17991
|
+
ggml_tensor * inpL;
|
|
17992
|
+
|
|
17993
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
17994
|
+
|
|
17995
|
+
// inp_pos - contains the positions
|
|
17996
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
17997
|
+
|
|
17998
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17999
|
+
|
|
18000
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
18001
|
+
|
|
18002
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
18003
|
+
|
|
18004
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
18005
|
+
ggml_tensor * inpSA = inpL;
|
|
18006
|
+
|
|
18007
|
+
// norm
|
|
18008
|
+
cur = build_norm(inpL,
|
|
18009
|
+
model.layers[il].attn_norm, NULL,
|
|
18010
|
+
LLM_NORM_RMS, il);
|
|
18011
|
+
cb(cur, "attn_norm", il);
|
|
18012
|
+
|
|
18013
|
+
// self-attention
|
|
18014
|
+
{
|
|
18015
|
+
// compute Q and K and RoPE them
|
|
18016
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
18017
|
+
cb(Qcur, "Qcur", il);
|
|
18018
|
+
if (model.layers[il].bq) {
|
|
18019
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
18020
|
+
cb(Qcur, "Qcur", il);
|
|
18021
|
+
}
|
|
18022
|
+
|
|
18023
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
18024
|
+
cb(Kcur, "Kcur", il);
|
|
18025
|
+
if (model.layers[il].bk) {
|
|
18026
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
18027
|
+
cb(Kcur, "Kcur", il);
|
|
18028
|
+
}
|
|
18029
|
+
|
|
18030
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
18031
|
+
cb(Vcur, "Vcur", il);
|
|
18032
|
+
if (model.layers[il].bv) {
|
|
18033
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
18034
|
+
cb(Vcur, "Vcur", il);
|
|
18035
|
+
}
|
|
18036
|
+
|
|
18037
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
18038
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
18039
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
18040
|
+
|
|
18041
|
+
Qcur = ggml_rope_ext(
|
|
18042
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
18043
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
18044
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
18045
|
+
);
|
|
18046
|
+
|
|
18047
|
+
Kcur = ggml_rope_ext(
|
|
18048
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
18049
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
18050
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
18051
|
+
);
|
|
18052
|
+
|
|
18053
|
+
cb(Qcur, "Qcur", il);
|
|
18054
|
+
cb(Kcur, "Kcur", il);
|
|
18055
|
+
cb(Vcur, "Vcur", il);
|
|
18056
|
+
|
|
18057
|
+
cur = build_attn(inp_attn,
|
|
18058
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
18059
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
18060
|
+
cb(cur, "attn_out", il);
|
|
18061
|
+
}
|
|
18062
|
+
|
|
18063
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
18064
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
18065
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
18066
|
+
}
|
|
18067
|
+
|
|
18068
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
18069
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
18070
|
+
|
|
18071
|
+
// feed-forward network
|
|
18072
|
+
cur = build_norm(ffn_inp,
|
|
18073
|
+
model.layers[il].attn_post_norm, NULL,
|
|
18074
|
+
LLM_NORM_RMS, il);
|
|
18075
|
+
cb(cur, "attn_post_norm", il);
|
|
18076
|
+
|
|
18077
|
+
cur = build_ffn(cur,
|
|
18078
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
18079
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
18080
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
18081
|
+
NULL,
|
|
18082
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
18083
|
+
cb(cur, "ffn_out", il);
|
|
18084
|
+
|
|
18085
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
18086
|
+
cb(cur, "ffn_out", il);
|
|
18087
|
+
|
|
18088
|
+
cur = build_cvec(cur, il);
|
|
18089
|
+
cb(cur, "l_out", il);
|
|
18090
|
+
|
|
18091
|
+
// input for next layer
|
|
18092
|
+
inpL = cur;
|
|
18093
|
+
}
|
|
18094
|
+
|
|
18095
|
+
cur = inpL;
|
|
18096
|
+
|
|
18097
|
+
cur = build_norm(cur,
|
|
18098
|
+
model.output_norm, NULL,
|
|
18099
|
+
LLM_NORM_RMS, -1);
|
|
18100
|
+
|
|
18101
|
+
cb(cur, "result_norm", -1);
|
|
18102
|
+
res->t_embd = cur;
|
|
18103
|
+
|
|
18104
|
+
// lm_head
|
|
18105
|
+
cur = build_lora_mm(model.output, cur);
|
|
18106
|
+
|
|
18107
|
+
cb(cur, "result_output", -1);
|
|
18108
|
+
res->t_logits = cur;
|
|
18109
|
+
|
|
18110
|
+
ggml_build_forward_expand(gf, cur);
|
|
18111
|
+
}
|
|
18112
|
+
};
|
|
18113
|
+
|
|
17917
18114
|
template <bool iswa>
|
|
17918
18115
|
struct llm_build_smallthinker : public llm_graph_context{
|
|
17919
18116
|
llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
|
|
@@ -17930,13 +18127,13 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
17930
18127
|
// inp_pos - contains the positions
|
|
17931
18128
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
17932
18129
|
|
|
17933
|
-
using inp_attn_type = std::conditional_t<iswa,
|
|
18130
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
17934
18131
|
inp_attn_type * inp_attn = nullptr;
|
|
17935
18132
|
|
|
17936
18133
|
if constexpr (iswa) {
|
|
17937
|
-
inp_attn =
|
|
18134
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
17938
18135
|
} else {
|
|
17939
|
-
inp_attn =
|
|
18136
|
+
inp_attn = build_attn_inp_kv();
|
|
17940
18137
|
}
|
|
17941
18138
|
|
|
17942
18139
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -17981,7 +18178,7 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
17981
18178
|
|
|
17982
18179
|
cur = build_attn(inp_attn,
|
|
17983
18180
|
model.layers[il].wo, model.layers[il].bo,
|
|
17984
|
-
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
18181
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
|
17985
18182
|
}
|
|
17986
18183
|
|
|
17987
18184
|
if (il == n_layer - 1 && inp_out_ids) {
|
|
@@ -18066,7 +18263,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18066
18263
|
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
18067
18264
|
cparams.n_seq_max);
|
|
18068
18265
|
} else if (llm_arch_is_hybrid(arch)) {
|
|
18069
|
-
const auto padding =
|
|
18266
|
+
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18070
18267
|
|
|
18071
18268
|
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
18072
18269
|
|
|
@@ -18088,7 +18285,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18088
18285
|
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
|
18089
18286
|
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
|
18090
18287
|
} else {
|
|
18091
|
-
const auto padding =
|
|
18288
|
+
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18092
18289
|
|
|
18093
18290
|
uint32_t n_ctx_per_stream = cparams.n_ctx;
|
|
18094
18291
|
|
|
@@ -18108,7 +18305,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18108
18305
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
18109
18306
|
GGML_ASSERT(hparams.is_swa_any());
|
|
18110
18307
|
|
|
18111
|
-
res = new
|
|
18308
|
+
res = new llama_kv_cache_iswa(
|
|
18112
18309
|
*this,
|
|
18113
18310
|
params.type_k,
|
|
18114
18311
|
params.type_v,
|
|
@@ -18123,7 +18320,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18123
18320
|
} else {
|
|
18124
18321
|
GGML_ASSERT(!hparams.is_swa_any());
|
|
18125
18322
|
|
|
18126
|
-
res = new
|
|
18323
|
+
res = new llama_kv_cache(
|
|
18127
18324
|
*this,
|
|
18128
18325
|
nullptr,
|
|
18129
18326
|
params.type_k,
|
|
@@ -18452,6 +18649,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18452
18649
|
{
|
|
18453
18650
|
llm = std::make_unique<llm_build_bailingmoe>(*this, params);
|
|
18454
18651
|
} break;
|
|
18652
|
+
case LLM_ARCH_SEED_OSS:
|
|
18653
|
+
{
|
|
18654
|
+
llm = std::make_unique<llm_build_seed_oss>(*this, params);
|
|
18655
|
+
} break;
|
|
18455
18656
|
case LLM_ARCH_DOTS1:
|
|
18456
18657
|
{
|
|
18457
18658
|
llm = std::make_unique<llm_build_dots1>(*this, params);
|
|
@@ -18510,6 +18711,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18510
18711
|
return llm->res->get_gf();
|
|
18511
18712
|
}
|
|
18512
18713
|
|
|
18714
|
+
|
|
18513
18715
|
//
|
|
18514
18716
|
// interface implementation
|
|
18515
18717
|
//
|
|
@@ -18704,6 +18906,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
18704
18906
|
case LLM_ARCH_LFM2:
|
|
18705
18907
|
case LLM_ARCH_SMALLTHINKER:
|
|
18706
18908
|
case LLM_ARCH_GLM4_MOE:
|
|
18909
|
+
case LLM_ARCH_SEED_OSS:
|
|
18707
18910
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
18708
18911
|
|
|
18709
18912
|
case LLM_ARCH_QWEN2VL:
|