@fugood/llama.node 1.2.0-rc.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/chat.cpp +139 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +12 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
- package/src/llama.cpp/ggml/include/ggml.h +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +7 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +0 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +161 -0
- package/src/llama.cpp/src/llama-arch.cpp +1 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -3
- package/src/llama.cpp/src/llama-graph.cpp +3 -2
- package/src/llama.cpp/src/llama-hparams.h +1 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +55 -15
- package/src/llama.cpp/src/llama-kv-cache.h +8 -0
- package/src/llama.cpp/src/llama-model.cpp +58 -96
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama.cpp +53 -10
|
@@ -1542,6 +1542,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1542
1542
|
hparams.dec_start_token_id = dec_start_token_id;
|
|
1543
1543
|
}
|
|
1544
1544
|
|
|
1545
|
+
hparams.dec_n_layer = hparams.n_layer;
|
|
1546
|
+
ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
|
|
1547
|
+
|
|
1545
1548
|
switch (hparams.n_layer) {
|
|
1546
1549
|
case 6: type = LLM_TYPE_60M; break; // t5-small
|
|
1547
1550
|
case 8: type = LLM_TYPE_80M; break; // flan-t5-small
|
|
@@ -4414,6 +4417,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4414
4417
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4415
4418
|
}
|
|
4416
4419
|
|
|
4420
|
+
// n_layer: number of encoder_layers
|
|
4421
|
+
// dec_n_layer: number of decoder_layers
|
|
4422
|
+
const int dec_n_layer = hparams.dec_n_layer;
|
|
4423
|
+
if (dec_n_layer > n_layer) {
|
|
4424
|
+
layers.resize(dec_n_layer);
|
|
4425
|
+
}
|
|
4426
|
+
|
|
4427
|
+
// load encoder layers
|
|
4417
4428
|
for (int i = 0; i < n_layer; ++i) {
|
|
4418
4429
|
auto & layer = layers[i];
|
|
4419
4430
|
|
|
@@ -4429,6 +4440,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4429
4440
|
layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
4430
4441
|
layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4431
4442
|
layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4443
|
+
}
|
|
4444
|
+
|
|
4445
|
+
// load decoder layers
|
|
4446
|
+
for (int i = 0; i < dec_n_layer; ++i) {
|
|
4447
|
+
auto & layer = layers[i];
|
|
4432
4448
|
|
|
4433
4449
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4434
4450
|
layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
|
|
@@ -6927,9 +6943,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6927
6943
|
|
|
6928
6944
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6929
6945
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6930
|
-
ggml_tensor * Vcur =
|
|
6931
|
-
|
|
6932
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6946
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
6933
6947
|
|
|
6934
6948
|
// using mode = 2 for neox mode
|
|
6935
6949
|
Qcur = ggml_rope_ext(
|
|
@@ -7207,9 +7221,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
7207
7221
|
|
|
7208
7222
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7209
7223
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7210
|
-
Vcur =
|
|
7211
|
-
|
|
7212
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7224
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7213
7225
|
|
|
7214
7226
|
Qcur = ggml_rope_ext(
|
|
7215
7227
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -7329,13 +7341,9 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7329
7341
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7330
7342
|
cb(cur, "bqkv", il);
|
|
7331
7343
|
|
|
7332
|
-
ggml_tensor * Qcur =
|
|
7333
|
-
ggml_tensor * Kcur =
|
|
7334
|
-
ggml_tensor * Vcur =
|
|
7335
|
-
|
|
7336
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7337
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7338
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7344
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7345
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7346
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7339
7347
|
|
|
7340
7348
|
cb(Qcur, "Qcur", il);
|
|
7341
7349
|
cb(Kcur, "Kcur", il);
|
|
@@ -7551,14 +7559,16 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7551
7559
|
cb(cur, "bqkv", il);
|
|
7552
7560
|
}
|
|
7553
7561
|
|
|
7554
|
-
Qcur =
|
|
7555
|
-
Kcur =
|
|
7556
|
-
Vcur =
|
|
7557
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7562
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7563
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7564
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7558
7565
|
} else {
|
|
7559
7566
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
7560
7567
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
7561
7568
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
7569
|
+
|
|
7570
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7571
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7562
7572
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7563
7573
|
}
|
|
7564
7574
|
|
|
@@ -7569,8 +7579,6 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7569
7579
|
LLM_NORM, il);
|
|
7570
7580
|
|
|
7571
7581
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7572
|
-
} else {
|
|
7573
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7574
7582
|
}
|
|
7575
7583
|
|
|
7576
7584
|
if (model.layers[il].attn_k_norm) {
|
|
@@ -7580,8 +7588,6 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7580
7588
|
LLM_NORM, il);
|
|
7581
7589
|
|
|
7582
7590
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7583
|
-
} else {
|
|
7584
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7585
7591
|
}
|
|
7586
7592
|
|
|
7587
7593
|
// RoPE
|
|
@@ -7727,9 +7733,7 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
7727
7733
|
|
|
7728
7734
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7729
7735
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7730
|
-
Vcur =
|
|
7731
|
-
|
|
7732
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7736
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7733
7737
|
|
|
7734
7738
|
// RoPE
|
|
7735
7739
|
Qcur = ggml_rope_ext(
|
|
@@ -7836,13 +7840,9 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7836
7840
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7837
7841
|
cb(cur, "bqkv", il);
|
|
7838
7842
|
|
|
7839
|
-
ggml_tensor * Qcur =
|
|
7840
|
-
ggml_tensor * Kcur =
|
|
7841
|
-
ggml_tensor * Vcur =
|
|
7842
|
-
|
|
7843
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7844
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7845
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7843
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7844
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7845
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7846
7846
|
|
|
7847
7847
|
cb(Qcur, "Qcur", il);
|
|
7848
7848
|
cb(Kcur, "Kcur", il);
|
|
@@ -7958,13 +7958,9 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7958
7958
|
cb(cur, "wqkv_clamped", il);
|
|
7959
7959
|
}
|
|
7960
7960
|
|
|
7961
|
-
ggml_tensor * Qcur =
|
|
7962
|
-
ggml_tensor * Kcur =
|
|
7963
|
-
ggml_tensor * Vcur =
|
|
7964
|
-
|
|
7965
|
-
cb(Qcur, "Qcur", il);
|
|
7966
|
-
cb(Kcur, "Kcur", il);
|
|
7967
|
-
cb(Vcur, "Vcur", il);
|
|
7961
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7962
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7963
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7968
7964
|
|
|
7969
7965
|
// Q/K Layernorm
|
|
7970
7966
|
if (model.layers[il].attn_q_norm) {
|
|
@@ -7972,26 +7968,16 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7972
7968
|
model.layers[il].attn_q_norm,
|
|
7973
7969
|
model.layers[il].attn_q_norm_b,
|
|
7974
7970
|
LLM_NORM, il);
|
|
7975
|
-
cb(Qcur, "Qcur", il);
|
|
7976
7971
|
|
|
7977
7972
|
Kcur = build_norm(Kcur,
|
|
7978
7973
|
model.layers[il].attn_k_norm,
|
|
7979
7974
|
model.layers[il].attn_k_norm_b,
|
|
7980
7975
|
LLM_NORM, il);
|
|
7981
|
-
cb(Kcur, "Kcur", il);
|
|
7982
7976
|
|
|
7983
7977
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7984
7978
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7985
|
-
} else {
|
|
7986
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7987
|
-
cb(Qcur, "Qcur", il);
|
|
7988
|
-
|
|
7989
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7990
|
-
cb(Kcur, "Kcur", il);
|
|
7991
7979
|
}
|
|
7992
7980
|
|
|
7993
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7994
|
-
|
|
7995
7981
|
cb(Qcur, "Qcur", il);
|
|
7996
7982
|
cb(Kcur, "Kcur", il);
|
|
7997
7983
|
cb(Vcur, "Vcur", il);
|
|
@@ -8240,11 +8226,9 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8240
8226
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
8241
8227
|
cb(cur, "bqkv", il);
|
|
8242
8228
|
|
|
8243
|
-
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,
|
|
8229
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8244
8230
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
8245
|
-
ggml_tensor * Vcur =
|
|
8246
|
-
|
|
8247
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8231
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
|
|
8248
8232
|
|
|
8249
8233
|
// using mode = 2 for neox mode
|
|
8250
8234
|
Qcur = ggml_rope_ext(
|
|
@@ -9219,21 +9203,17 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9219
9203
|
|
|
9220
9204
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9221
9205
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9222
|
-
Vcur =
|
|
9223
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9206
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9224
9207
|
} else {
|
|
9225
9208
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9226
9209
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9227
9210
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9211
|
+
|
|
9228
9212
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9229
9213
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9230
9214
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9231
9215
|
}
|
|
9232
9216
|
|
|
9233
|
-
cb(Qcur, "Qcur", il);
|
|
9234
|
-
cb(Kcur, "Kcur", il);
|
|
9235
|
-
cb(Vcur, "Vcur", il);
|
|
9236
|
-
|
|
9237
9217
|
Qcur = ggml_rope_ext(
|
|
9238
9218
|
ctx0, Qcur, inp_pos, nullptr,
|
|
9239
9219
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9357,21 +9337,17 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9357
9337
|
|
|
9358
9338
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
|
|
9359
9339
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
9360
|
-
Vcur =
|
|
9361
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9340
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
9362
9341
|
} else {
|
|
9363
9342
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9364
9343
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9365
9344
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9345
|
+
|
|
9366
9346
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9367
9347
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9368
9348
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9369
9349
|
}
|
|
9370
9350
|
|
|
9371
|
-
cb(Qcur, "Qcur", il);
|
|
9372
|
-
cb(Kcur, "Kcur", il);
|
|
9373
|
-
cb(Vcur, "Vcur", il);
|
|
9374
|
-
|
|
9375
9351
|
Qcur = ggml_rope_ext(
|
|
9376
9352
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
9377
9353
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9621,18 +9597,14 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
9621
9597
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
9622
9598
|
cb(cur, "bqkv", il);
|
|
9623
9599
|
|
|
9624
|
-
ggml_tensor * Qcur =
|
|
9625
|
-
ggml_tensor * Kcur =
|
|
9626
|
-
ggml_tensor * Vcur =
|
|
9600
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9601
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9602
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9627
9603
|
|
|
9628
9604
|
cb(Qcur, "Qcur", il);
|
|
9629
9605
|
cb(Kcur, "Kcur", il);
|
|
9630
9606
|
cb(Vcur, "Vcur", il);
|
|
9631
9607
|
|
|
9632
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9633
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9634
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9635
|
-
|
|
9636
9608
|
cur = build_attn(inp_attn,
|
|
9637
9609
|
model.layers[il].wo, model.layers[il].bo,
|
|
9638
9610
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
@@ -9727,9 +9699,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9727
9699
|
|
|
9728
9700
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9729
9701
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9730
|
-
ggml_tensor * Vcur =
|
|
9731
|
-
|
|
9732
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9702
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9733
9703
|
|
|
9734
9704
|
Qcur = ggml_rope_ext(
|
|
9735
9705
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -12601,9 +12571,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12601
12571
|
|
|
12602
12572
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12603
12573
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
12604
|
-
ggml_tensor * Vcur =
|
|
12605
|
-
|
|
12606
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12574
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
12607
12575
|
|
|
12608
12576
|
Qcur = ggml_rope_ext(
|
|
12609
12577
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -13557,7 +13525,9 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13557
13525
|
|
|
13558
13526
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13559
13527
|
|
|
13560
|
-
|
|
13528
|
+
const int64_t dec_n_layer = hparams.dec_n_layer;
|
|
13529
|
+
|
|
13530
|
+
for (int il = 0; il < dec_n_layer; ++il) {
|
|
13561
13531
|
ggml_tensor * inpSA = inpL;
|
|
13562
13532
|
|
|
13563
13533
|
// norm
|
|
@@ -13648,7 +13618,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13648
13618
|
//cb(cur, "kqv_out", il);
|
|
13649
13619
|
}
|
|
13650
13620
|
|
|
13651
|
-
if (il ==
|
|
13621
|
+
if (il == dec_n_layer - 1 && inp_out_ids) {
|
|
13652
13622
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13653
13623
|
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
|
13654
13624
|
}
|
|
@@ -13669,8 +13639,8 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13669
13639
|
model.layers[il].ffn_gate, NULL, NULL,
|
|
13670
13640
|
model.layers[il].ffn_down, NULL, NULL,
|
|
13671
13641
|
NULL,
|
|
13672
|
-
model.layers[il].
|
|
13673
|
-
model.layers[il].
|
|
13642
|
+
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
|
|
13643
|
+
model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
|
13674
13644
|
il);
|
|
13675
13645
|
cb(cur, "ffn_out", il);
|
|
13676
13646
|
}
|
|
@@ -13736,18 +13706,14 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
13736
13706
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
13737
13707
|
cb(cur, "bqkv", il);
|
|
13738
13708
|
|
|
13739
|
-
ggml_tensor * Qcur =
|
|
13740
|
-
ggml_tensor * Kcur =
|
|
13741
|
-
ggml_tensor * Vcur =
|
|
13709
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
|
|
13710
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
|
|
13711
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
|
|
13742
13712
|
|
|
13743
13713
|
cb(Qcur, "Qcur", il);
|
|
13744
13714
|
cb(Kcur, "Kcur", il);
|
|
13745
13715
|
cb(Vcur, "Vcur", il);
|
|
13746
13716
|
|
|
13747
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13748
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13749
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13750
|
-
|
|
13751
13717
|
cur = build_attn(inp_attn,
|
|
13752
13718
|
model.layers[il].wo, model.layers[il].bo,
|
|
13753
13719
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
@@ -13859,8 +13825,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13859
13825
|
}
|
|
13860
13826
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13861
13827
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13862
|
-
Vcur =
|
|
13863
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13828
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13864
13829
|
}
|
|
13865
13830
|
|
|
13866
13831
|
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
|
@@ -13993,8 +13958,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13993
13958
|
}
|
|
13994
13959
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13995
13960
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13996
|
-
Vcur =
|
|
13997
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13961
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13998
13962
|
}
|
|
13999
13963
|
|
|
14000
13964
|
Qcur = ggml_rope_ext(
|
|
@@ -17293,16 +17257,14 @@ private:
|
|
|
17293
17257
|
const int64_t k_offset = n_embd_head_q * n_head;
|
|
17294
17258
|
const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
|
|
17295
17259
|
|
|
17296
|
-
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head,
|
|
17260
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
|
17297
17261
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
|
17298
|
-
ggml_tensor * Vcur =
|
|
17262
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
|
|
17299
17263
|
|
|
17300
17264
|
cb(Qcur, "Qcur", il);
|
|
17301
17265
|
cb(Kcur, "Kcur", il);
|
|
17302
17266
|
cb(Vcur, "Vcur", il);
|
|
17303
17267
|
|
|
17304
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
|
|
17305
|
-
|
|
17306
17268
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
17307
17269
|
cb(Qcur, "Qcur_normed", il);
|
|
17308
17270
|
|
|
@@ -920,7 +920,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
920
920
|
new_type = tensor->type;
|
|
921
921
|
new_data = tensor->data;
|
|
922
922
|
new_size = ggml_nbytes(tensor);
|
|
923
|
-
LLAMA_LOG_INFO("size = %8.3f
|
|
923
|
+
LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
|
924
924
|
} else {
|
|
925
925
|
const int64_t nelements = ggml_nelements(tensor);
|
|
926
926
|
|
|
@@ -1037,8 +1037,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
1037
1037
|
}
|
|
1038
1038
|
close_ofstream();
|
|
1039
1039
|
|
|
1040
|
-
LLAMA_LOG_INFO("%s: model size = %8.2f
|
|
1041
|
-
LLAMA_LOG_INFO("%s: quant size = %8.2f
|
|
1040
|
+
LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
|
|
1041
|
+
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
|
|
1042
1042
|
|
|
1043
1043
|
if (qs.n_fallback > 0) {
|
|
1044
1044
|
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|
|
@@ -59,6 +59,7 @@ bool llama_supports_mlock(void) {
|
|
|
59
59
|
|
|
60
60
|
bool llama_supports_gpu_offload(void) {
|
|
61
61
|
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
|
62
|
+
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
|
|
62
63
|
llama_supports_rpc();
|
|
63
64
|
}
|
|
64
65
|
|
|
@@ -83,7 +84,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
|
83
84
|
GGML_ASSERT(dev && "CPU backend is not loaded");
|
|
84
85
|
auto * reg = ggml_backend_dev_backend_reg(dev);
|
|
85
86
|
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
|
86
|
-
numa_init_fn
|
|
87
|
+
if (numa_init_fn) {
|
|
88
|
+
numa_init_fn(numa);
|
|
89
|
+
}
|
|
87
90
|
}
|
|
88
91
|
}
|
|
89
92
|
|
|
@@ -182,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
182
185
|
model->devices.push_back(*dev);
|
|
183
186
|
}
|
|
184
187
|
} else {
|
|
188
|
+
// default device selection
|
|
189
|
+
|
|
190
|
+
// build list of available devices
|
|
191
|
+
std::vector<ggml_backend_dev_t> gpus;
|
|
192
|
+
std::vector<ggml_backend_dev_t> igpus;
|
|
185
193
|
std::vector<ggml_backend_dev_t> rpc_servers;
|
|
186
|
-
|
|
194
|
+
|
|
187
195
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
188
196
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
189
197
|
switch (ggml_backend_dev_type(dev)) {
|
|
@@ -192,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
192
200
|
// skip CPU backends since they are handled separately
|
|
193
201
|
break;
|
|
194
202
|
|
|
195
|
-
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
|
203
|
+
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
|
196
204
|
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
|
197
205
|
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
|
198
206
|
rpc_servers.push_back(dev);
|
|
199
207
|
} else {
|
|
200
|
-
|
|
208
|
+
// check if there is already a GPU with the same device id
|
|
209
|
+
ggml_backend_dev_props props;
|
|
210
|
+
ggml_backend_dev_get_props(dev, &props);
|
|
211
|
+
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
|
|
212
|
+
ggml_backend_dev_props d_props;
|
|
213
|
+
ggml_backend_dev_get_props(d, &d_props);
|
|
214
|
+
if (props.device_id && d_props.device_id) {
|
|
215
|
+
return strcmp(props.device_id, d_props.device_id) == 0;
|
|
216
|
+
}
|
|
217
|
+
return false;
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
if (it != gpus.end()) {
|
|
221
|
+
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
|
222
|
+
__func__,
|
|
223
|
+
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
|
224
|
+
props.device_id ? props.device_id : "unknown id",
|
|
225
|
+
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
|
|
226
|
+
} else {
|
|
227
|
+
gpus.push_back(dev);
|
|
228
|
+
}
|
|
201
229
|
}
|
|
202
230
|
break;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
|
234
|
+
igpus.push_back(dev);
|
|
235
|
+
break;
|
|
203
236
|
}
|
|
204
237
|
}
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
238
|
+
|
|
239
|
+
// add RPC servers at the front of the list to minimize network transfers
|
|
240
|
+
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
|
241
|
+
|
|
242
|
+
// add GPUs
|
|
243
|
+
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
|
|
244
|
+
|
|
245
|
+
// add integrated GPUs only if no other devices were found
|
|
246
|
+
if (model->devices.empty()) {
|
|
247
|
+
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
|
|
208
248
|
}
|
|
209
249
|
}
|
|
210
250
|
|
|
@@ -225,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
225
265
|
}
|
|
226
266
|
|
|
227
267
|
for (auto * dev : model->devices) {
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__,
|
|
268
|
+
ggml_backend_dev_props props;
|
|
269
|
+
ggml_backend_dev_get_props(dev, &props);
|
|
270
|
+
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
|
271
|
+
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
|
272
|
+
props.device_id ? props.device_id : "unknown id",
|
|
273
|
+
props.memory_free/1024/1024);
|
|
231
274
|
}
|
|
232
275
|
|
|
233
276
|
const int status = llama_model_load(path_model, splits, *model, params);
|