@fugood/llama.node 1.1.6 → 1.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +6 -1
- package/lib/index.ts +6 -0
- package/lib/version.js +5 -0
- package/lib/version.ts +2 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +9 -9
- package/src/LlamaCompletionWorker.cpp +73 -20
- package/src/LlamaCompletionWorker.h +8 -0
- package/src/LlamaContext.cpp +9 -0
- package/src/common.hpp +8 -1
- package/src/llama.cpp/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +132 -41
- package/src/llama.cpp/common/chat-parser.cpp +9 -1
- package/src/llama.cpp/common/chat.cpp +311 -9
- package/src/llama.cpp/common/chat.h +4 -1
- package/src/llama.cpp/common/common.cpp +54 -0
- package/src/llama.cpp/common/common.h +46 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/src/llama.cpp/ggml/include/ggml.h +28 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/src/llama.cpp/include/llama.h +25 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +2 -4
- package/src/llama.cpp/src/llama-context.cpp +29 -22
- package/src/llama.cpp/src/llama-context.h +6 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
- package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-memory.h +2 -2
- package/src/llama.cpp/src/llama-model.cpp +81 -70
- package/src/llama.cpp/src/llama-model.h +2 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
|
@@ -86,6 +86,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
86
86
|
case LLM_TYPE_40B: return "40B";
|
|
87
87
|
case LLM_TYPE_65B: return "65B";
|
|
88
88
|
case LLM_TYPE_70B: return "70B";
|
|
89
|
+
case LLM_TYPE_120B: return "120B";
|
|
89
90
|
case LLM_TYPE_142B: return "142B";
|
|
90
91
|
case LLM_TYPE_236B: return "236B";
|
|
91
92
|
case LLM_TYPE_290B: return "290B";
|
|
@@ -1095,6 +1096,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1095
1096
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1096
1097
|
|
|
1097
1098
|
switch (hparams.n_layer) {
|
|
1099
|
+
case 18: type = LLM_TYPE_537M; break;
|
|
1098
1100
|
case 26: type = LLM_TYPE_1B; break;
|
|
1099
1101
|
case 34: type = LLM_TYPE_4B; break;
|
|
1100
1102
|
case 48: type = LLM_TYPE_12B; break;
|
|
@@ -1833,7 +1835,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1833
1835
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1834
1836
|
hparams.set_swa_pattern(2);
|
|
1835
1837
|
|
|
1836
|
-
|
|
1838
|
+
switch (hparams.n_layer) {
|
|
1839
|
+
case 24: type = LLM_TYPE_20B; break;
|
|
1840
|
+
case 36: type = LLM_TYPE_120B; break;
|
|
1841
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1842
|
+
}
|
|
1837
1843
|
} break;
|
|
1838
1844
|
case LLM_ARCH_LFM2:
|
|
1839
1845
|
{
|
|
@@ -6742,9 +6748,9 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6742
6748
|
|
|
6743
6749
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6744
6750
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6745
|
-
ggml_tensor * Vcur =
|
|
6751
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
6746
6752
|
|
|
6747
|
-
Vcur =
|
|
6753
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6748
6754
|
|
|
6749
6755
|
// using mode = 2 for neox mode
|
|
6750
6756
|
Qcur = ggml_rope_ext(
|
|
@@ -7022,9 +7028,9 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
7022
7028
|
|
|
7023
7029
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7024
7030
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7025
|
-
Vcur =
|
|
7031
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7026
7032
|
|
|
7027
|
-
Vcur =
|
|
7033
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7028
7034
|
|
|
7029
7035
|
Qcur = ggml_rope_ext(
|
|
7030
7036
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -7144,13 +7150,13 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7144
7150
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7145
7151
|
cb(cur, "bqkv", il);
|
|
7146
7152
|
|
|
7147
|
-
ggml_tensor * Qcur =
|
|
7148
|
-
ggml_tensor * Kcur =
|
|
7149
|
-
ggml_tensor * Vcur =
|
|
7153
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7154
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7155
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7150
7156
|
|
|
7151
|
-
Qcur =
|
|
7152
|
-
Kcur =
|
|
7153
|
-
Vcur =
|
|
7157
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7158
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7159
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7154
7160
|
|
|
7155
7161
|
cb(Qcur, "Qcur", il);
|
|
7156
7162
|
cb(Kcur, "Kcur", il);
|
|
@@ -7366,13 +7372,15 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7366
7372
|
cb(cur, "bqkv", il);
|
|
7367
7373
|
}
|
|
7368
7374
|
|
|
7369
|
-
Qcur =
|
|
7370
|
-
Kcur =
|
|
7371
|
-
Vcur =
|
|
7375
|
+
Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7376
|
+
Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7377
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7378
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7372
7379
|
} else {
|
|
7373
7380
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
7374
7381
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
7375
7382
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
7383
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7376
7384
|
}
|
|
7377
7385
|
|
|
7378
7386
|
if (model.layers[il].attn_q_norm) {
|
|
@@ -7380,6 +7388,10 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7380
7388
|
model.layers[il].attn_q_norm,
|
|
7381
7389
|
model.layers[il].attn_q_norm_b,
|
|
7382
7390
|
LLM_NORM, il);
|
|
7391
|
+
|
|
7392
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7393
|
+
} else {
|
|
7394
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7383
7395
|
}
|
|
7384
7396
|
|
|
7385
7397
|
if (model.layers[il].attn_k_norm) {
|
|
@@ -7387,11 +7399,11 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7387
7399
|
model.layers[il].attn_k_norm,
|
|
7388
7400
|
model.layers[il].attn_k_norm_b,
|
|
7389
7401
|
LLM_NORM, il);
|
|
7390
|
-
}
|
|
7391
7402
|
|
|
7392
|
-
|
|
7393
|
-
|
|
7394
|
-
|
|
7403
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7404
|
+
} else {
|
|
7405
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7406
|
+
}
|
|
7395
7407
|
|
|
7396
7408
|
// RoPE
|
|
7397
7409
|
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
@@ -7536,9 +7548,9 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
7536
7548
|
|
|
7537
7549
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7538
7550
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7539
|
-
Vcur =
|
|
7551
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7540
7552
|
|
|
7541
|
-
Vcur =
|
|
7553
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7542
7554
|
|
|
7543
7555
|
// RoPE
|
|
7544
7556
|
Qcur = ggml_rope_ext(
|
|
@@ -7645,13 +7657,13 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7645
7657
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7646
7658
|
cb(cur, "bqkv", il);
|
|
7647
7659
|
|
|
7648
|
-
ggml_tensor * Qcur =
|
|
7649
|
-
ggml_tensor * Kcur =
|
|
7650
|
-
ggml_tensor * Vcur =
|
|
7660
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7661
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7662
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7651
7663
|
|
|
7652
|
-
Qcur =
|
|
7653
|
-
Kcur =
|
|
7654
|
-
Vcur =
|
|
7664
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7665
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7666
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7655
7667
|
|
|
7656
7668
|
cb(Qcur, "Qcur", il);
|
|
7657
7669
|
cb(Kcur, "Kcur", il);
|
|
@@ -7769,7 +7781,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7769
7781
|
|
|
7770
7782
|
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7771
7783
|
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7772
|
-
ggml_tensor * Vcur =
|
|
7784
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7773
7785
|
|
|
7774
7786
|
cb(Qcur, "Qcur", il);
|
|
7775
7787
|
cb(Kcur, "Kcur", il);
|
|
@@ -7788,17 +7800,18 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7788
7800
|
model.layers[il].attn_k_norm_b,
|
|
7789
7801
|
LLM_NORM, il);
|
|
7790
7802
|
cb(Kcur, "Kcur", il);
|
|
7803
|
+
|
|
7804
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7805
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7791
7806
|
} else {
|
|
7792
|
-
Qcur =
|
|
7807
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7793
7808
|
cb(Qcur, "Qcur", il);
|
|
7794
7809
|
|
|
7795
|
-
Kcur =
|
|
7810
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7796
7811
|
cb(Kcur, "Kcur", il);
|
|
7797
7812
|
}
|
|
7798
7813
|
|
|
7799
|
-
|
|
7800
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7801
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7814
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7802
7815
|
|
|
7803
7816
|
cb(Qcur, "Qcur", il);
|
|
7804
7817
|
cb(Kcur, "Kcur", il);
|
|
@@ -8050,9 +8063,9 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8050
8063
|
|
|
8051
8064
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8052
8065
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
8053
|
-
ggml_tensor * Vcur =
|
|
8066
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
|
|
8054
8067
|
|
|
8055
|
-
Vcur =
|
|
8068
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8056
8069
|
|
|
8057
8070
|
// using mode = 2 for neox mode
|
|
8058
8071
|
Qcur = ggml_rope_ext(
|
|
@@ -9025,21 +9038,21 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9025
9038
|
|
|
9026
9039
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9027
9040
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9028
|
-
Vcur =
|
|
9041
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9042
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9029
9043
|
} else {
|
|
9030
9044
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9031
9045
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9032
9046
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9033
9047
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9034
9048
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9049
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9035
9050
|
}
|
|
9036
9051
|
|
|
9037
9052
|
cb(Qcur, "Qcur", il);
|
|
9038
9053
|
cb(Kcur, "Kcur", il);
|
|
9039
9054
|
cb(Vcur, "Vcur", il);
|
|
9040
9055
|
|
|
9041
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9042
|
-
|
|
9043
9056
|
Qcur = ggml_rope_ext(
|
|
9044
9057
|
ctx0, Qcur, inp_pos, nullptr,
|
|
9045
9058
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9163,21 +9176,21 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9163
9176
|
|
|
9164
9177
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
|
|
9165
9178
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
9166
|
-
Vcur =
|
|
9179
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
9180
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9167
9181
|
} else {
|
|
9168
9182
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9169
9183
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9170
9184
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9171
9185
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9172
9186
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9187
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9173
9188
|
}
|
|
9174
9189
|
|
|
9175
9190
|
cb(Qcur, "Qcur", il);
|
|
9176
9191
|
cb(Kcur, "Kcur", il);
|
|
9177
9192
|
cb(Vcur, "Vcur", il);
|
|
9178
9193
|
|
|
9179
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9180
|
-
|
|
9181
9194
|
Qcur = ggml_rope_ext(
|
|
9182
9195
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
9183
9196
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9427,17 +9440,17 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
9427
9440
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
9428
9441
|
cb(cur, "bqkv", il);
|
|
9429
9442
|
|
|
9430
|
-
ggml_tensor * Qcur =
|
|
9431
|
-
ggml_tensor * Kcur =
|
|
9432
|
-
ggml_tensor * Vcur =
|
|
9443
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9444
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9445
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9433
9446
|
|
|
9434
9447
|
cb(Qcur, "Qcur", il);
|
|
9435
9448
|
cb(Kcur, "Kcur", il);
|
|
9436
9449
|
cb(Vcur, "Vcur", il);
|
|
9437
9450
|
|
|
9438
|
-
Qcur =
|
|
9439
|
-
Kcur =
|
|
9440
|
-
Vcur =
|
|
9451
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9452
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9453
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9441
9454
|
|
|
9442
9455
|
cur = build_attn(inp_attn,
|
|
9443
9456
|
model.layers[il].wo, model.layers[il].bo,
|
|
@@ -9533,9 +9546,9 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9533
9546
|
|
|
9534
9547
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9535
9548
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9536
|
-
ggml_tensor * Vcur =
|
|
9549
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9537
9550
|
|
|
9538
|
-
Vcur =
|
|
9551
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9539
9552
|
|
|
9540
9553
|
Qcur = ggml_rope_ext(
|
|
9541
9554
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -10863,8 +10876,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10863
10876
|
ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
|
|
10864
10877
|
all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
|
|
10865
10878
|
cb(all_coefs, "all_coefs", il);
|
|
10866
|
-
all_coefs =
|
|
10867
|
-
all_coefs =
|
|
10879
|
+
all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
|
|
10880
|
+
all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
|
|
10868
10881
|
|
|
10869
10882
|
innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
|
|
10870
10883
|
ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
|
|
@@ -12277,9 +12290,9 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12277
12290
|
|
|
12278
12291
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12279
12292
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
12280
|
-
ggml_tensor * Vcur =
|
|
12293
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
12281
12294
|
|
|
12282
|
-
Vcur =
|
|
12295
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12283
12296
|
|
|
12284
12297
|
Qcur = ggml_rope_ext(
|
|
12285
12298
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -13412,17 +13425,17 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
13412
13425
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
13413
13426
|
cb(cur, "bqkv", il);
|
|
13414
13427
|
|
|
13415
|
-
ggml_tensor * Qcur =
|
|
13416
|
-
ggml_tensor * Kcur =
|
|
13417
|
-
ggml_tensor * Vcur =
|
|
13428
|
+
ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
|
|
13429
|
+
ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
|
|
13430
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
|
|
13418
13431
|
|
|
13419
13432
|
cb(Qcur, "Qcur", il);
|
|
13420
13433
|
cb(Kcur, "Kcur", il);
|
|
13421
13434
|
cb(Vcur, "Vcur", il);
|
|
13422
13435
|
|
|
13423
|
-
Qcur =
|
|
13424
|
-
Kcur =
|
|
13425
|
-
Vcur =
|
|
13436
|
+
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13437
|
+
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13438
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13426
13439
|
|
|
13427
13440
|
cur = build_attn(inp_attn,
|
|
13428
13441
|
model.layers[il].wo, model.layers[il].bo,
|
|
@@ -13525,6 +13538,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13525
13538
|
}
|
|
13526
13539
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13527
13540
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13541
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13528
13542
|
} else {
|
|
13529
13543
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
13530
13544
|
cb(cur, "wqkv", il);
|
|
@@ -13534,11 +13548,10 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13534
13548
|
}
|
|
13535
13549
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13536
13550
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13537
|
-
Vcur =
|
|
13551
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13552
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13538
13553
|
}
|
|
13539
13554
|
|
|
13540
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13541
|
-
|
|
13542
13555
|
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
|
13543
13556
|
Qcur = ggml_rope_ext(
|
|
13544
13557
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -13659,6 +13672,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13659
13672
|
}
|
|
13660
13673
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13661
13674
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13675
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13662
13676
|
} else {
|
|
13663
13677
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
13664
13678
|
cb(cur, "wqkv", il);
|
|
@@ -13668,11 +13682,10 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13668
13682
|
}
|
|
13669
13683
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13670
13684
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13671
|
-
Vcur =
|
|
13685
|
+
Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13686
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13672
13687
|
}
|
|
13673
13688
|
|
|
13674
|
-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13675
|
-
|
|
13676
13689
|
Qcur = ggml_rope_ext(
|
|
13677
13690
|
ctx0, Qcur, inp_pos, nullptr,
|
|
13678
13691
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -16839,13 +16852,13 @@ private:
|
|
|
16839
16852
|
|
|
16840
16853
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
|
16841
16854
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
|
16842
|
-
ggml_tensor * Vcur =
|
|
16855
|
+
ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
|
|
16843
16856
|
|
|
16844
16857
|
cb(Qcur, "Qcur", il);
|
|
16845
16858
|
cb(Kcur, "Kcur", il);
|
|
16846
16859
|
cb(Vcur, "Vcur", il);
|
|
16847
16860
|
|
|
16848
|
-
Vcur =
|
|
16861
|
+
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
|
|
16849
16862
|
|
|
16850
16863
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
16851
16864
|
cb(Qcur, "Qcur_normed", il);
|
|
@@ -16912,15 +16925,13 @@ private:
|
|
|
16912
16925
|
cb(zx, "mamba_in_proj", il);
|
|
16913
16926
|
// {8192, 5, 1, 1} -> {8192, 1, 5, 1}
|
|
16914
16927
|
zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
|
|
16915
|
-
zx =
|
|
16916
|
-
zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
|
|
16928
|
+
zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
|
|
16917
16929
|
cb(zx, "mamba_in_proj_out", il);
|
|
16918
16930
|
|
|
16919
16931
|
// split into z and x
|
|
16920
16932
|
// => {head_dim * n_heads, n_seq_tokens, n_seqs}
|
|
16921
16933
|
ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
|
|
16922
|
-
x =
|
|
16923
|
-
x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
|
|
16934
|
+
x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
|
|
16924
16935
|
// x = ggml_permute(ctx0, x, 0, 2, 1, 3);
|
|
16925
16936
|
cb(x, "mamba_x_split", il);
|
|
16926
16937
|
|
|
@@ -39,6 +39,7 @@ enum llm_type {
|
|
|
39
39
|
LLM_TYPE_410M,
|
|
40
40
|
LLM_TYPE_450M,
|
|
41
41
|
LLM_TYPE_475M,
|
|
42
|
+
LLM_TYPE_537M,
|
|
42
43
|
LLM_TYPE_700M,
|
|
43
44
|
LLM_TYPE_770M,
|
|
44
45
|
LLM_TYPE_780M,
|
|
@@ -78,6 +79,7 @@ enum llm_type {
|
|
|
78
79
|
LLM_TYPE_40B,
|
|
79
80
|
LLM_TYPE_65B,
|
|
80
81
|
LLM_TYPE_70B,
|
|
82
|
+
LLM_TYPE_120B,
|
|
81
83
|
LLM_TYPE_142B,
|
|
82
84
|
LLM_TYPE_236B,
|
|
83
85
|
LLM_TYPE_290B,
|
|
@@ -999,7 +999,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
999
999
|
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
|
1000
1000
|
|
|
1001
1001
|
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
|
|
1002
|
-
#if
|
|
1002
|
+
#if 0
|
|
1003
1003
|
if (new_type == GGML_TYPE_MXFP4) {
|
|
1004
1004
|
auto * x = f32_data_03;
|
|
1005
1005
|
|
|
@@ -2341,7 +2341,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2341
2341
|
|
|
2342
2342
|
// @ngxson : quick hack for gpt-oss, always render these tokens
|
|
2343
2343
|
for (const auto & t : token_to_id) {
|
|
2344
|
-
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
|
|
2344
|
+
if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
|
|
2345
2345
|
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
2346
2346
|
}
|
|
2347
2347
|
}
|
|
@@ -2388,6 +2388,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2388
2388
|
|
|
2389
2389
|
if (has_return && has_call && has_end) {
|
|
2390
2390
|
special_eog_ids.erase(end_id);
|
|
2391
|
+
id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
2391
2392
|
LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
|
|
2392
2393
|
}
|
|
2393
2394
|
}
|