@fugood/llama.node 1.1.11 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +250 -1
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
- package/src/llama.cpp/ggml/include/ggml.h +56 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +28 -4
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +65 -57
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -11
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +10 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
- package/src/llama.cpp/src/llama-kv-cache.h +9 -0
- package/src/llama.cpp/src/llama-model.cpp +217 -97
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama.cpp +53 -10
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -1110,7 +1110,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1110
1110
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1111
1111
|
|
|
1112
1112
|
switch (hparams.n_layer) {
|
|
1113
|
-
case 18: type =
|
|
1113
|
+
case 18: type = LLM_TYPE_270M; break;
|
|
1114
1114
|
case 26: type = LLM_TYPE_1B; break;
|
|
1115
1115
|
case 34: type = LLM_TYPE_4B; break;
|
|
1116
1116
|
case 48: type = LLM_TYPE_12B; break;
|
|
@@ -1142,6 +1142,26 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1142
1142
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1143
1143
|
}
|
|
1144
1144
|
} break;
|
|
1145
|
+
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
1146
|
+
{
|
|
1147
|
+
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
|
1148
|
+
hparams.set_swa_pattern(6);
|
|
1149
|
+
|
|
1150
|
+
hparams.causal_attn = false; // embeddings do not use causal attention
|
|
1151
|
+
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1152
|
+
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1153
|
+
|
|
1154
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1155
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1156
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
1157
|
+
|
|
1158
|
+
switch (hparams.n_layer) {
|
|
1159
|
+
case 24: type = LLM_TYPE_0_3B; break;
|
|
1160
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1161
|
+
}
|
|
1162
|
+
hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
1163
|
+
|
|
1164
|
+
} break;
|
|
1145
1165
|
case LLM_ARCH_STARCODER2:
|
|
1146
1166
|
{
|
|
1147
1167
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1522,6 +1542,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1522
1542
|
hparams.dec_start_token_id = dec_start_token_id;
|
|
1523
1543
|
}
|
|
1524
1544
|
|
|
1545
|
+
hparams.dec_n_layer = hparams.n_layer;
|
|
1546
|
+
ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
|
|
1547
|
+
|
|
1525
1548
|
switch (hparams.n_layer) {
|
|
1526
1549
|
case 6: type = LLM_TYPE_60M; break; // t5-small
|
|
1527
1550
|
case 8: type = LLM_TYPE_80M; break; // flan-t5-small
|
|
@@ -3484,6 +3507,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3484
3507
|
}
|
|
3485
3508
|
} break;
|
|
3486
3509
|
case LLM_ARCH_GEMMA3:
|
|
3510
|
+
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
3487
3511
|
{
|
|
3488
3512
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3489
3513
|
|
|
@@ -4393,6 +4417,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4393
4417
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4394
4418
|
}
|
|
4395
4419
|
|
|
4420
|
+
// n_layer: number of encoder_layers
|
|
4421
|
+
// dec_n_layer: number of decoder_layers
|
|
4422
|
+
const int dec_n_layer = hparams.dec_n_layer;
|
|
4423
|
+
if (dec_n_layer > n_layer) {
|
|
4424
|
+
layers.resize(dec_n_layer);
|
|
4425
|
+
}
|
|
4426
|
+
|
|
4427
|
+
// load encoder layers
|
|
4396
4428
|
for (int i = 0; i < n_layer; ++i) {
|
|
4397
4429
|
auto & layer = layers[i];
|
|
4398
4430
|
|
|
@@ -4408,6 +4440,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4408
4440
|
layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
4409
4441
|
layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4410
4442
|
layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4443
|
+
}
|
|
4444
|
+
|
|
4445
|
+
// load decoder layers
|
|
4446
|
+
for (int i = 0; i < dec_n_layer; ++i) {
|
|
4447
|
+
auto & layer = layers[i];
|
|
4411
4448
|
|
|
4412
4449
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4413
4450
|
layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
|
|
@@ -6906,9 +6943,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6906
6943
|
|
|
6907
6944
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6908
6945
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6909
|
-
ggml_tensor * Vcur =
|
|
6910
|
-
|
|
6911
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6946
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
6912
6947
|
|
|
6913
6948
|
// using mode = 2 for neox mode
|
|
6914
6949
|
Qcur = ggml_rope_ext(
|
|
@@ -7186,9 +7221,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
7186
7221
|
|
|
7187
7222
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7188
7223
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7189
|
-
Vcur =
|
|
7190
|
-
|
|
7191
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7224
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7192
7225
|
|
|
7193
7226
|
Qcur = ggml_rope_ext(
|
|
7194
7227
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -7308,13 +7341,9 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7308
7341
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7309
7342
|
cb(cur, "bqkv", il);
|
|
7310
7343
|
|
|
7311
|
-
ggml_tensor * Qcur =
|
|
7312
|
-
ggml_tensor * Kcur =
|
|
7313
|
-
ggml_tensor * Vcur =
|
|
7314
|
-
|
|
7315
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7316
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7317
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7344
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7345
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7346
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7318
7347
|
|
|
7319
7348
|
cb(Qcur, "Qcur", il);
|
|
7320
7349
|
cb(Kcur, "Kcur", il);
|
|
@@ -7530,14 +7559,16 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7530
7559
|
cb(cur, "bqkv", il);
|
|
7531
7560
|
}
|
|
7532
7561
|
|
|
7533
|
-
Qcur =
|
|
7534
|
-
Kcur =
|
|
7535
|
-
Vcur =
|
|
7536
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7562
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7563
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7564
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7537
7565
|
} else {
|
|
7538
7566
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
7539
7567
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
7540
7568
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
7569
|
+
|
|
7570
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7571
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7541
7572
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7542
7573
|
}
|
|
7543
7574
|
|
|
@@ -7548,8 +7579,6 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7548
7579
|
LLM_NORM, il);
|
|
7549
7580
|
|
|
7550
7581
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7551
|
-
} else {
|
|
7552
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7553
7582
|
}
|
|
7554
7583
|
|
|
7555
7584
|
if (model.layers[il].attn_k_norm) {
|
|
@@ -7559,8 +7588,6 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7559
7588
|
LLM_NORM, il);
|
|
7560
7589
|
|
|
7561
7590
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7562
|
-
} else {
|
|
7563
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7564
7591
|
}
|
|
7565
7592
|
|
|
7566
7593
|
// RoPE
|
|
@@ -7706,9 +7733,7 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
7706
7733
|
|
|
7707
7734
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7708
7735
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7709
|
-
Vcur =
|
|
7710
|
-
|
|
7711
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7736
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7712
7737
|
|
|
7713
7738
|
// RoPE
|
|
7714
7739
|
Qcur = ggml_rope_ext(
|
|
@@ -7815,13 +7840,9 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7815
7840
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7816
7841
|
cb(cur, "bqkv", il);
|
|
7817
7842
|
|
|
7818
|
-
ggml_tensor * Qcur =
|
|
7819
|
-
ggml_tensor * Kcur =
|
|
7820
|
-
ggml_tensor * Vcur =
|
|
7821
|
-
|
|
7822
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7823
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7824
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7843
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7844
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7845
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7825
7846
|
|
|
7826
7847
|
cb(Qcur, "Qcur", il);
|
|
7827
7848
|
cb(Kcur, "Kcur", il);
|
|
@@ -7937,13 +7958,9 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7937
7958
|
cb(cur, "wqkv_clamped", il);
|
|
7938
7959
|
}
|
|
7939
7960
|
|
|
7940
|
-
ggml_tensor * Qcur =
|
|
7941
|
-
ggml_tensor * Kcur =
|
|
7942
|
-
ggml_tensor * Vcur =
|
|
7943
|
-
|
|
7944
|
-
cb(Qcur, "Qcur", il);
|
|
7945
|
-
cb(Kcur, "Kcur", il);
|
|
7946
|
-
cb(Vcur, "Vcur", il);
|
|
7961
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7962
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7963
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7947
7964
|
|
|
7948
7965
|
// Q/K Layernorm
|
|
7949
7966
|
if (model.layers[il].attn_q_norm) {
|
|
@@ -7951,26 +7968,16 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7951
7968
|
model.layers[il].attn_q_norm,
|
|
7952
7969
|
model.layers[il].attn_q_norm_b,
|
|
7953
7970
|
LLM_NORM, il);
|
|
7954
|
-
cb(Qcur, "Qcur", il);
|
|
7955
7971
|
|
|
7956
7972
|
Kcur = build_norm(Kcur,
|
|
7957
7973
|
model.layers[il].attn_k_norm,
|
|
7958
7974
|
model.layers[il].attn_k_norm_b,
|
|
7959
7975
|
LLM_NORM, il);
|
|
7960
|
-
cb(Kcur, "Kcur", il);
|
|
7961
7976
|
|
|
7962
7977
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7963
7978
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7964
|
-
} else {
|
|
7965
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7966
|
-
cb(Qcur, "Qcur", il);
|
|
7967
|
-
|
|
7968
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7969
|
-
cb(Kcur, "Kcur", il);
|
|
7970
7979
|
}
|
|
7971
7980
|
|
|
7972
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7973
|
-
|
|
7974
7981
|
cb(Qcur, "Qcur", il);
|
|
7975
7982
|
cb(Kcur, "Kcur", il);
|
|
7976
7983
|
cb(Vcur, "Vcur", il);
|
|
@@ -8219,11 +8226,9 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8219
8226
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
8220
8227
|
cb(cur, "bqkv", il);
|
|
8221
8228
|
|
|
8222
|
-
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,
|
|
8229
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8223
8230
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
8224
|
-
ggml_tensor * Vcur =
|
|
8225
|
-
|
|
8226
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8231
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
|
|
8227
8232
|
|
|
8228
8233
|
// using mode = 2 for neox mode
|
|
8229
8234
|
Qcur = ggml_rope_ext(
|
|
@@ -9198,21 +9203,17 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9198
9203
|
|
|
9199
9204
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9200
9205
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9201
|
-
Vcur =
|
|
9202
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9206
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9203
9207
|
} else {
|
|
9204
9208
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9205
9209
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9206
9210
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9211
|
+
|
|
9207
9212
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9208
9213
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9209
9214
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9210
9215
|
}
|
|
9211
9216
|
|
|
9212
|
-
cb(Qcur, "Qcur", il);
|
|
9213
|
-
cb(Kcur, "Kcur", il);
|
|
9214
|
-
cb(Vcur, "Vcur", il);
|
|
9215
|
-
|
|
9216
9217
|
Qcur = ggml_rope_ext(
|
|
9217
9218
|
ctx0, Qcur, inp_pos, nullptr,
|
|
9218
9219
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9336,21 +9337,17 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9336
9337
|
|
|
9337
9338
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
|
|
9338
9339
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
9339
|
-
Vcur =
|
|
9340
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9340
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
9341
9341
|
} else {
|
|
9342
9342
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9343
9343
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9344
9344
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9345
|
+
|
|
9345
9346
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9346
9347
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9347
9348
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9348
9349
|
}
|
|
9349
9350
|
|
|
9350
|
-
cb(Qcur, "Qcur", il);
|
|
9351
|
-
cb(Kcur, "Kcur", il);
|
|
9352
|
-
cb(Vcur, "Vcur", il);
|
|
9353
|
-
|
|
9354
9351
|
Qcur = ggml_rope_ext(
|
|
9355
9352
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
9356
9353
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9600,18 +9597,14 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
9600
9597
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
9601
9598
|
cb(cur, "bqkv", il);
|
|
9602
9599
|
|
|
9603
|
-
ggml_tensor * Qcur =
|
|
9604
|
-
ggml_tensor * Kcur =
|
|
9605
|
-
ggml_tensor * Vcur =
|
|
9600
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9601
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9602
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9606
9603
|
|
|
9607
9604
|
cb(Qcur, "Qcur", il);
|
|
9608
9605
|
cb(Kcur, "Kcur", il);
|
|
9609
9606
|
cb(Vcur, "Vcur", il);
|
|
9610
9607
|
|
|
9611
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9612
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9613
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9614
|
-
|
|
9615
9608
|
cur = build_attn(inp_attn,
|
|
9616
9609
|
model.layers[il].wo, model.layers[il].bo,
|
|
9617
9610
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
@@ -9706,9 +9699,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9706
9699
|
|
|
9707
9700
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9708
9701
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9709
|
-
ggml_tensor * Vcur =
|
|
9710
|
-
|
|
9711
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9702
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9712
9703
|
|
|
9713
9704
|
Qcur = ggml_rope_ext(
|
|
9714
9705
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -11045,6 +11036,137 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
11045
11036
|
}
|
|
11046
11037
|
};
|
|
11047
11038
|
|
|
11039
|
+
struct llm_build_gemma_embedding_iswa : public llm_graph_context {
|
|
11040
|
+
llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
11041
|
+
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
11042
|
+
|
|
11043
|
+
ggml_tensor * cur;
|
|
11044
|
+
ggml_tensor * inpL;
|
|
11045
|
+
|
|
11046
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
11047
|
+
|
|
11048
|
+
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
11049
|
+
if (ubatch.token) {
|
|
11050
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
|
11051
|
+
cb(inpL, "inp_scaled", -1);
|
|
11052
|
+
}
|
|
11053
|
+
|
|
11054
|
+
// inp_pos - contains the positions
|
|
11055
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
11056
|
+
|
|
11057
|
+
// TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
|
|
11058
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
11059
|
+
|
|
11060
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11061
|
+
|
|
11062
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
11063
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
11064
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
11065
|
+
|
|
11066
|
+
// norm
|
|
11067
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
11068
|
+
cb(cur, "attn_norm", il);
|
|
11069
|
+
|
|
11070
|
+
// self-attention
|
|
11071
|
+
{
|
|
11072
|
+
// compute Q and K and RoPE them
|
|
11073
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
11074
|
+
cb(Qcur, "Qcur", il);
|
|
11075
|
+
|
|
11076
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
11077
|
+
cb(Kcur, "Kcur", il);
|
|
11078
|
+
|
|
11079
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
11080
|
+
cb(Vcur, "Vcur", il);
|
|
11081
|
+
|
|
11082
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
11083
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
11084
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
11085
|
+
|
|
11086
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
11087
|
+
cb(Qcur, "Qcur_normed", il);
|
|
11088
|
+
|
|
11089
|
+
Qcur = ggml_rope_ext(
|
|
11090
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
11091
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
11092
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
11093
|
+
|
|
11094
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
11095
|
+
cb(Kcur, "Kcur_normed", il);
|
|
11096
|
+
|
|
11097
|
+
Kcur = ggml_rope_ext(
|
|
11098
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
11099
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
11100
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
11101
|
+
|
|
11102
|
+
cb(Qcur, "Qcur", il);
|
|
11103
|
+
cb(Kcur, "Kcur", il);
|
|
11104
|
+
cb(Vcur, "Vcur", il);
|
|
11105
|
+
|
|
11106
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
|
|
11107
|
+
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
11108
|
+
|
|
11109
|
+
cur = build_attn(inp_attn,
|
|
11110
|
+
model.layers[il].wo, NULL,
|
|
11111
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
11112
|
+
}
|
|
11113
|
+
|
|
11114
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11115
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11116
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
11117
|
+
}
|
|
11118
|
+
|
|
11119
|
+
cur = build_norm(cur,
|
|
11120
|
+
model.layers[il].attn_post_norm, NULL,
|
|
11121
|
+
LLM_NORM_RMS, il);
|
|
11122
|
+
cb(cur, "attn_post_norm", il);
|
|
11123
|
+
|
|
11124
|
+
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|
11125
|
+
cb(sa_out, "sa_out", il);
|
|
11126
|
+
|
|
11127
|
+
cur = build_norm(sa_out,
|
|
11128
|
+
model.layers[il].ffn_norm, NULL,
|
|
11129
|
+
LLM_NORM_RMS, il);
|
|
11130
|
+
cb(cur, "ffn_norm", il);
|
|
11131
|
+
|
|
11132
|
+
// feed-forward network
|
|
11133
|
+
{
|
|
11134
|
+
cur = build_ffn(cur,
|
|
11135
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
11136
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
11137
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
11138
|
+
NULL,
|
|
11139
|
+
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
11140
|
+
cb(cur, "ffn_out", il);
|
|
11141
|
+
}
|
|
11142
|
+
|
|
11143
|
+
cur = build_norm(cur,
|
|
11144
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
11145
|
+
LLM_NORM_RMS, -1);
|
|
11146
|
+
cb(cur, "ffn_post_norm", -1);
|
|
11147
|
+
|
|
11148
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
|
11149
|
+
|
|
11150
|
+
cur = build_cvec(cur, il);
|
|
11151
|
+
cb(cur, "l_out", il);
|
|
11152
|
+
|
|
11153
|
+
// input for next layer
|
|
11154
|
+
inpL = cur;
|
|
11155
|
+
}
|
|
11156
|
+
|
|
11157
|
+
cur = inpL;
|
|
11158
|
+
|
|
11159
|
+
cur = build_norm(cur,
|
|
11160
|
+
model.output_norm, NULL,
|
|
11161
|
+
LLM_NORM_RMS, -1);
|
|
11162
|
+
|
|
11163
|
+
cb(cur, "result_norm", -1);
|
|
11164
|
+
res->t_embd = cur;
|
|
11165
|
+
|
|
11166
|
+
ggml_build_forward_expand(gf, cur);
|
|
11167
|
+
}
|
|
11168
|
+
};
|
|
11169
|
+
|
|
11048
11170
|
// TODO: move up next to build_starcoder
|
|
11049
11171
|
struct llm_build_starcoder2 : public llm_graph_context {
|
|
11050
11172
|
llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
@@ -12449,9 +12571,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12449
12571
|
|
|
12450
12572
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12451
12573
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
12452
|
-
ggml_tensor * Vcur =
|
|
12453
|
-
|
|
12454
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12574
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
12455
12575
|
|
|
12456
12576
|
Qcur = ggml_rope_ext(
|
|
12457
12577
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -13405,7 +13525,9 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13405
13525
|
|
|
13406
13526
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13407
13527
|
|
|
13408
|
-
|
|
13528
|
+
const int64_t dec_n_layer = hparams.dec_n_layer;
|
|
13529
|
+
|
|
13530
|
+
for (int il = 0; il < dec_n_layer; ++il) {
|
|
13409
13531
|
ggml_tensor * inpSA = inpL;
|
|
13410
13532
|
|
|
13411
13533
|
// norm
|
|
@@ -13496,7 +13618,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13496
13618
|
//cb(cur, "kqv_out", il);
|
|
13497
13619
|
}
|
|
13498
13620
|
|
|
13499
|
-
if (il ==
|
|
13621
|
+
if (il == dec_n_layer - 1 && inp_out_ids) {
|
|
13500
13622
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13501
13623
|
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
|
13502
13624
|
}
|
|
@@ -13517,8 +13639,8 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13517
13639
|
model.layers[il].ffn_gate, NULL, NULL,
|
|
13518
13640
|
model.layers[il].ffn_down, NULL, NULL,
|
|
13519
13641
|
NULL,
|
|
13520
|
-
model.layers[il].
|
|
13521
|
-
model.layers[il].
|
|
13642
|
+
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
|
|
13643
|
+
model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
|
13522
13644
|
il);
|
|
13523
13645
|
cb(cur, "ffn_out", il);
|
|
13524
13646
|
}
|
|
@@ -13584,18 +13706,14 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
13584
13706
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
13585
13707
|
cb(cur, "bqkv", il);
|
|
13586
13708
|
|
|
13587
|
-
ggml_tensor * Qcur =
|
|
13588
|
-
ggml_tensor * Kcur =
|
|
13589
|
-
ggml_tensor * Vcur =
|
|
13709
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
|
|
13710
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
|
|
13711
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
|
|
13590
13712
|
|
|
13591
13713
|
cb(Qcur, "Qcur", il);
|
|
13592
13714
|
cb(Kcur, "Kcur", il);
|
|
13593
13715
|
cb(Vcur, "Vcur", il);
|
|
13594
13716
|
|
|
13595
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13596
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13597
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13598
|
-
|
|
13599
13717
|
cur = build_attn(inp_attn,
|
|
13600
13718
|
model.layers[il].wo, model.layers[il].bo,
|
|
13601
13719
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
@@ -13707,8 +13825,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13707
13825
|
}
|
|
13708
13826
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13709
13827
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13710
|
-
Vcur =
|
|
13711
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13828
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13712
13829
|
}
|
|
13713
13830
|
|
|
13714
13831
|
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
|
@@ -13841,8 +13958,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13841
13958
|
}
|
|
13842
13959
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13843
13960
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13844
|
-
Vcur =
|
|
13845
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13961
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13846
13962
|
}
|
|
13847
13963
|
|
|
13848
13964
|
Qcur = ggml_rope_ext(
|
|
@@ -17141,16 +17257,14 @@ private:
|
|
|
17141
17257
|
const int64_t k_offset = n_embd_head_q * n_head;
|
|
17142
17258
|
const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
|
|
17143
17259
|
|
|
17144
|
-
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head,
|
|
17260
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
|
17145
17261
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
|
17146
|
-
ggml_tensor * Vcur =
|
|
17262
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
|
|
17147
17263
|
|
|
17148
17264
|
cb(Qcur, "Qcur", il);
|
|
17149
17265
|
cb(Kcur, "Kcur", il);
|
|
17150
17266
|
cb(Vcur, "Vcur", il);
|
|
17151
17267
|
|
|
17152
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
|
|
17153
|
-
|
|
17154
17268
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
17155
17269
|
cb(Qcur, "Qcur_normed", il);
|
|
17156
17270
|
|
|
@@ -18481,6 +18595,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18481
18595
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
18482
18596
|
case LLM_ARCH_NEO_BERT:
|
|
18483
18597
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
18598
|
+
//case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
|
|
18484
18599
|
case LLM_ARCH_DREAM:
|
|
18485
18600
|
case LLM_ARCH_LLADA:
|
|
18486
18601
|
{
|
|
@@ -18761,6 +18876,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18761
18876
|
{
|
|
18762
18877
|
llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
|
|
18763
18878
|
} break;
|
|
18879
|
+
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
18880
|
+
{
|
|
18881
|
+
llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
|
|
18882
|
+
} break;
|
|
18764
18883
|
case LLM_ARCH_STARCODER2:
|
|
18765
18884
|
{
|
|
18766
18885
|
llm = std::make_unique<llm_build_starcoder2>(*this, params);
|
|
@@ -19161,6 +19280,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
19161
19280
|
case LLM_ARCH_GEMMA2:
|
|
19162
19281
|
case LLM_ARCH_GEMMA3:
|
|
19163
19282
|
case LLM_ARCH_GEMMA3N:
|
|
19283
|
+
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
19164
19284
|
case LLM_ARCH_STARCODER2:
|
|
19165
19285
|
case LLM_ARCH_OPENELM:
|
|
19166
19286
|
case LLM_ARCH_GPTNEOX:
|
|
@@ -920,7 +920,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
920
920
|
new_type = tensor->type;
|
|
921
921
|
new_data = tensor->data;
|
|
922
922
|
new_size = ggml_nbytes(tensor);
|
|
923
|
-
LLAMA_LOG_INFO("size = %8.3f
|
|
923
|
+
LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
|
924
924
|
} else {
|
|
925
925
|
const int64_t nelements = ggml_nelements(tensor);
|
|
926
926
|
|
|
@@ -1037,8 +1037,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
1037
1037
|
}
|
|
1038
1038
|
close_ofstream();
|
|
1039
1039
|
|
|
1040
|
-
LLAMA_LOG_INFO("%s: model size = %8.2f
|
|
1041
|
-
LLAMA_LOG_INFO("%s: quant size = %8.2f
|
|
1040
|
+
LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
|
|
1041
|
+
LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
|
|
1042
1042
|
|
|
1043
1043
|
if (qs.n_fallback > 0) {
|
|
1044
1044
|
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
|