@fugood/llama.node 1.2.0-rc.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +16 -15
- package/src/llama.cpp/CMakeLists.txt +7 -0
- package/src/llama.cpp/common/arg.cpp +141 -21
- package/src/llama.cpp/common/chat.cpp +139 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.h +23 -8
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +28 -7
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +12 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +10 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +7 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +0 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +161 -1
- package/src/llama.cpp/src/llama-arch.cpp +44 -10
- package/src/llama.cpp/src/llama-arch.h +9 -0
- package/src/llama.cpp/src/llama-chat.cpp +17 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +13 -11
- package/src/llama.cpp/src/llama-graph.cpp +6 -5
- package/src/llama.cpp/src/llama-hparams.h +14 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +55 -15
- package/src/llama.cpp/src/llama-kv-cache.h +8 -0
- package/src/llama.cpp/src/llama-model.cpp +386 -140
- package/src/llama.cpp/src/llama-model.h +3 -0
- package/src/llama.cpp/src/llama-quant.cpp +6 -4
- package/src/llama.cpp/src/llama-vocab.cpp +13 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/llama.cpp +53 -10
|
@@ -36,6 +36,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
36
36
|
case LLM_TYPE_80M: return "80M";
|
|
37
37
|
case LLM_TYPE_109M: return "109M";
|
|
38
38
|
case LLM_TYPE_137M: return "137M";
|
|
39
|
+
case LLM_TYPE_140M: return "140M";
|
|
39
40
|
case LLM_TYPE_160M: return "160M";
|
|
40
41
|
case LLM_TYPE_190M: return "190M";
|
|
41
42
|
case LLM_TYPE_220M: return "220M";
|
|
@@ -44,6 +45,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
44
45
|
case LLM_TYPE_270M: return "270M";
|
|
45
46
|
case LLM_TYPE_335M: return "335M";
|
|
46
47
|
case LLM_TYPE_350M: return "350M";
|
|
48
|
+
case LLM_TYPE_360M: return "360M";
|
|
47
49
|
case LLM_TYPE_410M: return "410M";
|
|
48
50
|
case LLM_TYPE_450M: return "450M";
|
|
49
51
|
case LLM_TYPE_475M: return "475M";
|
|
@@ -51,6 +53,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
51
53
|
case LLM_TYPE_700M: return "700M";
|
|
52
54
|
case LLM_TYPE_770M: return "770M";
|
|
53
55
|
case LLM_TYPE_780M: return "780M";
|
|
56
|
+
case LLM_TYPE_950M: return "950M";
|
|
54
57
|
case LLM_TYPE_0_3B: return "0.3B";
|
|
55
58
|
case LLM_TYPE_0_5B: return "0.5B";
|
|
56
59
|
case LLM_TYPE_0_6B: return "0.6B";
|
|
@@ -622,19 +625,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
622
625
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
623
626
|
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
|
624
627
|
|
|
625
|
-
hparams.
|
|
626
|
-
hparams.n_swa
|
|
627
|
-
|
|
628
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
629
|
+
if (found_swa && hparams.n_swa == 0) {
|
|
630
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
631
|
+
hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
|
|
632
|
+
} else {
|
|
633
|
+
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
|
634
|
+
hparams.n_swa = 8192;
|
|
635
|
+
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
636
|
+
}
|
|
628
637
|
|
|
629
638
|
switch (hparams.n_expert) {
|
|
639
|
+
case 0: {
|
|
640
|
+
// MobileLLM (no MoE)
|
|
641
|
+
switch (hparams.n_embd) {
|
|
642
|
+
case 2048: type = LLM_TYPE_140M; break;
|
|
643
|
+
case 4096: type = LLM_TYPE_360M; break;
|
|
644
|
+
case 6144: type = LLM_TYPE_950M; break;
|
|
645
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
646
|
+
}
|
|
647
|
+
} break;
|
|
630
648
|
case 16: type = LLM_TYPE_17B_16E; break;
|
|
631
649
|
case 128: type = LLM_TYPE_17B_128E; break;
|
|
632
650
|
default: type = LLM_TYPE_UNKNOWN;
|
|
633
651
|
}
|
|
634
652
|
|
|
635
|
-
|
|
636
|
-
hparams.use_kq_norm = false;
|
|
637
|
-
}
|
|
653
|
+
hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
|
|
638
654
|
} break;
|
|
639
655
|
case LLM_ARCH_ARCEE:
|
|
640
656
|
{
|
|
@@ -685,7 +701,30 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
685
701
|
} break;
|
|
686
702
|
case LLM_ARCH_GROK:
|
|
687
703
|
{
|
|
688
|
-
|
|
704
|
+
// defaults for old GGUFs
|
|
705
|
+
hparams.yarn_beta_fast = 8.0f;
|
|
706
|
+
hparams.f_logit_scale = 0.5773502691896257f;
|
|
707
|
+
hparams.f_embedding_scale = 78.38367176906169f;
|
|
708
|
+
hparams.f_attn_out_scale = 0.08838834764831845f;
|
|
709
|
+
hparams.f_attn_logit_softcapping = 30.0f;
|
|
710
|
+
hparams.f_router_logit_softcapping = 30.0f;
|
|
711
|
+
// no final_logit_softcapping in grok-1
|
|
712
|
+
hparams.f_final_logit_softcapping = 0.0f;
|
|
713
|
+
|
|
714
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
715
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
716
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
|
|
717
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
|
|
718
|
+
ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
|
|
719
|
+
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
|
720
|
+
ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
|
|
721
|
+
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
|
722
|
+
|
|
723
|
+
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
|
|
724
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
|
|
725
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
|
|
726
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
|
727
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
|
689
728
|
|
|
690
729
|
switch (hparams.n_layer) {
|
|
691
730
|
case 64: type = LLM_TYPE_314B; break;
|
|
@@ -913,6 +952,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
913
952
|
hparams.causal_attn = false;
|
|
914
953
|
}
|
|
915
954
|
break;
|
|
955
|
+
case LLM_ARCH_LLADA_MOE:
|
|
956
|
+
{
|
|
957
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
958
|
+
|
|
959
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
960
|
+
// diffusion language model uses non-causal attention
|
|
961
|
+
hparams.causal_attn = false;
|
|
962
|
+
switch (hparams.n_layer) {
|
|
963
|
+
case 16: type = LLM_TYPE_A1_7B; break;
|
|
964
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
965
|
+
}
|
|
966
|
+
} break;
|
|
916
967
|
case LLM_ARCH_QWEN2MOE:
|
|
917
968
|
{
|
|
918
969
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
@@ -1315,6 +1366,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1315
1366
|
{
|
|
1316
1367
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1317
1368
|
|
|
1369
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1370
|
+
if (found_swa && hparams.n_swa > 0) {
|
|
1371
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1372
|
+
hparams.set_swa_pattern(4);
|
|
1373
|
+
} else {
|
|
1374
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1318
1377
|
switch (hparams.n_layer) {
|
|
1319
1378
|
case 16: type = LLM_TYPE_1B; break;
|
|
1320
1379
|
case 32: type = LLM_TYPE_7B; break;
|
|
@@ -1542,6 +1601,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1542
1601
|
hparams.dec_start_token_id = dec_start_token_id;
|
|
1543
1602
|
}
|
|
1544
1603
|
|
|
1604
|
+
hparams.dec_n_layer = hparams.n_layer;
|
|
1605
|
+
ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
|
|
1606
|
+
|
|
1545
1607
|
switch (hparams.n_layer) {
|
|
1546
1608
|
case 6: type = LLM_TYPE_60M; break; // t5-small
|
|
1547
1609
|
case 8: type = LLM_TYPE_80M; break; // flan-t5-small
|
|
@@ -2361,6 +2423,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2361
2423
|
}
|
|
2362
2424
|
}
|
|
2363
2425
|
break;
|
|
2426
|
+
case LLM_ARCH_LLADA_MOE:
|
|
2427
|
+
{
|
|
2428
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2429
|
+
|
|
2430
|
+
// output
|
|
2431
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2432
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2433
|
+
|
|
2434
|
+
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
|
|
2435
|
+
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
|
|
2436
|
+
|
|
2437
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2438
|
+
auto & layer = layers[i];
|
|
2439
|
+
|
|
2440
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2441
|
+
|
|
2442
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2443
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2444
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2445
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2446
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2447
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2448
|
+
|
|
2449
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2450
|
+
|
|
2451
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2452
|
+
|
|
2453
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
2454
|
+
|
|
2455
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2456
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
2457
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2458
|
+
}
|
|
2459
|
+
} break;
|
|
2364
2460
|
case LLM_ARCH_LLAMA4:
|
|
2365
2461
|
{
|
|
2366
2462
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -2374,9 +2470,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2374
2470
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2375
2471
|
}
|
|
2376
2472
|
|
|
2377
|
-
GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
|
|
2378
2473
|
for (int i = 0; i < n_layer; ++i) {
|
|
2379
|
-
bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
|
|
2474
|
+
bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
|
|
2380
2475
|
|
|
2381
2476
|
auto & layer = layers[i];
|
|
2382
2477
|
|
|
@@ -2537,6 +2632,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2537
2632
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2538
2633
|
}
|
|
2539
2634
|
|
|
2635
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
|
|
2540
2636
|
for (int i = 0; i < n_layer; ++i) {
|
|
2541
2637
|
auto & layer = layers[i];
|
|
2542
2638
|
|
|
@@ -2551,12 +2647,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2551
2647
|
|
|
2552
2648
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2553
2649
|
|
|
2650
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
2651
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
|
|
2652
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
2653
|
+
|
|
2554
2654
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2555
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,
|
|
2556
|
-
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {
|
|
2557
|
-
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd,
|
|
2655
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
|
2656
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
2657
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
2558
2658
|
|
|
2559
|
-
layer.
|
|
2659
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2660
|
+
if (!layer.ffn_post_norm) {
|
|
2661
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2662
|
+
}
|
|
2560
2663
|
}
|
|
2561
2664
|
} break;
|
|
2562
2665
|
case LLM_ARCH_DBRX:
|
|
@@ -4414,6 +4517,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4414
4517
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4415
4518
|
}
|
|
4416
4519
|
|
|
4520
|
+
// n_layer: number of encoder_layers
|
|
4521
|
+
// dec_n_layer: number of decoder_layers
|
|
4522
|
+
const int dec_n_layer = hparams.dec_n_layer;
|
|
4523
|
+
if (dec_n_layer > n_layer) {
|
|
4524
|
+
layers.resize(dec_n_layer);
|
|
4525
|
+
}
|
|
4526
|
+
|
|
4527
|
+
// load encoder layers
|
|
4417
4528
|
for (int i = 0; i < n_layer; ++i) {
|
|
4418
4529
|
auto & layer = layers[i];
|
|
4419
4530
|
|
|
@@ -4429,6 +4540,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4429
4540
|
layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
4430
4541
|
layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4431
4542
|
layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4543
|
+
}
|
|
4544
|
+
|
|
4545
|
+
// load decoder layers
|
|
4546
|
+
for (int i = 0; i < dec_n_layer; ++i) {
|
|
4547
|
+
auto & layer = layers[i];
|
|
4432
4548
|
|
|
4433
4549
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4434
4550
|
layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
|
|
@@ -6227,6 +6343,14 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
6227
6343
|
cb(Kcur, "Kcur", il);
|
|
6228
6344
|
cb(Vcur, "Vcur", il);
|
|
6229
6345
|
|
|
6346
|
+
if (hparams.use_kq_norm) {
|
|
6347
|
+
// Llama4TextL2Norm
|
|
6348
|
+
Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
|
|
6349
|
+
Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
|
|
6350
|
+
cb(Qcur, "Qcur_normed", il);
|
|
6351
|
+
cb(Kcur, "Kcur_normed", il);
|
|
6352
|
+
}
|
|
6353
|
+
|
|
6230
6354
|
cur = build_attn(inp_attn,
|
|
6231
6355
|
model.layers[il].wo, model.layers[il].bo,
|
|
6232
6356
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
@@ -6334,7 +6458,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
6334
6458
|
for (int il = 0; il < n_layer; ++il) {
|
|
6335
6459
|
ggml_tensor * inpSA = inpL;
|
|
6336
6460
|
|
|
6337
|
-
const bool use_rope =
|
|
6461
|
+
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
6462
|
+
(il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
6338
6463
|
|
|
6339
6464
|
// norm
|
|
6340
6465
|
cur = build_norm(inpL,
|
|
@@ -6927,9 +7052,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
6927
7052
|
|
|
6928
7053
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
6929
7054
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
6930
|
-
ggml_tensor * Vcur =
|
|
6931
|
-
|
|
6932
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7055
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
6933
7056
|
|
|
6934
7057
|
// using mode = 2 for neox mode
|
|
6935
7058
|
Qcur = ggml_rope_ext(
|
|
@@ -7014,9 +7137,6 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
7014
7137
|
|
|
7015
7138
|
inpL = build_inp_embd(model.tok_embd);
|
|
7016
7139
|
|
|
7017
|
-
// multiply by embedding_multiplier_scale of 78.38367176906169
|
|
7018
|
-
inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
|
|
7019
|
-
|
|
7020
7140
|
// inp_pos - contains the positions
|
|
7021
7141
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7022
7142
|
|
|
@@ -7088,26 +7208,22 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
7088
7208
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7089
7209
|
}
|
|
7090
7210
|
|
|
7091
|
-
|
|
7092
|
-
|
|
7093
|
-
|
|
7094
|
-
|
|
7095
|
-
model.layers[il].attn_out_norm, NULL,
|
|
7096
|
-
LLM_NORM_RMS, il);
|
|
7097
|
-
cb(cur, "attn_out_norm", il);
|
|
7098
|
-
}
|
|
7211
|
+
cur = build_norm(cur,
|
|
7212
|
+
model.layers[il].attn_out_norm, NULL,
|
|
7213
|
+
LLM_NORM_RMS, il);
|
|
7214
|
+
cb(cur, "attn_out_norm", il);
|
|
7099
7215
|
|
|
7100
7216
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
7101
7217
|
cb(ffn_inp, "ffn_inp", il);
|
|
7102
7218
|
|
|
7103
7219
|
// feed-forward network
|
|
7104
|
-
// MoE branch
|
|
7105
7220
|
cur = build_norm(ffn_inp,
|
|
7106
7221
|
model.layers[il].ffn_norm, NULL,
|
|
7107
7222
|
LLM_NORM_RMS, il);
|
|
7108
7223
|
cb(cur, "ffn_norm", il);
|
|
7109
7224
|
|
|
7110
|
-
|
|
7225
|
+
// MoE branch
|
|
7226
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
7111
7227
|
model.layers[il].ffn_gate_inp,
|
|
7112
7228
|
model.layers[il].ffn_up_exps,
|
|
7113
7229
|
model.layers[il].ffn_gate_exps,
|
|
@@ -7118,18 +7234,28 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
7118
7234
|
false, 0.0,
|
|
7119
7235
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
7120
7236
|
il);
|
|
7121
|
-
cb(
|
|
7237
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
7122
7238
|
|
|
7123
|
-
|
|
7124
|
-
|
|
7125
|
-
|
|
7126
|
-
|
|
7127
|
-
|
|
7128
|
-
|
|
7129
|
-
|
|
7130
|
-
cb(
|
|
7239
|
+
if (model.layers[il].ffn_up) {
|
|
7240
|
+
ggml_tensor * ffn_out = build_ffn(cur,
|
|
7241
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
7242
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
7243
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
7244
|
+
NULL,
|
|
7245
|
+
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
7246
|
+
cb(ffn_out, "ffn_out", il);
|
|
7247
|
+
|
|
7248
|
+
cur = ggml_scale(ctx0, ggml_add(ctx0, ffn_out, moe_out), std::sqrt(2) / 2);
|
|
7249
|
+
cb(cur, "ffn_out", il);
|
|
7250
|
+
} else {
|
|
7251
|
+
cur = moe_out;
|
|
7131
7252
|
}
|
|
7132
7253
|
|
|
7254
|
+
cur = build_norm(cur,
|
|
7255
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
7256
|
+
LLM_NORM_RMS, il);
|
|
7257
|
+
cb(cur, "ffn_post_norm", il);
|
|
7258
|
+
|
|
7133
7259
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
7134
7260
|
cb(cur, "ffn_out", il);
|
|
7135
7261
|
|
|
@@ -7152,10 +7278,14 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
7152
7278
|
// lm_head
|
|
7153
7279
|
cur = build_lora_mm(model.output, cur);
|
|
7154
7280
|
|
|
7155
|
-
|
|
7156
|
-
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
|
7281
|
+
cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
|
|
7157
7282
|
|
|
7158
|
-
|
|
7283
|
+
// final logit soft-capping
|
|
7284
|
+
if (hparams.f_final_logit_softcapping) {
|
|
7285
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
|
7286
|
+
cur = ggml_tanh(ctx0, cur);
|
|
7287
|
+
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
|
7288
|
+
}
|
|
7159
7289
|
|
|
7160
7290
|
cb(cur, "result_output", -1);
|
|
7161
7291
|
res->t_logits = cur;
|
|
@@ -7207,9 +7337,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
7207
7337
|
|
|
7208
7338
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7209
7339
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7210
|
-
Vcur =
|
|
7211
|
-
|
|
7212
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7340
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7213
7341
|
|
|
7214
7342
|
Qcur = ggml_rope_ext(
|
|
7215
7343
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -7329,13 +7457,9 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
7329
7457
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7330
7458
|
cb(cur, "bqkv", il);
|
|
7331
7459
|
|
|
7332
|
-
ggml_tensor * Qcur =
|
|
7333
|
-
ggml_tensor * Kcur =
|
|
7334
|
-
ggml_tensor * Vcur =
|
|
7335
|
-
|
|
7336
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7337
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7338
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7460
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7461
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7462
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7339
7463
|
|
|
7340
7464
|
cb(Qcur, "Qcur", il);
|
|
7341
7465
|
cb(Kcur, "Kcur", il);
|
|
@@ -7551,14 +7675,16 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7551
7675
|
cb(cur, "bqkv", il);
|
|
7552
7676
|
}
|
|
7553
7677
|
|
|
7554
|
-
Qcur =
|
|
7555
|
-
Kcur =
|
|
7556
|
-
Vcur =
|
|
7557
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7678
|
+
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7679
|
+
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7680
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7558
7681
|
} else {
|
|
7559
7682
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
|
|
7560
7683
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
|
|
7561
7684
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
|
|
7685
|
+
|
|
7686
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7687
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7562
7688
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7563
7689
|
}
|
|
7564
7690
|
|
|
@@ -7569,8 +7695,6 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7569
7695
|
LLM_NORM, il);
|
|
7570
7696
|
|
|
7571
7697
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7572
|
-
} else {
|
|
7573
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7574
7698
|
}
|
|
7575
7699
|
|
|
7576
7700
|
if (model.layers[il].attn_k_norm) {
|
|
@@ -7580,8 +7704,6 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7580
7704
|
LLM_NORM, il);
|
|
7581
7705
|
|
|
7582
7706
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7583
|
-
} else {
|
|
7584
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7585
7707
|
}
|
|
7586
7708
|
|
|
7587
7709
|
// RoPE
|
|
@@ -7727,9 +7849,7 @@ struct llm_build_neo_bert : public llm_graph_context {
|
|
|
7727
7849
|
|
|
7728
7850
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7729
7851
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7730
|
-
Vcur =
|
|
7731
|
-
|
|
7732
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7852
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7733
7853
|
|
|
7734
7854
|
// RoPE
|
|
7735
7855
|
Qcur = ggml_rope_ext(
|
|
@@ -7836,13 +7956,9 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
7836
7956
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7837
7957
|
cb(cur, "bqkv", il);
|
|
7838
7958
|
|
|
7839
|
-
ggml_tensor * Qcur =
|
|
7840
|
-
ggml_tensor * Kcur =
|
|
7841
|
-
ggml_tensor * Vcur =
|
|
7842
|
-
|
|
7843
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7844
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7845
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7959
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
7960
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
7961
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7846
7962
|
|
|
7847
7963
|
cb(Qcur, "Qcur", il);
|
|
7848
7964
|
cb(Kcur, "Kcur", il);
|
|
@@ -7958,13 +8074,9 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7958
8074
|
cb(cur, "wqkv_clamped", il);
|
|
7959
8075
|
}
|
|
7960
8076
|
|
|
7961
|
-
ggml_tensor * Qcur =
|
|
7962
|
-
ggml_tensor * Kcur =
|
|
7963
|
-
ggml_tensor * Vcur =
|
|
7964
|
-
|
|
7965
|
-
cb(Qcur, "Qcur", il);
|
|
7966
|
-
cb(Kcur, "Kcur", il);
|
|
7967
|
-
cb(Vcur, "Vcur", il);
|
|
8077
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8078
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
8079
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
7968
8080
|
|
|
7969
8081
|
// Q/K Layernorm
|
|
7970
8082
|
if (model.layers[il].attn_q_norm) {
|
|
@@ -7972,26 +8084,16 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
7972
8084
|
model.layers[il].attn_q_norm,
|
|
7973
8085
|
model.layers[il].attn_q_norm_b,
|
|
7974
8086
|
LLM_NORM, il);
|
|
7975
|
-
cb(Qcur, "Qcur", il);
|
|
7976
8087
|
|
|
7977
8088
|
Kcur = build_norm(Kcur,
|
|
7978
8089
|
model.layers[il].attn_k_norm,
|
|
7979
8090
|
model.layers[il].attn_k_norm_b,
|
|
7980
8091
|
LLM_NORM, il);
|
|
7981
|
-
cb(Kcur, "Kcur", il);
|
|
7982
8092
|
|
|
7983
8093
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7984
8094
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7985
|
-
} else {
|
|
7986
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7987
|
-
cb(Qcur, "Qcur", il);
|
|
7988
|
-
|
|
7989
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7990
|
-
cb(Kcur, "Kcur", il);
|
|
7991
8095
|
}
|
|
7992
8096
|
|
|
7993
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7994
|
-
|
|
7995
8097
|
cb(Qcur, "Qcur", il);
|
|
7996
8098
|
cb(Kcur, "Kcur", il);
|
|
7997
8099
|
cb(Vcur, "Vcur", il);
|
|
@@ -8240,11 +8342,9 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
8240
8342
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
8241
8343
|
cb(cur, "bqkv", il);
|
|
8242
8344
|
|
|
8243
|
-
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,
|
|
8345
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
8244
8346
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
8245
|
-
ggml_tensor * Vcur =
|
|
8246
|
-
|
|
8247
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
8347
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
|
|
8248
8348
|
|
|
8249
8349
|
// using mode = 2 for neox mode
|
|
8250
8350
|
Qcur = ggml_rope_ext(
|
|
@@ -9219,21 +9319,17 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
9219
9319
|
|
|
9220
9320
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9221
9321
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9222
|
-
Vcur =
|
|
9223
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9322
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9224
9323
|
} else {
|
|
9225
9324
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9226
9325
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9227
9326
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9327
|
+
|
|
9228
9328
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9229
9329
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9230
9330
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9231
9331
|
}
|
|
9232
9332
|
|
|
9233
|
-
cb(Qcur, "Qcur", il);
|
|
9234
|
-
cb(Kcur, "Kcur", il);
|
|
9235
|
-
cb(Vcur, "Vcur", il);
|
|
9236
|
-
|
|
9237
9333
|
Qcur = ggml_rope_ext(
|
|
9238
9334
|
ctx0, Qcur, inp_pos, nullptr,
|
|
9239
9335
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9357,21 +9453,17 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
9357
9453
|
|
|
9358
9454
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
|
|
9359
9455
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
|
|
9360
|
-
Vcur =
|
|
9361
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9456
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
9362
9457
|
} else {
|
|
9363
9458
|
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
9364
9459
|
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
9365
9460
|
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
9461
|
+
|
|
9366
9462
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9367
9463
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9368
9464
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9369
9465
|
}
|
|
9370
9466
|
|
|
9371
|
-
cb(Qcur, "Qcur", il);
|
|
9372
|
-
cb(Kcur, "Kcur", il);
|
|
9373
|
-
cb(Vcur, "Vcur", il);
|
|
9374
|
-
|
|
9375
9467
|
Qcur = ggml_rope_ext(
|
|
9376
9468
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
9377
9469
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -9621,18 +9713,14 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
9621
9713
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
9622
9714
|
cb(cur, "bqkv", il);
|
|
9623
9715
|
|
|
9624
|
-
ggml_tensor * Qcur =
|
|
9625
|
-
ggml_tensor * Kcur =
|
|
9626
|
-
ggml_tensor * Vcur =
|
|
9716
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9717
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9718
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9627
9719
|
|
|
9628
9720
|
cb(Qcur, "Qcur", il);
|
|
9629
9721
|
cb(Kcur, "Kcur", il);
|
|
9630
9722
|
cb(Vcur, "Vcur", il);
|
|
9631
9723
|
|
|
9632
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9633
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9634
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9635
|
-
|
|
9636
9724
|
cur = build_attn(inp_attn,
|
|
9637
9725
|
model.layers[il].wo, model.layers[il].bo,
|
|
9638
9726
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
@@ -9727,9 +9815,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
9727
9815
|
|
|
9728
9816
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
9729
9817
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
9730
|
-
ggml_tensor * Vcur =
|
|
9731
|
-
|
|
9732
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
9818
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
9733
9819
|
|
|
9734
9820
|
Qcur = ggml_rope_ext(
|
|
9735
9821
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -12179,6 +12265,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
12179
12265
|
}
|
|
12180
12266
|
};
|
|
12181
12267
|
|
|
12268
|
+
template <bool iswa>
|
|
12182
12269
|
struct llm_build_olmo2 : public llm_graph_context {
|
|
12183
12270
|
llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12184
12271
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -12194,7 +12281,14 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
12194
12281
|
// inp_pos - contains the positions
|
|
12195
12282
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12196
12283
|
|
|
12197
|
-
|
|
12284
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
12285
|
+
inp_attn_type * inp_attn = nullptr;
|
|
12286
|
+
|
|
12287
|
+
if constexpr (iswa) {
|
|
12288
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
12289
|
+
} else {
|
|
12290
|
+
inp_attn = build_attn_inp_kv();
|
|
12291
|
+
}
|
|
12198
12292
|
|
|
12199
12293
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12200
12294
|
|
|
@@ -12227,17 +12321,36 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
12227
12321
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12228
12322
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12229
12323
|
|
|
12230
|
-
|
|
12324
|
+
const bool is_swa = hparams.is_swa(il);
|
|
12325
|
+
|
|
12326
|
+
if (is_swa) {
|
|
12327
|
+
// For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
|
|
12328
|
+
// This is achieved here by setting freq_scale and attn_factor to 1.
|
|
12329
|
+
// We also set ext_factor to 0 to avoid a few unnecessary computations.
|
|
12330
|
+
Qcur = ggml_rope_ext(
|
|
12331
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
12332
|
+
n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
|
|
12333
|
+
0.0, 1.0, beta_fast, beta_slow
|
|
12334
|
+
);
|
|
12335
|
+
|
|
12336
|
+
Kcur = ggml_rope_ext(
|
|
12337
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
12338
|
+
n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
|
|
12339
|
+
0.0, 1.0, beta_fast, beta_slow
|
|
12340
|
+
);
|
|
12341
|
+
} else {
|
|
12342
|
+
Qcur = ggml_rope_ext(
|
|
12231
12343
|
ctx0, Qcur, inp_pos, nullptr,
|
|
12232
12344
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12233
12345
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12234
12346
|
);
|
|
12235
12347
|
|
|
12236
|
-
|
|
12348
|
+
Kcur = ggml_rope_ext(
|
|
12237
12349
|
ctx0, Kcur, inp_pos, nullptr,
|
|
12238
12350
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12239
12351
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12240
12352
|
);
|
|
12353
|
+
}
|
|
12241
12354
|
|
|
12242
12355
|
cb(Qcur, "Qcur", il);
|
|
12243
12356
|
cb(Kcur, "Kcur", il);
|
|
@@ -12436,6 +12549,132 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
12436
12549
|
}
|
|
12437
12550
|
};
|
|
12438
12551
|
|
|
12552
|
+
struct llm_build_llada_moe : public llm_graph_context {
|
|
12553
|
+
llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12554
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12555
|
+
|
|
12556
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
12557
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
12558
|
+
|
|
12559
|
+
ggml_tensor * cur;
|
|
12560
|
+
ggml_tensor * inpL;
|
|
12561
|
+
|
|
12562
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12563
|
+
|
|
12564
|
+
// inp_pos - contains the positions
|
|
12565
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
12566
|
+
|
|
12567
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
12568
|
+
|
|
12569
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12570
|
+
|
|
12571
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12572
|
+
ggml_tensor * inpSA = inpL;
|
|
12573
|
+
|
|
12574
|
+
// norm
|
|
12575
|
+
cur = build_norm(inpL,
|
|
12576
|
+
model.layers[il].attn_norm, NULL,
|
|
12577
|
+
LLM_NORM_RMS, il);
|
|
12578
|
+
cb(cur, "attn_norm", il);
|
|
12579
|
+
|
|
12580
|
+
// self_attention
|
|
12581
|
+
{
|
|
12582
|
+
// compute Q and K and RoPE them
|
|
12583
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
12584
|
+
cb(Qcur, "Qcur", il);
|
|
12585
|
+
|
|
12586
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
12587
|
+
cb(Kcur, "Kcur", il);
|
|
12588
|
+
|
|
12589
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
12590
|
+
cb(Vcur, "Vcur", il);
|
|
12591
|
+
|
|
12592
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12593
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12594
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12595
|
+
|
|
12596
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
12597
|
+
cb(Qcur, "Qcur_normed", il);
|
|
12598
|
+
|
|
12599
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
12600
|
+
cb(Kcur, "Kcur_normed", il);
|
|
12601
|
+
|
|
12602
|
+
Qcur = ggml_rope_ext(
|
|
12603
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
12604
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12605
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12606
|
+
);
|
|
12607
|
+
|
|
12608
|
+
Kcur = ggml_rope_ext(
|
|
12609
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
12610
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12611
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12612
|
+
);
|
|
12613
|
+
|
|
12614
|
+
cb(Qcur, "Qcur", il);
|
|
12615
|
+
cb(Kcur, "Kcur", il);
|
|
12616
|
+
cb(Vcur, "Vcur", il);
|
|
12617
|
+
|
|
12618
|
+
cur = build_attn(inp_attn,
|
|
12619
|
+
model.layers[il].wo, NULL,
|
|
12620
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12621
|
+
}
|
|
12622
|
+
|
|
12623
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12624
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12625
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12626
|
+
}
|
|
12627
|
+
|
|
12628
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12629
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
12630
|
+
|
|
12631
|
+
// MoE branch
|
|
12632
|
+
cur = build_norm(ffn_inp,
|
|
12633
|
+
model.layers[il].ffn_norm, NULL,
|
|
12634
|
+
LLM_NORM_RMS, il);
|
|
12635
|
+
cb(cur, "ffn_norm", il);
|
|
12636
|
+
|
|
12637
|
+
cur = build_moe_ffn(cur,
|
|
12638
|
+
model.layers[il].ffn_gate_inp,
|
|
12639
|
+
model.layers[il].ffn_up_exps,
|
|
12640
|
+
model.layers[il].ffn_gate_exps,
|
|
12641
|
+
model.layers[il].ffn_down_exps,
|
|
12642
|
+
nullptr,
|
|
12643
|
+
n_expert, n_expert_used,
|
|
12644
|
+
LLM_FFN_SILU, false,
|
|
12645
|
+
false, 0.0,
|
|
12646
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
12647
|
+
il);
|
|
12648
|
+
cb(cur, "ffn_moe_out", il);
|
|
12649
|
+
|
|
12650
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
12651
|
+
|
|
12652
|
+
cur = build_cvec(cur, il);
|
|
12653
|
+
cb(cur, "l_out", il);
|
|
12654
|
+
|
|
12655
|
+
// input for next layer
|
|
12656
|
+
inpL = cur;
|
|
12657
|
+
}
|
|
12658
|
+
|
|
12659
|
+
cur = inpL;
|
|
12660
|
+
|
|
12661
|
+
cur = build_norm(cur,
|
|
12662
|
+
model.output_norm, NULL,
|
|
12663
|
+
LLM_NORM_RMS, -1);
|
|
12664
|
+
|
|
12665
|
+
cb(cur, "result_norm", -1);
|
|
12666
|
+
res->t_embd = cur;
|
|
12667
|
+
|
|
12668
|
+
// lm_head
|
|
12669
|
+
cur = build_lora_mm(model.output, cur);
|
|
12670
|
+
|
|
12671
|
+
cb(cur, "result_output", -1);
|
|
12672
|
+
res->t_logits = cur;
|
|
12673
|
+
|
|
12674
|
+
ggml_build_forward_expand(gf, cur);
|
|
12675
|
+
}
|
|
12676
|
+
};
|
|
12677
|
+
|
|
12439
12678
|
struct llm_build_openelm : public llm_graph_context {
|
|
12440
12679
|
llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12441
12680
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -12601,9 +12840,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
12601
12840
|
|
|
12602
12841
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
12603
12842
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
12604
|
-
ggml_tensor * Vcur =
|
|
12605
|
-
|
|
12606
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12843
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
12607
12844
|
|
|
12608
12845
|
Qcur = ggml_rope_ext(
|
|
12609
12846
|
ctx0, Qcur, inp_pos, nullptr,
|
|
@@ -13557,7 +13794,9 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13557
13794
|
|
|
13558
13795
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13559
13796
|
|
|
13560
|
-
|
|
13797
|
+
const int64_t dec_n_layer = hparams.dec_n_layer;
|
|
13798
|
+
|
|
13799
|
+
for (int il = 0; il < dec_n_layer; ++il) {
|
|
13561
13800
|
ggml_tensor * inpSA = inpL;
|
|
13562
13801
|
|
|
13563
13802
|
// norm
|
|
@@ -13648,7 +13887,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13648
13887
|
//cb(cur, "kqv_out", il);
|
|
13649
13888
|
}
|
|
13650
13889
|
|
|
13651
|
-
if (il ==
|
|
13890
|
+
if (il == dec_n_layer - 1 && inp_out_ids) {
|
|
13652
13891
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13653
13892
|
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
|
13654
13893
|
}
|
|
@@ -13669,8 +13908,8 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
13669
13908
|
model.layers[il].ffn_gate, NULL, NULL,
|
|
13670
13909
|
model.layers[il].ffn_down, NULL, NULL,
|
|
13671
13910
|
NULL,
|
|
13672
|
-
model.layers[il].
|
|
13673
|
-
model.layers[il].
|
|
13911
|
+
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
|
|
13912
|
+
model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
|
13674
13913
|
il);
|
|
13675
13914
|
cb(cur, "ffn_out", il);
|
|
13676
13915
|
}
|
|
@@ -13736,18 +13975,14 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
13736
13975
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
13737
13976
|
cb(cur, "bqkv", il);
|
|
13738
13977
|
|
|
13739
|
-
ggml_tensor * Qcur =
|
|
13740
|
-
ggml_tensor * Kcur =
|
|
13741
|
-
ggml_tensor * Vcur =
|
|
13978
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
|
|
13979
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
|
|
13980
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
|
|
13742
13981
|
|
|
13743
13982
|
cb(Qcur, "Qcur", il);
|
|
13744
13983
|
cb(Kcur, "Kcur", il);
|
|
13745
13984
|
cb(Vcur, "Vcur", il);
|
|
13746
13985
|
|
|
13747
|
-
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
13748
|
-
Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
13749
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
13750
|
-
|
|
13751
13986
|
cur = build_attn(inp_attn,
|
|
13752
13987
|
model.layers[il].wo, model.layers[il].bo,
|
|
13753
13988
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
@@ -13859,8 +14094,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
13859
14094
|
}
|
|
13860
14095
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13861
14096
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13862
|
-
Vcur =
|
|
13863
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14097
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13864
14098
|
}
|
|
13865
14099
|
|
|
13866
14100
|
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
|
@@ -13993,8 +14227,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
13993
14227
|
}
|
|
13994
14228
|
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
13995
14229
|
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
13996
|
-
Vcur =
|
|
13997
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14230
|
+
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
13998
14231
|
}
|
|
13999
14232
|
|
|
14000
14233
|
Qcur = ggml_rope_ext(
|
|
@@ -17293,16 +17526,14 @@ private:
|
|
|
17293
17526
|
const int64_t k_offset = n_embd_head_q * n_head;
|
|
17294
17527
|
const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
|
|
17295
17528
|
|
|
17296
|
-
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head,
|
|
17529
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
|
|
17297
17530
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
|
|
17298
|
-
ggml_tensor * Vcur =
|
|
17531
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
|
|
17299
17532
|
|
|
17300
17533
|
cb(Qcur, "Qcur", il);
|
|
17301
17534
|
cb(Kcur, "Kcur", il);
|
|
17302
17535
|
cb(Vcur, "Vcur", il);
|
|
17303
17536
|
|
|
17304
|
-
Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
|
|
17305
|
-
|
|
17306
17537
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
17307
17538
|
cb(Qcur, "Qcur_normed", il);
|
|
17308
17539
|
|
|
@@ -18636,6 +18867,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18636
18867
|
//case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
|
|
18637
18868
|
case LLM_ARCH_DREAM:
|
|
18638
18869
|
case LLM_ARCH_LLADA:
|
|
18870
|
+
case LLM_ARCH_LLADA_MOE:
|
|
18639
18871
|
{
|
|
18640
18872
|
res = nullptr;
|
|
18641
18873
|
} break;
|
|
@@ -18773,7 +19005,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18773
19005
|
} break;
|
|
18774
19006
|
case LLM_ARCH_LLAMA4:
|
|
18775
19007
|
{
|
|
18776
|
-
|
|
19008
|
+
if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
|
|
19009
|
+
llm = std::make_unique<llm_build_llama>(*this, params);
|
|
19010
|
+
} else {
|
|
19011
|
+
llm = std::make_unique<llm_build_llama_iswa>(*this, params);
|
|
19012
|
+
}
|
|
18777
19013
|
} break;
|
|
18778
19014
|
case LLM_ARCH_DECI:
|
|
18779
19015
|
{
|
|
@@ -18841,6 +19077,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18841
19077
|
llm = std::make_unique<llm_build_llada>(*this, params);
|
|
18842
19078
|
}
|
|
18843
19079
|
break;
|
|
19080
|
+
case LLM_ARCH_LLADA_MOE:
|
|
19081
|
+
{
|
|
19082
|
+
llm = std::make_unique<llm_build_llada_moe>(*this, params);
|
|
19083
|
+
}
|
|
19084
|
+
break;
|
|
18844
19085
|
case LLM_ARCH_QWEN2VL:
|
|
18845
19086
|
{
|
|
18846
19087
|
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
|
@@ -18953,7 +19194,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18953
19194
|
} break;
|
|
18954
19195
|
case LLM_ARCH_OLMO2:
|
|
18955
19196
|
{
|
|
18956
|
-
|
|
19197
|
+
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
|
19198
|
+
llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
|
|
19199
|
+
} else {
|
|
19200
|
+
llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
|
|
19201
|
+
}
|
|
18957
19202
|
} break;
|
|
18958
19203
|
case LLM_ARCH_OLMOE:
|
|
18959
19204
|
{
|
|
@@ -19307,6 +19552,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
19307
19552
|
case LLM_ARCH_QWEN2MOE:
|
|
19308
19553
|
case LLM_ARCH_QWEN3:
|
|
19309
19554
|
case LLM_ARCH_QWEN3MOE:
|
|
19555
|
+
case LLM_ARCH_LLADA_MOE:
|
|
19310
19556
|
case LLM_ARCH_OLMO2:
|
|
19311
19557
|
case LLM_ARCH_OLMOE:
|
|
19312
19558
|
case LLM_ARCH_PHI2:
|