@fugood/llama.node 1.2.0-rc.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/package.json +16 -15
  2. package/src/llama.cpp/CMakeLists.txt +7 -0
  3. package/src/llama.cpp/common/arg.cpp +141 -21
  4. package/src/llama.cpp/common/chat.cpp +139 -0
  5. package/src/llama.cpp/common/chat.h +1 -0
  6. package/src/llama.cpp/common/common.h +23 -8
  7. package/src/llama.cpp/common/json-schema-to-grammar.cpp +28 -7
  8. package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
  9. package/src/llama.cpp/ggml/include/ggml-backend.h +12 -0
  10. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  11. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -6
  12. package/src/llama.cpp/ggml/include/ggml-zdnn.h +0 -2
  13. package/src/llama.cpp/ggml/include/ggml.h +10 -5
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +7 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +0 -3
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +161 -1
  21. package/src/llama.cpp/src/llama-arch.cpp +44 -10
  22. package/src/llama.cpp/src/llama-arch.h +9 -0
  23. package/src/llama.cpp/src/llama-chat.cpp +17 -0
  24. package/src/llama.cpp/src/llama-chat.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +13 -11
  26. package/src/llama.cpp/src/llama-graph.cpp +6 -5
  27. package/src/llama.cpp/src/llama-hparams.h +14 -3
  28. package/src/llama.cpp/src/llama-kv-cache.cpp +55 -15
  29. package/src/llama.cpp/src/llama-kv-cache.h +8 -0
  30. package/src/llama.cpp/src/llama-model.cpp +386 -140
  31. package/src/llama.cpp/src/llama-model.h +3 -0
  32. package/src/llama.cpp/src/llama-quant.cpp +6 -4
  33. package/src/llama.cpp/src/llama-vocab.cpp +13 -1
  34. package/src/llama.cpp/src/llama-vocab.h +1 -0
  35. package/src/llama.cpp/src/llama.cpp +53 -10
@@ -36,6 +36,7 @@ const char * llm_type_name(llm_type type) {
36
36
  case LLM_TYPE_80M: return "80M";
37
37
  case LLM_TYPE_109M: return "109M";
38
38
  case LLM_TYPE_137M: return "137M";
39
+ case LLM_TYPE_140M: return "140M";
39
40
  case LLM_TYPE_160M: return "160M";
40
41
  case LLM_TYPE_190M: return "190M";
41
42
  case LLM_TYPE_220M: return "220M";
@@ -44,6 +45,7 @@ const char * llm_type_name(llm_type type) {
44
45
  case LLM_TYPE_270M: return "270M";
45
46
  case LLM_TYPE_335M: return "335M";
46
47
  case LLM_TYPE_350M: return "350M";
48
+ case LLM_TYPE_360M: return "360M";
47
49
  case LLM_TYPE_410M: return "410M";
48
50
  case LLM_TYPE_450M: return "450M";
49
51
  case LLM_TYPE_475M: return "475M";
@@ -51,6 +53,7 @@ const char * llm_type_name(llm_type type) {
51
53
  case LLM_TYPE_700M: return "700M";
52
54
  case LLM_TYPE_770M: return "770M";
53
55
  case LLM_TYPE_780M: return "780M";
56
+ case LLM_TYPE_950M: return "950M";
54
57
  case LLM_TYPE_0_3B: return "0.3B";
55
58
  case LLM_TYPE_0_5B: return "0.5B";
56
59
  case LLM_TYPE_0_6B: return "0.6B";
@@ -622,19 +625,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
622
625
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
623
626
  ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
624
627
 
625
- hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
626
- hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
627
- hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
628
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
629
+ if (found_swa && hparams.n_swa == 0) {
630
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
631
+ hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
632
+ } else {
633
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
634
+ hparams.n_swa = 8192;
635
+ hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
636
+ }
628
637
 
629
638
  switch (hparams.n_expert) {
639
+ case 0: {
640
+ // MobileLLM (no MoE)
641
+ switch (hparams.n_embd) {
642
+ case 2048: type = LLM_TYPE_140M; break;
643
+ case 4096: type = LLM_TYPE_360M; break;
644
+ case 6144: type = LLM_TYPE_950M; break;
645
+ default: type = LLM_TYPE_UNKNOWN;
646
+ }
647
+ } break;
630
648
  case 16: type = LLM_TYPE_17B_16E; break;
631
649
  case 128: type = LLM_TYPE_17B_128E; break;
632
650
  default: type = LLM_TYPE_UNKNOWN;
633
651
  }
634
652
 
635
- if (type == LLM_TYPE_17B_128E) {
636
- hparams.use_kq_norm = false;
637
- }
653
+ hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
638
654
  } break;
639
655
  case LLM_ARCH_ARCEE:
640
656
  {
@@ -685,7 +701,30 @@ void llama_model::load_hparams(llama_model_loader & ml) {
685
701
  } break;
686
702
  case LLM_ARCH_GROK:
687
703
  {
688
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
704
+ // defaults for old GGUFs
705
+ hparams.yarn_beta_fast = 8.0f;
706
+ hparams.f_logit_scale = 0.5773502691896257f;
707
+ hparams.f_embedding_scale = 78.38367176906169f;
708
+ hparams.f_attn_out_scale = 0.08838834764831845f;
709
+ hparams.f_attn_logit_softcapping = 30.0f;
710
+ hparams.f_router_logit_softcapping = 30.0f;
711
+ // no final_logit_softcapping in grok-1
712
+ hparams.f_final_logit_softcapping = 0.0f;
713
+
714
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
715
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
716
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
717
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
718
+ ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
719
+ ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
720
+ ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
721
+ ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
722
+
723
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
724
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
725
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
726
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
727
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
689
728
 
690
729
  switch (hparams.n_layer) {
691
730
  case 64: type = LLM_TYPE_314B; break;
@@ -913,6 +952,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
913
952
  hparams.causal_attn = false;
914
953
  }
915
954
  break;
955
+ case LLM_ARCH_LLADA_MOE:
956
+ {
957
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
958
+
959
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
960
+ // diffusion language model uses non-causal attention
961
+ hparams.causal_attn = false;
962
+ switch (hparams.n_layer) {
963
+ case 16: type = LLM_TYPE_A1_7B; break;
964
+ default: type = LLM_TYPE_UNKNOWN;
965
+ }
966
+ } break;
916
967
  case LLM_ARCH_QWEN2MOE:
917
968
  {
918
969
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -1315,6 +1366,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1315
1366
  {
1316
1367
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1317
1368
 
1369
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1370
+ if (found_swa && hparams.n_swa > 0) {
1371
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1372
+ hparams.set_swa_pattern(4);
1373
+ } else {
1374
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1375
+ }
1376
+
1318
1377
  switch (hparams.n_layer) {
1319
1378
  case 16: type = LLM_TYPE_1B; break;
1320
1379
  case 32: type = LLM_TYPE_7B; break;
@@ -1542,6 +1601,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1542
1601
  hparams.dec_start_token_id = dec_start_token_id;
1543
1602
  }
1544
1603
 
1604
+ hparams.dec_n_layer = hparams.n_layer;
1605
+ ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
1606
+
1545
1607
  switch (hparams.n_layer) {
1546
1608
  case 6: type = LLM_TYPE_60M; break; // t5-small
1547
1609
  case 8: type = LLM_TYPE_80M; break; // flan-t5-small
@@ -2361,6 +2423,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2361
2423
  }
2362
2424
  }
2363
2425
  break;
2426
+ case LLM_ARCH_LLADA_MOE:
2427
+ {
2428
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2429
+
2430
+ // output
2431
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2432
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2433
+
2434
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
2435
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
2436
+
2437
+ for (int i = 0; i < n_layer; ++i) {
2438
+ auto & layer = layers[i];
2439
+
2440
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2441
+
2442
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2443
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2444
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2445
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2446
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2447
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2448
+
2449
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2450
+
2451
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2452
+
2453
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
2454
+
2455
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2456
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2457
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2458
+ }
2459
+ } break;
2364
2460
  case LLM_ARCH_LLAMA4:
2365
2461
  {
2366
2462
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -2374,9 +2470,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2374
2470
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2375
2471
  }
2376
2472
 
2377
- GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
2378
2473
  for (int i = 0; i < n_layer; ++i) {
2379
- bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
2474
+ bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
2380
2475
 
2381
2476
  auto & layer = layers[i];
2382
2477
 
@@ -2537,6 +2632,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2537
2632
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2538
2633
  }
2539
2634
 
2635
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
2540
2636
  for (int i = 0; i < n_layer; ++i) {
2541
2637
  auto & layer = layers[i];
2542
2638
 
@@ -2551,12 +2647,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2551
2647
 
2552
2648
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2553
2649
 
2650
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2651
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
2652
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2653
+
2554
2654
  layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2555
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
2556
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2557
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2655
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
2656
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2657
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
2558
2658
 
2559
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
2659
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2660
+ if (!layer.ffn_post_norm) {
2661
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2662
+ }
2560
2663
  }
2561
2664
  } break;
2562
2665
  case LLM_ARCH_DBRX:
@@ -4414,6 +4517,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4414
4517
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4415
4518
  }
4416
4519
 
4520
+ // n_layer: number of encoder_layers
4521
+ // dec_n_layer: number of decoder_layers
4522
+ const int dec_n_layer = hparams.dec_n_layer;
4523
+ if (dec_n_layer > n_layer) {
4524
+ layers.resize(dec_n_layer);
4525
+ }
4526
+
4527
+ // load encoder layers
4417
4528
  for (int i = 0; i < n_layer; ++i) {
4418
4529
  auto & layer = layers[i];
4419
4530
 
@@ -4429,6 +4540,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4429
4540
  layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
4430
4541
  layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4431
4542
  layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4543
+ }
4544
+
4545
+ // load decoder layers
4546
+ for (int i = 0; i < dec_n_layer; ++i) {
4547
+ auto & layer = layers[i];
4432
4548
 
4433
4549
  layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
4434
4550
  layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
@@ -6227,6 +6343,14 @@ struct llm_build_llama : public llm_graph_context {
6227
6343
  cb(Kcur, "Kcur", il);
6228
6344
  cb(Vcur, "Vcur", il);
6229
6345
 
6346
+ if (hparams.use_kq_norm) {
6347
+ // Llama4TextL2Norm
6348
+ Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
6349
+ Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
6350
+ cb(Qcur, "Qcur_normed", il);
6351
+ cb(Kcur, "Kcur_normed", il);
6352
+ }
6353
+
6230
6354
  cur = build_attn(inp_attn,
6231
6355
  model.layers[il].wo, model.layers[il].bo,
6232
6356
  Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
@@ -6334,7 +6458,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
6334
6458
  for (int il = 0; il < n_layer; ++il) {
6335
6459
  ggml_tensor * inpSA = inpL;
6336
6460
 
6337
- const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
6461
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
6462
+ (il + 1) % hparams.n_no_rope_layer_step != 0;
6338
6463
 
6339
6464
  // norm
6340
6465
  cur = build_norm(inpL,
@@ -6927,9 +7052,7 @@ struct llm_build_falcon : public llm_graph_context {
6927
7052
 
6928
7053
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6929
7054
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6930
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6931
-
6932
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7055
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6933
7056
 
6934
7057
  // using mode = 2 for neox mode
6935
7058
  Qcur = ggml_rope_ext(
@@ -7014,9 +7137,6 @@ struct llm_build_grok : public llm_graph_context {
7014
7137
 
7015
7138
  inpL = build_inp_embd(model.tok_embd);
7016
7139
 
7017
- // multiply by embedding_multiplier_scale of 78.38367176906169
7018
- inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
7019
-
7020
7140
  // inp_pos - contains the positions
7021
7141
  ggml_tensor * inp_pos = build_inp_pos();
7022
7142
 
@@ -7088,26 +7208,22 @@ struct llm_build_grok : public llm_graph_context {
7088
7208
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7089
7209
  }
7090
7210
 
7091
- // Grok
7092
- // if attn_out_norm is present then apply it before adding the input
7093
- if (model.layers[il].attn_out_norm) {
7094
- cur = build_norm(cur,
7095
- model.layers[il].attn_out_norm, NULL,
7096
- LLM_NORM_RMS, il);
7097
- cb(cur, "attn_out_norm", il);
7098
- }
7211
+ cur = build_norm(cur,
7212
+ model.layers[il].attn_out_norm, NULL,
7213
+ LLM_NORM_RMS, il);
7214
+ cb(cur, "attn_out_norm", il);
7099
7215
 
7100
7216
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7101
7217
  cb(ffn_inp, "ffn_inp", il);
7102
7218
 
7103
7219
  // feed-forward network
7104
- // MoE branch
7105
7220
  cur = build_norm(ffn_inp,
7106
7221
  model.layers[il].ffn_norm, NULL,
7107
7222
  LLM_NORM_RMS, il);
7108
7223
  cb(cur, "ffn_norm", il);
7109
7224
 
7110
- cur = build_moe_ffn(cur,
7225
+ // MoE branch
7226
+ ggml_tensor * moe_out = build_moe_ffn(cur,
7111
7227
  model.layers[il].ffn_gate_inp,
7112
7228
  model.layers[il].ffn_up_exps,
7113
7229
  model.layers[il].ffn_gate_exps,
@@ -7118,18 +7234,28 @@ struct llm_build_grok : public llm_graph_context {
7118
7234
  false, 0.0,
7119
7235
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
7120
7236
  il);
7121
- cb(cur, "ffn_moe_out", il);
7237
+ cb(moe_out, "ffn_moe_out", il);
7122
7238
 
7123
- // Grok
7124
- // if layer_out_norm is present then apply it before adding the input
7125
- // Idea: maybe ffn_out_norm is a better name
7126
- if (model.layers[il].layer_out_norm) {
7127
- cur = build_norm(cur,
7128
- model.layers[il].layer_out_norm, NULL,
7129
- LLM_NORM_RMS, il);
7130
- cb(cur, "layer_out_norm", il);
7239
+ if (model.layers[il].ffn_up) {
7240
+ ggml_tensor * ffn_out = build_ffn(cur,
7241
+ model.layers[il].ffn_up, NULL, NULL,
7242
+ model.layers[il].ffn_gate, NULL, NULL,
7243
+ model.layers[il].ffn_down, NULL, NULL,
7244
+ NULL,
7245
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
7246
+ cb(ffn_out, "ffn_out", il);
7247
+
7248
+ cur = ggml_scale(ctx0, ggml_add(ctx0, ffn_out, moe_out), std::sqrt(2) / 2);
7249
+ cb(cur, "ffn_out", il);
7250
+ } else {
7251
+ cur = moe_out;
7131
7252
  }
7132
7253
 
7254
+ cur = build_norm(cur,
7255
+ model.layers[il].ffn_post_norm, NULL,
7256
+ LLM_NORM_RMS, il);
7257
+ cb(cur, "ffn_post_norm", il);
7258
+
7133
7259
  cur = ggml_add(ctx0, cur, ffn_inp);
7134
7260
  cb(cur, "ffn_out", il);
7135
7261
 
@@ -7152,10 +7278,14 @@ struct llm_build_grok : public llm_graph_context {
7152
7278
  // lm_head
7153
7279
  cur = build_lora_mm(model.output, cur);
7154
7280
 
7155
- // Grok
7156
- // multiply logits by output_multiplier_scale of 0.5773502691896257
7281
+ cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
7157
7282
 
7158
- cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7283
+ // final logit soft-capping
7284
+ if (hparams.f_final_logit_softcapping) {
7285
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
7286
+ cur = ggml_tanh(ctx0, cur);
7287
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
7288
+ }
7159
7289
 
7160
7290
  cb(cur, "result_output", -1);
7161
7291
  res->t_logits = cur;
@@ -7207,9 +7337,7 @@ struct llm_build_dbrx : public llm_graph_context {
7207
7337
 
7208
7338
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7209
7339
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7210
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7211
-
7212
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7340
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7213
7341
 
7214
7342
  Qcur = ggml_rope_ext(
7215
7343
  ctx0, Qcur, inp_pos, nullptr,
@@ -7329,13 +7457,9 @@ struct llm_build_starcoder : public llm_graph_context {
7329
7457
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7330
7458
  cb(cur, "bqkv", il);
7331
7459
 
7332
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7333
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7334
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7335
-
7336
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7337
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7338
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7460
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7461
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7462
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7339
7463
 
7340
7464
  cb(Qcur, "Qcur", il);
7341
7465
  cb(Kcur, "Kcur", il);
@@ -7551,14 +7675,16 @@ struct llm_build_bert : public llm_graph_context {
7551
7675
  cb(cur, "bqkv", il);
7552
7676
  }
7553
7677
 
7554
- Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7555
- Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7556
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7557
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7678
+ Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7679
+ Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7680
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7558
7681
  } else {
7559
7682
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
7560
7683
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
7561
7684
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
7685
+
7686
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7687
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7562
7688
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7563
7689
  }
7564
7690
 
@@ -7569,8 +7695,6 @@ struct llm_build_bert : public llm_graph_context {
7569
7695
  LLM_NORM, il);
7570
7696
 
7571
7697
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7572
- } else {
7573
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7574
7698
  }
7575
7699
 
7576
7700
  if (model.layers[il].attn_k_norm) {
@@ -7580,8 +7704,6 @@ struct llm_build_bert : public llm_graph_context {
7580
7704
  LLM_NORM, il);
7581
7705
 
7582
7706
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7583
- } else {
7584
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7585
7707
  }
7586
7708
 
7587
7709
  // RoPE
@@ -7727,9 +7849,7 @@ struct llm_build_neo_bert : public llm_graph_context {
7727
7849
 
7728
7850
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7729
7851
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7730
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7731
-
7732
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7852
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7733
7853
 
7734
7854
  // RoPE
7735
7855
  Qcur = ggml_rope_ext(
@@ -7836,13 +7956,9 @@ struct llm_build_bloom : public llm_graph_context {
7836
7956
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7837
7957
  cb(cur, "bqkv", il);
7838
7958
 
7839
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7840
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7841
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7842
-
7843
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7844
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7845
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7959
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7960
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7961
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7846
7962
 
7847
7963
  cb(Qcur, "Qcur", il);
7848
7964
  cb(Kcur, "Kcur", il);
@@ -7958,13 +8074,9 @@ struct llm_build_mpt : public llm_graph_context {
7958
8074
  cb(cur, "wqkv_clamped", il);
7959
8075
  }
7960
8076
 
7961
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7962
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7963
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7964
-
7965
- cb(Qcur, "Qcur", il);
7966
- cb(Kcur, "Kcur", il);
7967
- cb(Vcur, "Vcur", il);
8077
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8078
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8079
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7968
8080
 
7969
8081
  // Q/K Layernorm
7970
8082
  if (model.layers[il].attn_q_norm) {
@@ -7972,26 +8084,16 @@ struct llm_build_mpt : public llm_graph_context {
7972
8084
  model.layers[il].attn_q_norm,
7973
8085
  model.layers[il].attn_q_norm_b,
7974
8086
  LLM_NORM, il);
7975
- cb(Qcur, "Qcur", il);
7976
8087
 
7977
8088
  Kcur = build_norm(Kcur,
7978
8089
  model.layers[il].attn_k_norm,
7979
8090
  model.layers[il].attn_k_norm_b,
7980
8091
  LLM_NORM, il);
7981
- cb(Kcur, "Kcur", il);
7982
8092
 
7983
8093
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7984
8094
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7985
- } else {
7986
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7987
- cb(Qcur, "Qcur", il);
7988
-
7989
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7990
- cb(Kcur, "Kcur", il);
7991
8095
  }
7992
8096
 
7993
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7994
-
7995
8097
  cb(Qcur, "Qcur", il);
7996
8098
  cb(Kcur, "Kcur", il);
7997
8099
  cb(Vcur, "Vcur", il);
@@ -8240,11 +8342,9 @@ struct llm_build_qwen : public llm_graph_context {
8240
8342
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8241
8343
  cb(cur, "bqkv", il);
8242
8344
 
8243
- ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8345
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8244
8346
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8245
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
8246
-
8247
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8347
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 2*sizeof(float)*(n_embd));
8248
8348
 
8249
8349
  // using mode = 2 for neox mode
8250
8350
  Qcur = ggml_rope_ext(
@@ -9219,21 +9319,17 @@ struct llm_build_phi2 : public llm_graph_context {
9219
9319
 
9220
9320
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9221
9321
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9222
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9223
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9322
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9224
9323
  } else {
9225
9324
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9226
9325
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9227
9326
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9327
+
9228
9328
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9229
9329
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9230
9330
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9231
9331
  }
9232
9332
 
9233
- cb(Qcur, "Qcur", il);
9234
- cb(Kcur, "Kcur", il);
9235
- cb(Vcur, "Vcur", il);
9236
-
9237
9333
  Qcur = ggml_rope_ext(
9238
9334
  ctx0, Qcur, inp_pos, nullptr,
9239
9335
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9357,21 +9453,17 @@ struct llm_build_phi3 : public llm_graph_context {
9357
9453
 
9358
9454
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
9359
9455
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
9360
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
9361
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9456
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
9362
9457
  } else {
9363
9458
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9364
9459
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9365
9460
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9461
+
9366
9462
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9367
9463
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9368
9464
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9369
9465
  }
9370
9466
 
9371
- cb(Qcur, "Qcur", il);
9372
- cb(Kcur, "Kcur", il);
9373
- cb(Vcur, "Vcur", il);
9374
-
9375
9467
  Qcur = ggml_rope_ext(
9376
9468
  ctx0, Qcur, inp_pos, rope_factors,
9377
9469
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9621,18 +9713,14 @@ struct llm_build_gpt2 : public llm_graph_context {
9621
9713
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
9622
9714
  cb(cur, "bqkv", il);
9623
9715
 
9624
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
9625
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
9626
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9716
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9717
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9718
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9627
9719
 
9628
9720
  cb(Qcur, "Qcur", il);
9629
9721
  cb(Kcur, "Kcur", il);
9630
9722
  cb(Vcur, "Vcur", il);
9631
9723
 
9632
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9633
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9634
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9635
-
9636
9724
  cur = build_attn(inp_attn,
9637
9725
  model.layers[il].wo, model.layers[il].bo,
9638
9726
  Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
@@ -9727,9 +9815,7 @@ struct llm_build_codeshell : public llm_graph_context {
9727
9815
 
9728
9816
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9729
9817
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9730
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9731
-
9732
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9818
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9733
9819
 
9734
9820
  Qcur = ggml_rope_ext(
9735
9821
  ctx0, Qcur, inp_pos, nullptr,
@@ -12179,6 +12265,7 @@ struct llm_build_olmo : public llm_graph_context {
12179
12265
  }
12180
12266
  };
12181
12267
 
12268
+ template <bool iswa>
12182
12269
  struct llm_build_olmo2 : public llm_graph_context {
12183
12270
  llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12184
12271
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -12194,7 +12281,14 @@ struct llm_build_olmo2 : public llm_graph_context {
12194
12281
  // inp_pos - contains the positions
12195
12282
  ggml_tensor * inp_pos = build_inp_pos();
12196
12283
 
12197
- auto * inp_attn = build_attn_inp_kv();
12284
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
12285
+ inp_attn_type * inp_attn = nullptr;
12286
+
12287
+ if constexpr (iswa) {
12288
+ inp_attn = build_attn_inp_kv_iswa();
12289
+ } else {
12290
+ inp_attn = build_attn_inp_kv();
12291
+ }
12198
12292
 
12199
12293
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12200
12294
 
@@ -12227,17 +12321,36 @@ struct llm_build_olmo2 : public llm_graph_context {
12227
12321
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12228
12322
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12229
12323
 
12230
- Qcur = ggml_rope_ext(
12324
+ const bool is_swa = hparams.is_swa(il);
12325
+
12326
+ if (is_swa) {
12327
+ // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
12328
+ // This is achieved here by setting freq_scale and attn_factor to 1.
12329
+ // We also set ext_factor to 0 to avoid a few unnecessary computations.
12330
+ Qcur = ggml_rope_ext(
12331
+ ctx0, Qcur, inp_pos, nullptr,
12332
+ n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
12333
+ 0.0, 1.0, beta_fast, beta_slow
12334
+ );
12335
+
12336
+ Kcur = ggml_rope_ext(
12337
+ ctx0, Kcur, inp_pos, nullptr,
12338
+ n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
12339
+ 0.0, 1.0, beta_fast, beta_slow
12340
+ );
12341
+ } else {
12342
+ Qcur = ggml_rope_ext(
12231
12343
  ctx0, Qcur, inp_pos, nullptr,
12232
12344
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12233
12345
  ext_factor, attn_factor, beta_fast, beta_slow
12234
12346
  );
12235
12347
 
12236
- Kcur = ggml_rope_ext(
12348
+ Kcur = ggml_rope_ext(
12237
12349
  ctx0, Kcur, inp_pos, nullptr,
12238
12350
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12239
12351
  ext_factor, attn_factor, beta_fast, beta_slow
12240
12352
  );
12353
+ }
12241
12354
 
12242
12355
  cb(Qcur, "Qcur", il);
12243
12356
  cb(Kcur, "Kcur", il);
@@ -12436,6 +12549,132 @@ struct llm_build_olmoe : public llm_graph_context {
12436
12549
  }
12437
12550
  };
12438
12551
 
12552
+ struct llm_build_llada_moe : public llm_graph_context {
12553
+ llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12554
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12555
+
12556
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12557
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12558
+
12559
+ ggml_tensor * cur;
12560
+ ggml_tensor * inpL;
12561
+
12562
+ inpL = build_inp_embd(model.tok_embd);
12563
+
12564
+ // inp_pos - contains the positions
12565
+ ggml_tensor * inp_pos = build_inp_pos();
12566
+
12567
+ auto * inp_attn = build_attn_inp_no_cache();
12568
+
12569
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12570
+
12571
+ for (int il = 0; il < n_layer; ++il) {
12572
+ ggml_tensor * inpSA = inpL;
12573
+
12574
+ // norm
12575
+ cur = build_norm(inpL,
12576
+ model.layers[il].attn_norm, NULL,
12577
+ LLM_NORM_RMS, il);
12578
+ cb(cur, "attn_norm", il);
12579
+
12580
+ // self_attention
12581
+ {
12582
+ // compute Q and K and RoPE them
12583
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12584
+ cb(Qcur, "Qcur", il);
12585
+
12586
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12587
+ cb(Kcur, "Kcur", il);
12588
+
12589
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12590
+ cb(Vcur, "Vcur", il);
12591
+
12592
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12593
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12594
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12595
+
12596
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
12597
+ cb(Qcur, "Qcur_normed", il);
12598
+
12599
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
12600
+ cb(Kcur, "Kcur_normed", il);
12601
+
12602
+ Qcur = ggml_rope_ext(
12603
+ ctx0, Qcur, inp_pos, nullptr,
12604
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12605
+ ext_factor, attn_factor, beta_fast, beta_slow
12606
+ );
12607
+
12608
+ Kcur = ggml_rope_ext(
12609
+ ctx0, Kcur, inp_pos, nullptr,
12610
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12611
+ ext_factor, attn_factor, beta_fast, beta_slow
12612
+ );
12613
+
12614
+ cb(Qcur, "Qcur", il);
12615
+ cb(Kcur, "Kcur", il);
12616
+ cb(Vcur, "Vcur", il);
12617
+
12618
+ cur = build_attn(inp_attn,
12619
+ model.layers[il].wo, NULL,
12620
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12621
+ }
12622
+
12623
+ if (il == n_layer - 1 && inp_out_ids) {
12624
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12625
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12626
+ }
12627
+
12628
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12629
+ cb(ffn_inp, "ffn_inp", il);
12630
+
12631
+ // MoE branch
12632
+ cur = build_norm(ffn_inp,
12633
+ model.layers[il].ffn_norm, NULL,
12634
+ LLM_NORM_RMS, il);
12635
+ cb(cur, "ffn_norm", il);
12636
+
12637
+ cur = build_moe_ffn(cur,
12638
+ model.layers[il].ffn_gate_inp,
12639
+ model.layers[il].ffn_up_exps,
12640
+ model.layers[il].ffn_gate_exps,
12641
+ model.layers[il].ffn_down_exps,
12642
+ nullptr,
12643
+ n_expert, n_expert_used,
12644
+ LLM_FFN_SILU, false,
12645
+ false, 0.0,
12646
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12647
+ il);
12648
+ cb(cur, "ffn_moe_out", il);
12649
+
12650
+ cur = ggml_add(ctx0, cur, ffn_inp);
12651
+
12652
+ cur = build_cvec(cur, il);
12653
+ cb(cur, "l_out", il);
12654
+
12655
+ // input for next layer
12656
+ inpL = cur;
12657
+ }
12658
+
12659
+ cur = inpL;
12660
+
12661
+ cur = build_norm(cur,
12662
+ model.output_norm, NULL,
12663
+ LLM_NORM_RMS, -1);
12664
+
12665
+ cb(cur, "result_norm", -1);
12666
+ res->t_embd = cur;
12667
+
12668
+ // lm_head
12669
+ cur = build_lora_mm(model.output, cur);
12670
+
12671
+ cb(cur, "result_output", -1);
12672
+ res->t_logits = cur;
12673
+
12674
+ ggml_build_forward_expand(gf, cur);
12675
+ }
12676
+ };
12677
+
12439
12678
  struct llm_build_openelm : public llm_graph_context {
12440
12679
  llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12441
12680
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -12601,9 +12840,7 @@ struct llm_build_gptneox : public llm_graph_context {
12601
12840
 
12602
12841
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12603
12842
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
12604
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
12605
-
12606
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12843
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
12607
12844
 
12608
12845
  Qcur = ggml_rope_ext(
12609
12846
  ctx0, Qcur, inp_pos, nullptr,
@@ -13557,7 +13794,9 @@ struct llm_build_t5_dec : public llm_graph_context {
13557
13794
 
13558
13795
  ggml_tensor * inp_out_ids = build_inp_out_ids();
13559
13796
 
13560
- for (int il = 0; il < n_layer; ++il) {
13797
+ const int64_t dec_n_layer = hparams.dec_n_layer;
13798
+
13799
+ for (int il = 0; il < dec_n_layer; ++il) {
13561
13800
  ggml_tensor * inpSA = inpL;
13562
13801
 
13563
13802
  // norm
@@ -13648,7 +13887,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13648
13887
  //cb(cur, "kqv_out", il);
13649
13888
  }
13650
13889
 
13651
- if (il == n_layer - 1 && inp_out_ids) {
13890
+ if (il == dec_n_layer - 1 && inp_out_ids) {
13652
13891
  cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13653
13892
  inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
13654
13893
  }
@@ -13669,8 +13908,8 @@ struct llm_build_t5_dec : public llm_graph_context {
13669
13908
  model.layers[il].ffn_gate, NULL, NULL,
13670
13909
  model.layers[il].ffn_down, NULL, NULL,
13671
13910
  NULL,
13672
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13673
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13911
+ model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
13912
+ model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
13674
13913
  il);
13675
13914
  cb(cur, "ffn_out", il);
13676
13915
  }
@@ -13736,18 +13975,14 @@ struct llm_build_jais : public llm_graph_context {
13736
13975
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
13737
13976
  cb(cur, "bqkv", il);
13738
13977
 
13739
- ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
13740
- ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
13741
- ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
13978
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*cur->nb[0]*(n_embd));
13979
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd));
13980
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
13742
13981
 
13743
13982
  cb(Qcur, "Qcur", il);
13744
13983
  cb(Kcur, "Kcur", il);
13745
13984
  cb(Vcur, "Vcur", il);
13746
13985
 
13747
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13748
- Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13749
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13750
-
13751
13986
  cur = build_attn(inp_attn,
13752
13987
  model.layers[il].wo, model.layers[il].bo,
13753
13988
  Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/float(n_embd_head), il);
@@ -13859,8 +14094,7 @@ struct llm_build_chatglm : public llm_graph_context {
13859
14094
  }
13860
14095
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13861
14096
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13862
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13863
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14097
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13864
14098
  }
13865
14099
 
13866
14100
  //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
@@ -13993,8 +14227,7 @@ struct llm_build_glm4 : public llm_graph_context {
13993
14227
  }
13994
14228
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13995
14229
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13996
- Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13997
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14230
+ Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13998
14231
  }
13999
14232
 
14000
14233
  Qcur = ggml_rope_ext(
@@ -17293,16 +17526,14 @@ private:
17293
17526
  const int64_t k_offset = n_embd_head_q * n_head;
17294
17527
  const int64_t v_offset = k_offset + n_embd_head_k * n_head_kv;
17295
17528
 
17296
- ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
17529
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
17297
17530
  ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
17298
- ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
17531
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head_v, n_head_kv, n_tokens, n_embd_head_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
17299
17532
 
17300
17533
  cb(Qcur, "Qcur", il);
17301
17534
  cb(Kcur, "Kcur", il);
17302
17535
  cb(Vcur, "Vcur", il);
17303
17536
 
17304
- Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
17305
-
17306
17537
  Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
17307
17538
  cb(Qcur, "Qcur_normed", il);
17308
17539
 
@@ -18636,6 +18867,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18636
18867
  //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
18637
18868
  case LLM_ARCH_DREAM:
18638
18869
  case LLM_ARCH_LLADA:
18870
+ case LLM_ARCH_LLADA_MOE:
18639
18871
  {
18640
18872
  res = nullptr;
18641
18873
  } break;
@@ -18773,7 +19005,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18773
19005
  } break;
18774
19006
  case LLM_ARCH_LLAMA4:
18775
19007
  {
18776
- llm = std::make_unique<llm_build_llama_iswa>(*this, params);
19008
+ if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
19009
+ llm = std::make_unique<llm_build_llama>(*this, params);
19010
+ } else {
19011
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params);
19012
+ }
18777
19013
  } break;
18778
19014
  case LLM_ARCH_DECI:
18779
19015
  {
@@ -18841,6 +19077,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18841
19077
  llm = std::make_unique<llm_build_llada>(*this, params);
18842
19078
  }
18843
19079
  break;
19080
+ case LLM_ARCH_LLADA_MOE:
19081
+ {
19082
+ llm = std::make_unique<llm_build_llada_moe>(*this, params);
19083
+ }
19084
+ break;
18844
19085
  case LLM_ARCH_QWEN2VL:
18845
19086
  {
18846
19087
  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -18953,7 +19194,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18953
19194
  } break;
18954
19195
  case LLM_ARCH_OLMO2:
18955
19196
  {
18956
- llm = std::make_unique<llm_build_olmo2>(*this, params);
19197
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
19198
+ llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
19199
+ } else {
19200
+ llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
19201
+ }
18957
19202
  } break;
18958
19203
  case LLM_ARCH_OLMOE:
18959
19204
  {
@@ -19307,6 +19552,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
19307
19552
  case LLM_ARCH_QWEN2MOE:
19308
19553
  case LLM_ARCH_QWEN3:
19309
19554
  case LLM_ARCH_QWEN3MOE:
19555
+ case LLM_ARCH_LLADA_MOE:
19310
19556
  case LLM_ARCH_OLMO2:
19311
19557
  case LLM_ARCH_OLMOE:
19312
19558
  case LLM_ARCH_PHI2: