@fugood/llama.node 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -35,10 +35,10 @@ llama_context::llama_context(
35
35
 
36
36
  cparams.n_threads = params.n_threads;
37
37
  cparams.n_threads_batch = params.n_threads_batch;
38
- cparams.yarn_ext_factor = params.yarn_ext_factor;
39
- cparams.yarn_attn_factor = params.yarn_attn_factor;
40
- cparams.yarn_beta_fast = params.yarn_beta_fast;
41
- cparams.yarn_beta_slow = params.yarn_beta_slow;
38
+ cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
39
+ cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
40
+ cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
41
+ cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
42
42
  cparams.embeddings = params.embeddings;
43
43
  cparams.offload_kqv = params.offload_kqv;
44
44
  cparams.no_perf = params.no_perf;
@@ -181,7 +181,7 @@ llama_context::llama_context(
181
181
  // graph outputs buffer
182
182
  {
183
183
  // resized during inference when a batch uses more outputs
184
- if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) {
184
+ if (output_reserve(params.n_seq_max) < params.n_seq_max) {
185
185
  throw std::runtime_error("failed to reserve initial output buffer");
186
186
  }
187
187
 
@@ -2263,9 +2263,9 @@ llama_context_params llama_context_default_params() {
2263
2263
  /*.rope_freq_base =*/ 0.0f,
2264
2264
  /*.rope_freq_scale =*/ 0.0f,
2265
2265
  /*.yarn_ext_factor =*/ -1.0f,
2266
- /*.yarn_attn_factor =*/ 1.0f,
2267
- /*.yarn_beta_fast =*/ 32.0f,
2268
- /*.yarn_beta_slow =*/ 1.0f,
2266
+ /*.yarn_attn_factor =*/ -1.0f,
2267
+ /*.yarn_beta_fast =*/ -1.0f,
2268
+ /*.yarn_beta_slow =*/ -1.0f,
2269
2269
  /*.yarn_orig_ctx =*/ 0,
2270
2270
  /*.defrag_thold =*/ -1.0f,
2271
2271
  /*.cb_eval =*/ nullptr,
@@ -1335,14 +1335,14 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1335
1335
 
1336
1336
  if (arch == LLM_ARCH_GROK) {
1337
1337
  // need to do the following:
1338
- // multiply by attn_output_multiplyer of 0.08838834764831845
1338
+ // multiply by attn_output_multiplier
1339
1339
  // and then :
1340
1340
  // kq = 30 * tanh(kq / 30)
1341
1341
  // before the softmax below
1342
1342
 
1343
- kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
1343
+ kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, hparams.f_attn_out_scale / hparams.f_attn_logit_softcapping));
1344
1344
  cb(kq, "kq_tanh", il);
1345
- kq = ggml_scale(ctx0, kq, 30);
1345
+ kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
1346
1346
  cb(kq, "kq_scaled", il);
1347
1347
  }
1348
1348
 
@@ -82,8 +82,9 @@ struct llama_hparams {
82
82
  float f_norm_rms_eps;
83
83
  float f_norm_group_eps;
84
84
 
85
- float f_attn_logit_softcapping = 50.0f;
86
- float f_final_logit_softcapping = 30.0f;
85
+ float f_attn_logit_softcapping = 50.0f;
86
+ float f_router_logit_softcapping = 30.0f;
87
+ float f_final_logit_softcapping = 30.0f;
87
88
 
88
89
  // for RWKV
89
90
  uint32_t rescale_every_n_layers = 0;
@@ -104,6 +105,11 @@ struct llama_hparams {
104
105
  uint32_t n_ctx_orig_yarn;
105
106
  float rope_yarn_log_mul = 0.0f;
106
107
 
108
+ float yarn_ext_factor = -1.0f;
109
+ float yarn_attn_factor = 1.0f;
110
+ float yarn_beta_fast = 32.0f;
111
+ float yarn_beta_slow = 1.0f;
112
+
107
113
  std::array<int, 4> rope_sections;
108
114
 
109
115
  // Sliding Window Attention (SWA)
@@ -136,10 +142,14 @@ struct llama_hparams {
136
142
  float f_embedding_scale = 0.0f;
137
143
  float f_attention_scale = 0.0f;
138
144
 
145
+ // grok-2
146
+ float f_attn_out_scale = 0.0f;
147
+ uint32_t attn_temp_length = 0;
148
+
139
149
  bool causal_attn = true;
140
150
  bool use_alibi = false;
141
151
  bool attn_soft_cap = false;
142
- bool use_kq_norm = true;
152
+ bool use_kq_norm = false;
143
153
 
144
154
  // for Classifiers
145
155
  uint32_t n_cls_out = 1;
@@ -36,6 +36,7 @@ const char * llm_type_name(llm_type type) {
36
36
  case LLM_TYPE_80M: return "80M";
37
37
  case LLM_TYPE_109M: return "109M";
38
38
  case LLM_TYPE_137M: return "137M";
39
+ case LLM_TYPE_140M: return "140M";
39
40
  case LLM_TYPE_160M: return "160M";
40
41
  case LLM_TYPE_190M: return "190M";
41
42
  case LLM_TYPE_220M: return "220M";
@@ -44,6 +45,7 @@ const char * llm_type_name(llm_type type) {
44
45
  case LLM_TYPE_270M: return "270M";
45
46
  case LLM_TYPE_335M: return "335M";
46
47
  case LLM_TYPE_350M: return "350M";
48
+ case LLM_TYPE_360M: return "360M";
47
49
  case LLM_TYPE_410M: return "410M";
48
50
  case LLM_TYPE_450M: return "450M";
49
51
  case LLM_TYPE_475M: return "475M";
@@ -51,6 +53,7 @@ const char * llm_type_name(llm_type type) {
51
53
  case LLM_TYPE_700M: return "700M";
52
54
  case LLM_TYPE_770M: return "770M";
53
55
  case LLM_TYPE_780M: return "780M";
56
+ case LLM_TYPE_950M: return "950M";
54
57
  case LLM_TYPE_0_3B: return "0.3B";
55
58
  case LLM_TYPE_0_5B: return "0.5B";
56
59
  case LLM_TYPE_0_6B: return "0.6B";
@@ -622,19 +625,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
622
625
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
623
626
  ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
624
627
 
625
- hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
626
- hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
627
- hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
628
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
629
+ if (found_swa && hparams.n_swa == 0) {
630
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
631
+ hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
632
+ } else {
633
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
634
+ hparams.n_swa = 8192;
635
+ hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
636
+ }
628
637
 
629
638
  switch (hparams.n_expert) {
639
+ case 0: {
640
+ // MobileLLM (no MoE)
641
+ switch (hparams.n_embd) {
642
+ case 2048: type = LLM_TYPE_140M; break;
643
+ case 4096: type = LLM_TYPE_360M; break;
644
+ case 6144: type = LLM_TYPE_950M; break;
645
+ default: type = LLM_TYPE_UNKNOWN;
646
+ }
647
+ } break;
630
648
  case 16: type = LLM_TYPE_17B_16E; break;
631
649
  case 128: type = LLM_TYPE_17B_128E; break;
632
650
  default: type = LLM_TYPE_UNKNOWN;
633
651
  }
634
652
 
635
- if (type == LLM_TYPE_17B_128E) {
636
- hparams.use_kq_norm = false;
637
- }
653
+ hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
638
654
  } break;
639
655
  case LLM_ARCH_ARCEE:
640
656
  {
@@ -685,7 +701,30 @@ void llama_model::load_hparams(llama_model_loader & ml) {
685
701
  } break;
686
702
  case LLM_ARCH_GROK:
687
703
  {
688
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
704
+ // defaults for old GGUFs
705
+ hparams.yarn_beta_fast = 8.0f;
706
+ hparams.f_logit_scale = 0.5773502691896257f;
707
+ hparams.f_embedding_scale = 78.38367176906169f;
708
+ hparams.f_attn_out_scale = 0.08838834764831845f;
709
+ hparams.f_attn_logit_softcapping = 30.0f;
710
+ hparams.f_router_logit_softcapping = 30.0f;
711
+ // no final_logit_softcapping in grok-1
712
+ hparams.f_final_logit_softcapping = 0.0f;
713
+
714
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
715
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
716
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
717
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
718
+ ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
719
+ ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
720
+ ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
721
+ ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
722
+
723
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
724
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
725
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
726
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
727
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
689
728
 
690
729
  switch (hparams.n_layer) {
691
730
  case 64: type = LLM_TYPE_314B; break;
@@ -913,6 +952,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
913
952
  hparams.causal_attn = false;
914
953
  }
915
954
  break;
955
+ case LLM_ARCH_LLADA_MOE:
956
+ {
957
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
958
+
959
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
960
+ // diffusion language model uses non-causal attention
961
+ hparams.causal_attn = false;
962
+ switch (hparams.n_layer) {
963
+ case 16: type = LLM_TYPE_A1_7B; break;
964
+ default: type = LLM_TYPE_UNKNOWN;
965
+ }
966
+ } break;
916
967
  case LLM_ARCH_QWEN2MOE:
917
968
  {
918
969
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -1315,6 +1366,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1315
1366
  {
1316
1367
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1317
1368
 
1369
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1370
+ if (found_swa && hparams.n_swa > 0) {
1371
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1372
+ hparams.set_swa_pattern(4);
1373
+ } else {
1374
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1375
+ }
1376
+
1318
1377
  switch (hparams.n_layer) {
1319
1378
  case 16: type = LLM_TYPE_1B; break;
1320
1379
  case 32: type = LLM_TYPE_7B; break;
@@ -2364,6 +2423,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2364
2423
  }
2365
2424
  }
2366
2425
  break;
2426
+ case LLM_ARCH_LLADA_MOE:
2427
+ {
2428
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2429
+
2430
+ // output
2431
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2432
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2433
+
2434
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
2435
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
2436
+
2437
+ for (int i = 0; i < n_layer; ++i) {
2438
+ auto & layer = layers[i];
2439
+
2440
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2441
+
2442
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2443
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2444
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2445
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2446
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2447
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2448
+
2449
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2450
+
2451
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2452
+
2453
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
2454
+
2455
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2456
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2457
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2458
+ }
2459
+ } break;
2367
2460
  case LLM_ARCH_LLAMA4:
2368
2461
  {
2369
2462
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -2377,9 +2470,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2377
2470
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2378
2471
  }
2379
2472
 
2380
- GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
2381
2473
  for (int i = 0; i < n_layer; ++i) {
2382
- bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
2474
+ bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
2383
2475
 
2384
2476
  auto & layer = layers[i];
2385
2477
 
@@ -2540,6 +2632,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2540
2632
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2541
2633
  }
2542
2634
 
2635
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
2543
2636
  for (int i = 0; i < n_layer; ++i) {
2544
2637
  auto & layer = layers[i];
2545
2638
 
@@ -2554,12 +2647,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2554
2647
 
2555
2648
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2556
2649
 
2650
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2651
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
2652
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2653
+
2557
2654
  layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2558
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
2559
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2560
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2655
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
2656
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2657
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
2561
2658
 
2562
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
2659
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2660
+ if (!layer.ffn_post_norm) {
2661
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2662
+ }
2563
2663
  }
2564
2664
  } break;
2565
2665
  case LLM_ARCH_DBRX:
@@ -6243,6 +6343,14 @@ struct llm_build_llama : public llm_graph_context {
6243
6343
  cb(Kcur, "Kcur", il);
6244
6344
  cb(Vcur, "Vcur", il);
6245
6345
 
6346
+ if (hparams.use_kq_norm) {
6347
+ // Llama4TextL2Norm
6348
+ Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
6349
+ Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
6350
+ cb(Qcur, "Qcur_normed", il);
6351
+ cb(Kcur, "Kcur_normed", il);
6352
+ }
6353
+
6246
6354
  cur = build_attn(inp_attn,
6247
6355
  model.layers[il].wo, model.layers[il].bo,
6248
6356
  Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
@@ -6350,7 +6458,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
6350
6458
  for (int il = 0; il < n_layer; ++il) {
6351
6459
  ggml_tensor * inpSA = inpL;
6352
6460
 
6353
- const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
6461
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
6462
+ (il + 1) % hparams.n_no_rope_layer_step != 0;
6354
6463
 
6355
6464
  // norm
6356
6465
  cur = build_norm(inpL,
@@ -7028,9 +7137,6 @@ struct llm_build_grok : public llm_graph_context {
7028
7137
 
7029
7138
  inpL = build_inp_embd(model.tok_embd);
7030
7139
 
7031
- // multiply by embedding_multiplier_scale of 78.38367176906169
7032
- inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
7033
-
7034
7140
  // inp_pos - contains the positions
7035
7141
  ggml_tensor * inp_pos = build_inp_pos();
7036
7142
 
@@ -7102,26 +7208,22 @@ struct llm_build_grok : public llm_graph_context {
7102
7208
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7103
7209
  }
7104
7210
 
7105
- // Grok
7106
- // if attn_out_norm is present then apply it before adding the input
7107
- if (model.layers[il].attn_out_norm) {
7108
- cur = build_norm(cur,
7109
- model.layers[il].attn_out_norm, NULL,
7110
- LLM_NORM_RMS, il);
7111
- cb(cur, "attn_out_norm", il);
7112
- }
7211
+ cur = build_norm(cur,
7212
+ model.layers[il].attn_out_norm, NULL,
7213
+ LLM_NORM_RMS, il);
7214
+ cb(cur, "attn_out_norm", il);
7113
7215
 
7114
7216
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7115
7217
  cb(ffn_inp, "ffn_inp", il);
7116
7218
 
7117
7219
  // feed-forward network
7118
- // MoE branch
7119
7220
  cur = build_norm(ffn_inp,
7120
7221
  model.layers[il].ffn_norm, NULL,
7121
7222
  LLM_NORM_RMS, il);
7122
7223
  cb(cur, "ffn_norm", il);
7123
7224
 
7124
- cur = build_moe_ffn(cur,
7225
+ // MoE branch
7226
+ ggml_tensor * moe_out = build_moe_ffn(cur,
7125
7227
  model.layers[il].ffn_gate_inp,
7126
7228
  model.layers[il].ffn_up_exps,
7127
7229
  model.layers[il].ffn_gate_exps,
@@ -7132,18 +7234,28 @@ struct llm_build_grok : public llm_graph_context {
7132
7234
  false, 0.0,
7133
7235
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
7134
7236
  il);
7135
- cb(cur, "ffn_moe_out", il);
7237
+ cb(moe_out, "ffn_moe_out", il);
7136
7238
 
7137
- // Grok
7138
- // if layer_out_norm is present then apply it before adding the input
7139
- // Idea: maybe ffn_out_norm is a better name
7140
- if (model.layers[il].layer_out_norm) {
7141
- cur = build_norm(cur,
7142
- model.layers[il].layer_out_norm, NULL,
7143
- LLM_NORM_RMS, il);
7144
- cb(cur, "layer_out_norm", il);
7239
+ if (model.layers[il].ffn_up) {
7240
+ ggml_tensor * ffn_out = build_ffn(cur,
7241
+ model.layers[il].ffn_up, NULL, NULL,
7242
+ model.layers[il].ffn_gate, NULL, NULL,
7243
+ model.layers[il].ffn_down, NULL, NULL,
7244
+ NULL,
7245
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
7246
+ cb(ffn_out, "ffn_out", il);
7247
+
7248
+ cur = ggml_scale(ctx0, ggml_add(ctx0, ffn_out, moe_out), std::sqrt(2) / 2);
7249
+ cb(cur, "ffn_out", il);
7250
+ } else {
7251
+ cur = moe_out;
7145
7252
  }
7146
7253
 
7254
+ cur = build_norm(cur,
7255
+ model.layers[il].ffn_post_norm, NULL,
7256
+ LLM_NORM_RMS, il);
7257
+ cb(cur, "ffn_post_norm", il);
7258
+
7147
7259
  cur = ggml_add(ctx0, cur, ffn_inp);
7148
7260
  cb(cur, "ffn_out", il);
7149
7261
 
@@ -7166,10 +7278,14 @@ struct llm_build_grok : public llm_graph_context {
7166
7278
  // lm_head
7167
7279
  cur = build_lora_mm(model.output, cur);
7168
7280
 
7169
- // Grok
7170
- // multiply logits by output_multiplier_scale of 0.5773502691896257
7281
+ cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
7171
7282
 
7172
- cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7283
+ // final logit soft-capping
7284
+ if (hparams.f_final_logit_softcapping) {
7285
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
7286
+ cur = ggml_tanh(ctx0, cur);
7287
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
7288
+ }
7173
7289
 
7174
7290
  cb(cur, "result_output", -1);
7175
7291
  res->t_logits = cur;
@@ -12149,6 +12265,7 @@ struct llm_build_olmo : public llm_graph_context {
12149
12265
  }
12150
12266
  };
12151
12267
 
12268
+ template <bool iswa>
12152
12269
  struct llm_build_olmo2 : public llm_graph_context {
12153
12270
  llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12154
12271
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -12164,7 +12281,14 @@ struct llm_build_olmo2 : public llm_graph_context {
12164
12281
  // inp_pos - contains the positions
12165
12282
  ggml_tensor * inp_pos = build_inp_pos();
12166
12283
 
12167
- auto * inp_attn = build_attn_inp_kv();
12284
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
12285
+ inp_attn_type * inp_attn = nullptr;
12286
+
12287
+ if constexpr (iswa) {
12288
+ inp_attn = build_attn_inp_kv_iswa();
12289
+ } else {
12290
+ inp_attn = build_attn_inp_kv();
12291
+ }
12168
12292
 
12169
12293
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12170
12294
 
@@ -12197,17 +12321,36 @@ struct llm_build_olmo2 : public llm_graph_context {
12197
12321
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12198
12322
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12199
12323
 
12200
- Qcur = ggml_rope_ext(
12324
+ const bool is_swa = hparams.is_swa(il);
12325
+
12326
+ if (is_swa) {
12327
+ // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
12328
+ // This is achieved here by setting freq_scale and attn_factor to 1.
12329
+ // We also set ext_factor to 0 to avoid a few unnecessary computations.
12330
+ Qcur = ggml_rope_ext(
12331
+ ctx0, Qcur, inp_pos, nullptr,
12332
+ n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
12333
+ 0.0, 1.0, beta_fast, beta_slow
12334
+ );
12335
+
12336
+ Kcur = ggml_rope_ext(
12337
+ ctx0, Kcur, inp_pos, nullptr,
12338
+ n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
12339
+ 0.0, 1.0, beta_fast, beta_slow
12340
+ );
12341
+ } else {
12342
+ Qcur = ggml_rope_ext(
12201
12343
  ctx0, Qcur, inp_pos, nullptr,
12202
12344
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12203
12345
  ext_factor, attn_factor, beta_fast, beta_slow
12204
12346
  );
12205
12347
 
12206
- Kcur = ggml_rope_ext(
12348
+ Kcur = ggml_rope_ext(
12207
12349
  ctx0, Kcur, inp_pos, nullptr,
12208
12350
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12209
12351
  ext_factor, attn_factor, beta_fast, beta_slow
12210
12352
  );
12353
+ }
12211
12354
 
12212
12355
  cb(Qcur, "Qcur", il);
12213
12356
  cb(Kcur, "Kcur", il);
@@ -12406,6 +12549,132 @@ struct llm_build_olmoe : public llm_graph_context {
12406
12549
  }
12407
12550
  };
12408
12551
 
12552
+ struct llm_build_llada_moe : public llm_graph_context {
12553
+ llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12554
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12555
+
12556
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12557
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12558
+
12559
+ ggml_tensor * cur;
12560
+ ggml_tensor * inpL;
12561
+
12562
+ inpL = build_inp_embd(model.tok_embd);
12563
+
12564
+ // inp_pos - contains the positions
12565
+ ggml_tensor * inp_pos = build_inp_pos();
12566
+
12567
+ auto * inp_attn = build_attn_inp_no_cache();
12568
+
12569
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12570
+
12571
+ for (int il = 0; il < n_layer; ++il) {
12572
+ ggml_tensor * inpSA = inpL;
12573
+
12574
+ // norm
12575
+ cur = build_norm(inpL,
12576
+ model.layers[il].attn_norm, NULL,
12577
+ LLM_NORM_RMS, il);
12578
+ cb(cur, "attn_norm", il);
12579
+
12580
+ // self_attention
12581
+ {
12582
+ // compute Q and K and RoPE them
12583
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12584
+ cb(Qcur, "Qcur", il);
12585
+
12586
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12587
+ cb(Kcur, "Kcur", il);
12588
+
12589
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12590
+ cb(Vcur, "Vcur", il);
12591
+
12592
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12593
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12594
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12595
+
12596
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
12597
+ cb(Qcur, "Qcur_normed", il);
12598
+
12599
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
12600
+ cb(Kcur, "Kcur_normed", il);
12601
+
12602
+ Qcur = ggml_rope_ext(
12603
+ ctx0, Qcur, inp_pos, nullptr,
12604
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12605
+ ext_factor, attn_factor, beta_fast, beta_slow
12606
+ );
12607
+
12608
+ Kcur = ggml_rope_ext(
12609
+ ctx0, Kcur, inp_pos, nullptr,
12610
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12611
+ ext_factor, attn_factor, beta_fast, beta_slow
12612
+ );
12613
+
12614
+ cb(Qcur, "Qcur", il);
12615
+ cb(Kcur, "Kcur", il);
12616
+ cb(Vcur, "Vcur", il);
12617
+
12618
+ cur = build_attn(inp_attn,
12619
+ model.layers[il].wo, NULL,
12620
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12621
+ }
12622
+
12623
+ if (il == n_layer - 1 && inp_out_ids) {
12624
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12625
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12626
+ }
12627
+
12628
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12629
+ cb(ffn_inp, "ffn_inp", il);
12630
+
12631
+ // MoE branch
12632
+ cur = build_norm(ffn_inp,
12633
+ model.layers[il].ffn_norm, NULL,
12634
+ LLM_NORM_RMS, il);
12635
+ cb(cur, "ffn_norm", il);
12636
+
12637
+ cur = build_moe_ffn(cur,
12638
+ model.layers[il].ffn_gate_inp,
12639
+ model.layers[il].ffn_up_exps,
12640
+ model.layers[il].ffn_gate_exps,
12641
+ model.layers[il].ffn_down_exps,
12642
+ nullptr,
12643
+ n_expert, n_expert_used,
12644
+ LLM_FFN_SILU, false,
12645
+ false, 0.0,
12646
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12647
+ il);
12648
+ cb(cur, "ffn_moe_out", il);
12649
+
12650
+ cur = ggml_add(ctx0, cur, ffn_inp);
12651
+
12652
+ cur = build_cvec(cur, il);
12653
+ cb(cur, "l_out", il);
12654
+
12655
+ // input for next layer
12656
+ inpL = cur;
12657
+ }
12658
+
12659
+ cur = inpL;
12660
+
12661
+ cur = build_norm(cur,
12662
+ model.output_norm, NULL,
12663
+ LLM_NORM_RMS, -1);
12664
+
12665
+ cb(cur, "result_norm", -1);
12666
+ res->t_embd = cur;
12667
+
12668
+ // lm_head
12669
+ cur = build_lora_mm(model.output, cur);
12670
+
12671
+ cb(cur, "result_output", -1);
12672
+ res->t_logits = cur;
12673
+
12674
+ ggml_build_forward_expand(gf, cur);
12675
+ }
12676
+ };
12677
+
12409
12678
  struct llm_build_openelm : public llm_graph_context {
12410
12679
  llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12411
12680
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -18598,6 +18867,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18598
18867
  //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
18599
18868
  case LLM_ARCH_DREAM:
18600
18869
  case LLM_ARCH_LLADA:
18870
+ case LLM_ARCH_LLADA_MOE:
18601
18871
  {
18602
18872
  res = nullptr;
18603
18873
  } break;
@@ -18735,7 +19005,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18735
19005
  } break;
18736
19006
  case LLM_ARCH_LLAMA4:
18737
19007
  {
18738
- llm = std::make_unique<llm_build_llama_iswa>(*this, params);
19008
+ if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
19009
+ llm = std::make_unique<llm_build_llama>(*this, params);
19010
+ } else {
19011
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params);
19012
+ }
18739
19013
  } break;
18740
19014
  case LLM_ARCH_DECI:
18741
19015
  {
@@ -18803,6 +19077,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18803
19077
  llm = std::make_unique<llm_build_llada>(*this, params);
18804
19078
  }
18805
19079
  break;
19080
+ case LLM_ARCH_LLADA_MOE:
19081
+ {
19082
+ llm = std::make_unique<llm_build_llada_moe>(*this, params);
19083
+ }
19084
+ break;
18806
19085
  case LLM_ARCH_QWEN2VL:
18807
19086
  {
18808
19087
  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -18915,7 +19194,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18915
19194
  } break;
18916
19195
  case LLM_ARCH_OLMO2:
18917
19196
  {
18918
- llm = std::make_unique<llm_build_olmo2>(*this, params);
19197
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
19198
+ llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
19199
+ } else {
19200
+ llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
19201
+ }
18919
19202
  } break;
18920
19203
  case LLM_ARCH_OLMOE:
18921
19204
  {
@@ -19269,6 +19552,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
19269
19552
  case LLM_ARCH_QWEN2MOE:
19270
19553
  case LLM_ARCH_QWEN3:
19271
19554
  case LLM_ARCH_QWEN3MOE:
19555
+ case LLM_ARCH_LLADA_MOE:
19272
19556
  case LLM_ARCH_OLMO2:
19273
19557
  case LLM_ARCH_OLMOE:
19274
19558
  case LLM_ARCH_PHI2: