@fugood/llama.node 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,6 +36,7 @@ const char * llm_type_name(llm_type type) {
36
36
  case LLM_TYPE_80M: return "80M";
37
37
  case LLM_TYPE_109M: return "109M";
38
38
  case LLM_TYPE_137M: return "137M";
39
+ case LLM_TYPE_140M: return "140M";
39
40
  case LLM_TYPE_160M: return "160M";
40
41
  case LLM_TYPE_190M: return "190M";
41
42
  case LLM_TYPE_220M: return "220M";
@@ -44,6 +45,7 @@ const char * llm_type_name(llm_type type) {
44
45
  case LLM_TYPE_270M: return "270M";
45
46
  case LLM_TYPE_335M: return "335M";
46
47
  case LLM_TYPE_350M: return "350M";
48
+ case LLM_TYPE_360M: return "360M";
47
49
  case LLM_TYPE_410M: return "410M";
48
50
  case LLM_TYPE_450M: return "450M";
49
51
  case LLM_TYPE_475M: return "475M";
@@ -51,6 +53,7 @@ const char * llm_type_name(llm_type type) {
51
53
  case LLM_TYPE_700M: return "700M";
52
54
  case LLM_TYPE_770M: return "770M";
53
55
  case LLM_TYPE_780M: return "780M";
56
+ case LLM_TYPE_950M: return "950M";
54
57
  case LLM_TYPE_0_3B: return "0.3B";
55
58
  case LLM_TYPE_0_5B: return "0.5B";
56
59
  case LLM_TYPE_0_6B: return "0.6B";
@@ -622,19 +625,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
622
625
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
623
626
  ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
624
627
 
625
- hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
626
- hparams.n_swa = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
627
- hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
628
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
629
+ if (found_swa && hparams.n_swa == 0) {
630
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
631
+ hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
632
+ } else {
633
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
634
+ hparams.n_swa = 8192;
635
+ hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
636
+ }
628
637
 
629
638
  switch (hparams.n_expert) {
639
+ case 0: {
640
+ // MobileLLM (no MoE)
641
+ switch (hparams.n_embd) {
642
+ case 2048: type = LLM_TYPE_140M; break;
643
+ case 4096: type = LLM_TYPE_360M; break;
644
+ case 6144: type = LLM_TYPE_950M; break;
645
+ default: type = LLM_TYPE_UNKNOWN;
646
+ }
647
+ } break;
630
648
  case 16: type = LLM_TYPE_17B_16E; break;
631
649
  case 128: type = LLM_TYPE_17B_128E; break;
632
650
  default: type = LLM_TYPE_UNKNOWN;
633
651
  }
634
652
 
635
- if (type == LLM_TYPE_17B_128E) {
636
- hparams.use_kq_norm = false;
637
- }
653
+ hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
638
654
  } break;
639
655
  case LLM_ARCH_ARCEE:
640
656
  {
@@ -685,7 +701,30 @@ void llama_model::load_hparams(llama_model_loader & ml) {
685
701
  } break;
686
702
  case LLM_ARCH_GROK:
687
703
  {
688
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
704
+ // defaults for old GGUFs
705
+ hparams.yarn_beta_fast = 8.0f;
706
+ hparams.f_logit_scale = 0.5773502691896257f;
707
+ hparams.f_embedding_scale = 78.38367176906169f;
708
+ hparams.f_attn_out_scale = 0.08838834764831845f;
709
+ hparams.f_attn_logit_softcapping = 30.0f;
710
+ hparams.f_router_logit_softcapping = 30.0f;
711
+ // no final_logit_softcapping in grok-1
712
+ hparams.f_final_logit_softcapping = 0.0f;
713
+
714
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
715
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
716
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
717
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
718
+ ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
719
+ ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
720
+ ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
721
+ ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
722
+
723
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
724
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
725
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
726
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
727
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
689
728
 
690
729
  switch (hparams.n_layer) {
691
730
  case 64: type = LLM_TYPE_314B; break;
@@ -913,6 +952,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
913
952
  hparams.causal_attn = false;
914
953
  }
915
954
  break;
955
+ case LLM_ARCH_LLADA_MOE:
956
+ {
957
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
958
+
959
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
960
+ // diffusion language model uses non-causal attention
961
+ hparams.causal_attn = false;
962
+ switch (hparams.n_layer) {
963
+ case 16: type = LLM_TYPE_A1_7B; break;
964
+ default: type = LLM_TYPE_UNKNOWN;
965
+ }
966
+ } break;
916
967
  case LLM_ARCH_QWEN2MOE:
917
968
  {
918
969
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -1315,6 +1366,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1315
1366
  {
1316
1367
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1317
1368
 
1369
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1370
+ if (found_swa && hparams.n_swa > 0) {
1371
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1372
+ hparams.set_swa_pattern(4);
1373
+ } else {
1374
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1375
+ }
1376
+
1318
1377
  switch (hparams.n_layer) {
1319
1378
  case 16: type = LLM_TYPE_1B; break;
1320
1379
  case 32: type = LLM_TYPE_7B; break;
@@ -2364,6 +2423,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2364
2423
  }
2365
2424
  }
2366
2425
  break;
2426
+ case LLM_ARCH_LLADA_MOE:
2427
+ {
2428
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2429
+
2430
+ // output
2431
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
2432
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2433
+
2434
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
2435
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
2436
+
2437
+ for (int i = 0; i < n_layer; ++i) {
2438
+ auto & layer = layers[i];
2439
+
2440
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
2441
+
2442
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
2443
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
2444
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
2445
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
2446
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
2447
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
2448
+
2449
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2450
+
2451
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2452
+
2453
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
2454
+
2455
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2456
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2457
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
2458
+ }
2459
+ } break;
2367
2460
  case LLM_ARCH_LLAMA4:
2368
2461
  {
2369
2462
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -2377,9 +2470,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2377
2470
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2378
2471
  }
2379
2472
 
2380
- GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
2381
2473
  for (int i = 0; i < n_layer; ++i) {
2382
- bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
2474
+ bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
2383
2475
 
2384
2476
  auto & layer = layers[i];
2385
2477
 
@@ -2540,6 +2632,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2540
2632
  output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2541
2633
  }
2542
2634
 
2635
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
2543
2636
  for (int i = 0; i < n_layer; ++i) {
2544
2637
  auto & layer = layers[i];
2545
2638
 
@@ -2554,12 +2647,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2554
2647
 
2555
2648
  layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
2556
2649
 
2650
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2651
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
2652
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
2653
+
2557
2654
  layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
2558
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
2559
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
2560
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
2655
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
2656
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
2657
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
2561
2658
 
2562
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
2659
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
2660
+ if (!layer.ffn_post_norm) {
2661
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
2662
+ }
2563
2663
  }
2564
2664
  } break;
2565
2665
  case LLM_ARCH_DBRX:
@@ -6243,6 +6343,14 @@ struct llm_build_llama : public llm_graph_context {
6243
6343
  cb(Kcur, "Kcur", il);
6244
6344
  cb(Vcur, "Vcur", il);
6245
6345
 
6346
+ if (hparams.use_kq_norm) {
6347
+ // Llama4TextL2Norm
6348
+ Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
6349
+ Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
6350
+ cb(Qcur, "Qcur_normed", il);
6351
+ cb(Kcur, "Kcur_normed", il);
6352
+ }
6353
+
6246
6354
  cur = build_attn(inp_attn,
6247
6355
  model.layers[il].wo, model.layers[il].bo,
6248
6356
  Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
@@ -6350,7 +6458,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
6350
6458
  for (int il = 0; il < n_layer; ++il) {
6351
6459
  ggml_tensor * inpSA = inpL;
6352
6460
 
6353
- const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
6461
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
6462
+ (il + 1) % hparams.n_no_rope_layer_step != 0;
6354
6463
 
6355
6464
  // norm
6356
6465
  cur = build_norm(inpL,
@@ -7028,9 +7137,6 @@ struct llm_build_grok : public llm_graph_context {
7028
7137
 
7029
7138
  inpL = build_inp_embd(model.tok_embd);
7030
7139
 
7031
- // multiply by embedding_multiplier_scale of 78.38367176906169
7032
- inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
7033
-
7034
7140
  // inp_pos - contains the positions
7035
7141
  ggml_tensor * inp_pos = build_inp_pos();
7036
7142
 
@@ -7102,26 +7208,22 @@ struct llm_build_grok : public llm_graph_context {
7102
7208
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
7103
7209
  }
7104
7210
 
7105
- // Grok
7106
- // if attn_out_norm is present then apply it before adding the input
7107
- if (model.layers[il].attn_out_norm) {
7108
- cur = build_norm(cur,
7109
- model.layers[il].attn_out_norm, NULL,
7110
- LLM_NORM_RMS, il);
7111
- cb(cur, "attn_out_norm", il);
7112
- }
7211
+ cur = build_norm(cur,
7212
+ model.layers[il].attn_out_norm, NULL,
7213
+ LLM_NORM_RMS, il);
7214
+ cb(cur, "attn_out_norm", il);
7113
7215
 
7114
7216
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
7115
7217
  cb(ffn_inp, "ffn_inp", il);
7116
7218
 
7117
7219
  // feed-forward network
7118
- // MoE branch
7119
7220
  cur = build_norm(ffn_inp,
7120
7221
  model.layers[il].ffn_norm, NULL,
7121
7222
  LLM_NORM_RMS, il);
7122
7223
  cb(cur, "ffn_norm", il);
7123
7224
 
7124
- cur = build_moe_ffn(cur,
7225
+ // MoE branch
7226
+ ggml_tensor * moe_out = build_moe_ffn(cur,
7125
7227
  model.layers[il].ffn_gate_inp,
7126
7228
  model.layers[il].ffn_up_exps,
7127
7229
  model.layers[il].ffn_gate_exps,
@@ -7132,18 +7234,28 @@ struct llm_build_grok : public llm_graph_context {
7132
7234
  false, 0.0,
7133
7235
  LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
7134
7236
  il);
7135
- cb(cur, "ffn_moe_out", il);
7237
+ cb(moe_out, "ffn_moe_out", il);
7136
7238
 
7137
- // Grok
7138
- // if layer_out_norm is present then apply it before adding the input
7139
- // Idea: maybe ffn_out_norm is a better name
7140
- if (model.layers[il].layer_out_norm) {
7141
- cur = build_norm(cur,
7142
- model.layers[il].layer_out_norm, NULL,
7143
- LLM_NORM_RMS, il);
7144
- cb(cur, "layer_out_norm", il);
7239
+ if (model.layers[il].ffn_up) {
7240
+ ggml_tensor * ffn_out = build_ffn(cur,
7241
+ model.layers[il].ffn_up, NULL, NULL,
7242
+ model.layers[il].ffn_gate, NULL, NULL,
7243
+ model.layers[il].ffn_down, NULL, NULL,
7244
+ NULL,
7245
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
7246
+ cb(ffn_out, "ffn_out", il);
7247
+
7248
+ cur = ggml_scale(ctx0, ggml_add(ctx0, ffn_out, moe_out), std::sqrt(2) / 2);
7249
+ cb(cur, "ffn_out", il);
7250
+ } else {
7251
+ cur = moe_out;
7145
7252
  }
7146
7253
 
7254
+ cur = build_norm(cur,
7255
+ model.layers[il].ffn_post_norm, NULL,
7256
+ LLM_NORM_RMS, il);
7257
+ cb(cur, "ffn_post_norm", il);
7258
+
7147
7259
  cur = ggml_add(ctx0, cur, ffn_inp);
7148
7260
  cb(cur, "ffn_out", il);
7149
7261
 
@@ -7166,10 +7278,14 @@ struct llm_build_grok : public llm_graph_context {
7166
7278
  // lm_head
7167
7279
  cur = build_lora_mm(model.output, cur);
7168
7280
 
7169
- // Grok
7170
- // multiply logits by output_multiplier_scale of 0.5773502691896257
7281
+ cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
7171
7282
 
7172
- cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
7283
+ // final logit soft-capping
7284
+ if (hparams.f_final_logit_softcapping) {
7285
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
7286
+ cur = ggml_tanh(ctx0, cur);
7287
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
7288
+ }
7173
7289
 
7174
7290
  cb(cur, "result_output", -1);
7175
7291
  res->t_logits = cur;
@@ -12149,6 +12265,7 @@ struct llm_build_olmo : public llm_graph_context {
12149
12265
  }
12150
12266
  };
12151
12267
 
12268
+ template <bool iswa>
12152
12269
  struct llm_build_olmo2 : public llm_graph_context {
12153
12270
  llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12154
12271
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -12164,7 +12281,14 @@ struct llm_build_olmo2 : public llm_graph_context {
12164
12281
  // inp_pos - contains the positions
12165
12282
  ggml_tensor * inp_pos = build_inp_pos();
12166
12283
 
12167
- auto * inp_attn = build_attn_inp_kv();
12284
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
12285
+ inp_attn_type * inp_attn = nullptr;
12286
+
12287
+ if constexpr (iswa) {
12288
+ inp_attn = build_attn_inp_kv_iswa();
12289
+ } else {
12290
+ inp_attn = build_attn_inp_kv();
12291
+ }
12168
12292
 
12169
12293
  ggml_tensor * inp_out_ids = build_inp_out_ids();
12170
12294
 
@@ -12197,17 +12321,36 @@ struct llm_build_olmo2 : public llm_graph_context {
12197
12321
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12198
12322
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12199
12323
 
12200
- Qcur = ggml_rope_ext(
12324
+ const bool is_swa = hparams.is_swa(il);
12325
+
12326
+ if (is_swa) {
12327
+ // For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
12328
+ // This is achieved here by setting freq_scale and attn_factor to 1.
12329
+ // We also set ext_factor to 0 to avoid a few unnecessary computations.
12330
+ Qcur = ggml_rope_ext(
12331
+ ctx0, Qcur, inp_pos, nullptr,
12332
+ n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
12333
+ 0.0, 1.0, beta_fast, beta_slow
12334
+ );
12335
+
12336
+ Kcur = ggml_rope_ext(
12337
+ ctx0, Kcur, inp_pos, nullptr,
12338
+ n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
12339
+ 0.0, 1.0, beta_fast, beta_slow
12340
+ );
12341
+ } else {
12342
+ Qcur = ggml_rope_ext(
12201
12343
  ctx0, Qcur, inp_pos, nullptr,
12202
12344
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12203
12345
  ext_factor, attn_factor, beta_fast, beta_slow
12204
12346
  );
12205
12347
 
12206
- Kcur = ggml_rope_ext(
12348
+ Kcur = ggml_rope_ext(
12207
12349
  ctx0, Kcur, inp_pos, nullptr,
12208
12350
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12209
12351
  ext_factor, attn_factor, beta_fast, beta_slow
12210
12352
  );
12353
+ }
12211
12354
 
12212
12355
  cb(Qcur, "Qcur", il);
12213
12356
  cb(Kcur, "Kcur", il);
@@ -12406,6 +12549,132 @@ struct llm_build_olmoe : public llm_graph_context {
12406
12549
  }
12407
12550
  };
12408
12551
 
12552
+ struct llm_build_llada_moe : public llm_graph_context {
12553
+ llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12554
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12555
+
12556
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12557
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12558
+
12559
+ ggml_tensor * cur;
12560
+ ggml_tensor * inpL;
12561
+
12562
+ inpL = build_inp_embd(model.tok_embd);
12563
+
12564
+ // inp_pos - contains the positions
12565
+ ggml_tensor * inp_pos = build_inp_pos();
12566
+
12567
+ auto * inp_attn = build_attn_inp_no_cache();
12568
+
12569
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12570
+
12571
+ for (int il = 0; il < n_layer; ++il) {
12572
+ ggml_tensor * inpSA = inpL;
12573
+
12574
+ // norm
12575
+ cur = build_norm(inpL,
12576
+ model.layers[il].attn_norm, NULL,
12577
+ LLM_NORM_RMS, il);
12578
+ cb(cur, "attn_norm", il);
12579
+
12580
+ // self_attention
12581
+ {
12582
+ // compute Q and K and RoPE them
12583
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12584
+ cb(Qcur, "Qcur", il);
12585
+
12586
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12587
+ cb(Kcur, "Kcur", il);
12588
+
12589
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12590
+ cb(Vcur, "Vcur", il);
12591
+
12592
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12593
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12594
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12595
+
12596
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
12597
+ cb(Qcur, "Qcur_normed", il);
12598
+
12599
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
12600
+ cb(Kcur, "Kcur_normed", il);
12601
+
12602
+ Qcur = ggml_rope_ext(
12603
+ ctx0, Qcur, inp_pos, nullptr,
12604
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12605
+ ext_factor, attn_factor, beta_fast, beta_slow
12606
+ );
12607
+
12608
+ Kcur = ggml_rope_ext(
12609
+ ctx0, Kcur, inp_pos, nullptr,
12610
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12611
+ ext_factor, attn_factor, beta_fast, beta_slow
12612
+ );
12613
+
12614
+ cb(Qcur, "Qcur", il);
12615
+ cb(Kcur, "Kcur", il);
12616
+ cb(Vcur, "Vcur", il);
12617
+
12618
+ cur = build_attn(inp_attn,
12619
+ model.layers[il].wo, NULL,
12620
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
12621
+ }
12622
+
12623
+ if (il == n_layer - 1 && inp_out_ids) {
12624
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12625
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12626
+ }
12627
+
12628
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12629
+ cb(ffn_inp, "ffn_inp", il);
12630
+
12631
+ // MoE branch
12632
+ cur = build_norm(ffn_inp,
12633
+ model.layers[il].ffn_norm, NULL,
12634
+ LLM_NORM_RMS, il);
12635
+ cb(cur, "ffn_norm", il);
12636
+
12637
+ cur = build_moe_ffn(cur,
12638
+ model.layers[il].ffn_gate_inp,
12639
+ model.layers[il].ffn_up_exps,
12640
+ model.layers[il].ffn_gate_exps,
12641
+ model.layers[il].ffn_down_exps,
12642
+ nullptr,
12643
+ n_expert, n_expert_used,
12644
+ LLM_FFN_SILU, false,
12645
+ false, 0.0,
12646
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12647
+ il);
12648
+ cb(cur, "ffn_moe_out", il);
12649
+
12650
+ cur = ggml_add(ctx0, cur, ffn_inp);
12651
+
12652
+ cur = build_cvec(cur, il);
12653
+ cb(cur, "l_out", il);
12654
+
12655
+ // input for next layer
12656
+ inpL = cur;
12657
+ }
12658
+
12659
+ cur = inpL;
12660
+
12661
+ cur = build_norm(cur,
12662
+ model.output_norm, NULL,
12663
+ LLM_NORM_RMS, -1);
12664
+
12665
+ cb(cur, "result_norm", -1);
12666
+ res->t_embd = cur;
12667
+
12668
+ // lm_head
12669
+ cur = build_lora_mm(model.output, cur);
12670
+
12671
+ cb(cur, "result_output", -1);
12672
+ res->t_logits = cur;
12673
+
12674
+ ggml_build_forward_expand(gf, cur);
12675
+ }
12676
+ };
12677
+
12409
12678
  struct llm_build_openelm : public llm_graph_context {
12410
12679
  llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
12411
12680
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -18598,6 +18867,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
18598
18867
  //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
18599
18868
  case LLM_ARCH_DREAM:
18600
18869
  case LLM_ARCH_LLADA:
18870
+ case LLM_ARCH_LLADA_MOE:
18601
18871
  {
18602
18872
  res = nullptr;
18603
18873
  } break;
@@ -18735,7 +19005,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18735
19005
  } break;
18736
19006
  case LLM_ARCH_LLAMA4:
18737
19007
  {
18738
- llm = std::make_unique<llm_build_llama_iswa>(*this, params);
19008
+ if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
19009
+ llm = std::make_unique<llm_build_llama>(*this, params);
19010
+ } else {
19011
+ llm = std::make_unique<llm_build_llama_iswa>(*this, params);
19012
+ }
18739
19013
  } break;
18740
19014
  case LLM_ARCH_DECI:
18741
19015
  {
@@ -18803,6 +19077,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18803
19077
  llm = std::make_unique<llm_build_llada>(*this, params);
18804
19078
  }
18805
19079
  break;
19080
+ case LLM_ARCH_LLADA_MOE:
19081
+ {
19082
+ llm = std::make_unique<llm_build_llada_moe>(*this, params);
19083
+ }
19084
+ break;
18806
19085
  case LLM_ARCH_QWEN2VL:
18807
19086
  {
18808
19087
  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -18915,7 +19194,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
18915
19194
  } break;
18916
19195
  case LLM_ARCH_OLMO2:
18917
19196
  {
18918
- llm = std::make_unique<llm_build_olmo2>(*this, params);
19197
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
19198
+ llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
19199
+ } else {
19200
+ llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
19201
+ }
18919
19202
  } break;
18920
19203
  case LLM_ARCH_OLMOE:
18921
19204
  {
@@ -19269,6 +19552,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
19269
19552
  case LLM_ARCH_QWEN2MOE:
19270
19553
  case LLM_ARCH_QWEN3:
19271
19554
  case LLM_ARCH_QWEN3MOE:
19555
+ case LLM_ARCH_LLADA_MOE:
19272
19556
  case LLM_ARCH_OLMO2:
19273
19557
  case LLM_ARCH_OLMOE:
19274
19558
  case LLM_ARCH_PHI2:
@@ -28,6 +28,7 @@ enum llm_type {
28
28
  LLM_TYPE_80M,
29
29
  LLM_TYPE_109M,
30
30
  LLM_TYPE_137M,
31
+ LLM_TYPE_140M,
31
32
  LLM_TYPE_160M,
32
33
  LLM_TYPE_190M,
33
34
  LLM_TYPE_220M,
@@ -36,6 +37,7 @@ enum llm_type {
36
37
  LLM_TYPE_270M,
37
38
  LLM_TYPE_335M,
38
39
  LLM_TYPE_350M,
40
+ LLM_TYPE_360M,
39
41
  LLM_TYPE_410M,
40
42
  LLM_TYPE_450M,
41
43
  LLM_TYPE_475M,
@@ -43,6 +45,7 @@ enum llm_type {
43
45
  LLM_TYPE_700M,
44
46
  LLM_TYPE_770M,
45
47
  LLM_TYPE_780M,
48
+ LLM_TYPE_950M,
46
49
  LLM_TYPE_0_3B,
47
50
  LLM_TYPE_0_5B,
48
51
  LLM_TYPE_0_6B,
@@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
725
725
  // attention layers have a non-zero number of kv heads
726
726
  int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
727
727
  if (llama_model_has_encoder(&model)) {
728
- n_attn_layer *= 3;
728
+ // now n_attn_layer is the number of attention layers in the encoder
729
+ // for each decoder block, there are 2 attention layers
730
+ n_attn_layer += 2 * model.hparams.dec_n_layer;
729
731
  }
730
732
  GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
731
733
  }
@@ -434,6 +434,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
434
434
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
435
435
  };
436
436
  break;
437
+ case LLAMA_VOCAB_PRE_TYPE_GROK_2:
438
+ regex_exprs = {
439
+ // original regex from tokenizer.json
440
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
441
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
442
+ };
443
+ break;
437
444
  default:
438
445
  // default regex for BPE tokenization pre-processing
439
446
  regex_exprs = {
@@ -1955,7 +1962,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1955
1962
  pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1956
1963
  clean_spaces = false;
1957
1964
  } else if (
1958
- tokenizer_pre == "bailingmoe") {
1965
+ tokenizer_pre == "bailingmoe" ||
1966
+ tokenizer_pre == "llada-moe") {
1959
1967
  pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1960
1968
  clean_spaces = false;
1961
1969
  } else if (
@@ -1974,6 +1982,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1974
1982
  tokenizer_pre == "kimi-k2") {
1975
1983
  pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1976
1984
  clean_spaces = false;
1985
+ } else if (
1986
+ tokenizer_pre == "grok-2") {
1987
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
1988
+ clean_spaces = false;
1977
1989
  } else {
1978
1990
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1979
1991
  }
@@ -47,6 +47,7 @@ enum llama_vocab_pre_type {
47
47
  LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
48
  LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49
49
  LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50
+ LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
50
51
  };
51
52
 
52
53
  struct LLM_KV;