@fugood/llama.node 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +16 -15
- package/src/llama.cpp/CMakeLists.txt +7 -0
- package/src/llama.cpp/common/arg.cpp +141 -21
- package/src/llama.cpp/common/common.h +23 -8
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +7 -6
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +4 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -1
- package/src/llama.cpp/src/llama-arch.cpp +43 -10
- package/src/llama.cpp/src/llama-arch.h +8 -0
- package/src/llama.cpp/src/llama-chat.cpp +17 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +8 -8
- package/src/llama.cpp/src/llama-graph.cpp +3 -3
- package/src/llama.cpp/src/llama-hparams.h +13 -3
- package/src/llama.cpp/src/llama-model.cpp +328 -44
- package/src/llama.cpp/src/llama-model.h +3 -0
- package/src/llama.cpp/src/llama-quant.cpp +3 -1
- package/src/llama.cpp/src/llama-vocab.cpp +13 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -35,10 +35,10 @@ llama_context::llama_context(
|
|
|
35
35
|
|
|
36
36
|
cparams.n_threads = params.n_threads;
|
|
37
37
|
cparams.n_threads_batch = params.n_threads_batch;
|
|
38
|
-
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
|
39
|
-
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
|
40
|
-
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
|
41
|
-
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
|
38
|
+
cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
|
|
39
|
+
cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
|
|
40
|
+
cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
|
|
41
|
+
cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
|
|
42
42
|
cparams.embeddings = params.embeddings;
|
|
43
43
|
cparams.offload_kqv = params.offload_kqv;
|
|
44
44
|
cparams.no_perf = params.no_perf;
|
|
@@ -181,7 +181,7 @@ llama_context::llama_context(
|
|
|
181
181
|
// graph outputs buffer
|
|
182
182
|
{
|
|
183
183
|
// resized during inference when a batch uses more outputs
|
|
184
|
-
if (
|
|
184
|
+
if (output_reserve(params.n_seq_max) < params.n_seq_max) {
|
|
185
185
|
throw std::runtime_error("failed to reserve initial output buffer");
|
|
186
186
|
}
|
|
187
187
|
|
|
@@ -2263,9 +2263,9 @@ llama_context_params llama_context_default_params() {
|
|
|
2263
2263
|
/*.rope_freq_base =*/ 0.0f,
|
|
2264
2264
|
/*.rope_freq_scale =*/ 0.0f,
|
|
2265
2265
|
/*.yarn_ext_factor =*/ -1.0f,
|
|
2266
|
-
/*.yarn_attn_factor =*/ 1.0f,
|
|
2267
|
-
/*.yarn_beta_fast =*/
|
|
2268
|
-
/*.yarn_beta_slow =*/ 1.0f,
|
|
2266
|
+
/*.yarn_attn_factor =*/ -1.0f,
|
|
2267
|
+
/*.yarn_beta_fast =*/ -1.0f,
|
|
2268
|
+
/*.yarn_beta_slow =*/ -1.0f,
|
|
2269
2269
|
/*.yarn_orig_ctx =*/ 0,
|
|
2270
2270
|
/*.defrag_thold =*/ -1.0f,
|
|
2271
2271
|
/*.cb_eval =*/ nullptr,
|
|
@@ -1335,14 +1335,14 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1335
1335
|
|
|
1336
1336
|
if (arch == LLM_ARCH_GROK) {
|
|
1337
1337
|
// need to do the following:
|
|
1338
|
-
// multiply by
|
|
1338
|
+
// multiply by attn_output_multiplier
|
|
1339
1339
|
// and then :
|
|
1340
1340
|
// kq = 30 * tanh(kq / 30)
|
|
1341
1341
|
// before the softmax below
|
|
1342
1342
|
|
|
1343
|
-
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq,
|
|
1343
|
+
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, hparams.f_attn_out_scale / hparams.f_attn_logit_softcapping));
|
|
1344
1344
|
cb(kq, "kq_tanh", il);
|
|
1345
|
-
kq = ggml_scale(ctx0, kq,
|
|
1345
|
+
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
|
|
1346
1346
|
cb(kq, "kq_scaled", il);
|
|
1347
1347
|
}
|
|
1348
1348
|
|
|
@@ -82,8 +82,9 @@ struct llama_hparams {
|
|
|
82
82
|
float f_norm_rms_eps;
|
|
83
83
|
float f_norm_group_eps;
|
|
84
84
|
|
|
85
|
-
float f_attn_logit_softcapping
|
|
86
|
-
float
|
|
85
|
+
float f_attn_logit_softcapping = 50.0f;
|
|
86
|
+
float f_router_logit_softcapping = 30.0f;
|
|
87
|
+
float f_final_logit_softcapping = 30.0f;
|
|
87
88
|
|
|
88
89
|
// for RWKV
|
|
89
90
|
uint32_t rescale_every_n_layers = 0;
|
|
@@ -104,6 +105,11 @@ struct llama_hparams {
|
|
|
104
105
|
uint32_t n_ctx_orig_yarn;
|
|
105
106
|
float rope_yarn_log_mul = 0.0f;
|
|
106
107
|
|
|
108
|
+
float yarn_ext_factor = -1.0f;
|
|
109
|
+
float yarn_attn_factor = 1.0f;
|
|
110
|
+
float yarn_beta_fast = 32.0f;
|
|
111
|
+
float yarn_beta_slow = 1.0f;
|
|
112
|
+
|
|
107
113
|
std::array<int, 4> rope_sections;
|
|
108
114
|
|
|
109
115
|
// Sliding Window Attention (SWA)
|
|
@@ -136,10 +142,14 @@ struct llama_hparams {
|
|
|
136
142
|
float f_embedding_scale = 0.0f;
|
|
137
143
|
float f_attention_scale = 0.0f;
|
|
138
144
|
|
|
145
|
+
// grok-2
|
|
146
|
+
float f_attn_out_scale = 0.0f;
|
|
147
|
+
uint32_t attn_temp_length = 0;
|
|
148
|
+
|
|
139
149
|
bool causal_attn = true;
|
|
140
150
|
bool use_alibi = false;
|
|
141
151
|
bool attn_soft_cap = false;
|
|
142
|
-
bool use_kq_norm =
|
|
152
|
+
bool use_kq_norm = false;
|
|
143
153
|
|
|
144
154
|
// for Classifiers
|
|
145
155
|
uint32_t n_cls_out = 1;
|
|
@@ -36,6 +36,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
36
36
|
case LLM_TYPE_80M: return "80M";
|
|
37
37
|
case LLM_TYPE_109M: return "109M";
|
|
38
38
|
case LLM_TYPE_137M: return "137M";
|
|
39
|
+
case LLM_TYPE_140M: return "140M";
|
|
39
40
|
case LLM_TYPE_160M: return "160M";
|
|
40
41
|
case LLM_TYPE_190M: return "190M";
|
|
41
42
|
case LLM_TYPE_220M: return "220M";
|
|
@@ -44,6 +45,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
44
45
|
case LLM_TYPE_270M: return "270M";
|
|
45
46
|
case LLM_TYPE_335M: return "335M";
|
|
46
47
|
case LLM_TYPE_350M: return "350M";
|
|
48
|
+
case LLM_TYPE_360M: return "360M";
|
|
47
49
|
case LLM_TYPE_410M: return "410M";
|
|
48
50
|
case LLM_TYPE_450M: return "450M";
|
|
49
51
|
case LLM_TYPE_475M: return "475M";
|
|
@@ -51,6 +53,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
51
53
|
case LLM_TYPE_700M: return "700M";
|
|
52
54
|
case LLM_TYPE_770M: return "770M";
|
|
53
55
|
case LLM_TYPE_780M: return "780M";
|
|
56
|
+
case LLM_TYPE_950M: return "950M";
|
|
54
57
|
case LLM_TYPE_0_3B: return "0.3B";
|
|
55
58
|
case LLM_TYPE_0_5B: return "0.5B";
|
|
56
59
|
case LLM_TYPE_0_6B: return "0.6B";
|
|
@@ -622,19 +625,32 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
622
625
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
623
626
|
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
|
624
627
|
|
|
625
|
-
hparams.
|
|
626
|
-
hparams.n_swa
|
|
627
|
-
|
|
628
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
629
|
+
if (found_swa && hparams.n_swa == 0) {
|
|
630
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
631
|
+
hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
|
|
632
|
+
} else {
|
|
633
|
+
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
|
634
|
+
hparams.n_swa = 8192;
|
|
635
|
+
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
636
|
+
}
|
|
628
637
|
|
|
629
638
|
switch (hparams.n_expert) {
|
|
639
|
+
case 0: {
|
|
640
|
+
// MobileLLM (no MoE)
|
|
641
|
+
switch (hparams.n_embd) {
|
|
642
|
+
case 2048: type = LLM_TYPE_140M; break;
|
|
643
|
+
case 4096: type = LLM_TYPE_360M; break;
|
|
644
|
+
case 6144: type = LLM_TYPE_950M; break;
|
|
645
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
646
|
+
}
|
|
647
|
+
} break;
|
|
630
648
|
case 16: type = LLM_TYPE_17B_16E; break;
|
|
631
649
|
case 128: type = LLM_TYPE_17B_128E; break;
|
|
632
650
|
default: type = LLM_TYPE_UNKNOWN;
|
|
633
651
|
}
|
|
634
652
|
|
|
635
|
-
|
|
636
|
-
hparams.use_kq_norm = false;
|
|
637
|
-
}
|
|
653
|
+
hparams.use_kq_norm = type != LLM_TYPE_17B_128E;
|
|
638
654
|
} break;
|
|
639
655
|
case LLM_ARCH_ARCEE:
|
|
640
656
|
{
|
|
@@ -685,7 +701,30 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
685
701
|
} break;
|
|
686
702
|
case LLM_ARCH_GROK:
|
|
687
703
|
{
|
|
688
|
-
|
|
704
|
+
// defaults for old GGUFs
|
|
705
|
+
hparams.yarn_beta_fast = 8.0f;
|
|
706
|
+
hparams.f_logit_scale = 0.5773502691896257f;
|
|
707
|
+
hparams.f_embedding_scale = 78.38367176906169f;
|
|
708
|
+
hparams.f_attn_out_scale = 0.08838834764831845f;
|
|
709
|
+
hparams.f_attn_logit_softcapping = 30.0f;
|
|
710
|
+
hparams.f_router_logit_softcapping = 30.0f;
|
|
711
|
+
// no final_logit_softcapping in grok-1
|
|
712
|
+
hparams.f_final_logit_softcapping = 0.0f;
|
|
713
|
+
|
|
714
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
715
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
716
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
|
|
717
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
|
|
718
|
+
ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
|
|
719
|
+
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
|
720
|
+
ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
|
|
721
|
+
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
|
722
|
+
|
|
723
|
+
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
|
|
724
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
|
|
725
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
|
|
726
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
|
727
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
|
689
728
|
|
|
690
729
|
switch (hparams.n_layer) {
|
|
691
730
|
case 64: type = LLM_TYPE_314B; break;
|
|
@@ -913,6 +952,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
913
952
|
hparams.causal_attn = false;
|
|
914
953
|
}
|
|
915
954
|
break;
|
|
955
|
+
case LLM_ARCH_LLADA_MOE:
|
|
956
|
+
{
|
|
957
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
958
|
+
|
|
959
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
960
|
+
// diffusion language model uses non-causal attention
|
|
961
|
+
hparams.causal_attn = false;
|
|
962
|
+
switch (hparams.n_layer) {
|
|
963
|
+
case 16: type = LLM_TYPE_A1_7B; break;
|
|
964
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
965
|
+
}
|
|
966
|
+
} break;
|
|
916
967
|
case LLM_ARCH_QWEN2MOE:
|
|
917
968
|
{
|
|
918
969
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
@@ -1315,6 +1366,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1315
1366
|
{
|
|
1316
1367
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1317
1368
|
|
|
1369
|
+
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1370
|
+
if (found_swa && hparams.n_swa > 0) {
|
|
1371
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1372
|
+
hparams.set_swa_pattern(4);
|
|
1373
|
+
} else {
|
|
1374
|
+
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1318
1377
|
switch (hparams.n_layer) {
|
|
1319
1378
|
case 16: type = LLM_TYPE_1B; break;
|
|
1320
1379
|
case 32: type = LLM_TYPE_7B; break;
|
|
@@ -2364,6 +2423,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2364
2423
|
}
|
|
2365
2424
|
}
|
|
2366
2425
|
break;
|
|
2426
|
+
case LLM_ARCH_LLADA_MOE:
|
|
2427
|
+
{
|
|
2428
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2429
|
+
|
|
2430
|
+
// output
|
|
2431
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2432
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2433
|
+
|
|
2434
|
+
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada-moe");
|
|
2435
|
+
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada-moe");
|
|
2436
|
+
|
|
2437
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2438
|
+
auto & layer = layers[i];
|
|
2439
|
+
|
|
2440
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2441
|
+
|
|
2442
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2443
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2444
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2445
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2446
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2447
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2448
|
+
|
|
2449
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2450
|
+
|
|
2451
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2452
|
+
|
|
2453
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
2454
|
+
|
|
2455
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2456
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
2457
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2458
|
+
}
|
|
2459
|
+
} break;
|
|
2367
2460
|
case LLM_ARCH_LLAMA4:
|
|
2368
2461
|
{
|
|
2369
2462
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -2377,9 +2470,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2377
2470
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2378
2471
|
}
|
|
2379
2472
|
|
|
2380
|
-
GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
|
|
2381
2473
|
for (int i = 0; i < n_layer; ++i) {
|
|
2382
|
-
bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
|
|
2474
|
+
bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
|
|
2383
2475
|
|
|
2384
2476
|
auto & layer = layers[i];
|
|
2385
2477
|
|
|
@@ -2540,6 +2632,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2540
2632
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2541
2633
|
}
|
|
2542
2634
|
|
|
2635
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff/* / n_expert_used*/; // grok-1 n_ff_exp == n_ff
|
|
2543
2636
|
for (int i = 0; i < n_layer; ++i) {
|
|
2544
2637
|
auto & layer = layers[i];
|
|
2545
2638
|
|
|
@@ -2554,12 +2647,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2554
2647
|
|
|
2555
2648
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2556
2649
|
|
|
2650
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
2651
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, TENSOR_NOT_REQUIRED);
|
|
2652
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
2653
|
+
|
|
2557
2654
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2558
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,
|
|
2559
|
-
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {
|
|
2560
|
-
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd,
|
|
2655
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
|
2656
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
2657
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
2561
2658
|
|
|
2562
|
-
layer.
|
|
2659
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2660
|
+
if (!layer.ffn_post_norm) {
|
|
2661
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2662
|
+
}
|
|
2563
2663
|
}
|
|
2564
2664
|
} break;
|
|
2565
2665
|
case LLM_ARCH_DBRX:
|
|
@@ -6243,6 +6343,14 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
6243
6343
|
cb(Kcur, "Kcur", il);
|
|
6244
6344
|
cb(Vcur, "Vcur", il);
|
|
6245
6345
|
|
|
6346
|
+
if (hparams.use_kq_norm) {
|
|
6347
|
+
// Llama4TextL2Norm
|
|
6348
|
+
Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
|
|
6349
|
+
Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
|
|
6350
|
+
cb(Qcur, "Qcur_normed", il);
|
|
6351
|
+
cb(Kcur, "Kcur_normed", il);
|
|
6352
|
+
}
|
|
6353
|
+
|
|
6246
6354
|
cur = build_attn(inp_attn,
|
|
6247
6355
|
model.layers[il].wo, model.layers[il].bo,
|
|
6248
6356
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
@@ -6350,7 +6458,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
6350
6458
|
for (int il = 0; il < n_layer; ++il) {
|
|
6351
6459
|
ggml_tensor * inpSA = inpL;
|
|
6352
6460
|
|
|
6353
|
-
const bool use_rope =
|
|
6461
|
+
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
6462
|
+
(il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
6354
6463
|
|
|
6355
6464
|
// norm
|
|
6356
6465
|
cur = build_norm(inpL,
|
|
@@ -7028,9 +7137,6 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
7028
7137
|
|
|
7029
7138
|
inpL = build_inp_embd(model.tok_embd);
|
|
7030
7139
|
|
|
7031
|
-
// multiply by embedding_multiplier_scale of 78.38367176906169
|
|
7032
|
-
inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
|
|
7033
|
-
|
|
7034
7140
|
// inp_pos - contains the positions
|
|
7035
7141
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
7036
7142
|
|
|
@@ -7102,26 +7208,22 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
7102
7208
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
7103
7209
|
}
|
|
7104
7210
|
|
|
7105
|
-
|
|
7106
|
-
|
|
7107
|
-
|
|
7108
|
-
|
|
7109
|
-
model.layers[il].attn_out_norm, NULL,
|
|
7110
|
-
LLM_NORM_RMS, il);
|
|
7111
|
-
cb(cur, "attn_out_norm", il);
|
|
7112
|
-
}
|
|
7211
|
+
cur = build_norm(cur,
|
|
7212
|
+
model.layers[il].attn_out_norm, NULL,
|
|
7213
|
+
LLM_NORM_RMS, il);
|
|
7214
|
+
cb(cur, "attn_out_norm", il);
|
|
7113
7215
|
|
|
7114
7216
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
7115
7217
|
cb(ffn_inp, "ffn_inp", il);
|
|
7116
7218
|
|
|
7117
7219
|
// feed-forward network
|
|
7118
|
-
// MoE branch
|
|
7119
7220
|
cur = build_norm(ffn_inp,
|
|
7120
7221
|
model.layers[il].ffn_norm, NULL,
|
|
7121
7222
|
LLM_NORM_RMS, il);
|
|
7122
7223
|
cb(cur, "ffn_norm", il);
|
|
7123
7224
|
|
|
7124
|
-
|
|
7225
|
+
// MoE branch
|
|
7226
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
7125
7227
|
model.layers[il].ffn_gate_inp,
|
|
7126
7228
|
model.layers[il].ffn_up_exps,
|
|
7127
7229
|
model.layers[il].ffn_gate_exps,
|
|
@@ -7132,18 +7234,28 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
7132
7234
|
false, 0.0,
|
|
7133
7235
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
7134
7236
|
il);
|
|
7135
|
-
cb(
|
|
7237
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
7136
7238
|
|
|
7137
|
-
|
|
7138
|
-
|
|
7139
|
-
|
|
7140
|
-
|
|
7141
|
-
|
|
7142
|
-
|
|
7143
|
-
|
|
7144
|
-
cb(
|
|
7239
|
+
if (model.layers[il].ffn_up) {
|
|
7240
|
+
ggml_tensor * ffn_out = build_ffn(cur,
|
|
7241
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
7242
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
7243
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
7244
|
+
NULL,
|
|
7245
|
+
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
7246
|
+
cb(ffn_out, "ffn_out", il);
|
|
7247
|
+
|
|
7248
|
+
cur = ggml_scale(ctx0, ggml_add(ctx0, ffn_out, moe_out), std::sqrt(2) / 2);
|
|
7249
|
+
cb(cur, "ffn_out", il);
|
|
7250
|
+
} else {
|
|
7251
|
+
cur = moe_out;
|
|
7145
7252
|
}
|
|
7146
7253
|
|
|
7254
|
+
cur = build_norm(cur,
|
|
7255
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
7256
|
+
LLM_NORM_RMS, il);
|
|
7257
|
+
cb(cur, "ffn_post_norm", il);
|
|
7258
|
+
|
|
7147
7259
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
7148
7260
|
cb(cur, "ffn_out", il);
|
|
7149
7261
|
|
|
@@ -7166,10 +7278,14 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
7166
7278
|
// lm_head
|
|
7167
7279
|
cur = build_lora_mm(model.output, cur);
|
|
7168
7280
|
|
|
7169
|
-
|
|
7170
|
-
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
|
7281
|
+
cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
|
|
7171
7282
|
|
|
7172
|
-
|
|
7283
|
+
// final logit soft-capping
|
|
7284
|
+
if (hparams.f_final_logit_softcapping) {
|
|
7285
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
|
7286
|
+
cur = ggml_tanh(ctx0, cur);
|
|
7287
|
+
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
|
7288
|
+
}
|
|
7173
7289
|
|
|
7174
7290
|
cb(cur, "result_output", -1);
|
|
7175
7291
|
res->t_logits = cur;
|
|
@@ -12149,6 +12265,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
12149
12265
|
}
|
|
12150
12266
|
};
|
|
12151
12267
|
|
|
12268
|
+
template <bool iswa>
|
|
12152
12269
|
struct llm_build_olmo2 : public llm_graph_context {
|
|
12153
12270
|
llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12154
12271
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -12164,7 +12281,14 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
12164
12281
|
// inp_pos - contains the positions
|
|
12165
12282
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
12166
12283
|
|
|
12167
|
-
|
|
12284
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
12285
|
+
inp_attn_type * inp_attn = nullptr;
|
|
12286
|
+
|
|
12287
|
+
if constexpr (iswa) {
|
|
12288
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
12289
|
+
} else {
|
|
12290
|
+
inp_attn = build_attn_inp_kv();
|
|
12291
|
+
}
|
|
12168
12292
|
|
|
12169
12293
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12170
12294
|
|
|
@@ -12197,17 +12321,36 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
12197
12321
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12198
12322
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12199
12323
|
|
|
12200
|
-
|
|
12324
|
+
const bool is_swa = hparams.is_swa(il);
|
|
12325
|
+
|
|
12326
|
+
if (is_swa) {
|
|
12327
|
+
// For sliding window layers, Olmo3 use regular rope with no yarn rope scaling.
|
|
12328
|
+
// This is achieved here by setting freq_scale and attn_factor to 1.
|
|
12329
|
+
// We also set ext_factor to 0 to avoid a few unnecessary computations.
|
|
12330
|
+
Qcur = ggml_rope_ext(
|
|
12331
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
12332
|
+
n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
|
|
12333
|
+
0.0, 1.0, beta_fast, beta_slow
|
|
12334
|
+
);
|
|
12335
|
+
|
|
12336
|
+
Kcur = ggml_rope_ext(
|
|
12337
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
12338
|
+
n_rot, rope_type, n_ctx_orig, freq_base, 1.0,
|
|
12339
|
+
0.0, 1.0, beta_fast, beta_slow
|
|
12340
|
+
);
|
|
12341
|
+
} else {
|
|
12342
|
+
Qcur = ggml_rope_ext(
|
|
12201
12343
|
ctx0, Qcur, inp_pos, nullptr,
|
|
12202
12344
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12203
12345
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12204
12346
|
);
|
|
12205
12347
|
|
|
12206
|
-
|
|
12348
|
+
Kcur = ggml_rope_ext(
|
|
12207
12349
|
ctx0, Kcur, inp_pos, nullptr,
|
|
12208
12350
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12209
12351
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12210
12352
|
);
|
|
12353
|
+
}
|
|
12211
12354
|
|
|
12212
12355
|
cb(Qcur, "Qcur", il);
|
|
12213
12356
|
cb(Kcur, "Kcur", il);
|
|
@@ -12406,6 +12549,132 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
12406
12549
|
}
|
|
12407
12550
|
};
|
|
12408
12551
|
|
|
12552
|
+
struct llm_build_llada_moe : public llm_graph_context {
|
|
12553
|
+
llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12554
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12555
|
+
|
|
12556
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
12557
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
12558
|
+
|
|
12559
|
+
ggml_tensor * cur;
|
|
12560
|
+
ggml_tensor * inpL;
|
|
12561
|
+
|
|
12562
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12563
|
+
|
|
12564
|
+
// inp_pos - contains the positions
|
|
12565
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
12566
|
+
|
|
12567
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
12568
|
+
|
|
12569
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12570
|
+
|
|
12571
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12572
|
+
ggml_tensor * inpSA = inpL;
|
|
12573
|
+
|
|
12574
|
+
// norm
|
|
12575
|
+
cur = build_norm(inpL,
|
|
12576
|
+
model.layers[il].attn_norm, NULL,
|
|
12577
|
+
LLM_NORM_RMS, il);
|
|
12578
|
+
cb(cur, "attn_norm", il);
|
|
12579
|
+
|
|
12580
|
+
// self_attention
|
|
12581
|
+
{
|
|
12582
|
+
// compute Q and K and RoPE them
|
|
12583
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
12584
|
+
cb(Qcur, "Qcur", il);
|
|
12585
|
+
|
|
12586
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
12587
|
+
cb(Kcur, "Kcur", il);
|
|
12588
|
+
|
|
12589
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
12590
|
+
cb(Vcur, "Vcur", il);
|
|
12591
|
+
|
|
12592
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
12593
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
12594
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
12595
|
+
|
|
12596
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
12597
|
+
cb(Qcur, "Qcur_normed", il);
|
|
12598
|
+
|
|
12599
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
12600
|
+
cb(Kcur, "Kcur_normed", il);
|
|
12601
|
+
|
|
12602
|
+
Qcur = ggml_rope_ext(
|
|
12603
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
12604
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12605
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12606
|
+
);
|
|
12607
|
+
|
|
12608
|
+
Kcur = ggml_rope_ext(
|
|
12609
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
12610
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12611
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12612
|
+
);
|
|
12613
|
+
|
|
12614
|
+
cb(Qcur, "Qcur", il);
|
|
12615
|
+
cb(Kcur, "Kcur", il);
|
|
12616
|
+
cb(Vcur, "Vcur", il);
|
|
12617
|
+
|
|
12618
|
+
cur = build_attn(inp_attn,
|
|
12619
|
+
model.layers[il].wo, NULL,
|
|
12620
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12621
|
+
}
|
|
12622
|
+
|
|
12623
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
12624
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12625
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12626
|
+
}
|
|
12627
|
+
|
|
12628
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12629
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
12630
|
+
|
|
12631
|
+
// MoE branch
|
|
12632
|
+
cur = build_norm(ffn_inp,
|
|
12633
|
+
model.layers[il].ffn_norm, NULL,
|
|
12634
|
+
LLM_NORM_RMS, il);
|
|
12635
|
+
cb(cur, "ffn_norm", il);
|
|
12636
|
+
|
|
12637
|
+
cur = build_moe_ffn(cur,
|
|
12638
|
+
model.layers[il].ffn_gate_inp,
|
|
12639
|
+
model.layers[il].ffn_up_exps,
|
|
12640
|
+
model.layers[il].ffn_gate_exps,
|
|
12641
|
+
model.layers[il].ffn_down_exps,
|
|
12642
|
+
nullptr,
|
|
12643
|
+
n_expert, n_expert_used,
|
|
12644
|
+
LLM_FFN_SILU, false,
|
|
12645
|
+
false, 0.0,
|
|
12646
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
12647
|
+
il);
|
|
12648
|
+
cb(cur, "ffn_moe_out", il);
|
|
12649
|
+
|
|
12650
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
12651
|
+
|
|
12652
|
+
cur = build_cvec(cur, il);
|
|
12653
|
+
cb(cur, "l_out", il);
|
|
12654
|
+
|
|
12655
|
+
// input for next layer
|
|
12656
|
+
inpL = cur;
|
|
12657
|
+
}
|
|
12658
|
+
|
|
12659
|
+
cur = inpL;
|
|
12660
|
+
|
|
12661
|
+
cur = build_norm(cur,
|
|
12662
|
+
model.output_norm, NULL,
|
|
12663
|
+
LLM_NORM_RMS, -1);
|
|
12664
|
+
|
|
12665
|
+
cb(cur, "result_norm", -1);
|
|
12666
|
+
res->t_embd = cur;
|
|
12667
|
+
|
|
12668
|
+
// lm_head
|
|
12669
|
+
cur = build_lora_mm(model.output, cur);
|
|
12670
|
+
|
|
12671
|
+
cb(cur, "result_output", -1);
|
|
12672
|
+
res->t_logits = cur;
|
|
12673
|
+
|
|
12674
|
+
ggml_build_forward_expand(gf, cur);
|
|
12675
|
+
}
|
|
12676
|
+
};
|
|
12677
|
+
|
|
12409
12678
|
struct llm_build_openelm : public llm_graph_context {
|
|
12410
12679
|
llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
12411
12680
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -18598,6 +18867,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18598
18867
|
//case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
|
|
18599
18868
|
case LLM_ARCH_DREAM:
|
|
18600
18869
|
case LLM_ARCH_LLADA:
|
|
18870
|
+
case LLM_ARCH_LLADA_MOE:
|
|
18601
18871
|
{
|
|
18602
18872
|
res = nullptr;
|
|
18603
18873
|
} break;
|
|
@@ -18735,7 +19005,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18735
19005
|
} break;
|
|
18736
19006
|
case LLM_ARCH_LLAMA4:
|
|
18737
19007
|
{
|
|
18738
|
-
|
|
19008
|
+
if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
|
|
19009
|
+
llm = std::make_unique<llm_build_llama>(*this, params);
|
|
19010
|
+
} else {
|
|
19011
|
+
llm = std::make_unique<llm_build_llama_iswa>(*this, params);
|
|
19012
|
+
}
|
|
18739
19013
|
} break;
|
|
18740
19014
|
case LLM_ARCH_DECI:
|
|
18741
19015
|
{
|
|
@@ -18803,6 +19077,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18803
19077
|
llm = std::make_unique<llm_build_llada>(*this, params);
|
|
18804
19078
|
}
|
|
18805
19079
|
break;
|
|
19080
|
+
case LLM_ARCH_LLADA_MOE:
|
|
19081
|
+
{
|
|
19082
|
+
llm = std::make_unique<llm_build_llada_moe>(*this, params);
|
|
19083
|
+
}
|
|
19084
|
+
break;
|
|
18806
19085
|
case LLM_ARCH_QWEN2VL:
|
|
18807
19086
|
{
|
|
18808
19087
|
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
|
@@ -18915,7 +19194,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18915
19194
|
} break;
|
|
18916
19195
|
case LLM_ARCH_OLMO2:
|
|
18917
19196
|
{
|
|
18918
|
-
|
|
19197
|
+
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
|
19198
|
+
llm = std::make_unique<llm_build_olmo2<true>>(*this, params);
|
|
19199
|
+
} else {
|
|
19200
|
+
llm = std::make_unique<llm_build_olmo2<false>>(*this, params);
|
|
19201
|
+
}
|
|
18919
19202
|
} break;
|
|
18920
19203
|
case LLM_ARCH_OLMOE:
|
|
18921
19204
|
{
|
|
@@ -19269,6 +19552,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
19269
19552
|
case LLM_ARCH_QWEN2MOE:
|
|
19270
19553
|
case LLM_ARCH_QWEN3:
|
|
19271
19554
|
case LLM_ARCH_QWEN3MOE:
|
|
19555
|
+
case LLM_ARCH_LLADA_MOE:
|
|
19272
19556
|
case LLM_ARCH_OLMO2:
|
|
19273
19557
|
case LLM_ARCH_OLMOE:
|
|
19274
19558
|
case LLM_ARCH_PHI2:
|