@fugood/llama.node 1.1.9 → 1.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +7 -1
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +15 -5
- package/src/LlamaCompletionWorker.cpp +12 -3
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +20 -2
- package/src/llama.cpp/common/arg.cpp +29 -19
- package/src/llama.cpp/common/chat.cpp +153 -3
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +4 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
- package/src/llama.cpp/include/llama.h +27 -1
- package/src/llama.cpp/src/llama-adapter.cpp +68 -4
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +46 -2
- package/src/llama.cpp/src/llama-arch.h +4 -0
- package/src/llama.cpp/src/llama-context.cpp +80 -39
- package/src/llama.cpp/src/llama-context.h +0 -4
- package/src/llama.cpp/src/llama-graph.cpp +20 -10
- package/src/llama.cpp/src/llama-graph.h +2 -1
- package/src/llama.cpp/src/llama-hparams.cpp +25 -0
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +24 -7
- package/src/llama.cpp/src/llama-kv-cache-iswa.h +4 -2
- package/src/llama.cpp/src/llama-kv-cache.cpp +67 -130
- package/src/llama.cpp/src/llama-kv-cache.h +16 -28
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +29 -28
- package/src/llama.cpp/src/llama-memory-hybrid.h +18 -22
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
- package/src/llama.cpp/src/llama-memory-recurrent.h +7 -11
- package/src/llama.cpp/src/llama-memory.h +8 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +302 -31
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
|
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
47
47
|
case LLM_TYPE_410M: return "410M";
|
|
48
48
|
case LLM_TYPE_450M: return "450M";
|
|
49
49
|
case LLM_TYPE_475M: return "475M";
|
|
50
|
+
case LLM_TYPE_558M: return "558M";
|
|
50
51
|
case LLM_TYPE_700M: return "700M";
|
|
51
52
|
case LLM_TYPE_770M: return "770M";
|
|
52
53
|
case LLM_TYPE_780M: return "780M";
|
|
@@ -772,6 +773,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
772
773
|
default: type = LLM_TYPE_UNKNOWN;
|
|
773
774
|
}
|
|
774
775
|
} break;
|
|
776
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
777
|
+
{
|
|
778
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
779
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
780
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
781
|
+
|
|
782
|
+
switch (hparams.n_layer) {
|
|
783
|
+
case 24:
|
|
784
|
+
type = LLM_TYPE_558M; break;
|
|
785
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
786
|
+
}
|
|
787
|
+
} break;
|
|
775
788
|
case LLM_ARCH_NOMIC_BERT:
|
|
776
789
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
777
790
|
{
|
|
@@ -1115,6 +1128,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1115
1128
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1116
1129
|
hparams.set_swa_pattern(5);
|
|
1117
1130
|
|
|
1131
|
+
hparams.n_layer_kv_from_start = 20;
|
|
1118
1132
|
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1119
1133
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1120
1134
|
hparams.f_attention_scale = 1.0f;
|
|
@@ -1474,12 +1488,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1474
1488
|
// Expert gating function (GLM-4.5 uses sigmoid)
|
|
1475
1489
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1476
1490
|
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
1477
|
-
hparams.expert_gating_func =
|
|
1491
|
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
|
1478
1492
|
}
|
|
1479
1493
|
|
|
1480
1494
|
// NextN/MTP parameters
|
|
1481
1495
|
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1482
1496
|
|
|
1497
|
+
// TODO: when MTP is implemented, this should probably be updated if needed
|
|
1498
|
+
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
|
1499
|
+
|
|
1483
1500
|
switch (hparams.n_layer) {
|
|
1484
1501
|
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
|
1485
1502
|
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
|
|
@@ -1553,6 +1570,27 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1553
1570
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1554
1571
|
}
|
|
1555
1572
|
} break;
|
|
1573
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
1574
|
+
{
|
|
1575
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1576
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1577
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1578
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1579
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1580
|
+
|
|
1581
|
+
// A layer is recurrent IFF the n_head_kv value is set to 0 and
|
|
1582
|
+
// the n_ff value is set to 0
|
|
1583
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
1584
|
+
hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
|
|
1585
|
+
}
|
|
1586
|
+
|
|
1587
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1588
|
+
|
|
1589
|
+
switch (hparams.n_layer) {
|
|
1590
|
+
case 56: type = LLM_TYPE_9B; break;
|
|
1591
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1592
|
+
}
|
|
1593
|
+
} break;
|
|
1556
1594
|
case LLM_ARCH_EXAONE:
|
|
1557
1595
|
{
|
|
1558
1596
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -2627,6 +2665,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2627
2665
|
case LLM_ARCH_BERT:
|
|
2628
2666
|
case LLM_ARCH_NOMIC_BERT:
|
|
2629
2667
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
2668
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
2630
2669
|
{
|
|
2631
2670
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2632
2671
|
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
|
|
@@ -2662,24 +2701,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2662
2701
|
}
|
|
2663
2702
|
|
|
2664
2703
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2704
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2665
2705
|
|
|
2666
2706
|
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
2667
2707
|
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
2668
2708
|
|
|
2669
2709
|
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
|
2670
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2671
2710
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
|
2672
2711
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
2673
2712
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2674
2713
|
} else {
|
|
2675
|
-
layer.ffn_up
|
|
2676
|
-
layer.
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2681
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2682
|
-
} else {
|
|
2714
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2715
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
2716
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2717
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2718
|
+
|
|
2719
|
+
if (arch == LLM_ARCH_NOMIC_BERT) {
|
|
2683
2720
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2684
2721
|
}
|
|
2685
2722
|
}
|
|
@@ -4672,6 +4709,75 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4672
4709
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
4673
4710
|
}
|
|
4674
4711
|
} break;
|
|
4712
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
4713
|
+
{
|
|
4714
|
+
// mamba2 Mixer SSM params
|
|
4715
|
+
// NOTE: int64_t for tensor dimensions
|
|
4716
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
4717
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
4718
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
4719
|
+
const int64_t n_ssm_head = hparams.ssm_dt_rank;
|
|
4720
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
4721
|
+
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
4722
|
+
|
|
4723
|
+
// embeddings
|
|
4724
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4725
|
+
|
|
4726
|
+
// output
|
|
4727
|
+
{
|
|
4728
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4729
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4730
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
4731
|
+
if (output == NULL) {
|
|
4732
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4733
|
+
}
|
|
4734
|
+
}
|
|
4735
|
+
|
|
4736
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4737
|
+
auto & layer = layers[i];
|
|
4738
|
+
|
|
4739
|
+
// all blocks use the attn norm
|
|
4740
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4741
|
+
|
|
4742
|
+
if (hparams.is_recurrent(i)) {
|
|
4743
|
+
// ssm layers
|
|
4744
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
|
|
4745
|
+
|
|
4746
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
|
|
4747
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
|
|
4748
|
+
|
|
4749
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
|
|
4750
|
+
|
|
4751
|
+
// no "weight" suffix for these
|
|
4752
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
|
|
4753
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
|
|
4754
|
+
|
|
4755
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
|
|
4756
|
+
|
|
4757
|
+
// out_proj
|
|
4758
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
4759
|
+
} else if (hparams.n_ff(i) == 0) {
|
|
4760
|
+
// attention layers (with optional bias)
|
|
4761
|
+
const int64_t n_head_i = hparams.n_head(i);
|
|
4762
|
+
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
|
|
4763
|
+
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
|
|
4764
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
|
|
4765
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
|
|
4766
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
|
|
4767
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
|
|
4768
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4769
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
4770
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
4771
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4772
|
+
} else {
|
|
4773
|
+
// mlp layers
|
|
4774
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
|
4775
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
|
4776
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4777
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
|
4778
|
+
}
|
|
4779
|
+
}
|
|
4780
|
+
} break;
|
|
4675
4781
|
case LLM_ARCH_EXAONE:
|
|
4676
4782
|
{
|
|
4677
4783
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5846,7 +5952,8 @@ void llama_model::print_info() const {
|
|
|
5846
5952
|
arch == LLM_ARCH_JAMBA ||
|
|
5847
5953
|
arch == LLM_ARCH_FALCON_H1 ||
|
|
5848
5954
|
arch == LLM_ARCH_PLAMO2 ||
|
|
5849
|
-
arch == LLM_ARCH_GRANITE_HYBRID
|
|
5955
|
+
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
5956
|
+
arch == LLM_ARCH_NEMOTRON_H) {
|
|
5850
5957
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
5851
5958
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
5852
5959
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
@@ -7457,7 +7564,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7457
7564
|
}
|
|
7458
7565
|
|
|
7459
7566
|
// RoPE
|
|
7460
|
-
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
7567
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
|
|
7461
7568
|
Qcur = ggml_rope_ext(
|
|
7462
7569
|
ctx0, Qcur, inp_pos, nullptr,
|
|
7463
7570
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -7516,7 +7623,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7516
7623
|
0.0f,
|
|
7517
7624
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
|
7518
7625
|
cb(cur, "ffn_moe_out", il);
|
|
7519
|
-
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
7626
|
+
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
|
|
7520
7627
|
cur = build_ffn(cur,
|
|
7521
7628
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
7522
7629
|
NULL, NULL, NULL,
|
|
@@ -10524,7 +10631,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10524
10631
|
const int64_t n_embd_altup;
|
|
10525
10632
|
const int64_t n_altup;
|
|
10526
10633
|
const int i_altup_act;
|
|
10527
|
-
const int n_layer_kv = 20; // number of layers having KV [KV_REUSE]
|
|
10528
10634
|
const int n_layer_sparsity = 10; // number of layers using activation sparsity
|
|
10529
10635
|
const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
|
|
10530
10636
|
|
|
@@ -10574,8 +10680,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10574
10680
|
|
|
10575
10681
|
for (int il = 0; il < n_layer; ++il) {
|
|
10576
10682
|
// this block is made to be closely resemble Gemma3p5DecoderLayer on python code
|
|
10577
|
-
const bool has_kv = (il < n_layer_kv);
|
|
10578
|
-
|
|
10579
10683
|
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
10580
10684
|
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
10581
10685
|
|
|
@@ -10595,7 +10699,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10595
10699
|
ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
|
|
10596
10700
|
|
|
10597
10701
|
// self-attention
|
|
10598
|
-
if (has_kv) {
|
|
10702
|
+
if (hparams.has_kv(il)) {
|
|
10599
10703
|
// compute Q and K and RoPE them
|
|
10600
10704
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
10601
10705
|
cb(Qcur, "Qcur", il);
|
|
@@ -10635,7 +10739,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10635
10739
|
model.layers[il].wo, NULL,
|
|
10636
10740
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
10637
10741
|
} else {
|
|
10638
|
-
//
|
|
10742
|
+
// reuse KV cache of earlier layers
|
|
10639
10743
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
10640
10744
|
cb(Qcur, "Qcur", il);
|
|
10641
10745
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
@@ -14116,6 +14220,138 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
14116
14220
|
}
|
|
14117
14221
|
};
|
|
14118
14222
|
|
|
14223
|
+
struct llm_build_nemotron_h : public llm_graph_context_mamba {
|
|
14224
|
+
llm_build_nemotron_h(
|
|
14225
|
+
const llama_model & model,
|
|
14226
|
+
const llm_graph_params & params) :
|
|
14227
|
+
llm_graph_context_mamba(params) {
|
|
14228
|
+
|
|
14229
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14230
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14231
|
+
|
|
14232
|
+
ggml_tensor * cur;
|
|
14233
|
+
ggml_tensor * inpL;
|
|
14234
|
+
|
|
14235
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14236
|
+
|
|
14237
|
+
auto * inp = build_inp_mem_hybrid();
|
|
14238
|
+
|
|
14239
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14240
|
+
|
|
14241
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14242
|
+
struct ggml_tensor * inpSA = inpL;
|
|
14243
|
+
|
|
14244
|
+
// norm
|
|
14245
|
+
cur = build_norm(inpL,
|
|
14246
|
+
model.layers[il].attn_norm, NULL,
|
|
14247
|
+
LLM_NORM_RMS, il);
|
|
14248
|
+
cb(cur, "attn_norm", il);
|
|
14249
|
+
|
|
14250
|
+
if (hparams.is_recurrent(il)) {
|
|
14251
|
+
// ssm layer //
|
|
14252
|
+
cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
|
|
14253
|
+
} else if (hparams.n_ff(il) == 0) {
|
|
14254
|
+
// attention layer //
|
|
14255
|
+
cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
|
|
14256
|
+
} else {
|
|
14257
|
+
cur = build_ffn_layer(cur, model, il);
|
|
14258
|
+
}
|
|
14259
|
+
|
|
14260
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14261
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14262
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14263
|
+
}
|
|
14264
|
+
|
|
14265
|
+
// add residual
|
|
14266
|
+
cur = ggml_add(ctx0, cur, inpSA);
|
|
14267
|
+
cb(cur, "block_out", il);
|
|
14268
|
+
|
|
14269
|
+
// input for next layer
|
|
14270
|
+
inpL = cur;
|
|
14271
|
+
}
|
|
14272
|
+
|
|
14273
|
+
cur = inpL;
|
|
14274
|
+
|
|
14275
|
+
cur = build_norm(cur,
|
|
14276
|
+
model.output_norm, NULL,
|
|
14277
|
+
LLM_NORM_RMS, -1);
|
|
14278
|
+
|
|
14279
|
+
cb(cur, "result_norm", -1);
|
|
14280
|
+
res->t_embd = cur;
|
|
14281
|
+
|
|
14282
|
+
// lm_head
|
|
14283
|
+
cur = build_lora_mm(model.output, cur);
|
|
14284
|
+
cb(cur, "result_output", -1);
|
|
14285
|
+
res->t_logits = cur;
|
|
14286
|
+
|
|
14287
|
+
ggml_build_forward_expand(gf, cur);
|
|
14288
|
+
}
|
|
14289
|
+
|
|
14290
|
+
ggml_tensor * build_attention_layer(
|
|
14291
|
+
ggml_tensor * cur,
|
|
14292
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
14293
|
+
const llama_model & model,
|
|
14294
|
+
const int64_t n_embd_head,
|
|
14295
|
+
const int il) {
|
|
14296
|
+
|
|
14297
|
+
// compute Q and K and (optionally) RoPE them
|
|
14298
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14299
|
+
cb(Qcur, "Qcur", il);
|
|
14300
|
+
if (model.layers[il].bq) {
|
|
14301
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14302
|
+
cb(Qcur, "Qcur", il);
|
|
14303
|
+
}
|
|
14304
|
+
|
|
14305
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14306
|
+
cb(Kcur, "Kcur", il);
|
|
14307
|
+
if (model.layers[il].bk) {
|
|
14308
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14309
|
+
cb(Kcur, "Kcur", il);
|
|
14310
|
+
}
|
|
14311
|
+
|
|
14312
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14313
|
+
cb(Vcur, "Vcur", il);
|
|
14314
|
+
if (model.layers[il].bv) {
|
|
14315
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14316
|
+
cb(Vcur, "Vcur", il);
|
|
14317
|
+
}
|
|
14318
|
+
|
|
14319
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
|
|
14320
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14321
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14322
|
+
|
|
14323
|
+
cb(Qcur, "Qcur", il);
|
|
14324
|
+
cb(Kcur, "Kcur", il);
|
|
14325
|
+
cb(Vcur, "Vcur", il);
|
|
14326
|
+
|
|
14327
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14328
|
+
cur = build_attn(inp_attn,
|
|
14329
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
14330
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
14331
|
+
cb(cur, "attn_out", il);
|
|
14332
|
+
return cur;
|
|
14333
|
+
}
|
|
14334
|
+
|
|
14335
|
+
ggml_tensor * build_ffn_layer(
|
|
14336
|
+
ggml_tensor * cur,
|
|
14337
|
+
const llama_model & model,
|
|
14338
|
+
const int il) {
|
|
14339
|
+
|
|
14340
|
+
cur = build_ffn(cur,
|
|
14341
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
14342
|
+
NULL, NULL, NULL,
|
|
14343
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
14344
|
+
NULL,
|
|
14345
|
+
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
|
14346
|
+
cb(cur, "ffn_out", il);
|
|
14347
|
+
|
|
14348
|
+
cur = build_cvec(cur, il);
|
|
14349
|
+
cb(cur, "l_out", il);
|
|
14350
|
+
|
|
14351
|
+
return cur;
|
|
14352
|
+
}
|
|
14353
|
+
};
|
|
14354
|
+
|
|
14119
14355
|
struct llm_build_exaone : public llm_graph_context {
|
|
14120
14356
|
llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
14121
14357
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -18240,6 +18476,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18240
18476
|
// switch statement
|
|
18241
18477
|
case LLM_ARCH_BERT:
|
|
18242
18478
|
case LLM_ARCH_JINA_BERT_V2:
|
|
18479
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
18243
18480
|
case LLM_ARCH_NOMIC_BERT:
|
|
18244
18481
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
18245
18482
|
case LLM_ARCH_NEO_BERT:
|
|
@@ -18256,13 +18493,30 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18256
18493
|
if (llm_arch_is_recurrent(arch)) {
|
|
18257
18494
|
res = new llama_memory_recurrent(
|
|
18258
18495
|
*this,
|
|
18259
|
-
nullptr,
|
|
18260
18496
|
GGML_TYPE_F32,
|
|
18261
18497
|
GGML_TYPE_F32,
|
|
18262
18498
|
cparams.offload_kqv,
|
|
18263
18499
|
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
18264
|
-
cparams.n_seq_max
|
|
18500
|
+
cparams.n_seq_max,
|
|
18501
|
+
nullptr);
|
|
18265
18502
|
} else if (llm_arch_is_hybrid(arch)) {
|
|
18503
|
+
|
|
18504
|
+
// The main difference between hybrid architectures is the
|
|
18505
|
+
// layer filters, so pick the right one here
|
|
18506
|
+
llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
|
|
18507
|
+
llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
|
|
18508
|
+
if (arch == LLM_ARCH_FALCON_H1) {
|
|
18509
|
+
filter_attn = [&](int32_t) { return true; };
|
|
18510
|
+
filter_recr = [&](int32_t) { return true; };
|
|
18511
|
+
} else if (arch == LLM_ARCH_NEMOTRON_H) {
|
|
18512
|
+
filter_attn = [&](int32_t il) {
|
|
18513
|
+
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
|
18514
|
+
};
|
|
18515
|
+
filter_recr = [&](int32_t il) {
|
|
18516
|
+
return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
|
18517
|
+
};
|
|
18518
|
+
}
|
|
18519
|
+
|
|
18266
18520
|
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18267
18521
|
|
|
18268
18522
|
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
@@ -18282,8 +18536,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18282
18536
|
/* n_seq_max */ cparams.n_seq_max,
|
|
18283
18537
|
/* offload */ cparams.offload_kqv,
|
|
18284
18538
|
/* unified */ cparams.kv_unified,
|
|
18285
|
-
/* filter_attn */ (
|
|
18286
|
-
/* filter_recr */ (
|
|
18539
|
+
/* filter_attn */ std::move(filter_attn),
|
|
18540
|
+
/* filter_recr */ std::move(filter_recr));
|
|
18287
18541
|
} else {
|
|
18288
18542
|
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18289
18543
|
|
|
@@ -18302,6 +18556,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18302
18556
|
|
|
18303
18557
|
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
18304
18558
|
|
|
18559
|
+
llama_memory_i::layer_reuse_cb reuse = nullptr;
|
|
18560
|
+
|
|
18561
|
+
if (arch == LLM_ARCH_GEMMA3N) {
|
|
18562
|
+
reuse = [&](int32_t il) {
|
|
18563
|
+
if (il >= (int32_t) hparams.n_layer_kv_from_start) {
|
|
18564
|
+
return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
|
|
18565
|
+
}
|
|
18566
|
+
|
|
18567
|
+
return -1;
|
|
18568
|
+
};
|
|
18569
|
+
}
|
|
18570
|
+
|
|
18305
18571
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
18306
18572
|
GGML_ASSERT(hparams.is_swa_any());
|
|
18307
18573
|
|
|
@@ -18316,13 +18582,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18316
18582
|
n_ctx_per_stream,
|
|
18317
18583
|
cparams.n_seq_max,
|
|
18318
18584
|
cparams.n_ubatch,
|
|
18319
|
-
padding
|
|
18585
|
+
padding,
|
|
18586
|
+
nullptr,
|
|
18587
|
+
reuse);
|
|
18320
18588
|
} else {
|
|
18321
18589
|
GGML_ASSERT(!hparams.is_swa_any());
|
|
18322
18590
|
|
|
18323
18591
|
res = new llama_kv_cache(
|
|
18324
18592
|
*this,
|
|
18325
|
-
nullptr,
|
|
18326
18593
|
params.type_k,
|
|
18327
18594
|
params.type_v,
|
|
18328
18595
|
!cparams.flash_attn,
|
|
@@ -18332,7 +18599,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18332
18599
|
cparams.n_seq_max,
|
|
18333
18600
|
padding,
|
|
18334
18601
|
hparams.n_swa,
|
|
18335
|
-
hparams.swa_type
|
|
18602
|
+
hparams.swa_type,
|
|
18603
|
+
nullptr,
|
|
18604
|
+
nullptr);
|
|
18336
18605
|
}
|
|
18337
18606
|
}
|
|
18338
18607
|
}
|
|
@@ -18379,6 +18648,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18379
18648
|
} break;
|
|
18380
18649
|
case LLM_ARCH_BERT:
|
|
18381
18650
|
case LLM_ARCH_JINA_BERT_V2:
|
|
18651
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
18382
18652
|
case LLM_ARCH_NOMIC_BERT:
|
|
18383
18653
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
18384
18654
|
{
|
|
@@ -18595,6 +18865,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18595
18865
|
{
|
|
18596
18866
|
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
|
18597
18867
|
} break;
|
|
18868
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
18869
|
+
{
|
|
18870
|
+
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
|
18871
|
+
} break;
|
|
18598
18872
|
case LLM_ARCH_EXAONE:
|
|
18599
18873
|
{
|
|
18600
18874
|
llm = std::make_unique<llm_build_exaone>(*this, params);
|
|
@@ -18720,7 +18994,7 @@ llama_model_params llama_model_default_params() {
|
|
|
18720
18994
|
llama_model_params result = {
|
|
18721
18995
|
/*.devices =*/ nullptr,
|
|
18722
18996
|
/*.tensor_buft_overrides =*/ nullptr,
|
|
18723
|
-
/*.n_gpu_layers =*/
|
|
18997
|
+
/*.n_gpu_layers =*/ 999,
|
|
18724
18998
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
18725
18999
|
/*.main_gpu =*/ 0,
|
|
18726
19000
|
/*.tensor_split =*/ nullptr,
|
|
@@ -18734,11 +19008,6 @@ llama_model_params llama_model_default_params() {
|
|
|
18734
19008
|
/*.use_extra_bufts =*/ true,
|
|
18735
19009
|
};
|
|
18736
19010
|
|
|
18737
|
-
#ifdef GGML_USE_METAL
|
|
18738
|
-
// note: we usually have plenty of VRAM, so by default offload all layers to the GPU
|
|
18739
|
-
result.n_gpu_layers = 999;
|
|
18740
|
-
#endif
|
|
18741
|
-
|
|
18742
19011
|
return result;
|
|
18743
19012
|
}
|
|
18744
19013
|
|
|
@@ -18830,6 +19099,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
18830
19099
|
case LLM_ARCH_RWKV7:
|
|
18831
19100
|
case LLM_ARCH_ARWKV7:
|
|
18832
19101
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
19102
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
18833
19103
|
return LLAMA_ROPE_TYPE_NONE;
|
|
18834
19104
|
|
|
18835
19105
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
@@ -18869,6 +19139,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
18869
19139
|
case LLM_ARCH_GROK:
|
|
18870
19140
|
case LLM_ARCH_DBRX:
|
|
18871
19141
|
case LLM_ARCH_BERT:
|
|
19142
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
18872
19143
|
case LLM_ARCH_NOMIC_BERT:
|
|
18873
19144
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
18874
19145
|
case LLM_ARCH_STABLELM:
|
|
@@ -2470,7 +2470,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2470
2470
|
// set attributes by model/tokenizer/architecture name
|
|
2471
2471
|
if (false
|
|
2472
2472
|
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
|
|
2473
|
-
|| _contains_any(general_arch, {"nomic-bert-moe"})
|
|
2473
|
+
|| _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
|
|
2474
2474
|
) {
|
|
2475
2475
|
if (token_to_id.count("<mask>") == 0) {
|
|
2476
2476
|
LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
|
|
@@ -25,6 +25,18 @@
|
|
|
25
25
|
// interface implementation
|
|
26
26
|
//
|
|
27
27
|
|
|
28
|
+
const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
|
|
29
|
+
switch (flash_attn_type) {
|
|
30
|
+
case LLAMA_FLASH_ATTN_TYPE_AUTO:
|
|
31
|
+
return "auto";
|
|
32
|
+
case LLAMA_FLASH_ATTN_TYPE_DISABLED:
|
|
33
|
+
return "disabled";
|
|
34
|
+
case LLAMA_FLASH_ATTN_TYPE_ENABLED:
|
|
35
|
+
return "enabled";
|
|
36
|
+
}
|
|
37
|
+
GGML_ABORT("fatal error");
|
|
38
|
+
}
|
|
39
|
+
|
|
28
40
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
|
29
41
|
struct llama_sampler_chain_params result = {
|
|
30
42
|
/*.no_perf =*/ true,
|