@fugood/llama.node 1.1.10 → 1.2.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +20 -2
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +174 -388
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +67 -37
- package/src/llama.cpp/common/chat.cpp +263 -2
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +5 -2
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +50 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
- package/src/llama.cpp/include/llama.h +32 -7
- package/src/llama.cpp/src/llama-adapter.cpp +101 -4
- package/src/llama.cpp/src/llama-adapter.h +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +69 -2
- package/src/llama.cpp/src/llama-arch.h +6 -0
- package/src/llama.cpp/src/llama-context.cpp +92 -45
- package/src/llama.cpp/src/llama-context.h +1 -5
- package/src/llama.cpp/src/llama-graph.cpp +74 -19
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -3
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
- package/src/llama.cpp/src/llama-kv-cache.h +4 -13
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +434 -21
- package/src/llama.cpp/src/llama-model.h +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
47
47
|
case LLM_TYPE_410M: return "410M";
|
|
48
48
|
case LLM_TYPE_450M: return "450M";
|
|
49
49
|
case LLM_TYPE_475M: return "475M";
|
|
50
|
+
case LLM_TYPE_558M: return "558M";
|
|
50
51
|
case LLM_TYPE_700M: return "700M";
|
|
51
52
|
case LLM_TYPE_770M: return "770M";
|
|
52
53
|
case LLM_TYPE_780M: return "780M";
|
|
@@ -772,6 +773,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
772
773
|
default: type = LLM_TYPE_UNKNOWN;
|
|
773
774
|
}
|
|
774
775
|
} break;
|
|
776
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
777
|
+
{
|
|
778
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
779
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
780
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
781
|
+
|
|
782
|
+
switch (hparams.n_layer) {
|
|
783
|
+
case 24:
|
|
784
|
+
type = LLM_TYPE_558M; break;
|
|
785
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
786
|
+
}
|
|
787
|
+
} break;
|
|
775
788
|
case LLM_ARCH_NOMIC_BERT:
|
|
776
789
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
777
790
|
{
|
|
@@ -1097,7 +1110,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1097
1110
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1098
1111
|
|
|
1099
1112
|
switch (hparams.n_layer) {
|
|
1100
|
-
case 18: type =
|
|
1113
|
+
case 18: type = LLM_TYPE_270M; break;
|
|
1101
1114
|
case 26: type = LLM_TYPE_1B; break;
|
|
1102
1115
|
case 34: type = LLM_TYPE_4B; break;
|
|
1103
1116
|
case 48: type = LLM_TYPE_12B; break;
|
|
@@ -1129,6 +1142,26 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1129
1142
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1130
1143
|
}
|
|
1131
1144
|
} break;
|
|
1145
|
+
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
1146
|
+
{
|
|
1147
|
+
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
|
1148
|
+
hparams.set_swa_pattern(6);
|
|
1149
|
+
|
|
1150
|
+
hparams.causal_attn = false; // embeddings do not use causal attention
|
|
1151
|
+
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1152
|
+
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1153
|
+
|
|
1154
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1155
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1156
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
1157
|
+
|
|
1158
|
+
switch (hparams.n_layer) {
|
|
1159
|
+
case 24: type = LLM_TYPE_0_3B; break;
|
|
1160
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1161
|
+
}
|
|
1162
|
+
hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
1163
|
+
|
|
1164
|
+
} break;
|
|
1132
1165
|
case LLM_ARCH_STARCODER2:
|
|
1133
1166
|
{
|
|
1134
1167
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1557,6 +1590,27 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1557
1590
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1558
1591
|
}
|
|
1559
1592
|
} break;
|
|
1593
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
1594
|
+
{
|
|
1595
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1596
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1597
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1598
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1599
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1600
|
+
|
|
1601
|
+
// A layer is recurrent IFF the n_head_kv value is set to 0 and
|
|
1602
|
+
// the n_ff value is set to 0
|
|
1603
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
1604
|
+
hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
|
|
1605
|
+
}
|
|
1606
|
+
|
|
1607
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1608
|
+
|
|
1609
|
+
switch (hparams.n_layer) {
|
|
1610
|
+
case 56: type = LLM_TYPE_9B; break;
|
|
1611
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1612
|
+
}
|
|
1613
|
+
} break;
|
|
1560
1614
|
case LLM_ARCH_EXAONE:
|
|
1561
1615
|
{
|
|
1562
1616
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -2631,6 +2685,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2631
2685
|
case LLM_ARCH_BERT:
|
|
2632
2686
|
case LLM_ARCH_NOMIC_BERT:
|
|
2633
2687
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
2688
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
2634
2689
|
{
|
|
2635
2690
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2636
2691
|
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
|
|
@@ -2666,24 +2721,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2666
2721
|
}
|
|
2667
2722
|
|
|
2668
2723
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2724
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2669
2725
|
|
|
2670
2726
|
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
2671
2727
|
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
2672
2728
|
|
|
2673
2729
|
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
|
2674
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2675
2730
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
|
2676
2731
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
2677
2732
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2678
2733
|
} else {
|
|
2679
|
-
layer.ffn_up
|
|
2680
|
-
layer.
|
|
2681
|
-
|
|
2682
|
-
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2686
|
-
} else {
|
|
2734
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2735
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
2736
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2737
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2738
|
+
|
|
2739
|
+
if (arch == LLM_ARCH_NOMIC_BERT) {
|
|
2687
2740
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2688
2741
|
}
|
|
2689
2742
|
}
|
|
@@ -3451,6 +3504,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3451
3504
|
}
|
|
3452
3505
|
} break;
|
|
3453
3506
|
case LLM_ARCH_GEMMA3:
|
|
3507
|
+
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
3454
3508
|
{
|
|
3455
3509
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3456
3510
|
|
|
@@ -4676,6 +4730,75 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4676
4730
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
4677
4731
|
}
|
|
4678
4732
|
} break;
|
|
4733
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
4734
|
+
{
|
|
4735
|
+
// mamba2 Mixer SSM params
|
|
4736
|
+
// NOTE: int64_t for tensor dimensions
|
|
4737
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
4738
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
4739
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
4740
|
+
const int64_t n_ssm_head = hparams.ssm_dt_rank;
|
|
4741
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
4742
|
+
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
4743
|
+
|
|
4744
|
+
// embeddings
|
|
4745
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4746
|
+
|
|
4747
|
+
// output
|
|
4748
|
+
{
|
|
4749
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4750
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4751
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
4752
|
+
if (output == NULL) {
|
|
4753
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4754
|
+
}
|
|
4755
|
+
}
|
|
4756
|
+
|
|
4757
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4758
|
+
auto & layer = layers[i];
|
|
4759
|
+
|
|
4760
|
+
// all blocks use the attn norm
|
|
4761
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4762
|
+
|
|
4763
|
+
if (hparams.is_recurrent(i)) {
|
|
4764
|
+
// ssm layers
|
|
4765
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
|
|
4766
|
+
|
|
4767
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
|
|
4768
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
|
|
4769
|
+
|
|
4770
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
|
|
4771
|
+
|
|
4772
|
+
// no "weight" suffix for these
|
|
4773
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
|
|
4774
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
|
|
4775
|
+
|
|
4776
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
|
|
4777
|
+
|
|
4778
|
+
// out_proj
|
|
4779
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
4780
|
+
} else if (hparams.n_ff(i) == 0) {
|
|
4781
|
+
// attention layers (with optional bias)
|
|
4782
|
+
const int64_t n_head_i = hparams.n_head(i);
|
|
4783
|
+
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
|
|
4784
|
+
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
|
|
4785
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
|
|
4786
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
|
|
4787
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
|
|
4788
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
|
|
4789
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4790
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
4791
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
4792
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4793
|
+
} else {
|
|
4794
|
+
// mlp layers
|
|
4795
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
|
4796
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
|
4797
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4798
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
|
4799
|
+
}
|
|
4800
|
+
}
|
|
4801
|
+
} break;
|
|
4679
4802
|
case LLM_ARCH_EXAONE:
|
|
4680
4803
|
{
|
|
4681
4804
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5850,7 +5973,8 @@ void llama_model::print_info() const {
|
|
|
5850
5973
|
arch == LLM_ARCH_JAMBA ||
|
|
5851
5974
|
arch == LLM_ARCH_FALCON_H1 ||
|
|
5852
5975
|
arch == LLM_ARCH_PLAMO2 ||
|
|
5853
|
-
arch == LLM_ARCH_GRANITE_HYBRID
|
|
5976
|
+
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
5977
|
+
arch == LLM_ARCH_NEMOTRON_H) {
|
|
5854
5978
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
5855
5979
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
5856
5980
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
@@ -7461,7 +7585,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7461
7585
|
}
|
|
7462
7586
|
|
|
7463
7587
|
// RoPE
|
|
7464
|
-
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
7588
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
|
|
7465
7589
|
Qcur = ggml_rope_ext(
|
|
7466
7590
|
ctx0, Qcur, inp_pos, nullptr,
|
|
7467
7591
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -7520,7 +7644,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7520
7644
|
0.0f,
|
|
7521
7645
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
|
7522
7646
|
cb(cur, "ffn_moe_out", il);
|
|
7523
|
-
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
7647
|
+
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
|
|
7524
7648
|
cur = build_ffn(cur,
|
|
7525
7649
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
7526
7650
|
NULL, NULL, NULL,
|
|
@@ -10942,6 +11066,137 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
10942
11066
|
}
|
|
10943
11067
|
};
|
|
10944
11068
|
|
|
11069
|
+
struct llm_build_gemma_embedding_iswa : public llm_graph_context {
|
|
11070
|
+
llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
11071
|
+
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
11072
|
+
|
|
11073
|
+
ggml_tensor * cur;
|
|
11074
|
+
ggml_tensor * inpL;
|
|
11075
|
+
|
|
11076
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
11077
|
+
|
|
11078
|
+
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
11079
|
+
if (ubatch.token) {
|
|
11080
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
|
11081
|
+
cb(inpL, "inp_scaled", -1);
|
|
11082
|
+
}
|
|
11083
|
+
|
|
11084
|
+
// inp_pos - contains the positions
|
|
11085
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
11086
|
+
|
|
11087
|
+
// TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
|
|
11088
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
11089
|
+
|
|
11090
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11091
|
+
|
|
11092
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
11093
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
11094
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
11095
|
+
|
|
11096
|
+
// norm
|
|
11097
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
11098
|
+
cb(cur, "attn_norm", il);
|
|
11099
|
+
|
|
11100
|
+
// self-attention
|
|
11101
|
+
{
|
|
11102
|
+
// compute Q and K and RoPE them
|
|
11103
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
11104
|
+
cb(Qcur, "Qcur", il);
|
|
11105
|
+
|
|
11106
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
11107
|
+
cb(Kcur, "Kcur", il);
|
|
11108
|
+
|
|
11109
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
11110
|
+
cb(Vcur, "Vcur", il);
|
|
11111
|
+
|
|
11112
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
11113
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
11114
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
11115
|
+
|
|
11116
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
11117
|
+
cb(Qcur, "Qcur_normed", il);
|
|
11118
|
+
|
|
11119
|
+
Qcur = ggml_rope_ext(
|
|
11120
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
11121
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
11122
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
11123
|
+
|
|
11124
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
11125
|
+
cb(Kcur, "Kcur_normed", il);
|
|
11126
|
+
|
|
11127
|
+
Kcur = ggml_rope_ext(
|
|
11128
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
11129
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
11130
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
11131
|
+
|
|
11132
|
+
cb(Qcur, "Qcur", il);
|
|
11133
|
+
cb(Kcur, "Kcur", il);
|
|
11134
|
+
cb(Vcur, "Vcur", il);
|
|
11135
|
+
|
|
11136
|
+
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
|
|
11137
|
+
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
|
|
11138
|
+
|
|
11139
|
+
cur = build_attn(inp_attn,
|
|
11140
|
+
model.layers[il].wo, NULL,
|
|
11141
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
|
|
11142
|
+
}
|
|
11143
|
+
|
|
11144
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
11145
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11146
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
11147
|
+
}
|
|
11148
|
+
|
|
11149
|
+
cur = build_norm(cur,
|
|
11150
|
+
model.layers[il].attn_post_norm, NULL,
|
|
11151
|
+
LLM_NORM_RMS, il);
|
|
11152
|
+
cb(cur, "attn_post_norm", il);
|
|
11153
|
+
|
|
11154
|
+
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
|
11155
|
+
cb(sa_out, "sa_out", il);
|
|
11156
|
+
|
|
11157
|
+
cur = build_norm(sa_out,
|
|
11158
|
+
model.layers[il].ffn_norm, NULL,
|
|
11159
|
+
LLM_NORM_RMS, il);
|
|
11160
|
+
cb(cur, "ffn_norm", il);
|
|
11161
|
+
|
|
11162
|
+
// feed-forward network
|
|
11163
|
+
{
|
|
11164
|
+
cur = build_ffn(cur,
|
|
11165
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
11166
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
11167
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
11168
|
+
NULL,
|
|
11169
|
+
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
11170
|
+
cb(cur, "ffn_out", il);
|
|
11171
|
+
}
|
|
11172
|
+
|
|
11173
|
+
cur = build_norm(cur,
|
|
11174
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
11175
|
+
LLM_NORM_RMS, -1);
|
|
11176
|
+
cb(cur, "ffn_post_norm", -1);
|
|
11177
|
+
|
|
11178
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
|
11179
|
+
|
|
11180
|
+
cur = build_cvec(cur, il);
|
|
11181
|
+
cb(cur, "l_out", il);
|
|
11182
|
+
|
|
11183
|
+
// input for next layer
|
|
11184
|
+
inpL = cur;
|
|
11185
|
+
}
|
|
11186
|
+
|
|
11187
|
+
cur = inpL;
|
|
11188
|
+
|
|
11189
|
+
cur = build_norm(cur,
|
|
11190
|
+
model.output_norm, NULL,
|
|
11191
|
+
LLM_NORM_RMS, -1);
|
|
11192
|
+
|
|
11193
|
+
cb(cur, "result_norm", -1);
|
|
11194
|
+
res->t_embd = cur;
|
|
11195
|
+
|
|
11196
|
+
ggml_build_forward_expand(gf, cur);
|
|
11197
|
+
}
|
|
11198
|
+
};
|
|
11199
|
+
|
|
10945
11200
|
// TODO: move up next to build_starcoder
|
|
10946
11201
|
struct llm_build_starcoder2 : public llm_graph_context {
|
|
10947
11202
|
llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
@@ -14117,6 +14372,138 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
14117
14372
|
}
|
|
14118
14373
|
};
|
|
14119
14374
|
|
|
14375
|
+
struct llm_build_nemotron_h : public llm_graph_context_mamba {
|
|
14376
|
+
llm_build_nemotron_h(
|
|
14377
|
+
const llama_model & model,
|
|
14378
|
+
const llm_graph_params & params) :
|
|
14379
|
+
llm_graph_context_mamba(params) {
|
|
14380
|
+
|
|
14381
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14382
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14383
|
+
|
|
14384
|
+
ggml_tensor * cur;
|
|
14385
|
+
ggml_tensor * inpL;
|
|
14386
|
+
|
|
14387
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14388
|
+
|
|
14389
|
+
auto * inp = build_inp_mem_hybrid();
|
|
14390
|
+
|
|
14391
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14392
|
+
|
|
14393
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14394
|
+
struct ggml_tensor * inpSA = inpL;
|
|
14395
|
+
|
|
14396
|
+
// norm
|
|
14397
|
+
cur = build_norm(inpL,
|
|
14398
|
+
model.layers[il].attn_norm, NULL,
|
|
14399
|
+
LLM_NORM_RMS, il);
|
|
14400
|
+
cb(cur, "attn_norm", il);
|
|
14401
|
+
|
|
14402
|
+
if (hparams.is_recurrent(il)) {
|
|
14403
|
+
// ssm layer //
|
|
14404
|
+
cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
|
|
14405
|
+
} else if (hparams.n_ff(il) == 0) {
|
|
14406
|
+
// attention layer //
|
|
14407
|
+
cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
|
|
14408
|
+
} else {
|
|
14409
|
+
cur = build_ffn_layer(cur, model, il);
|
|
14410
|
+
}
|
|
14411
|
+
|
|
14412
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14413
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14414
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14415
|
+
}
|
|
14416
|
+
|
|
14417
|
+
// add residual
|
|
14418
|
+
cur = ggml_add(ctx0, cur, inpSA);
|
|
14419
|
+
cb(cur, "block_out", il);
|
|
14420
|
+
|
|
14421
|
+
// input for next layer
|
|
14422
|
+
inpL = cur;
|
|
14423
|
+
}
|
|
14424
|
+
|
|
14425
|
+
cur = inpL;
|
|
14426
|
+
|
|
14427
|
+
cur = build_norm(cur,
|
|
14428
|
+
model.output_norm, NULL,
|
|
14429
|
+
LLM_NORM_RMS, -1);
|
|
14430
|
+
|
|
14431
|
+
cb(cur, "result_norm", -1);
|
|
14432
|
+
res->t_embd = cur;
|
|
14433
|
+
|
|
14434
|
+
// lm_head
|
|
14435
|
+
cur = build_lora_mm(model.output, cur);
|
|
14436
|
+
cb(cur, "result_output", -1);
|
|
14437
|
+
res->t_logits = cur;
|
|
14438
|
+
|
|
14439
|
+
ggml_build_forward_expand(gf, cur);
|
|
14440
|
+
}
|
|
14441
|
+
|
|
14442
|
+
ggml_tensor * build_attention_layer(
|
|
14443
|
+
ggml_tensor * cur,
|
|
14444
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
14445
|
+
const llama_model & model,
|
|
14446
|
+
const int64_t n_embd_head,
|
|
14447
|
+
const int il) {
|
|
14448
|
+
|
|
14449
|
+
// compute Q and K and (optionally) RoPE them
|
|
14450
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14451
|
+
cb(Qcur, "Qcur", il);
|
|
14452
|
+
if (model.layers[il].bq) {
|
|
14453
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14454
|
+
cb(Qcur, "Qcur", il);
|
|
14455
|
+
}
|
|
14456
|
+
|
|
14457
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14458
|
+
cb(Kcur, "Kcur", il);
|
|
14459
|
+
if (model.layers[il].bk) {
|
|
14460
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14461
|
+
cb(Kcur, "Kcur", il);
|
|
14462
|
+
}
|
|
14463
|
+
|
|
14464
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14465
|
+
cb(Vcur, "Vcur", il);
|
|
14466
|
+
if (model.layers[il].bv) {
|
|
14467
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14468
|
+
cb(Vcur, "Vcur", il);
|
|
14469
|
+
}
|
|
14470
|
+
|
|
14471
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
|
|
14472
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14473
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14474
|
+
|
|
14475
|
+
cb(Qcur, "Qcur", il);
|
|
14476
|
+
cb(Kcur, "Kcur", il);
|
|
14477
|
+
cb(Vcur, "Vcur", il);
|
|
14478
|
+
|
|
14479
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14480
|
+
cur = build_attn(inp_attn,
|
|
14481
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
14482
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
14483
|
+
cb(cur, "attn_out", il);
|
|
14484
|
+
return cur;
|
|
14485
|
+
}
|
|
14486
|
+
|
|
14487
|
+
ggml_tensor * build_ffn_layer(
|
|
14488
|
+
ggml_tensor * cur,
|
|
14489
|
+
const llama_model & model,
|
|
14490
|
+
const int il) {
|
|
14491
|
+
|
|
14492
|
+
cur = build_ffn(cur,
|
|
14493
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
14494
|
+
NULL, NULL, NULL,
|
|
14495
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
14496
|
+
NULL,
|
|
14497
|
+
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
|
14498
|
+
cb(cur, "ffn_out", il);
|
|
14499
|
+
|
|
14500
|
+
cur = build_cvec(cur, il);
|
|
14501
|
+
cb(cur, "l_out", il);
|
|
14502
|
+
|
|
14503
|
+
return cur;
|
|
14504
|
+
}
|
|
14505
|
+
};
|
|
14506
|
+
|
|
14120
14507
|
struct llm_build_exaone : public llm_graph_context {
|
|
14121
14508
|
llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
14122
14509
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -18241,10 +18628,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18241
18628
|
// switch statement
|
|
18242
18629
|
case LLM_ARCH_BERT:
|
|
18243
18630
|
case LLM_ARCH_JINA_BERT_V2:
|
|
18631
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
18244
18632
|
case LLM_ARCH_NOMIC_BERT:
|
|
18245
18633
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
18246
18634
|
case LLM_ARCH_NEO_BERT:
|
|
18247
18635
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
18636
|
+
//case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
|
|
18248
18637
|
case LLM_ARCH_DREAM:
|
|
18249
18638
|
case LLM_ARCH_LLADA:
|
|
18250
18639
|
{
|
|
@@ -18264,6 +18653,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18264
18653
|
cparams.n_seq_max,
|
|
18265
18654
|
nullptr);
|
|
18266
18655
|
} else if (llm_arch_is_hybrid(arch)) {
|
|
18656
|
+
|
|
18657
|
+
// The main difference between hybrid architectures is the
|
|
18658
|
+
// layer filters, so pick the right one here
|
|
18659
|
+
llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
|
|
18660
|
+
llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
|
|
18661
|
+
if (arch == LLM_ARCH_FALCON_H1) {
|
|
18662
|
+
filter_attn = [&](int32_t) { return true; };
|
|
18663
|
+
filter_recr = [&](int32_t) { return true; };
|
|
18664
|
+
} else if (arch == LLM_ARCH_NEMOTRON_H) {
|
|
18665
|
+
filter_attn = [&](int32_t il) {
|
|
18666
|
+
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
|
18667
|
+
};
|
|
18668
|
+
filter_recr = [&](int32_t il) {
|
|
18669
|
+
return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
|
18670
|
+
};
|
|
18671
|
+
}
|
|
18672
|
+
|
|
18267
18673
|
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18268
18674
|
|
|
18269
18675
|
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
@@ -18283,8 +18689,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18283
18689
|
/* n_seq_max */ cparams.n_seq_max,
|
|
18284
18690
|
/* offload */ cparams.offload_kqv,
|
|
18285
18691
|
/* unified */ cparams.kv_unified,
|
|
18286
|
-
/* filter_attn */ (
|
|
18287
|
-
/* filter_recr */ (
|
|
18692
|
+
/* filter_attn */ std::move(filter_attn),
|
|
18693
|
+
/* filter_recr */ std::move(filter_recr));
|
|
18288
18694
|
} else {
|
|
18289
18695
|
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18290
18696
|
|
|
@@ -18395,6 +18801,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18395
18801
|
} break;
|
|
18396
18802
|
case LLM_ARCH_BERT:
|
|
18397
18803
|
case LLM_ARCH_JINA_BERT_V2:
|
|
18804
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
18398
18805
|
case LLM_ARCH_NOMIC_BERT:
|
|
18399
18806
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
18400
18807
|
{
|
|
@@ -18507,6 +18914,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18507
18914
|
{
|
|
18508
18915
|
llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params);
|
|
18509
18916
|
} break;
|
|
18917
|
+
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
18918
|
+
{
|
|
18919
|
+
llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
|
|
18920
|
+
} break;
|
|
18510
18921
|
case LLM_ARCH_STARCODER2:
|
|
18511
18922
|
{
|
|
18512
18923
|
llm = std::make_unique<llm_build_starcoder2>(*this, params);
|
|
@@ -18611,6 +19022,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18611
19022
|
{
|
|
18612
19023
|
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
|
18613
19024
|
} break;
|
|
19025
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
19026
|
+
{
|
|
19027
|
+
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
|
19028
|
+
} break;
|
|
18614
19029
|
case LLM_ARCH_EXAONE:
|
|
18615
19030
|
{
|
|
18616
19031
|
llm = std::make_unique<llm_build_exaone>(*this, params);
|
|
@@ -18736,7 +19151,7 @@ llama_model_params llama_model_default_params() {
|
|
|
18736
19151
|
llama_model_params result = {
|
|
18737
19152
|
/*.devices =*/ nullptr,
|
|
18738
19153
|
/*.tensor_buft_overrides =*/ nullptr,
|
|
18739
|
-
/*.n_gpu_layers =*/
|
|
19154
|
+
/*.n_gpu_layers =*/ 999,
|
|
18740
19155
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
18741
19156
|
/*.main_gpu =*/ 0,
|
|
18742
19157
|
/*.tensor_split =*/ nullptr,
|
|
@@ -18750,11 +19165,6 @@ llama_model_params llama_model_default_params() {
|
|
|
18750
19165
|
/*.use_extra_bufts =*/ true,
|
|
18751
19166
|
};
|
|
18752
19167
|
|
|
18753
|
-
#ifdef GGML_USE_METAL
|
|
18754
|
-
// note: we usually have plenty of VRAM, so by default offload all layers to the GPU
|
|
18755
|
-
result.n_gpu_layers = 999;
|
|
18756
|
-
#endif
|
|
18757
|
-
|
|
18758
19168
|
return result;
|
|
18759
19169
|
}
|
|
18760
19170
|
|
|
@@ -18846,6 +19256,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
18846
19256
|
case LLM_ARCH_RWKV7:
|
|
18847
19257
|
case LLM_ARCH_ARWKV7:
|
|
18848
19258
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
19259
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
18849
19260
|
return LLAMA_ROPE_TYPE_NONE;
|
|
18850
19261
|
|
|
18851
19262
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
@@ -18885,6 +19296,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
18885
19296
|
case LLM_ARCH_GROK:
|
|
18886
19297
|
case LLM_ARCH_DBRX:
|
|
18887
19298
|
case LLM_ARCH_BERT:
|
|
19299
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
18888
19300
|
case LLM_ARCH_NOMIC_BERT:
|
|
18889
19301
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
18890
19302
|
case LLM_ARCH_STABLELM:
|
|
@@ -18906,6 +19318,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
18906
19318
|
case LLM_ARCH_GEMMA2:
|
|
18907
19319
|
case LLM_ARCH_GEMMA3:
|
|
18908
19320
|
case LLM_ARCH_GEMMA3N:
|
|
19321
|
+
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
18909
19322
|
case LLM_ARCH_STARCODER2:
|
|
18910
19323
|
case LLM_ARCH_OPENELM:
|
|
18911
19324
|
case LLM_ARCH_GPTNEOX:
|