@fugood/llama.node 1.1.10 → 1.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +2 -1
- package/package.json +14 -14
- package/src/LlamaContext.cpp +17 -1
- package/src/llama.cpp/common/arg.cpp +29 -19
- package/src/llama.cpp/common/chat.cpp +152 -1
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +4 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
- package/src/llama.cpp/include/llama.h +27 -1
- package/src/llama.cpp/src/llama-adapter.cpp +68 -4
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +46 -2
- package/src/llama.cpp/src/llama-arch.h +4 -0
- package/src/llama.cpp/src/llama-context.cpp +80 -39
- package/src/llama.cpp/src/llama-context.h +0 -4
- package/src/llama.cpp/src/llama-graph.cpp +20 -10
- package/src/llama.cpp/src/llama-graph.h +2 -1
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +32 -97
- package/src/llama.cpp/src/llama-kv-cache.h +3 -13
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +275 -20
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
|
@@ -788,6 +788,7 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
|
|
|
788
788
|
}
|
|
789
789
|
|
|
790
790
|
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
|
|
791
|
+
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
|
|
791
792
|
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
|
792
793
|
|
|
793
794
|
if (cur == NULL) {
|
|
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
47
47
|
case LLM_TYPE_410M: return "410M";
|
|
48
48
|
case LLM_TYPE_450M: return "450M";
|
|
49
49
|
case LLM_TYPE_475M: return "475M";
|
|
50
|
+
case LLM_TYPE_558M: return "558M";
|
|
50
51
|
case LLM_TYPE_700M: return "700M";
|
|
51
52
|
case LLM_TYPE_770M: return "770M";
|
|
52
53
|
case LLM_TYPE_780M: return "780M";
|
|
@@ -772,6 +773,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
772
773
|
default: type = LLM_TYPE_UNKNOWN;
|
|
773
774
|
}
|
|
774
775
|
} break;
|
|
776
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
777
|
+
{
|
|
778
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
779
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
780
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
781
|
+
|
|
782
|
+
switch (hparams.n_layer) {
|
|
783
|
+
case 24:
|
|
784
|
+
type = LLM_TYPE_558M; break;
|
|
785
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
786
|
+
}
|
|
787
|
+
} break;
|
|
775
788
|
case LLM_ARCH_NOMIC_BERT:
|
|
776
789
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
777
790
|
{
|
|
@@ -1557,6 +1570,27 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1557
1570
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1558
1571
|
}
|
|
1559
1572
|
} break;
|
|
1573
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
1574
|
+
{
|
|
1575
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
1576
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
1577
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
1578
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
1579
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
1580
|
+
|
|
1581
|
+
// A layer is recurrent IFF the n_head_kv value is set to 0 and
|
|
1582
|
+
// the n_ff value is set to 0
|
|
1583
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
1584
|
+
hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0);
|
|
1585
|
+
}
|
|
1586
|
+
|
|
1587
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1588
|
+
|
|
1589
|
+
switch (hparams.n_layer) {
|
|
1590
|
+
case 56: type = LLM_TYPE_9B; break;
|
|
1591
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1592
|
+
}
|
|
1593
|
+
} break;
|
|
1560
1594
|
case LLM_ARCH_EXAONE:
|
|
1561
1595
|
{
|
|
1562
1596
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -2631,6 +2665,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2631
2665
|
case LLM_ARCH_BERT:
|
|
2632
2666
|
case LLM_ARCH_NOMIC_BERT:
|
|
2633
2667
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
2668
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
2634
2669
|
{
|
|
2635
2670
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2636
2671
|
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
|
|
@@ -2666,24 +2701,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2666
2701
|
}
|
|
2667
2702
|
|
|
2668
2703
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2704
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2669
2705
|
|
|
2670
2706
|
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
2671
2707
|
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
2672
2708
|
|
|
2673
2709
|
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
|
2674
|
-
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2675
2710
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
|
2676
2711
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
2677
2712
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2678
2713
|
} else {
|
|
2679
|
-
layer.ffn_up
|
|
2680
|
-
layer.
|
|
2681
|
-
|
|
2682
|
-
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2686
|
-
} else {
|
|
2714
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2715
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
2716
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2717
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2718
|
+
|
|
2719
|
+
if (arch == LLM_ARCH_NOMIC_BERT) {
|
|
2687
2720
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2688
2721
|
}
|
|
2689
2722
|
}
|
|
@@ -4676,6 +4709,75 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4676
4709
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
4677
4710
|
}
|
|
4678
4711
|
} break;
|
|
4712
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
4713
|
+
{
|
|
4714
|
+
// mamba2 Mixer SSM params
|
|
4715
|
+
// NOTE: int64_t for tensor dimensions
|
|
4716
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
4717
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
4718
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
4719
|
+
const int64_t n_ssm_head = hparams.ssm_dt_rank;
|
|
4720
|
+
const int64_t n_group = hparams.ssm_n_group;
|
|
4721
|
+
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
4722
|
+
|
|
4723
|
+
// embeddings
|
|
4724
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4725
|
+
|
|
4726
|
+
// output
|
|
4727
|
+
{
|
|
4728
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4729
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4730
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
4731
|
+
if (output == NULL) {
|
|
4732
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4733
|
+
}
|
|
4734
|
+
}
|
|
4735
|
+
|
|
4736
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4737
|
+
auto & layer = layers[i];
|
|
4738
|
+
|
|
4739
|
+
// all blocks use the attn norm
|
|
4740
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4741
|
+
|
|
4742
|
+
if (hparams.is_recurrent(i)) {
|
|
4743
|
+
// ssm layers
|
|
4744
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
|
|
4745
|
+
|
|
4746
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
|
|
4747
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
|
|
4748
|
+
|
|
4749
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
|
|
4750
|
+
|
|
4751
|
+
// no "weight" suffix for these
|
|
4752
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
|
|
4753
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
|
|
4754
|
+
|
|
4755
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
|
|
4756
|
+
|
|
4757
|
+
// out_proj
|
|
4758
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
4759
|
+
} else if (hparams.n_ff(i) == 0) {
|
|
4760
|
+
// attention layers (with optional bias)
|
|
4761
|
+
const int64_t n_head_i = hparams.n_head(i);
|
|
4762
|
+
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
|
|
4763
|
+
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
|
|
4764
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
|
|
4765
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
|
|
4766
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
|
|
4767
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
|
|
4768
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4769
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
4770
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
|
4771
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4772
|
+
} else {
|
|
4773
|
+
// mlp layers
|
|
4774
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
|
4775
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
|
4776
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4777
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
|
4778
|
+
}
|
|
4779
|
+
}
|
|
4780
|
+
} break;
|
|
4679
4781
|
case LLM_ARCH_EXAONE:
|
|
4680
4782
|
{
|
|
4681
4783
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5850,7 +5952,8 @@ void llama_model::print_info() const {
|
|
|
5850
5952
|
arch == LLM_ARCH_JAMBA ||
|
|
5851
5953
|
arch == LLM_ARCH_FALCON_H1 ||
|
|
5852
5954
|
arch == LLM_ARCH_PLAMO2 ||
|
|
5853
|
-
arch == LLM_ARCH_GRANITE_HYBRID
|
|
5955
|
+
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
5956
|
+
arch == LLM_ARCH_NEMOTRON_H) {
|
|
5854
5957
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
5855
5958
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
5856
5959
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
@@ -7461,7 +7564,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7461
7564
|
}
|
|
7462
7565
|
|
|
7463
7566
|
// RoPE
|
|
7464
|
-
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
7567
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
|
|
7465
7568
|
Qcur = ggml_rope_ext(
|
|
7466
7569
|
ctx0, Qcur, inp_pos, nullptr,
|
|
7467
7570
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -7520,7 +7623,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
7520
7623
|
0.0f,
|
|
7521
7624
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
|
7522
7625
|
cb(cur, "ffn_moe_out", il);
|
|
7523
|
-
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
7626
|
+
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
|
|
7524
7627
|
cur = build_ffn(cur,
|
|
7525
7628
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
7526
7629
|
NULL, NULL, NULL,
|
|
@@ -14117,6 +14220,138 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
14117
14220
|
}
|
|
14118
14221
|
};
|
|
14119
14222
|
|
|
14223
|
+
struct llm_build_nemotron_h : public llm_graph_context_mamba {
|
|
14224
|
+
llm_build_nemotron_h(
|
|
14225
|
+
const llama_model & model,
|
|
14226
|
+
const llm_graph_params & params) :
|
|
14227
|
+
llm_graph_context_mamba(params) {
|
|
14228
|
+
|
|
14229
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14230
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14231
|
+
|
|
14232
|
+
ggml_tensor * cur;
|
|
14233
|
+
ggml_tensor * inpL;
|
|
14234
|
+
|
|
14235
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14236
|
+
|
|
14237
|
+
auto * inp = build_inp_mem_hybrid();
|
|
14238
|
+
|
|
14239
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14240
|
+
|
|
14241
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14242
|
+
struct ggml_tensor * inpSA = inpL;
|
|
14243
|
+
|
|
14244
|
+
// norm
|
|
14245
|
+
cur = build_norm(inpL,
|
|
14246
|
+
model.layers[il].attn_norm, NULL,
|
|
14247
|
+
LLM_NORM_RMS, il);
|
|
14248
|
+
cb(cur, "attn_norm", il);
|
|
14249
|
+
|
|
14250
|
+
if (hparams.is_recurrent(il)) {
|
|
14251
|
+
// ssm layer //
|
|
14252
|
+
cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il);
|
|
14253
|
+
} else if (hparams.n_ff(il) == 0) {
|
|
14254
|
+
// attention layer //
|
|
14255
|
+
cur = build_attention_layer(cur, inp->get_attn(), model, n_embd_head, il);
|
|
14256
|
+
} else {
|
|
14257
|
+
cur = build_ffn_layer(cur, model, il);
|
|
14258
|
+
}
|
|
14259
|
+
|
|
14260
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
14261
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14262
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14263
|
+
}
|
|
14264
|
+
|
|
14265
|
+
// add residual
|
|
14266
|
+
cur = ggml_add(ctx0, cur, inpSA);
|
|
14267
|
+
cb(cur, "block_out", il);
|
|
14268
|
+
|
|
14269
|
+
// input for next layer
|
|
14270
|
+
inpL = cur;
|
|
14271
|
+
}
|
|
14272
|
+
|
|
14273
|
+
cur = inpL;
|
|
14274
|
+
|
|
14275
|
+
cur = build_norm(cur,
|
|
14276
|
+
model.output_norm, NULL,
|
|
14277
|
+
LLM_NORM_RMS, -1);
|
|
14278
|
+
|
|
14279
|
+
cb(cur, "result_norm", -1);
|
|
14280
|
+
res->t_embd = cur;
|
|
14281
|
+
|
|
14282
|
+
// lm_head
|
|
14283
|
+
cur = build_lora_mm(model.output, cur);
|
|
14284
|
+
cb(cur, "result_output", -1);
|
|
14285
|
+
res->t_logits = cur;
|
|
14286
|
+
|
|
14287
|
+
ggml_build_forward_expand(gf, cur);
|
|
14288
|
+
}
|
|
14289
|
+
|
|
14290
|
+
ggml_tensor * build_attention_layer(
|
|
14291
|
+
ggml_tensor * cur,
|
|
14292
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
14293
|
+
const llama_model & model,
|
|
14294
|
+
const int64_t n_embd_head,
|
|
14295
|
+
const int il) {
|
|
14296
|
+
|
|
14297
|
+
// compute Q and K and (optionally) RoPE them
|
|
14298
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14299
|
+
cb(Qcur, "Qcur", il);
|
|
14300
|
+
if (model.layers[il].bq) {
|
|
14301
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14302
|
+
cb(Qcur, "Qcur", il);
|
|
14303
|
+
}
|
|
14304
|
+
|
|
14305
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14306
|
+
cb(Kcur, "Kcur", il);
|
|
14307
|
+
if (model.layers[il].bk) {
|
|
14308
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14309
|
+
cb(Kcur, "Kcur", il);
|
|
14310
|
+
}
|
|
14311
|
+
|
|
14312
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14313
|
+
cb(Vcur, "Vcur", il);
|
|
14314
|
+
if (model.layers[il].bv) {
|
|
14315
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14316
|
+
cb(Vcur, "Vcur", il);
|
|
14317
|
+
}
|
|
14318
|
+
|
|
14319
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
|
|
14320
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14321
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
|
|
14322
|
+
|
|
14323
|
+
cb(Qcur, "Qcur", il);
|
|
14324
|
+
cb(Kcur, "Kcur", il);
|
|
14325
|
+
cb(Vcur, "Vcur", il);
|
|
14326
|
+
|
|
14327
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
14328
|
+
cur = build_attn(inp_attn,
|
|
14329
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
14330
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
14331
|
+
cb(cur, "attn_out", il);
|
|
14332
|
+
return cur;
|
|
14333
|
+
}
|
|
14334
|
+
|
|
14335
|
+
ggml_tensor * build_ffn_layer(
|
|
14336
|
+
ggml_tensor * cur,
|
|
14337
|
+
const llama_model & model,
|
|
14338
|
+
const int il) {
|
|
14339
|
+
|
|
14340
|
+
cur = build_ffn(cur,
|
|
14341
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
14342
|
+
NULL, NULL, NULL,
|
|
14343
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
14344
|
+
NULL,
|
|
14345
|
+
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
|
14346
|
+
cb(cur, "ffn_out", il);
|
|
14347
|
+
|
|
14348
|
+
cur = build_cvec(cur, il);
|
|
14349
|
+
cb(cur, "l_out", il);
|
|
14350
|
+
|
|
14351
|
+
return cur;
|
|
14352
|
+
}
|
|
14353
|
+
};
|
|
14354
|
+
|
|
14120
14355
|
struct llm_build_exaone : public llm_graph_context {
|
|
14121
14356
|
llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
14122
14357
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -18241,6 +18476,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18241
18476
|
// switch statement
|
|
18242
18477
|
case LLM_ARCH_BERT:
|
|
18243
18478
|
case LLM_ARCH_JINA_BERT_V2:
|
|
18479
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
18244
18480
|
case LLM_ARCH_NOMIC_BERT:
|
|
18245
18481
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
18246
18482
|
case LLM_ARCH_NEO_BERT:
|
|
@@ -18264,6 +18500,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18264
18500
|
cparams.n_seq_max,
|
|
18265
18501
|
nullptr);
|
|
18266
18502
|
} else if (llm_arch_is_hybrid(arch)) {
|
|
18503
|
+
|
|
18504
|
+
// The main difference between hybrid architectures is the
|
|
18505
|
+
// layer filters, so pick the right one here
|
|
18506
|
+
llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
|
|
18507
|
+
llama_memory_hybrid::layer_filter_cb filter_recr = nullptr;
|
|
18508
|
+
if (arch == LLM_ARCH_FALCON_H1) {
|
|
18509
|
+
filter_attn = [&](int32_t) { return true; };
|
|
18510
|
+
filter_recr = [&](int32_t) { return true; };
|
|
18511
|
+
} else if (arch == LLM_ARCH_NEMOTRON_H) {
|
|
18512
|
+
filter_attn = [&](int32_t il) {
|
|
18513
|
+
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
|
18514
|
+
};
|
|
18515
|
+
filter_recr = [&](int32_t il) {
|
|
18516
|
+
return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
|
18517
|
+
};
|
|
18518
|
+
}
|
|
18519
|
+
|
|
18267
18520
|
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18268
18521
|
|
|
18269
18522
|
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
@@ -18283,8 +18536,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
18283
18536
|
/* n_seq_max */ cparams.n_seq_max,
|
|
18284
18537
|
/* offload */ cparams.offload_kqv,
|
|
18285
18538
|
/* unified */ cparams.kv_unified,
|
|
18286
|
-
/* filter_attn */ (
|
|
18287
|
-
/* filter_recr */ (
|
|
18539
|
+
/* filter_attn */ std::move(filter_attn),
|
|
18540
|
+
/* filter_recr */ std::move(filter_recr));
|
|
18288
18541
|
} else {
|
|
18289
18542
|
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
18290
18543
|
|
|
@@ -18395,6 +18648,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18395
18648
|
} break;
|
|
18396
18649
|
case LLM_ARCH_BERT:
|
|
18397
18650
|
case LLM_ARCH_JINA_BERT_V2:
|
|
18651
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
18398
18652
|
case LLM_ARCH_NOMIC_BERT:
|
|
18399
18653
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
18400
18654
|
{
|
|
@@ -18611,6 +18865,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
18611
18865
|
{
|
|
18612
18866
|
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
|
18613
18867
|
} break;
|
|
18868
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
18869
|
+
{
|
|
18870
|
+
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
|
18871
|
+
} break;
|
|
18614
18872
|
case LLM_ARCH_EXAONE:
|
|
18615
18873
|
{
|
|
18616
18874
|
llm = std::make_unique<llm_build_exaone>(*this, params);
|
|
@@ -18736,7 +18994,7 @@ llama_model_params llama_model_default_params() {
|
|
|
18736
18994
|
llama_model_params result = {
|
|
18737
18995
|
/*.devices =*/ nullptr,
|
|
18738
18996
|
/*.tensor_buft_overrides =*/ nullptr,
|
|
18739
|
-
/*.n_gpu_layers =*/
|
|
18997
|
+
/*.n_gpu_layers =*/ 999,
|
|
18740
18998
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
18741
18999
|
/*.main_gpu =*/ 0,
|
|
18742
19000
|
/*.tensor_split =*/ nullptr,
|
|
@@ -18750,11 +19008,6 @@ llama_model_params llama_model_default_params() {
|
|
|
18750
19008
|
/*.use_extra_bufts =*/ true,
|
|
18751
19009
|
};
|
|
18752
19010
|
|
|
18753
|
-
#ifdef GGML_USE_METAL
|
|
18754
|
-
// note: we usually have plenty of VRAM, so by default offload all layers to the GPU
|
|
18755
|
-
result.n_gpu_layers = 999;
|
|
18756
|
-
#endif
|
|
18757
|
-
|
|
18758
19011
|
return result;
|
|
18759
19012
|
}
|
|
18760
19013
|
|
|
@@ -18846,6 +19099,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
18846
19099
|
case LLM_ARCH_RWKV7:
|
|
18847
19100
|
case LLM_ARCH_ARWKV7:
|
|
18848
19101
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
19102
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
18849
19103
|
return LLAMA_ROPE_TYPE_NONE;
|
|
18850
19104
|
|
|
18851
19105
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
@@ -18885,6 +19139,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
18885
19139
|
case LLM_ARCH_GROK:
|
|
18886
19140
|
case LLM_ARCH_DBRX:
|
|
18887
19141
|
case LLM_ARCH_BERT:
|
|
19142
|
+
case LLM_ARCH_JINA_BERT_V3:
|
|
18888
19143
|
case LLM_ARCH_NOMIC_BERT:
|
|
18889
19144
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
18890
19145
|
case LLM_ARCH_STABLELM:
|
|
@@ -2470,7 +2470,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2470
2470
|
// set attributes by model/tokenizer/architecture name
|
|
2471
2471
|
if (false
|
|
2472
2472
|
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
|
|
2473
|
-
|| _contains_any(general_arch, {"nomic-bert-moe"})
|
|
2473
|
+
|| _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
|
|
2474
2474
|
) {
|
|
2475
2475
|
if (token_to_id.count("<mask>") == 0) {
|
|
2476
2476
|
LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
|
|
@@ -25,6 +25,18 @@
|
|
|
25
25
|
// interface implementation
|
|
26
26
|
//
|
|
27
27
|
|
|
28
|
+
const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) {
|
|
29
|
+
switch (flash_attn_type) {
|
|
30
|
+
case LLAMA_FLASH_ATTN_TYPE_AUTO:
|
|
31
|
+
return "auto";
|
|
32
|
+
case LLAMA_FLASH_ATTN_TYPE_DISABLED:
|
|
33
|
+
return "disabled";
|
|
34
|
+
case LLAMA_FLASH_ATTN_TYPE_ENABLED:
|
|
35
|
+
return "enabled";
|
|
36
|
+
}
|
|
37
|
+
GGML_ABORT("fatal error");
|
|
38
|
+
}
|
|
39
|
+
|
|
28
40
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
|
29
41
|
struct llama_sampler_chain_params result = {
|
|
30
42
|
/*.no_perf =*/ true,
|