@fugood/llama.node 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +12 -12
- package/src/llama.cpp/common/arg.cpp +10 -0
- package/src/llama.cpp/common/chat.cpp +37 -20
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +90 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +534 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +54 -0
- package/src/llama.cpp/src/llama-arch.cpp +18 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -31
- package/src/llama.cpp/src/llama-graph.h +2 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -16
- package/src/llama.cpp/src/llama-model.cpp +178 -0
- package/src/llama.cpp/src/llama-model.h +1 -0
|
@@ -76,6 +76,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
76
76
|
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
|
77
77
|
{ LLM_ARCH_DOTS1, "dots1" },
|
|
78
78
|
{ LLM_ARCH_ARCEE, "arcee" },
|
|
79
|
+
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
|
79
80
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
80
81
|
};
|
|
81
82
|
|
|
@@ -1658,6 +1659,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1658
1659
|
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
|
1659
1660
|
}
|
|
1660
1661
|
},
|
|
1662
|
+
{
|
|
1663
|
+
LLM_ARCH_ERNIE4_5,
|
|
1664
|
+
{
|
|
1665
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1666
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1667
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1668
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1669
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1670
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1671
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1672
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1673
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1674
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1675
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1676
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1677
|
+
},
|
|
1678
|
+
},
|
|
1661
1679
|
{
|
|
1662
1680
|
LLM_ARCH_UNKNOWN,
|
|
1663
1681
|
{
|
|
@@ -560,12 +560,20 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
|
560
560
|
|
|
561
561
|
switch (type_op) {
|
|
562
562
|
case LLM_FFN_SILU:
|
|
563
|
-
{
|
|
563
|
+
if (gate && type_gate == LLM_FFN_PAR) {
|
|
564
|
+
cur = ggml_swiglu_split(ctx0, cur, tmp);
|
|
565
|
+
cb(cur, "ffn_swiglu", il);
|
|
566
|
+
type_gate = LLM_FFN_SEQ;
|
|
567
|
+
} else {
|
|
564
568
|
cur = ggml_silu(ctx0, cur);
|
|
565
569
|
cb(cur, "ffn_silu", il);
|
|
566
570
|
} break;
|
|
567
571
|
case LLM_FFN_GELU:
|
|
568
|
-
{
|
|
572
|
+
if (gate && type_gate == LLM_FFN_PAR) {
|
|
573
|
+
cur = ggml_geglu_split(ctx0, cur, tmp);
|
|
574
|
+
cb(cur, "ffn_geglu", il);
|
|
575
|
+
type_gate = LLM_FFN_SEQ;
|
|
576
|
+
} else {
|
|
569
577
|
cur = ggml_gelu(ctx0, cur);
|
|
570
578
|
cb(cur, "ffn_gelu", il);
|
|
571
579
|
if (act_scales != NULL) {
|
|
@@ -574,7 +582,11 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
|
574
582
|
}
|
|
575
583
|
} break;
|
|
576
584
|
case LLM_FFN_RELU:
|
|
577
|
-
{
|
|
585
|
+
if (gate && type_gate == LLM_FFN_PAR) {
|
|
586
|
+
cur = ggml_reglu_split(ctx0, cur, tmp);
|
|
587
|
+
cb(cur, "ffn_reglu", il);
|
|
588
|
+
type_gate = LLM_FFN_SEQ;
|
|
589
|
+
} else {
|
|
578
590
|
cur = ggml_relu(ctx0, cur);
|
|
579
591
|
cb(cur, "ffn_relu", il);
|
|
580
592
|
} break;
|
|
@@ -588,32 +600,19 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
|
588
600
|
} break;
|
|
589
601
|
case LLM_FFN_SWIGLU:
|
|
590
602
|
{
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
// TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
|
|
594
|
-
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
|
595
|
-
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
|
596
|
-
|
|
597
|
-
x0 = ggml_silu(ctx0, x0);
|
|
598
|
-
cb(cur, "ffn_silu", il);
|
|
599
|
-
|
|
600
|
-
cur = ggml_mul(ctx0, x0, x1);
|
|
601
|
-
cb(cur, "ffn_mul", il);
|
|
603
|
+
cur = ggml_swiglu(ctx0, cur);
|
|
604
|
+
cb(cur, "ffn_swiglu", il);
|
|
602
605
|
} break;
|
|
603
606
|
case LLM_FFN_GEGLU:
|
|
604
607
|
{
|
|
605
|
-
|
|
606
|
-
int64_t split_point = cur->ne[0] / 2;
|
|
607
|
-
// TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
|
|
608
|
-
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
|
609
|
-
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
|
610
|
-
|
|
611
|
-
x0 = ggml_gelu(ctx0, x0);
|
|
612
|
-
cb(x0, "ffn_gelu", il);
|
|
613
|
-
|
|
614
|
-
cur = ggml_mul(ctx0, x0, x1);
|
|
608
|
+
cur = ggml_geglu(ctx0, cur);
|
|
615
609
|
cb(cur, "ffn_geglu", il);
|
|
616
610
|
} break;
|
|
611
|
+
case LLM_FFN_REGLU:
|
|
612
|
+
{
|
|
613
|
+
cur = ggml_reglu(ctx0, cur);
|
|
614
|
+
cb(cur, "ffn_reglu", il);
|
|
615
|
+
} break;
|
|
617
616
|
}
|
|
618
617
|
|
|
619
618
|
if (gate && type_gate == LLM_FFN_PAR) {
|
|
@@ -743,12 +742,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
743
742
|
|
|
744
743
|
switch (type_op) {
|
|
745
744
|
case LLM_FFN_SILU:
|
|
746
|
-
{
|
|
745
|
+
if (gate_exps) {
|
|
746
|
+
cur = ggml_swiglu_split(ctx0, cur, up);
|
|
747
|
+
cb(cur, "ffn_moe_swiglu", il);
|
|
748
|
+
} else {
|
|
747
749
|
cur = ggml_silu(ctx0, cur);
|
|
748
750
|
cb(cur, "ffn_moe_silu", il);
|
|
749
751
|
} break;
|
|
750
752
|
case LLM_FFN_GELU:
|
|
751
|
-
{
|
|
753
|
+
if (gate_exps) {
|
|
754
|
+
cur = ggml_geglu_split(ctx0, cur, up);
|
|
755
|
+
cb(cur, "ffn_moe_geglu", il);
|
|
756
|
+
} else {
|
|
752
757
|
cur = ggml_gelu(ctx0, cur);
|
|
753
758
|
cb(cur, "ffn_moe_gelu", il);
|
|
754
759
|
} break;
|
|
@@ -756,11 +761,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
756
761
|
GGML_ABORT("fatal error");
|
|
757
762
|
}
|
|
758
763
|
|
|
759
|
-
if (gate_exps) {
|
|
760
|
-
cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
|
|
761
|
-
cb(cur, "ffn_moe_gate_par", il);
|
|
762
|
-
}
|
|
763
|
-
|
|
764
764
|
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
|
765
765
|
cb(experts, "ffn_moe_down", il);
|
|
766
766
|
|
|
@@ -38,6 +38,7 @@ enum llm_ffn_op_type {
|
|
|
38
38
|
LLM_FFN_RELU_SQR,
|
|
39
39
|
LLM_FFN_SWIGLU,
|
|
40
40
|
LLM_FFN_GEGLU,
|
|
41
|
+
LLM_FFN_REGLU,
|
|
41
42
|
};
|
|
42
43
|
|
|
43
44
|
enum llm_ffn_gate_type {
|
|
@@ -475,6 +476,7 @@ struct llm_graph_context {
|
|
|
475
476
|
std::unique_ptr<llm_graph_result> res;
|
|
476
477
|
|
|
477
478
|
llm_graph_context(const llm_graph_params & params);
|
|
479
|
+
virtual ~llm_graph_context() = default;
|
|
478
480
|
|
|
479
481
|
void cb(ggml_tensor * cur, const char * name, int il) const;
|
|
480
482
|
|
|
@@ -363,30 +363,35 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
363
363
|
}
|
|
364
364
|
|
|
365
365
|
llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
|
366
|
-
|
|
366
|
+
do {
|
|
367
|
+
balloc.split_reset();
|
|
367
368
|
|
|
368
|
-
|
|
369
|
-
|
|
369
|
+
std::vector<llama_ubatch> ubatches;
|
|
370
|
+
while (true) {
|
|
371
|
+
llama_ubatch ubatch;
|
|
370
372
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
373
|
+
if (embd_all) {
|
|
374
|
+
// if all tokens are output, split by sequence
|
|
375
|
+
ubatch = balloc.split_seq(n_ubatch);
|
|
376
|
+
} else {
|
|
377
|
+
ubatch = balloc.split_equal(n_ubatch);
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
if (ubatch.n_tokens == 0) {
|
|
381
|
+
break;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
ubatches.push_back(std::move(ubatch)); // NOLINT
|
|
376
385
|
}
|
|
377
386
|
|
|
378
|
-
if (
|
|
387
|
+
if (!prepare(ubatches)) {
|
|
379
388
|
break;
|
|
380
389
|
}
|
|
381
390
|
|
|
382
|
-
|
|
383
|
-
}
|
|
384
|
-
|
|
385
|
-
if (!prepare(ubatches)) {
|
|
386
|
-
return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
387
|
-
}
|
|
391
|
+
return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
|
|
392
|
+
} while (false);
|
|
388
393
|
|
|
389
|
-
return std::make_unique<llama_memory_recurrent_context>(
|
|
394
|
+
return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
390
395
|
}
|
|
391
396
|
|
|
392
397
|
llama_memory_context_ptr llama_memory_recurrent::init_full() {
|
|
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
47
47
|
case LLM_TYPE_475M: return "475M";
|
|
48
48
|
case LLM_TYPE_770M: return "770M";
|
|
49
49
|
case LLM_TYPE_780M: return "780M";
|
|
50
|
+
case LLM_TYPE_0_3B: return "0.3B";
|
|
50
51
|
case LLM_TYPE_0_5B: return "0.5B";
|
|
51
52
|
case LLM_TYPE_0_6B: return "0.6B";
|
|
52
53
|
case LLM_TYPE_1B: return "1B";
|
|
@@ -1504,6 +1505,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1504
1505
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1505
1506
|
}
|
|
1506
1507
|
} break;
|
|
1508
|
+
case LLM_ARCH_ERNIE4_5:
|
|
1509
|
+
{
|
|
1510
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1511
|
+
switch (hparams.n_layer) {
|
|
1512
|
+
case 18: type = LLM_TYPE_0_3B; break;
|
|
1513
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1514
|
+
}
|
|
1515
|
+
} break;
|
|
1507
1516
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1508
1517
|
}
|
|
1509
1518
|
|
|
@@ -4344,6 +4353,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4344
4353
|
|
|
4345
4354
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
4346
4355
|
|
|
4356
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4357
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4358
|
+
}
|
|
4359
|
+
} break;
|
|
4360
|
+
case LLM_ARCH_ERNIE4_5:
|
|
4361
|
+
{
|
|
4362
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4363
|
+
|
|
4364
|
+
// output
|
|
4365
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4366
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
4367
|
+
// if output is NULL, init from the input tok embed
|
|
4368
|
+
if (output == NULL) {
|
|
4369
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
4370
|
+
}
|
|
4371
|
+
|
|
4372
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4373
|
+
auto & layer = layers[i];
|
|
4374
|
+
|
|
4375
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4376
|
+
|
|
4377
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
4378
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
4379
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
4380
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
4381
|
+
|
|
4382
|
+
// optional bias tensors
|
|
4383
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4384
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
4385
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
4386
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
4387
|
+
|
|
4388
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4389
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
4347
4390
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
4348
4391
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
4349
4392
|
}
|
|
@@ -14125,6 +14168,136 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
14125
14168
|
}
|
|
14126
14169
|
};
|
|
14127
14170
|
|
|
14171
|
+
struct llm_build_ernie4_5 : public llm_graph_context {
|
|
14172
|
+
llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
14173
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14174
|
+
|
|
14175
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14176
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
14177
|
+
|
|
14178
|
+
ggml_tensor * cur;
|
|
14179
|
+
ggml_tensor * inpL;
|
|
14180
|
+
|
|
14181
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
14182
|
+
|
|
14183
|
+
// inp_pos - contains the positions
|
|
14184
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
14185
|
+
|
|
14186
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
14187
|
+
|
|
14188
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14189
|
+
ggml_tensor * inpSA = inpL;
|
|
14190
|
+
|
|
14191
|
+
// norm
|
|
14192
|
+
{
|
|
14193
|
+
cur = build_norm(inpL,
|
|
14194
|
+
model.layers[il].attn_norm, NULL,
|
|
14195
|
+
LLM_NORM_RMS, il);
|
|
14196
|
+
cb(cur, "attn_norm", il);
|
|
14197
|
+
}
|
|
14198
|
+
|
|
14199
|
+
// self-attention
|
|
14200
|
+
{
|
|
14201
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
14202
|
+
cb(Qcur, "Qcur", il);
|
|
14203
|
+
if (model.layers[il].bq) {
|
|
14204
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
14205
|
+
cb(Qcur, "Qcur", il);
|
|
14206
|
+
}
|
|
14207
|
+
|
|
14208
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
14209
|
+
cb(Kcur, "Kcur", il);
|
|
14210
|
+
if (model.layers[il].bk) {
|
|
14211
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
14212
|
+
cb(Kcur, "Kcur", il);
|
|
14213
|
+
}
|
|
14214
|
+
|
|
14215
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
14216
|
+
cb(Vcur, "Vcur", il);
|
|
14217
|
+
if (model.layers[il].bv) {
|
|
14218
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
14219
|
+
cb(Vcur, "Vcur", il);
|
|
14220
|
+
}
|
|
14221
|
+
|
|
14222
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
14223
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
14224
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
14225
|
+
|
|
14226
|
+
Qcur = ggml_rope_ext(
|
|
14227
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
14228
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14229
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14230
|
+
);
|
|
14231
|
+
|
|
14232
|
+
Kcur = ggml_rope_ext(
|
|
14233
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
14234
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14235
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14236
|
+
);
|
|
14237
|
+
|
|
14238
|
+
cb(Qcur, "Qcur", il);
|
|
14239
|
+
cb(Kcur, "Kcur", il);
|
|
14240
|
+
cb(Vcur, "Vcur", il);
|
|
14241
|
+
|
|
14242
|
+
cur = build_attn(inp_attn, gf,
|
|
14243
|
+
model.layers[il].wo, NULL,
|
|
14244
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
14245
|
+
}
|
|
14246
|
+
|
|
14247
|
+
if (il == n_layer - 1) {
|
|
14248
|
+
// skip computing output for unused tokens
|
|
14249
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14250
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14251
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14252
|
+
}
|
|
14253
|
+
|
|
14254
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14255
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
14256
|
+
|
|
14257
|
+
// feed-forward network
|
|
14258
|
+
{
|
|
14259
|
+
cur = build_norm(ffn_inp,
|
|
14260
|
+
model.layers[il].ffn_norm, NULL,
|
|
14261
|
+
LLM_NORM_RMS, il);
|
|
14262
|
+
cb(cur, "ffn_norm", il);
|
|
14263
|
+
|
|
14264
|
+
cur = build_ffn(cur,
|
|
14265
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
14266
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
14267
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
14268
|
+
NULL,
|
|
14269
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
14270
|
+
cb(cur, "ffn_out", il);
|
|
14271
|
+
}
|
|
14272
|
+
|
|
14273
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14274
|
+
|
|
14275
|
+
cur = build_cvec(cur, il);
|
|
14276
|
+
cb(cur, "l_out", il);
|
|
14277
|
+
|
|
14278
|
+
// input for next layer
|
|
14279
|
+
inpL = cur;
|
|
14280
|
+
}
|
|
14281
|
+
|
|
14282
|
+
cur = inpL;
|
|
14283
|
+
|
|
14284
|
+
cur = build_norm(cur,
|
|
14285
|
+
model.output_norm, NULL,
|
|
14286
|
+
LLM_NORM_RMS, -1);
|
|
14287
|
+
|
|
14288
|
+
cb(cur, "result_norm", -1);
|
|
14289
|
+
res->t_embd = cur;
|
|
14290
|
+
|
|
14291
|
+
// lm_head
|
|
14292
|
+
cur = build_lora_mm(model.output, cur);
|
|
14293
|
+
|
|
14294
|
+
cb(cur, "result_output", -1);
|
|
14295
|
+
res->t_logits = cur;
|
|
14296
|
+
|
|
14297
|
+
ggml_build_forward_expand(gf, cur);
|
|
14298
|
+
}
|
|
14299
|
+
};
|
|
14300
|
+
|
|
14128
14301
|
struct llm_build_arcee : public llm_graph_context {
|
|
14129
14302
|
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
14130
14303
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -14635,6 +14808,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
14635
14808
|
{
|
|
14636
14809
|
llm = std::make_unique<llm_build_arcee>(*this, params, gf);
|
|
14637
14810
|
} break;
|
|
14811
|
+
case LLM_ARCH_ERNIE4_5:
|
|
14812
|
+
{
|
|
14813
|
+
llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf);
|
|
14814
|
+
} break;
|
|
14638
14815
|
default:
|
|
14639
14816
|
GGML_ABORT("fatal error");
|
|
14640
14817
|
}
|
|
@@ -14786,6 +14963,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
14786
14963
|
case LLM_ARCH_BAILINGMOE:
|
|
14787
14964
|
case LLM_ARCH_NEO_BERT:
|
|
14788
14965
|
case LLM_ARCH_ARCEE:
|
|
14966
|
+
case LLM_ARCH_ERNIE4_5:
|
|
14789
14967
|
return LLAMA_ROPE_TYPE_NORM;
|
|
14790
14968
|
|
|
14791
14969
|
// the pairs of head values are offset by n_rot/2
|