@fugood/llama.node 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -76,6 +76,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
76
76
  { LLM_ARCH_BAILINGMOE, "bailingmoe" },
77
77
  { LLM_ARCH_DOTS1, "dots1" },
78
78
  { LLM_ARCH_ARCEE, "arcee" },
79
+ { LLM_ARCH_ERNIE4_5, "ernie4_5" },
79
80
  { LLM_ARCH_UNKNOWN, "(unknown)" },
80
81
  };
81
82
 
@@ -1658,6 +1659,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1658
1659
  { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1659
1660
  }
1660
1661
  },
1662
+ {
1663
+ LLM_ARCH_ERNIE4_5,
1664
+ {
1665
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1666
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1667
+ { LLM_TENSOR_OUTPUT, "output" },
1668
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1669
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1670
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1671
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1672
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1673
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1674
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1675
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1676
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1677
+ },
1678
+ },
1661
1679
  {
1662
1680
  LLM_ARCH_UNKNOWN,
1663
1681
  {
@@ -80,6 +80,7 @@ enum llm_arch {
80
80
  LLM_ARCH_BAILINGMOE,
81
81
  LLM_ARCH_DOTS1,
82
82
  LLM_ARCH_ARCEE,
83
+ LLM_ARCH_ERNIE4_5,
83
84
  LLM_ARCH_UNKNOWN,
84
85
  };
85
86
 
@@ -560,12 +560,20 @@ ggml_tensor * llm_graph_context::build_ffn(
560
560
 
561
561
  switch (type_op) {
562
562
  case LLM_FFN_SILU:
563
- {
563
+ if (gate && type_gate == LLM_FFN_PAR) {
564
+ cur = ggml_swiglu_split(ctx0, cur, tmp);
565
+ cb(cur, "ffn_swiglu", il);
566
+ type_gate = LLM_FFN_SEQ;
567
+ } else {
564
568
  cur = ggml_silu(ctx0, cur);
565
569
  cb(cur, "ffn_silu", il);
566
570
  } break;
567
571
  case LLM_FFN_GELU:
568
- {
572
+ if (gate && type_gate == LLM_FFN_PAR) {
573
+ cur = ggml_geglu_split(ctx0, cur, tmp);
574
+ cb(cur, "ffn_geglu", il);
575
+ type_gate = LLM_FFN_SEQ;
576
+ } else {
569
577
  cur = ggml_gelu(ctx0, cur);
570
578
  cb(cur, "ffn_gelu", il);
571
579
  if (act_scales != NULL) {
@@ -574,7 +582,11 @@ ggml_tensor * llm_graph_context::build_ffn(
574
582
  }
575
583
  } break;
576
584
  case LLM_FFN_RELU:
577
- {
585
+ if (gate && type_gate == LLM_FFN_PAR) {
586
+ cur = ggml_reglu_split(ctx0, cur, tmp);
587
+ cb(cur, "ffn_reglu", il);
588
+ type_gate = LLM_FFN_SEQ;
589
+ } else {
578
590
  cur = ggml_relu(ctx0, cur);
579
591
  cb(cur, "ffn_relu", il);
580
592
  } break;
@@ -588,32 +600,19 @@ ggml_tensor * llm_graph_context::build_ffn(
588
600
  } break;
589
601
  case LLM_FFN_SWIGLU:
590
602
  {
591
- // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
592
- int64_t split_point = cur->ne[0] / 2;
593
- // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
594
- ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
595
- ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
596
-
597
- x0 = ggml_silu(ctx0, x0);
598
- cb(cur, "ffn_silu", il);
599
-
600
- cur = ggml_mul(ctx0, x0, x1);
601
- cb(cur, "ffn_mul", il);
603
+ cur = ggml_swiglu(ctx0, cur);
604
+ cb(cur, "ffn_swiglu", il);
602
605
  } break;
603
606
  case LLM_FFN_GEGLU:
604
607
  {
605
- // Split into two equal parts
606
- int64_t split_point = cur->ne[0] / 2;
607
- // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
608
- ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
609
- ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
610
-
611
- x0 = ggml_gelu(ctx0, x0);
612
- cb(x0, "ffn_gelu", il);
613
-
614
- cur = ggml_mul(ctx0, x0, x1);
608
+ cur = ggml_geglu(ctx0, cur);
615
609
  cb(cur, "ffn_geglu", il);
616
610
  } break;
611
+ case LLM_FFN_REGLU:
612
+ {
613
+ cur = ggml_reglu(ctx0, cur);
614
+ cb(cur, "ffn_reglu", il);
615
+ } break;
617
616
  }
618
617
 
619
618
  if (gate && type_gate == LLM_FFN_PAR) {
@@ -743,12 +742,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
743
742
 
744
743
  switch (type_op) {
745
744
  case LLM_FFN_SILU:
746
- {
745
+ if (gate_exps) {
746
+ cur = ggml_swiglu_split(ctx0, cur, up);
747
+ cb(cur, "ffn_moe_swiglu", il);
748
+ } else {
747
749
  cur = ggml_silu(ctx0, cur);
748
750
  cb(cur, "ffn_moe_silu", il);
749
751
  } break;
750
752
  case LLM_FFN_GELU:
751
- {
753
+ if (gate_exps) {
754
+ cur = ggml_geglu_split(ctx0, cur, up);
755
+ cb(cur, "ffn_moe_geglu", il);
756
+ } else {
752
757
  cur = ggml_gelu(ctx0, cur);
753
758
  cb(cur, "ffn_moe_gelu", il);
754
759
  } break;
@@ -756,11 +761,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
756
761
  GGML_ABORT("fatal error");
757
762
  }
758
763
 
759
- if (gate_exps) {
760
- cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
761
- cb(cur, "ffn_moe_gate_par", il);
762
- }
763
-
764
764
  experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
765
765
  cb(experts, "ffn_moe_down", il);
766
766
 
@@ -38,6 +38,7 @@ enum llm_ffn_op_type {
38
38
  LLM_FFN_RELU_SQR,
39
39
  LLM_FFN_SWIGLU,
40
40
  LLM_FFN_GEGLU,
41
+ LLM_FFN_REGLU,
41
42
  };
42
43
 
43
44
  enum llm_ffn_gate_type {
@@ -475,6 +476,7 @@ struct llm_graph_context {
475
476
  std::unique_ptr<llm_graph_result> res;
476
477
 
477
478
  llm_graph_context(const llm_graph_params & params);
479
+ virtual ~llm_graph_context() = default;
478
480
 
479
481
  void cb(ggml_tensor * cur, const char * name, int il) const;
480
482
 
@@ -363,30 +363,35 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
363
363
  }
364
364
 
365
365
  llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
366
- std::vector<llama_ubatch> ubatches;
366
+ do {
367
+ balloc.split_reset();
367
368
 
368
- while (true) {
369
- llama_ubatch ubatch;
369
+ std::vector<llama_ubatch> ubatches;
370
+ while (true) {
371
+ llama_ubatch ubatch;
370
372
 
371
- if (embd_all) {
372
- // if all tokens are output, split by sequence
373
- ubatch = balloc.split_seq(n_ubatch);
374
- } else {
375
- ubatch = balloc.split_equal(n_ubatch);
373
+ if (embd_all) {
374
+ // if all tokens are output, split by sequence
375
+ ubatch = balloc.split_seq(n_ubatch);
376
+ } else {
377
+ ubatch = balloc.split_equal(n_ubatch);
378
+ }
379
+
380
+ if (ubatch.n_tokens == 0) {
381
+ break;
382
+ }
383
+
384
+ ubatches.push_back(std::move(ubatch)); // NOLINT
376
385
  }
377
386
 
378
- if (ubatch.n_tokens == 0) {
387
+ if (!prepare(ubatches)) {
379
388
  break;
380
389
  }
381
390
 
382
- ubatches.push_back(std::move(ubatch)); // NOLINT
383
- }
384
-
385
- if (!prepare(ubatches)) {
386
- return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
387
- }
391
+ return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
392
+ } while (false);
388
393
 
389
- return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
394
+ return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
390
395
  }
391
396
 
392
397
  llama_memory_context_ptr llama_memory_recurrent::init_full() {
@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
47
47
  case LLM_TYPE_475M: return "475M";
48
48
  case LLM_TYPE_770M: return "770M";
49
49
  case LLM_TYPE_780M: return "780M";
50
+ case LLM_TYPE_0_3B: return "0.3B";
50
51
  case LLM_TYPE_0_5B: return "0.5B";
51
52
  case LLM_TYPE_0_6B: return "0.6B";
52
53
  case LLM_TYPE_1B: return "1B";
@@ -1504,6 +1505,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1504
1505
  default: type = LLM_TYPE_UNKNOWN;
1505
1506
  }
1506
1507
  } break;
1508
+ case LLM_ARCH_ERNIE4_5:
1509
+ {
1510
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1511
+ switch (hparams.n_layer) {
1512
+ case 18: type = LLM_TYPE_0_3B; break;
1513
+ default: type = LLM_TYPE_UNKNOWN;
1514
+ }
1515
+ } break;
1507
1516
  default: throw std::runtime_error("unsupported model architecture");
1508
1517
  }
1509
1518
 
@@ -4344,6 +4353,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4344
4353
 
4345
4354
  layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
4346
4355
 
4356
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4357
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4358
+ }
4359
+ } break;
4360
+ case LLM_ARCH_ERNIE4_5:
4361
+ {
4362
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4363
+
4364
+ // output
4365
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4366
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4367
+ // if output is NULL, init from the input tok embed
4368
+ if (output == NULL) {
4369
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4370
+ }
4371
+
4372
+ for (int i = 0; i < n_layer; ++i) {
4373
+ auto & layer = layers[i];
4374
+
4375
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4376
+
4377
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4378
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
4379
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
4380
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4381
+
4382
+ // optional bias tensors
4383
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4384
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4385
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
4386
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
4387
+
4388
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4389
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4347
4390
  layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4348
4391
  layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4349
4392
  }
@@ -14125,6 +14168,136 @@ struct llm_build_dots1 : public llm_graph_context {
14125
14168
  }
14126
14169
  };
14127
14170
 
14171
+ struct llm_build_ernie4_5 : public llm_graph_context {
14172
+ llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14173
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14174
+
14175
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14176
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
14177
+
14178
+ ggml_tensor * cur;
14179
+ ggml_tensor * inpL;
14180
+
14181
+ inpL = build_inp_embd(model.tok_embd);
14182
+
14183
+ // inp_pos - contains the positions
14184
+ ggml_tensor * inp_pos = build_inp_pos();
14185
+
14186
+ auto * inp_attn = build_attn_inp_kv_unified();
14187
+
14188
+ for (int il = 0; il < n_layer; ++il) {
14189
+ ggml_tensor * inpSA = inpL;
14190
+
14191
+ // norm
14192
+ {
14193
+ cur = build_norm(inpL,
14194
+ model.layers[il].attn_norm, NULL,
14195
+ LLM_NORM_RMS, il);
14196
+ cb(cur, "attn_norm", il);
14197
+ }
14198
+
14199
+ // self-attention
14200
+ {
14201
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14202
+ cb(Qcur, "Qcur", il);
14203
+ if (model.layers[il].bq) {
14204
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14205
+ cb(Qcur, "Qcur", il);
14206
+ }
14207
+
14208
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14209
+ cb(Kcur, "Kcur", il);
14210
+ if (model.layers[il].bk) {
14211
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14212
+ cb(Kcur, "Kcur", il);
14213
+ }
14214
+
14215
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14216
+ cb(Vcur, "Vcur", il);
14217
+ if (model.layers[il].bv) {
14218
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14219
+ cb(Vcur, "Vcur", il);
14220
+ }
14221
+
14222
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14223
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14224
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14225
+
14226
+ Qcur = ggml_rope_ext(
14227
+ ctx0, Qcur, inp_pos, nullptr,
14228
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14229
+ ext_factor, attn_factor, beta_fast, beta_slow
14230
+ );
14231
+
14232
+ Kcur = ggml_rope_ext(
14233
+ ctx0, Kcur, inp_pos, nullptr,
14234
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14235
+ ext_factor, attn_factor, beta_fast, beta_slow
14236
+ );
14237
+
14238
+ cb(Qcur, "Qcur", il);
14239
+ cb(Kcur, "Kcur", il);
14240
+ cb(Vcur, "Vcur", il);
14241
+
14242
+ cur = build_attn(inp_attn, gf,
14243
+ model.layers[il].wo, NULL,
14244
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14245
+ }
14246
+
14247
+ if (il == n_layer - 1) {
14248
+ // skip computing output for unused tokens
14249
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14250
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14251
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14252
+ }
14253
+
14254
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14255
+ cb(ffn_inp, "ffn_inp", il);
14256
+
14257
+ // feed-forward network
14258
+ {
14259
+ cur = build_norm(ffn_inp,
14260
+ model.layers[il].ffn_norm, NULL,
14261
+ LLM_NORM_RMS, il);
14262
+ cb(cur, "ffn_norm", il);
14263
+
14264
+ cur = build_ffn(cur,
14265
+ model.layers[il].ffn_up, NULL, NULL,
14266
+ model.layers[il].ffn_gate, NULL, NULL,
14267
+ model.layers[il].ffn_down, NULL, NULL,
14268
+ NULL,
14269
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14270
+ cb(cur, "ffn_out", il);
14271
+ }
14272
+
14273
+ cur = ggml_add(ctx0, cur, ffn_inp);
14274
+
14275
+ cur = build_cvec(cur, il);
14276
+ cb(cur, "l_out", il);
14277
+
14278
+ // input for next layer
14279
+ inpL = cur;
14280
+ }
14281
+
14282
+ cur = inpL;
14283
+
14284
+ cur = build_norm(cur,
14285
+ model.output_norm, NULL,
14286
+ LLM_NORM_RMS, -1);
14287
+
14288
+ cb(cur, "result_norm", -1);
14289
+ res->t_embd = cur;
14290
+
14291
+ // lm_head
14292
+ cur = build_lora_mm(model.output, cur);
14293
+
14294
+ cb(cur, "result_output", -1);
14295
+ res->t_logits = cur;
14296
+
14297
+ ggml_build_forward_expand(gf, cur);
14298
+ }
14299
+ };
14300
+
14128
14301
  struct llm_build_arcee : public llm_graph_context {
14129
14302
  llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14130
14303
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -14635,6 +14808,10 @@ llm_graph_result_ptr llama_model::build_graph(
14635
14808
  {
14636
14809
  llm = std::make_unique<llm_build_arcee>(*this, params, gf);
14637
14810
  } break;
14811
+ case LLM_ARCH_ERNIE4_5:
14812
+ {
14813
+ llm = std::make_unique<llm_build_ernie4_5>(*this, params, gf);
14814
+ } break;
14638
14815
  default:
14639
14816
  GGML_ABORT("fatal error");
14640
14817
  }
@@ -14786,6 +14963,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
14786
14963
  case LLM_ARCH_BAILINGMOE:
14787
14964
  case LLM_ARCH_NEO_BERT:
14788
14965
  case LLM_ARCH_ARCEE:
14966
+ case LLM_ARCH_ERNIE4_5:
14789
14967
  return LLAMA_ROPE_TYPE_NORM;
14790
14968
 
14791
14969
  // the pairs of head values are offset by n_rot/2
@@ -39,6 +39,7 @@ enum llm_type {
39
39
  LLM_TYPE_475M,
40
40
  LLM_TYPE_770M,
41
41
  LLM_TYPE_780M,
42
+ LLM_TYPE_0_3B,
42
43
  LLM_TYPE_0_5B,
43
44
  LLM_TYPE_0_6B,
44
45
  LLM_TYPE_1B,