@fugood/llama.node 1.1.6 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +6 -1
  3. package/lib/index.ts +6 -0
  4. package/lib/version.js +5 -0
  5. package/lib/version.ts +2 -0
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +9 -9
  8. package/src/LlamaCompletionWorker.cpp +73 -20
  9. package/src/LlamaCompletionWorker.h +8 -0
  10. package/src/LlamaContext.cpp +9 -0
  11. package/src/common.hpp +8 -1
  12. package/src/llama.cpp/CMakeLists.txt +2 -0
  13. package/src/llama.cpp/common/arg.cpp +132 -41
  14. package/src/llama.cpp/common/chat-parser.cpp +9 -1
  15. package/src/llama.cpp/common/chat.cpp +311 -9
  16. package/src/llama.cpp/common/chat.h +4 -1
  17. package/src/llama.cpp/common/common.cpp +54 -0
  18. package/src/llama.cpp/common/common.h +46 -9
  19. package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
  20. package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
  21. package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  22. package/src/llama.cpp/ggml/include/ggml.h +28 -2
  23. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
  27. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  30. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
  33. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  36. package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  37. package/src/llama.cpp/include/llama.h +25 -0
  38. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  39. package/src/llama.cpp/src/llama-chat.cpp +2 -4
  40. package/src/llama.cpp/src/llama-context.cpp +29 -22
  41. package/src/llama.cpp/src/llama-context.h +6 -5
  42. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
  43. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
  44. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
  45. package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
  46. package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
  47. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
  48. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
  49. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  50. package/src/llama.cpp/src/llama-memory.h +2 -2
  51. package/src/llama.cpp/src/llama-model.cpp +81 -70
  52. package/src/llama.cpp/src/llama-model.h +2 -0
  53. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  54. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
@@ -86,6 +86,7 @@ const char * llm_type_name(llm_type type) {
86
86
  case LLM_TYPE_40B: return "40B";
87
87
  case LLM_TYPE_65B: return "65B";
88
88
  case LLM_TYPE_70B: return "70B";
89
+ case LLM_TYPE_120B: return "120B";
89
90
  case LLM_TYPE_142B: return "142B";
90
91
  case LLM_TYPE_236B: return "236B";
91
92
  case LLM_TYPE_290B: return "290B";
@@ -1095,6 +1096,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1095
1096
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1096
1097
 
1097
1098
  switch (hparams.n_layer) {
1099
+ case 18: type = LLM_TYPE_537M; break;
1098
1100
  case 26: type = LLM_TYPE_1B; break;
1099
1101
  case 34: type = LLM_TYPE_4B; break;
1100
1102
  case 48: type = LLM_TYPE_12B; break;
@@ -1833,7 +1835,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1833
1835
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1834
1836
  hparams.set_swa_pattern(2);
1835
1837
 
1836
- // TODO: switch (hparams.n_layer)
1838
+ switch (hparams.n_layer) {
1839
+ case 24: type = LLM_TYPE_20B; break;
1840
+ case 36: type = LLM_TYPE_120B; break;
1841
+ default: type = LLM_TYPE_UNKNOWN;
1842
+ }
1837
1843
  } break;
1838
1844
  case LLM_ARCH_LFM2:
1839
1845
  {
@@ -6742,9 +6748,9 @@ struct llm_build_falcon : public llm_graph_context {
6742
6748
 
6743
6749
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6744
6750
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6745
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6751
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
6746
6752
 
6747
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6753
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6748
6754
 
6749
6755
  // using mode = 2 for neox mode
6750
6756
  Qcur = ggml_rope_ext(
@@ -7022,9 +7028,9 @@ struct llm_build_dbrx : public llm_graph_context {
7022
7028
 
7023
7029
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7024
7030
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7025
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7031
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7026
7032
 
7027
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7033
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7028
7034
 
7029
7035
  Qcur = ggml_rope_ext(
7030
7036
  ctx0, Qcur, inp_pos, nullptr,
@@ -7144,13 +7150,13 @@ struct llm_build_starcoder : public llm_graph_context {
7144
7150
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7145
7151
  cb(cur, "bqkv", il);
7146
7152
 
7147
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7148
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7149
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7153
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7154
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7155
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7150
7156
 
7151
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7152
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7153
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7157
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7158
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7159
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7154
7160
 
7155
7161
  cb(Qcur, "Qcur", il);
7156
7162
  cb(Kcur, "Kcur", il);
@@ -7366,13 +7372,15 @@ struct llm_build_bert : public llm_graph_context {
7366
7372
  cb(cur, "bqkv", il);
7367
7373
  }
7368
7374
 
7369
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7370
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7371
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7375
+ Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7376
+ Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7377
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7378
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7372
7379
  } else {
7373
7380
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
7374
7381
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
7375
7382
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
7383
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7376
7384
  }
7377
7385
 
7378
7386
  if (model.layers[il].attn_q_norm) {
@@ -7380,6 +7388,10 @@ struct llm_build_bert : public llm_graph_context {
7380
7388
  model.layers[il].attn_q_norm,
7381
7389
  model.layers[il].attn_q_norm_b,
7382
7390
  LLM_NORM, il);
7391
+
7392
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7393
+ } else {
7394
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7383
7395
  }
7384
7396
 
7385
7397
  if (model.layers[il].attn_k_norm) {
@@ -7387,11 +7399,11 @@ struct llm_build_bert : public llm_graph_context {
7387
7399
  model.layers[il].attn_k_norm,
7388
7400
  model.layers[il].attn_k_norm_b,
7389
7401
  LLM_NORM, il);
7390
- }
7391
7402
 
7392
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7393
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7394
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7403
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7404
+ } else {
7405
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7406
+ }
7395
7407
 
7396
7408
  // RoPE
7397
7409
  if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
@@ -7536,9 +7548,9 @@ struct llm_build_neo_bert : public llm_graph_context {
7536
7548
 
7537
7549
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7538
7550
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7539
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7551
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7540
7552
 
7541
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7553
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7542
7554
 
7543
7555
  // RoPE
7544
7556
  Qcur = ggml_rope_ext(
@@ -7645,13 +7657,13 @@ struct llm_build_bloom : public llm_graph_context {
7645
7657
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7646
7658
  cb(cur, "bqkv", il);
7647
7659
 
7648
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
7649
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
7650
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7660
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7661
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7662
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7651
7663
 
7652
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7653
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7654
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7664
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7665
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7666
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7655
7667
 
7656
7668
  cb(Qcur, "Qcur", il);
7657
7669
  cb(Kcur, "Kcur", il);
@@ -7769,7 +7781,7 @@ struct llm_build_mpt : public llm_graph_context {
7769
7781
 
7770
7782
  ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
7771
7783
  ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
7772
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7784
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
7773
7785
 
7774
7786
  cb(Qcur, "Qcur", il);
7775
7787
  cb(Kcur, "Kcur", il);
@@ -7788,17 +7800,18 @@ struct llm_build_mpt : public llm_graph_context {
7788
7800
  model.layers[il].attn_k_norm_b,
7789
7801
  LLM_NORM, il);
7790
7802
  cb(Kcur, "Kcur", il);
7803
+
7804
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7805
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7791
7806
  } else {
7792
- Qcur = ggml_cont(ctx0, Qcur);
7807
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7793
7808
  cb(Qcur, "Qcur", il);
7794
7809
 
7795
- Kcur = ggml_cont(ctx0, Kcur);
7810
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7796
7811
  cb(Kcur, "Kcur", il);
7797
7812
  }
7798
7813
 
7799
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7800
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7801
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7814
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7802
7815
 
7803
7816
  cb(Qcur, "Qcur", il);
7804
7817
  cb(Kcur, "Kcur", il);
@@ -8050,9 +8063,9 @@ struct llm_build_qwen : public llm_graph_context {
8050
8063
 
8051
8064
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8052
8065
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8053
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
8066
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd));
8054
8067
 
8055
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8068
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8056
8069
 
8057
8070
  // using mode = 2 for neox mode
8058
8071
  Qcur = ggml_rope_ext(
@@ -9025,21 +9038,21 @@ struct llm_build_phi2 : public llm_graph_context {
9025
9038
 
9026
9039
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9027
9040
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9028
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9041
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9042
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9029
9043
  } else {
9030
9044
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9031
9045
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9032
9046
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9033
9047
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9034
9048
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9049
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9035
9050
  }
9036
9051
 
9037
9052
  cb(Qcur, "Qcur", il);
9038
9053
  cb(Kcur, "Kcur", il);
9039
9054
  cb(Vcur, "Vcur", il);
9040
9055
 
9041
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9042
-
9043
9056
  Qcur = ggml_rope_ext(
9044
9057
  ctx0, Qcur, inp_pos, nullptr,
9045
9058
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9163,21 +9176,21 @@ struct llm_build_phi3 : public llm_graph_context {
9163
9176
 
9164
9177
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
9165
9178
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
9166
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
9179
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
9180
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9167
9181
  } else {
9168
9182
  Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
9169
9183
  Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
9170
9184
  Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
9171
9185
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9172
9186
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9187
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9173
9188
  }
9174
9189
 
9175
9190
  cb(Qcur, "Qcur", il);
9176
9191
  cb(Kcur, "Kcur", il);
9177
9192
  cb(Vcur, "Vcur", il);
9178
9193
 
9179
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9180
-
9181
9194
  Qcur = ggml_rope_ext(
9182
9195
  ctx0, Qcur, inp_pos, rope_factors,
9183
9196
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -9427,17 +9440,17 @@ struct llm_build_gpt2 : public llm_graph_context {
9427
9440
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
9428
9441
  cb(cur, "bqkv", il);
9429
9442
 
9430
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
9431
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
9432
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9443
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
9444
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
9445
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9433
9446
 
9434
9447
  cb(Qcur, "Qcur", il);
9435
9448
  cb(Kcur, "Kcur", il);
9436
9449
  cb(Vcur, "Vcur", il);
9437
9450
 
9438
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9439
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9440
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9451
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
9452
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
9453
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9441
9454
 
9442
9455
  cur = build_attn(inp_attn,
9443
9456
  model.layers[il].wo, model.layers[il].bo,
@@ -9533,9 +9546,9 @@ struct llm_build_codeshell : public llm_graph_context {
9533
9546
 
9534
9547
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
9535
9548
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
9536
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
9549
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
9537
9550
 
9538
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9551
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
9539
9552
 
9540
9553
  Qcur = ggml_rope_ext(
9541
9554
  ctx0, Qcur, inp_pos, nullptr,
@@ -10863,8 +10876,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
10863
10876
  ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
10864
10877
  all_coefs = ggml_scale_bias(ctx0, all_coefs, 1.0f, 1.0f); // + 1.0
10865
10878
  cb(all_coefs, "all_coefs", il);
10866
- all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
10867
- all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
10879
+ all_coefs = ggml_transpose(ctx0, all_coefs); // [n_tokens, n_altup]
10880
+ all_coefs = ggml_cont_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
10868
10881
 
10869
10882
  innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
10870
10883
  ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
@@ -12277,9 +12290,9 @@ struct llm_build_gptneox : public llm_graph_context {
12277
12290
 
12278
12291
  ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12279
12292
  ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
12280
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
12293
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
12281
12294
 
12282
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12295
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12283
12296
 
12284
12297
  Qcur = ggml_rope_ext(
12285
12298
  ctx0, Qcur, inp_pos, nullptr,
@@ -13412,17 +13425,17 @@ struct llm_build_jais : public llm_graph_context {
13412
13425
  cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
13413
13426
  cb(cur, "bqkv", il);
13414
13427
 
13415
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
13416
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
13417
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
13428
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd));
13429
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd));
13430
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa));
13418
13431
 
13419
13432
  cb(Qcur, "Qcur", il);
13420
13433
  cb(Kcur, "Kcur", il);
13421
13434
  cb(Vcur, "Vcur", il);
13422
13435
 
13423
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13424
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13425
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13436
+ Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13437
+ Kcur = ggml_cont_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13438
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13426
13439
 
13427
13440
  cur = build_attn(inp_attn,
13428
13441
  model.layers[il].wo, model.layers[il].bo,
@@ -13525,6 +13538,7 @@ struct llm_build_chatglm : public llm_graph_context {
13525
13538
  }
13526
13539
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13527
13540
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13541
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13528
13542
  } else {
13529
13543
  cur = build_lora_mm(model.layers[il].wqkv, cur);
13530
13544
  cb(cur, "wqkv", il);
@@ -13534,11 +13548,10 @@ struct llm_build_chatglm : public llm_graph_context {
13534
13548
  }
13535
13549
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13536
13550
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13537
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
13551
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13552
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13538
13553
  }
13539
13554
 
13540
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13541
-
13542
13555
  //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
13543
13556
  Qcur = ggml_rope_ext(
13544
13557
  ctx0, Qcur, inp_pos, nullptr,
@@ -13659,6 +13672,7 @@ struct llm_build_glm4 : public llm_graph_context {
13659
13672
  }
13660
13673
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13661
13674
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13675
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13662
13676
  } else {
13663
13677
  cur = build_lora_mm(model.layers[il].wqkv, cur);
13664
13678
  cb(cur, "wqkv", il);
@@ -13668,11 +13682,10 @@ struct llm_build_glm4 : public llm_graph_context {
13668
13682
  }
13669
13683
  Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
13670
13684
  Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
13671
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
13685
+ Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
13686
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13672
13687
  }
13673
13688
 
13674
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
13675
-
13676
13689
  Qcur = ggml_rope_ext(
13677
13690
  ctx0, Qcur, inp_pos, nullptr,
13678
13691
  n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -16839,13 +16852,13 @@ private:
16839
16852
 
16840
16853
  ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head_q, n_head, n_tokens, n_embd_head_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
16841
16854
  ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head_k, n_head_kv, n_tokens, n_embd_head_k * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
16842
- ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv)));
16855
+ ggml_tensor * Vcur = ggml_view_2d(ctx0, qkv, n_embd_head_v * n_head_kv, n_tokens, qkv->nb[1], v_offset * ggml_element_size(qkv));
16843
16856
 
16844
16857
  cb(Qcur, "Qcur", il);
16845
16858
  cb(Kcur, "Kcur", il);
16846
16859
  cb(Vcur, "Vcur", il);
16847
16860
 
16848
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
16861
+ Vcur = ggml_cont_3d(ctx0, Vcur, n_embd_head_v, n_head_kv, n_tokens);
16849
16862
 
16850
16863
  Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
16851
16864
  cb(Qcur, "Qcur_normed", il);
@@ -16912,15 +16925,13 @@ private:
16912
16925
  cb(zx, "mamba_in_proj", il);
16913
16926
  // {8192, 5, 1, 1} -> {8192, 1, 5, 1}
16914
16927
  zx = ggml_permute(ctx0, zx, 0, 2, 1, 3);
16915
- zx = ggml_cont(ctx0, zx);
16916
- zx = ggml_reshape_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
16928
+ zx = ggml_cont_4d(ctx0, zx, head_dim * 2, n_heads, n_seq_tokens, n_seqs);
16917
16929
  cb(zx, "mamba_in_proj_out", il);
16918
16930
 
16919
16931
  // split into z and x
16920
16932
  // => {head_dim * n_heads, n_seq_tokens, n_seqs}
16921
16933
  ggml_tensor * x = ggml_view_4d(ctx0, zx, head_dim, n_heads, n_seq_tokens, n_seqs, zx->nb[1], zx->nb[2], zx->nb[3], head_dim*ggml_element_size(zx));
16922
- x = ggml_cont(ctx0, x);
16923
- x = ggml_reshape_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
16934
+ x = ggml_cont_3d(ctx0, x, head_dim * n_heads, n_seq_tokens, n_seqs);
16924
16935
  // x = ggml_permute(ctx0, x, 0, 2, 1, 3);
16925
16936
  cb(x, "mamba_x_split", il);
16926
16937
 
@@ -39,6 +39,7 @@ enum llm_type {
39
39
  LLM_TYPE_410M,
40
40
  LLM_TYPE_450M,
41
41
  LLM_TYPE_475M,
42
+ LLM_TYPE_537M,
42
43
  LLM_TYPE_700M,
43
44
  LLM_TYPE_770M,
44
45
  LLM_TYPE_780M,
@@ -78,6 +79,7 @@ enum llm_type {
78
79
  LLM_TYPE_40B,
79
80
  LLM_TYPE_65B,
80
81
  LLM_TYPE_70B,
82
+ LLM_TYPE_120B,
81
83
  LLM_TYPE_142B,
82
84
  LLM_TYPE_236B,
83
85
  LLM_TYPE_290B,
@@ -999,7 +999,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
999
999
  new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
1000
1000
 
1001
1001
  // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
1002
- #if 1
1002
+ #if 0
1003
1003
  if (new_type == GGML_TYPE_MXFP4) {
1004
1004
  auto * x = f32_data_03;
1005
1005
 
@@ -2341,7 +2341,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2341
2341
 
2342
2342
  // @ngxson : quick hack for gpt-oss, always render these tokens
2343
2343
  for (const auto & t : token_to_id) {
2344
- if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
2344
+ if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2345
2345
  id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2346
2346
  }
2347
2347
  }
@@ -2388,6 +2388,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2388
2388
 
2389
2389
  if (has_return && has_call && has_end) {
2390
2390
  special_eog_ids.erase(end_id);
2391
+ id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2391
2392
  LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2392
2393
  }
2393
2394
  }