@fugood/llama.node 1.3.0-rc.6 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/CMakeLists.txt +12 -2
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +8 -9
  4. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  5. package/src/llama.cpp/common/arg.cpp +39 -1001
  6. package/src/llama.cpp/common/arg.h +2 -2
  7. package/src/llama.cpp/common/chat.cpp +216 -2
  8. package/src/llama.cpp/common/chat.h +1 -0
  9. package/src/llama.cpp/common/common.cpp +33 -0
  10. package/src/llama.cpp/common/common.h +13 -0
  11. package/src/llama.cpp/common/download.cpp +1054 -0
  12. package/src/llama.cpp/common/download.h +55 -0
  13. package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
  14. package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
  15. package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  16. package/src/llama.cpp/ggml/include/ggml.h +2 -0
  17. package/src/llama.cpp/ggml/src/CMakeLists.txt +7 -3
  18. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +10 -3
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +0 -5
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -35
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
  26. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
  27. package/src/llama.cpp/include/llama.h +7 -3
  28. package/src/llama.cpp/src/CMakeLists.txt +95 -0
  29. package/src/llama.cpp/src/llama-arch.cpp +108 -0
  30. package/src/llama.cpp/src/llama-arch.h +11 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +63 -31
  32. package/src/llama.cpp/src/llama-batch.h +12 -1
  33. package/src/llama.cpp/src/llama-chat.cpp +32 -0
  34. package/src/llama.cpp/src/llama-chat.h +1 -0
  35. package/src/llama.cpp/src/llama-context.cpp +44 -16
  36. package/src/llama.cpp/src/llama-context.h +5 -5
  37. package/src/llama.cpp/src/llama-cparams.h +1 -0
  38. package/src/llama.cpp/src/llama-graph.cpp +12 -7
  39. package/src/llama.cpp/src/llama-hparams.cpp +11 -1
  40. package/src/llama.cpp/src/llama-hparams.h +6 -0
  41. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
  42. package/src/llama.cpp/src/llama-kv-cache.cpp +56 -21
  43. package/src/llama.cpp/src/llama-kv-cache.h +2 -4
  44. package/src/llama.cpp/src/llama-kv-cells.h +44 -2
  45. package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
  46. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  47. package/src/llama.cpp/src/llama-model.cpp +350 -13194
  48. package/src/llama.cpp/src/llama-model.h +9 -2
  49. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  50. package/src/llama.cpp/src/llama-vocab.cpp +5 -0
  51. package/src/llama.cpp/src/llama-vocab.h +1 -0
  52. package/src/llama.cpp/src/models/apertus.cpp +125 -0
  53. package/src/llama.cpp/src/models/arcee.cpp +135 -0
  54. package/src/llama.cpp/src/models/arctic.cpp +138 -0
  55. package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
  56. package/src/llama.cpp/src/models/baichuan.cpp +122 -0
  57. package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
  58. package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  59. package/src/llama.cpp/src/models/bert.cpp +176 -0
  60. package/src/llama.cpp/src/models/bitnet.cpp +160 -0
  61. package/src/llama.cpp/src/models/bloom.cpp +101 -0
  62. package/src/llama.cpp/src/models/chameleon.cpp +178 -0
  63. package/src/llama.cpp/src/models/chatglm.cpp +132 -0
  64. package/src/llama.cpp/src/models/codeshell.cpp +111 -0
  65. package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
  66. package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  67. package/src/llama.cpp/src/models/command-r.cpp +122 -0
  68. package/src/llama.cpp/src/models/dbrx.cpp +123 -0
  69. package/src/llama.cpp/src/models/deci.cpp +135 -0
  70. package/src/llama.cpp/src/models/deepseek.cpp +144 -0
  71. package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
  72. package/src/llama.cpp/src/models/dots1.cpp +134 -0
  73. package/src/llama.cpp/src/models/dream.cpp +105 -0
  74. package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  75. package/src/llama.cpp/src/models/ernie4-5.cpp +111 -0
  76. package/src/llama.cpp/src/models/exaone.cpp +114 -0
  77. package/src/llama.cpp/src/models/exaone4.cpp +123 -0
  78. package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
  79. package/src/llama.cpp/src/models/falcon.cpp +120 -0
  80. package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  81. package/src/llama.cpp/src/models/gemma.cpp +112 -0
  82. package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  83. package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  84. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  85. package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
  86. package/src/llama.cpp/src/models/glm4.cpp +127 -0
  87. package/src/llama.cpp/src/models/gpt2.cpp +105 -0
  88. package/src/llama.cpp/src/models/gptneox.cpp +144 -0
  89. package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  90. package/src/llama.cpp/src/models/granite.cpp +211 -0
  91. package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  92. package/src/llama.cpp/src/models/grok.cpp +159 -0
  93. package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
  94. package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  95. package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  96. package/src/llama.cpp/src/models/internlm2.cpp +120 -0
  97. package/src/llama.cpp/src/models/jais.cpp +86 -0
  98. package/src/llama.cpp/src/models/jamba.cpp +106 -0
  99. package/src/llama.cpp/src/models/lfm2.cpp +173 -0
  100. package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
  101. package/src/llama.cpp/src/models/llada.cpp +99 -0
  102. package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
  103. package/src/llama.cpp/src/models/llama.cpp +155 -0
  104. package/src/llama.cpp/src/models/mamba.cpp +55 -0
  105. package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
  106. package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
  107. package/src/llama.cpp/src/models/models.h +481 -0
  108. package/src/llama.cpp/src/models/mpt.cpp +126 -0
  109. package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
  110. package/src/llama.cpp/src/models/nemotron.cpp +122 -0
  111. package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
  112. package/src/llama.cpp/src/models/olmo.cpp +121 -0
  113. package/src/llama.cpp/src/models/olmo2.cpp +150 -0
  114. package/src/llama.cpp/src/models/olmoe.cpp +124 -0
  115. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +123 -0
  116. package/src/llama.cpp/src/models/openelm.cpp +124 -0
  117. package/src/llama.cpp/src/models/orion.cpp +123 -0
  118. package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  119. package/src/llama.cpp/src/models/phi2.cpp +121 -0
  120. package/src/llama.cpp/src/models/phi3.cpp +152 -0
  121. package/src/llama.cpp/src/models/plamo.cpp +110 -0
  122. package/src/llama.cpp/src/models/plamo2.cpp +316 -0
  123. package/src/llama.cpp/src/models/plm.cpp +168 -0
  124. package/src/llama.cpp/src/models/qwen.cpp +108 -0
  125. package/src/llama.cpp/src/models/qwen2.cpp +117 -0
  126. package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
  127. package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
  128. package/src/llama.cpp/src/models/qwen3.cpp +117 -0
  129. package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
  130. package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  131. package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
  132. package/src/llama.cpp/src/models/refact.cpp +94 -0
  133. package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  134. package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
  135. package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  136. package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  137. package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
  138. package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
  139. package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
  140. package/src/llama.cpp/src/models/smollm3.cpp +128 -0
  141. package/src/llama.cpp/src/models/stablelm.cpp +146 -0
  142. package/src/llama.cpp/src/models/starcoder.cpp +100 -0
  143. package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
  144. package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
  145. package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
  146. package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  147. package/src/llama.cpp/src/models/xverse.cpp +108 -0
@@ -114,6 +114,7 @@ enum llm_type {
114
114
  LLM_TYPE_30B_A3B,
115
115
  LLM_TYPE_100B_A6B,
116
116
  LLM_TYPE_106B_A12B, // GLM-4.5-Air
117
+ LLM_TYPE_230B_A10B, // Minimax M2
117
118
  LLM_TYPE_235B_A22B,
118
119
  LLM_TYPE_300B_A47B, // Ernie MoE big
119
120
  LLM_TYPE_355B_A32B, // GLM-4.5
@@ -384,6 +385,13 @@ struct llama_layer {
384
385
  // openai-moe
385
386
  struct ggml_tensor * attn_sinks = nullptr;
386
387
 
388
+ // cogvlm
389
+ struct ggml_tensor * visexp_attn_wqkv = nullptr;
390
+ struct ggml_tensor * visexp_attn_wo = nullptr;
391
+ struct ggml_tensor * visexp_ffn_gate = nullptr;
392
+ struct ggml_tensor * visexp_ffn_down = nullptr;
393
+ struct ggml_tensor * visexp_ffn_up = nullptr;
394
+
387
395
  // xIELU activation parameters for Apertus
388
396
  struct ggml_tensor * ffn_act_alpha_n = nullptr;
389
397
  struct ggml_tensor * ffn_act_alpha_p = nullptr;
@@ -500,9 +508,8 @@ struct llama_model {
500
508
 
501
509
  ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
502
510
 
503
- // note: can mutate `cparams`
504
511
  // TODO: move this to new llm_arch_model_i interface
505
- llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
512
+ llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
506
513
 
507
514
  // TODO: move this to new llm_arch_model_i interface
508
515
  ggml_cgraph * build_graph(const llm_graph_params & params) const;
@@ -653,7 +653,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
653
653
  gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
654
654
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
655
655
  // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
656
- gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64));
656
+ gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)std::abs(o.val_i64));
657
657
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
658
658
  gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
659
659
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
@@ -401,6 +401,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
401
401
  };
402
402
  break;
403
403
  case LLAMA_VOCAB_PRE_TYPE_GPT4O:
404
+ case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2:
404
405
  regex_exprs = {
405
406
  // original regex from tokenizer.json
406
407
  // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -1992,6 +1993,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1992
1993
  tokenizer_pre == "grok-2") {
1993
1994
  pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
1994
1995
  clean_spaces = false;
1996
+ } else if (
1997
+ tokenizer_pre == "minimax-m2") {
1998
+ pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
1999
+ clean_spaces = false;
1995
2000
  } else {
1996
2001
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1997
2002
  }
@@ -49,6 +49,7 @@ enum llama_vocab_pre_type {
49
49
  LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
50
50
  LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
51
51
  LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
52
+ LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
52
53
  };
53
54
 
54
55
  struct LLM_KV;
@@ -0,0 +1,125 @@
1
+ #include "models.h"
2
+
3
+
4
+
5
+ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
6
+ const int64_t n_embd_head = hparams.n_embd_head_v;
7
+
8
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
10
+
11
+ ggml_tensor * cur;
12
+ ggml_tensor * inpL;
13
+
14
+ inpL = build_inp_embd(model.tok_embd);
15
+
16
+ ggml_tensor * inp_pos = build_inp_pos();
17
+ auto * inp_attn = build_attn_inp_kv();
18
+
19
+ const float kq_scale =
20
+ hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
21
+
22
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
23
+
24
+ for (int il = 0; il < n_layer; ++il) {
25
+ ggml_tensor * inpSA = inpL;
26
+
27
+ cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
28
+ cb(cur, "attn_norm", il);
29
+
30
+ // self-attention
31
+ {
32
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
33
+
34
+ // compute Q and K and RoPE them
35
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
36
+ cb(Qcur, "Qcur", il);
37
+
38
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
39
+ cb(Kcur, "Kcur", il);
40
+
41
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
42
+ cb(Vcur, "Vcur", il);
43
+
44
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
45
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
46
+ cb(Qcur, "Qcur_normed", il);
47
+
48
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
49
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
50
+ cb(Kcur, "Kcur_normed", il);
51
+
52
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
53
+
54
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
55
+ ext_factor, attn_factor, beta_fast, beta_slow);
56
+
57
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
58
+ ext_factor, attn_factor, beta_fast, beta_slow);
59
+
60
+ cb(Qcur, "Qcur_pos", il);
61
+ cb(Kcur, "Kcur_pos", il);
62
+ cb(Vcur, "Vcur_pos", il);
63
+
64
+ cur = build_attn(inp_attn,
65
+ model.layers[il].wo, model.layers[il].bo,
66
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
67
+ cb(cur, "attn_out", il);
68
+ }
69
+
70
+ if (il == n_layer - 1 && inp_out_ids) {
71
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
72
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
73
+ }
74
+
75
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
76
+ cb(ffn_inp, "ffn_inp", il);
77
+
78
+ // feed-forward network with xIELU activation
79
+ {
80
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il);
81
+ cb(cur, "ffn_norm", il);
82
+
83
+ // Up projection
84
+ ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
85
+ cb(up, "ffn_up", il);
86
+
87
+ float alpha_n_val = hparams.xielu_alpha_n[il];
88
+ float alpha_p_val = hparams.xielu_alpha_p[il];
89
+ float beta_val = hparams.xielu_beta[il];
90
+ float eps_val = hparams.xielu_eps[il];
91
+
92
+ // Apply xIELU activation
93
+ ggml_tensor * activated = ggml_xielu(ctx0, up, alpha_n_val, alpha_p_val, beta_val, eps_val);
94
+ cb(activated, "ffn_xielu", il);
95
+
96
+ // Down projection
97
+ cur = build_lora_mm(model.layers[il].ffn_down, activated);
98
+ cb(cur, "ffn_down", il);
99
+ }
100
+
101
+ cur = ggml_add(ctx0, cur, ffn_inp);
102
+ cb(cur, "ffn_out", il);
103
+
104
+ cur = build_cvec(cur, il);
105
+ cb(cur, "l_out", il);
106
+
107
+ // input for next layer
108
+ inpL = cur;
109
+ }
110
+
111
+ cur = inpL;
112
+
113
+ cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
114
+
115
+ cb(cur, "result_norm", -1);
116
+ res->t_embd = cur;
117
+
118
+ // lm_head
119
+ cur = build_lora_mm(model.output, cur);
120
+
121
+ cb(cur, "result_output", -1);
122
+ res->t_logits = cur;
123
+
124
+ ggml_build_forward_expand(gf, cur);
125
+ }
@@ -0,0 +1,135 @@
1
+ #include "models.h"
2
+
3
+
4
+ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6
+
7
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
9
+
10
+ ggml_tensor * cur;
11
+ ggml_tensor * inpL;
12
+
13
+ inpL = build_inp_embd(model.tok_embd);
14
+
15
+ // inp_pos - contains the positions
16
+ ggml_tensor * inp_pos = build_inp_pos();
17
+
18
+ auto * inp_attn = build_attn_inp_kv();
19
+
20
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
21
+
22
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
23
+
24
+ for (int il = 0; il < n_layer; ++il) {
25
+ ggml_tensor * inpSA = inpL;
26
+
27
+ // norm
28
+ cur = build_norm(inpL,
29
+ model.layers[il].attn_norm, NULL,
30
+ LLM_NORM_RMS, il);
31
+ cb(cur, "attn_norm", il);
32
+
33
+ // self-attention
34
+ {
35
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
36
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
37
+
38
+ // compute Q and K and RoPE them
39
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
40
+ cb(Qcur, "Qcur", il);
41
+ if (model.layers[il].bq) {
42
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
43
+ cb(Qcur, "Qcur", il);
44
+ }
45
+
46
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
47
+ cb(Kcur, "Kcur", il);
48
+ if (model.layers[il].bk) {
49
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
50
+ cb(Kcur, "Kcur", il);
51
+ }
52
+
53
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
54
+ cb(Vcur, "Vcur", il);
55
+ if (model.layers[il].bv) {
56
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
57
+ cb(Vcur, "Vcur", il);
58
+ }
59
+
60
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
61
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
62
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
63
+
64
+ Qcur = ggml_rope_ext(
65
+ ctx0, Qcur, inp_pos, rope_factors,
66
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
67
+ ext_factor, attn_factor, beta_fast, beta_slow
68
+ );
69
+
70
+ Kcur = ggml_rope_ext(
71
+ ctx0, Kcur, inp_pos, rope_factors,
72
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
73
+ ext_factor, attn_factor, beta_fast, beta_slow
74
+ );
75
+
76
+ cb(Qcur, "Qcur", il);
77
+ cb(Kcur, "Kcur", il);
78
+ cb(Vcur, "Vcur", il);
79
+
80
+ cur = build_attn(inp_attn,
81
+ model.layers[il].wo, model.layers[il].bo,
82
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
83
+ cb(cur, "attn_out", il);
84
+ }
85
+
86
+ if (il == n_layer - 1 && inp_out_ids) {
87
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
88
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
89
+ }
90
+
91
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
92
+ cb(ffn_inp, "ffn_inp", il);
93
+
94
+ // feed-forward network
95
+ // ARCEE uses relu^2 instead of silu
96
+ cur = build_norm(ffn_inp,
97
+ model.layers[il].ffn_norm, NULL,
98
+ LLM_NORM_RMS, il);
99
+ cb(cur, "ffn_norm", il);
100
+
101
+ cur = build_ffn(cur,
102
+ model.layers[il].ffn_up, NULL, NULL,
103
+ NULL, NULL, NULL,
104
+ model.layers[il].ffn_down, NULL, NULL,
105
+ NULL,
106
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
107
+ cb(cur, "ffn_out", il);
108
+
109
+ cur = ggml_add(ctx0, cur, ffn_inp);
110
+ cb(cur, "ffn_out", il);
111
+
112
+ cur = build_cvec(cur, il);
113
+ cb(cur, "l_out", il);
114
+
115
+ // input for next layer
116
+ inpL = cur;
117
+ }
118
+
119
+ cur = inpL;
120
+
121
+ cur = build_norm(cur,
122
+ model.output_norm, NULL,
123
+ LLM_NORM_RMS, -1);
124
+
125
+ cb(cur, "result_norm", -1);
126
+ res->t_embd = cur;
127
+
128
+ // lm_head
129
+ cur = build_lora_mm(model.output, cur);
130
+
131
+ cb(cur, "result_output", -1);
132
+ res->t_logits = cur;
133
+
134
+ ggml_build_forward_expand(gf, cur);
135
+ }
@@ -0,0 +1,138 @@
1
+ #include "models.h"
2
+
3
+
4
+ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6
+
7
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
9
+
10
+ ggml_tensor * cur;
11
+ ggml_tensor * inpL;
12
+
13
+ inpL = build_inp_embd(model.tok_embd);
14
+
15
+ // inp_pos - contains the positions
16
+ ggml_tensor * inp_pos = build_inp_pos();
17
+
18
+ auto * inp_attn = build_attn_inp_kv();
19
+
20
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
21
+
22
+ for (int il = 0; il < n_layer; ++il) {
23
+ ggml_tensor * inpSA = inpL;
24
+
25
+ // norm
26
+ cur = build_norm(inpL,
27
+ model.layers[il].attn_norm, NULL,
28
+ LLM_NORM_RMS, il);
29
+ cb(cur, "attn_norm", il);
30
+
31
+ // self-attention
32
+ {
33
+ // compute Q and K and RoPE them
34
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
35
+ cb(Qcur, "Qcur", il);
36
+
37
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
38
+ cb(Kcur, "Kcur", il);
39
+
40
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
41
+ cb(Vcur, "Vcur", il);
42
+
43
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
44
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
45
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
46
+
47
+ Qcur = ggml_rope_ext(
48
+ ctx0, Qcur, inp_pos, nullptr,
49
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
50
+ ext_factor, attn_factor, beta_fast, beta_slow
51
+ );
52
+
53
+ Kcur = ggml_rope_ext(
54
+ ctx0, Kcur, inp_pos, nullptr,
55
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
56
+ ext_factor, attn_factor, beta_fast, beta_slow
57
+ );
58
+
59
+ cb(Qcur, "Qcur", il);
60
+ cb(Kcur, "Kcur", il);
61
+ cb(Vcur, "Vcur", il);
62
+
63
+ cur = build_attn(inp_attn,
64
+ model.layers[il].wo, NULL,
65
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
66
+ }
67
+
68
+ if (il == n_layer - 1 && inp_out_ids) {
69
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
70
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
71
+ }
72
+
73
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
74
+ cb(ffn_inp, "ffn_inp", il);
75
+
76
+ // feed-forward network
77
+ cur = build_norm(ffn_inp,
78
+ model.layers[il].ffn_norm, NULL,
79
+ LLM_NORM_RMS, il);
80
+ cb(cur, "ffn_norm", il);
81
+
82
+ cur = build_ffn(cur,
83
+ model.layers[il].ffn_up, NULL, NULL,
84
+ model.layers[il].ffn_gate, NULL, NULL,
85
+ model.layers[il].ffn_down, NULL, NULL,
86
+ NULL,
87
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
88
+ cb(cur, "ffn_out", il);
89
+
90
+ ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
91
+ cb(ffn_out, "ffn_out", il);
92
+
93
+ // MoE
94
+ cur = build_norm(inpSA,
95
+ model.layers[il].ffn_norm_exps, NULL,
96
+ LLM_NORM_RMS, il);
97
+ cb(cur, "ffn_norm_exps", il);
98
+
99
+ cur = build_moe_ffn(cur,
100
+ model.layers[il].ffn_gate_inp,
101
+ model.layers[il].ffn_up_exps,
102
+ model.layers[il].ffn_gate_exps,
103
+ model.layers[il].ffn_down_exps,
104
+ nullptr,
105
+ n_expert, n_expert_used,
106
+ LLM_FFN_SILU, true,
107
+ false, 0.0,
108
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
109
+ il);
110
+ cb(cur, "ffn_moe_out", il);
111
+
112
+ cur = ggml_add(ctx0, cur, ffn_out);
113
+ cb(cur, "ffn_out", il);
114
+
115
+ cur = build_cvec(cur, il);
116
+ cb(cur, "l_out", il);
117
+
118
+ // input for next layer
119
+ inpL = cur;
120
+ }
121
+
122
+ cur = inpL;
123
+
124
+ cur = build_norm(cur,
125
+ model.output_norm, NULL,
126
+ LLM_NORM_RMS, -1);
127
+
128
+ cb(cur, "result_norm", -1);
129
+ res->t_embd = cur;
130
+
131
+ // lm_head
132
+ cur = build_lora_mm(model.output, cur);
133
+
134
+ cb(cur, "result_output", -1);
135
+ res->t_logits = cur;
136
+
137
+ ggml_build_forward_expand(gf, cur);
138
+ }
@@ -0,0 +1,86 @@
1
+ #include "models.h"
2
+
3
+
4
+ llm_build_arwkv7::llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) {
5
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
6
+
7
+ ggml_tensor * cur;
8
+ ggml_tensor * inpL;
9
+ ggml_tensor * v_first = nullptr;
10
+
11
+ inpL = build_inp_embd(model.tok_embd);
12
+
13
+ auto * rs_inp = build_rs_inp();
14
+
15
+ const auto n_embd = hparams.n_embd;
16
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
17
+ const auto n_seqs = ubatch.n_seqs;
18
+
19
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
20
+
21
+ for (int il = 0; il < n_layer; ++il) {
22
+ const llama_layer * layer = &model.layers[il];
23
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
24
+
25
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
26
+
27
+ ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
28
+ cb(att_norm, "attn_norm", il);
29
+
30
+ ggml_tensor * x_prev = ggml_concat(
31
+ ctx0,
32
+ token_shift,
33
+ ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
34
+ 1
35
+ );
36
+
37
+ cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
38
+
39
+ token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
40
+ ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
41
+
42
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
43
+ cb(ffn_inp, "ffn_inp", il);
44
+
45
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
46
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
47
+
48
+ if (il == n_layer - 1 && inp_out_ids) {
49
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
50
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
51
+ }
52
+ // feed-forward network
53
+ cur = build_norm(ffn_inp,
54
+ model.layers[il].ffn_norm, NULL,
55
+ LLM_NORM_RMS, il);
56
+ cb(cur, "ffn_norm", il);
57
+
58
+ cur = build_ffn(cur,
59
+ model.layers[il].ffn_up, NULL, NULL,
60
+ model.layers[il].ffn_gate, NULL, NULL,
61
+ model.layers[il].ffn_down, NULL, NULL,
62
+ NULL,
63
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
64
+ cb(cur, "ffn_out", il);
65
+
66
+ cur = ggml_add(ctx0, cur, ffn_inp);
67
+
68
+ cur = build_cvec(cur, il);
69
+ cb(cur, "l_out", il);
70
+
71
+ // input for next layer
72
+ inpL = cur;
73
+ }
74
+ cur = inpL;
75
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
76
+
77
+ cb(cur, "result_norm", -1);
78
+ res->t_embd = cur;
79
+
80
+ cur = build_lora_mm(model.output, cur);
81
+
82
+ cb(cur, "result_output", -1);
83
+ res->t_logits = cur;
84
+
85
+ ggml_build_forward_expand(gf, cur);
86
+ }
@@ -0,0 +1,122 @@
1
+ #include "models.h"
2
+
3
+
4
+ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6
+
7
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
9
+
10
+ ggml_tensor * cur;
11
+ ggml_tensor * inpL;
12
+
13
+ inpL = build_inp_embd(model.tok_embd);
14
+
15
+ // inp_pos - contains the positions
16
+ ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
17
+
18
+ auto * inp_attn = build_attn_inp_kv();
19
+
20
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
21
+
22
+ for (int il = 0; il < n_layer; ++il) {
23
+ ggml_tensor * inpSA = inpL;
24
+
25
+ cur = build_norm(inpL,
26
+ model.layers[il].attn_norm, NULL,
27
+ LLM_NORM_RMS, il);
28
+ cb(cur, "attn_norm", il);
29
+
30
+ // self-attention
31
+ {
32
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
33
+ cb(Qcur, "Qcur", il);
34
+
35
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
36
+ cb(Kcur, "Kcur", il);
37
+
38
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
39
+ cb(Vcur, "Vcur", il);
40
+
41
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
42
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
43
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
44
+
45
+ switch (model.type) {
46
+ case LLM_TYPE_7B:
47
+ Qcur = ggml_rope_ext(
48
+ ctx0, Qcur, inp_pos, nullptr,
49
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
50
+ ext_factor, attn_factor, beta_fast, beta_slow
51
+ );
52
+ Kcur = ggml_rope_ext(
53
+ ctx0, Kcur, inp_pos, nullptr,
54
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
55
+ ext_factor, attn_factor, beta_fast, beta_slow
56
+ );
57
+ break;
58
+ case LLM_TYPE_13B:
59
+ break;
60
+ default:
61
+ GGML_ABORT("fatal error");
62
+ }
63
+
64
+ cb(Qcur, "Qcur", il);
65
+ cb(Kcur, "Kcur", il);
66
+ cb(Vcur, "Vcur", il);
67
+
68
+ cur = build_attn(inp_attn,
69
+ model.layers[il].wo, NULL,
70
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
71
+ }
72
+
73
+ if (il == n_layer - 1 && inp_out_ids) {
74
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
75
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
76
+ }
77
+
78
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
79
+ cb(ffn_inp, "ffn_inp", il);
80
+
81
+ // feed-forward network
82
+ {
83
+ cur = build_norm(ffn_inp,
84
+ model.layers[il].ffn_norm, NULL,
85
+ LLM_NORM_RMS, il);
86
+ cb(cur, "ffn_norm", il);
87
+
88
+ cur = build_ffn(cur,
89
+ model.layers[il].ffn_up, NULL, NULL,
90
+ model.layers[il].ffn_gate, NULL, NULL,
91
+ model.layers[il].ffn_down, NULL, NULL,
92
+ NULL,
93
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
94
+ cb(cur, "ffn_out", il);
95
+ }
96
+
97
+ cur = ggml_add(ctx0, cur, ffn_inp);
98
+
99
+ cur = build_cvec(cur, il);
100
+ cb(cur, "l_out", il);
101
+
102
+ // input for next layer
103
+ inpL = cur;
104
+ }
105
+
106
+ cur = inpL;
107
+
108
+ cur = build_norm(cur,
109
+ model.output_norm, NULL,
110
+ LLM_NORM_RMS, -1);
111
+
112
+ cb(cur, "result_norm", -1);
113
+ res->t_embd = cur;
114
+
115
+ // lm_head
116
+ cur = build_lora_mm(model.output, cur);
117
+
118
+ cb(cur, "result_output", -1);
119
+ res->t_logits = cur;
120
+
121
+ ggml_build_forward_expand(gf, cur);
122
+ }