@fugood/llama.node 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +8 -8
  3. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  4. package/src/llama.cpp/common/arg.cpp +44 -999
  5. package/src/llama.cpp/common/arg.h +2 -2
  6. package/src/llama.cpp/common/chat.cpp +17 -2
  7. package/src/llama.cpp/common/common.cpp +33 -0
  8. package/src/llama.cpp/common/common.h +15 -1
  9. package/src/llama.cpp/common/download.cpp +1054 -0
  10. package/src/llama.cpp/common/download.h +55 -0
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
  12. package/src/llama.cpp/ggml/include/ggml.h +2 -0
  13. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -11
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +21 -21
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -75
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +0 -4
  23. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
  24. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
  25. package/src/llama.cpp/include/llama.h +7 -3
  26. package/src/llama.cpp/src/CMakeLists.txt +95 -0
  27. package/src/llama.cpp/src/llama-arch.cpp +108 -0
  28. package/src/llama.cpp/src/llama-arch.h +11 -0
  29. package/src/llama.cpp/src/llama-batch.cpp +63 -31
  30. package/src/llama.cpp/src/llama-batch.h +12 -1
  31. package/src/llama.cpp/src/llama-chat.cpp +32 -0
  32. package/src/llama.cpp/src/llama-chat.h +1 -0
  33. package/src/llama.cpp/src/llama-context.cpp +36 -13
  34. package/src/llama.cpp/src/llama-context.h +5 -5
  35. package/src/llama.cpp/src/llama-cparams.h +1 -0
  36. package/src/llama.cpp/src/llama-graph.cpp +3 -3
  37. package/src/llama.cpp/src/llama-hparams.cpp +11 -1
  38. package/src/llama.cpp/src/llama-hparams.h +6 -0
  39. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
  40. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -1
  41. package/src/llama.cpp/src/llama-kv-cells.h +44 -2
  42. package/src/llama.cpp/src/llama-memory-recurrent.cpp +4 -3
  43. package/src/llama.cpp/src/llama-model.cpp +320 -13171
  44. package/src/llama.cpp/src/llama-model.h +8 -0
  45. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  46. package/src/llama.cpp/src/llama-vocab.cpp +5 -0
  47. package/src/llama.cpp/src/llama-vocab.h +1 -0
  48. package/src/llama.cpp/src/models/apertus.cpp +125 -0
  49. package/src/llama.cpp/src/models/arcee.cpp +135 -0
  50. package/src/llama.cpp/src/models/arctic.cpp +138 -0
  51. package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
  52. package/src/llama.cpp/src/models/baichuan.cpp +122 -0
  53. package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
  54. package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  55. package/src/llama.cpp/src/models/bert.cpp +176 -0
  56. package/src/llama.cpp/src/models/bitnet.cpp +160 -0
  57. package/src/llama.cpp/src/models/bloom.cpp +101 -0
  58. package/src/llama.cpp/src/models/chameleon.cpp +178 -0
  59. package/src/llama.cpp/src/models/chatglm.cpp +132 -0
  60. package/src/llama.cpp/src/models/codeshell.cpp +111 -0
  61. package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
  62. package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  63. package/src/llama.cpp/src/models/command-r.cpp +122 -0
  64. package/src/llama.cpp/src/models/dbrx.cpp +123 -0
  65. package/src/llama.cpp/src/models/deci.cpp +135 -0
  66. package/src/llama.cpp/src/models/deepseek.cpp +144 -0
  67. package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
  68. package/src/llama.cpp/src/models/dots1.cpp +134 -0
  69. package/src/llama.cpp/src/models/dream.cpp +105 -0
  70. package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  71. package/src/llama.cpp/src/models/ernie4-5.cpp +110 -0
  72. package/src/llama.cpp/src/models/exaone.cpp +114 -0
  73. package/src/llama.cpp/src/models/exaone4.cpp +123 -0
  74. package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
  75. package/src/llama.cpp/src/models/falcon.cpp +120 -0
  76. package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  77. package/src/llama.cpp/src/models/gemma.cpp +112 -0
  78. package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  79. package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  80. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  81. package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
  82. package/src/llama.cpp/src/models/glm4.cpp +127 -0
  83. package/src/llama.cpp/src/models/gpt2.cpp +105 -0
  84. package/src/llama.cpp/src/models/gptneox.cpp +144 -0
  85. package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  86. package/src/llama.cpp/src/models/granite.cpp +211 -0
  87. package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  88. package/src/llama.cpp/src/models/grok.cpp +159 -0
  89. package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
  90. package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  91. package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  92. package/src/llama.cpp/src/models/internlm2.cpp +120 -0
  93. package/src/llama.cpp/src/models/jais.cpp +86 -0
  94. package/src/llama.cpp/src/models/jamba.cpp +106 -0
  95. package/src/llama.cpp/src/models/lfm2.cpp +173 -0
  96. package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
  97. package/src/llama.cpp/src/models/llada.cpp +99 -0
  98. package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
  99. package/src/llama.cpp/src/models/llama.cpp +155 -0
  100. package/src/llama.cpp/src/models/mamba.cpp +55 -0
  101. package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
  102. package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
  103. package/src/llama.cpp/src/models/models.h +481 -0
  104. package/src/llama.cpp/src/models/mpt.cpp +126 -0
  105. package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
  106. package/src/llama.cpp/src/models/nemotron.cpp +122 -0
  107. package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
  108. package/src/llama.cpp/src/models/olmo.cpp +121 -0
  109. package/src/llama.cpp/src/models/olmo2.cpp +150 -0
  110. package/src/llama.cpp/src/models/olmoe.cpp +124 -0
  111. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  112. package/src/llama.cpp/src/models/openelm.cpp +124 -0
  113. package/src/llama.cpp/src/models/orion.cpp +123 -0
  114. package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  115. package/src/llama.cpp/src/models/phi2.cpp +121 -0
  116. package/src/llama.cpp/src/models/phi3.cpp +152 -0
  117. package/src/llama.cpp/src/models/plamo.cpp +110 -0
  118. package/src/llama.cpp/src/models/plamo2.cpp +316 -0
  119. package/src/llama.cpp/src/models/plm.cpp +168 -0
  120. package/src/llama.cpp/src/models/qwen.cpp +108 -0
  121. package/src/llama.cpp/src/models/qwen2.cpp +117 -0
  122. package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
  123. package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
  124. package/src/llama.cpp/src/models/qwen3.cpp +117 -0
  125. package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
  126. package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  127. package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
  128. package/src/llama.cpp/src/models/refact.cpp +94 -0
  129. package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  130. package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
  131. package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  132. package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  133. package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
  134. package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
  135. package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
  136. package/src/llama.cpp/src/models/smollm3.cpp +128 -0
  137. package/src/llama.cpp/src/models/stablelm.cpp +146 -0
  138. package/src/llama.cpp/src/models/starcoder.cpp +100 -0
  139. package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
  140. package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
  141. package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
  142. package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  143. package/src/llama.cpp/src/models/xverse.cpp +108 -0
@@ -0,0 +1,86 @@
1
+ #include "models.h"
2
+
3
+ llm_build_rwkv6qwen2::llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
4
+ GGML_ASSERT(n_embd == hparams.n_embd_r());
5
+
6
+ ggml_tensor * cur;
7
+ ggml_tensor * inpL;
8
+
9
+ inpL = build_inp_embd(model.tok_embd);
10
+
11
+ auto * rs_inp = build_rs_inp();
12
+
13
+ const auto n_embd = hparams.n_embd;
14
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
15
+ const auto n_seqs = ubatch.n_seqs;
16
+
17
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
18
+
19
+ for (int il = 0; il < n_layer; ++il) {
20
+ const llama_layer * layer = &model.layers[il];
21
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
22
+
23
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
24
+
25
+ ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
26
+ cb(att_norm, "attn_norm", il);
27
+
28
+ ggml_tensor * x_prev = ggml_concat(
29
+ ctx0,
30
+ token_shift,
31
+ ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
32
+ 1
33
+ );
34
+
35
+ cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
36
+
37
+ token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
38
+ ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
39
+
40
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
41
+ cb(ffn_inp, "ffn_inp", il);
42
+
43
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
44
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
45
+
46
+ if (il == n_layer - 1 && inp_out_ids) {
47
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
48
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
49
+ }
50
+
51
+ // feed-forward network
52
+ cur = build_norm(ffn_inp,
53
+ model.layers[il].ffn_norm, NULL,
54
+ LLM_NORM_RMS, il);
55
+ cb(cur, "ffn_norm", il);
56
+
57
+ cur = build_ffn(cur,
58
+ model.layers[il].ffn_up, NULL, NULL,
59
+ model.layers[il].ffn_gate, NULL, NULL,
60
+ model.layers[il].ffn_down, NULL, NULL,
61
+ NULL,
62
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
63
+ cb(cur, "ffn_out", il);
64
+
65
+ cur = ggml_add(ctx0, cur, ffn_inp);
66
+
67
+ cur = build_cvec(cur, il);
68
+ cb(cur, "l_out", il);
69
+
70
+ // input for next layer
71
+ inpL = cur;
72
+ }
73
+
74
+ cur = inpL;
75
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
76
+
77
+ cb(cur, "result_norm", -1);
78
+ res->t_embd = cur;
79
+
80
+ cur = build_lora_mm(model.output, cur);
81
+
82
+ cb(cur, "result_output", -1);
83
+ res->t_logits = cur;
84
+
85
+ ggml_build_forward_expand(gf, cur);
86
+ }
@@ -0,0 +1,135 @@
1
+ #include "models.h"
2
+
3
+ llm_build_rwkv7_base::llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) :
4
+ llm_graph_context(params),
5
+ model(model) {}
6
+
7
+ ggml_tensor * llm_build_rwkv7_base::build_rwkv7_channel_mix(const llama_layer * layer,
8
+ ggml_tensor * cur,
9
+ ggml_tensor * x_prev,
10
+ llm_arch arch) const {
11
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
12
+ switch (arch) {
13
+ case LLM_ARCH_RWKV7:
14
+ {
15
+ ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
16
+
17
+ ggml_tensor * k = ggml_sqr(ctx0, ggml_relu(ctx0, build_lora_mm(layer->channel_mix_key, xk)));
18
+
19
+ cur = build_lora_mm(layer->channel_mix_value, k);
20
+ }
21
+ break;
22
+ default:
23
+ GGML_ABORT("fatal error");
24
+ }
25
+ return cur;
26
+ }
27
+
28
+ ggml_tensor * llm_build_rwkv7_base::build_rwkv7_time_mix(llm_graph_input_rs * inp,
29
+ ggml_tensor * cur,
30
+ ggml_tensor * x_prev,
31
+ ggml_tensor *& first_layer_value,
32
+ const llama_ubatch & ubatch,
33
+ int il) const {
34
+ const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
35
+
36
+ const auto n_tokens = ubatch.n_tokens;
37
+ const auto n_seqs = ubatch.n_seqs;
38
+ const auto n_embd = hparams.n_embd;
39
+ const auto head_size = hparams.wkv_head_size;
40
+ const auto head_count = n_embd / head_size;
41
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
42
+
43
+ const auto kv_head = mctx_cur->get_head();
44
+
45
+ const auto & layer = model.layers[il];
46
+
47
+ bool has_gating = layer.time_mix_g1 && layer.time_mix_g2;
48
+
49
+ ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
50
+ ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
51
+ sx = ggml_repeat(ctx0, sx, dummy);
52
+
53
+ ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur);
54
+
55
+ ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
56
+ ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
57
+ ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
58
+ ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
59
+ ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
60
+ ggml_tensor * xg =
61
+ has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) :
62
+ nullptr;
63
+
64
+ ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
65
+ ggml_tensor * w = ggml_add(
66
+ ctx0, ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))),
67
+ layer.time_mix_w0);
68
+ w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531));
69
+
70
+ ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
71
+ ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
72
+ if (first_layer_value == nullptr) {
73
+ first_layer_value = v;
74
+ } else {
75
+ // Add the first layer value as a residual connection.
76
+ v = ggml_add(ctx0, v,
77
+ ggml_mul(ctx0, ggml_sub(ctx0, first_layer_value, v),
78
+ ggml_sigmoid(ctx0, ggml_add(ctx0,
79
+ ggml_mul_mat(ctx0, layer.time_mix_v2,
80
+ ggml_mul_mat(ctx0, layer.time_mix_v1, xv)),
81
+ layer.time_mix_v0))));
82
+ }
83
+ ggml_tensor * g = nullptr;
84
+ if (layer.time_mix_g1 && layer.time_mix_g2) {
85
+ g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg)));
86
+ }
87
+ ggml_tensor * a = ggml_sigmoid(
88
+ ctx0, ggml_add(ctx0, ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)),
89
+ layer.time_mix_a0));
90
+
91
+ ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens);
92
+ kk = ggml_l2_norm(ctx0, kk, 1e-12);
93
+
94
+ ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a);
95
+ k = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka));
96
+
97
+ r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
98
+ w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
99
+ k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
100
+ v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
101
+ a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
102
+
103
+ ggml_tensor * wkv_state = build_rs(inp, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs);
104
+
105
+ ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
106
+ cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
107
+ wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
108
+
109
+ ggml_build_forward_expand(
110
+ gf, ggml_cpy(ctx0, wkv_state,
111
+ ggml_view_1d(ctx0, mctx_cur->get_s_l(il), hparams.n_embd_s() * n_seqs,
112
+ hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il)))));
113
+
114
+ if (layer.time_mix_ln && layer.time_mix_ln_b) {
115
+ // group norm with head_count groups
116
+ cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
117
+ cur = ggml_norm(ctx0, cur, 64e-5f);
118
+
119
+ // Convert back to regular vectors.
120
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
121
+ cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
122
+ } else {
123
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
124
+ }
125
+ ggml_tensor * rk = ggml_sum_rows(
126
+ ctx0, ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count)));
127
+ cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens));
128
+
129
+ if (has_gating) {
130
+ cur = ggml_mul(ctx0, cur, g);
131
+ }
132
+ cur = build_lora_mm(layer.time_mix_output, cur);
133
+
134
+ return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
135
+ }
@@ -0,0 +1,90 @@
1
+ #include "models.h"
2
+
3
+ llm_build_rwkv7::llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) :
4
+ llm_build_rwkv7_base(model, params) {
5
+ GGML_ASSERT(hparams.token_shift_count == 2);
6
+
7
+ ggml_tensor * cur;
8
+ ggml_tensor * inpL;
9
+ ggml_tensor * v_first = nullptr;
10
+
11
+ inpL = build_inp_embd(model.tok_embd);
12
+ inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
13
+
14
+ auto * rs_inp = build_rs_inp();
15
+
16
+ const auto n_embd = hparams.n_embd;
17
+ const auto n_seq_tokens = ubatch.n_seq_tokens;
18
+ const auto n_seqs = ubatch.n_seqs;
19
+
20
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
21
+
22
+ for (int il = 0; il < n_layer; ++il) {
23
+ const llama_layer * layer = &model.layers[il];
24
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
25
+
26
+ ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
27
+
28
+ ggml_tensor * att_shift =
29
+ ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
30
+ ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1],
31
+ token_shift->nb[2], n_embd * ggml_element_size(token_shift));
32
+
33
+ ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
34
+ cb(att_norm, "attn_norm", il);
35
+
36
+ ggml_tensor * x_prev = ggml_concat(
37
+ ctx0, att_shift,
38
+ ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), 1);
39
+
40
+ cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
41
+
42
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
43
+ cb(ffn_inp, "ffn_inp", il);
44
+
45
+ ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
46
+ cb(ffn_norm, "ffn_norm", il);
47
+
48
+ x_prev = ggml_concat(
49
+ ctx0, ffn_shift,
50
+ ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), 1);
51
+
52
+ token_shift = ggml_concat(ctx0,
53
+ ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2],
54
+ (n_seq_tokens - 1) * n_embd * ggml_element_size(att_norm)),
55
+ ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2],
56
+ (n_seq_tokens - 1) * n_embd * ggml_element_size(ffn_norm)),
57
+ 1);
58
+ ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
59
+
60
+ ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
61
+ ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
62
+ x_prev = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
63
+
64
+ if (il == n_layer - 1 && inp_out_ids) {
65
+ ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
66
+ ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
67
+ x_prev = ggml_get_rows(ctx0, x_prev, inp_out_ids);
68
+ }
69
+ cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
70
+ cur = ggml_add(ctx0, cur, ffn_inp);
71
+
72
+ cur = build_cvec(cur, il);
73
+ cb(cur, "l_out", il);
74
+
75
+ // input for next layer
76
+ inpL = cur;
77
+ }
78
+ cur = inpL;
79
+ cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
80
+
81
+ cb(cur, "result_norm", -1);
82
+ res->t_embd = cur;
83
+
84
+ cur = build_lora_mm(model.output, cur);
85
+
86
+ cb(cur, "result_output", -1);
87
+ res->t_logits = cur;
88
+
89
+ ggml_build_forward_expand(gf, cur);
90
+ }
@@ -0,0 +1,124 @@
1
+ #include "models.h"
2
+
3
+ llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5
+
6
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8
+
9
+ ggml_tensor * cur;
10
+ ggml_tensor * inpL;
11
+
12
+ inpL = build_inp_embd(model.tok_embd);
13
+
14
+ // inp_pos - contains the positions
15
+ ggml_tensor * inp_pos = build_inp_pos();
16
+
17
+ auto * inp_attn = build_attn_inp_kv();
18
+
19
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
20
+
21
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
22
+
23
+ for (int il = 0; il < n_layer; ++il) {
24
+ ggml_tensor * inpSA = inpL;
25
+
26
+ // norm
27
+ cur = build_norm(inpL,
28
+ model.layers[il].attn_norm, NULL,
29
+ LLM_NORM_RMS, il);
30
+ cb(cur, "attn_norm", il);
31
+
32
+ // self-attention
33
+ {
34
+ // compute Q and K and RoPE them
35
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
36
+ cb(Qcur, "Qcur", il);
37
+ if (model.layers[il].bq) {
38
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
39
+ cb(Qcur, "Qcur", il);
40
+ }
41
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
42
+ cb(Kcur, "Kcur", il);
43
+ if (model.layers[il].bk) {
44
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
45
+ cb(Kcur, "Kcur", il);
46
+ }
47
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
48
+ cb(Vcur, "Vcur", il);
49
+ if (model.layers[il].bv) {
50
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
51
+ cb(Vcur, "Vcur", il);
52
+ }
53
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
54
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
55
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
56
+
57
+ Qcur = ggml_rope_ext(
58
+ ctx0, Qcur, inp_pos, nullptr,
59
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
60
+ ext_factor, attn_factor, beta_fast, beta_slow
61
+ );
62
+
63
+ Kcur = ggml_rope_ext(
64
+ ctx0, Kcur, inp_pos, nullptr,
65
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
66
+ ext_factor, attn_factor, beta_fast, beta_slow
67
+ );
68
+
69
+ cb(Qcur, "Qcur", il);
70
+ cb(Kcur, "Kcur", il);
71
+ cb(Vcur, "Vcur", il);
72
+
73
+ cur = build_attn(inp_attn,
74
+ model.layers[il].wo, model.layers[il].bo,
75
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
76
+ cb(cur, "attn_out", il);
77
+ }
78
+ if (il == n_layer - 1 && inp_out_ids) {
79
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
80
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
81
+ }
82
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
83
+ cb(ffn_inp, "ffn_inp", il);
84
+
85
+ // feed-forward network
86
+ cur = build_norm(ffn_inp,
87
+ model.layers[il].attn_post_norm, NULL,
88
+ LLM_NORM_RMS, il);
89
+ cb(cur, "attn_post_norm", il);
90
+
91
+ cur = build_ffn(cur,
92
+ model.layers[il].ffn_up, NULL, NULL,
93
+ model.layers[il].ffn_gate, NULL, NULL,
94
+ model.layers[il].ffn_down, NULL, NULL,
95
+ NULL,
96
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
97
+ cb(cur, "ffn_out", il);
98
+
99
+ cur = ggml_add(ctx0, cur, ffn_inp);
100
+ cb(cur, "ffn_out", il);
101
+
102
+ cur = build_cvec(cur, il);
103
+ cb(cur, "l_out", il);
104
+
105
+ // input for next layer
106
+ inpL = cur;
107
+ }
108
+ cur = inpL;
109
+
110
+ cur = build_norm(cur,
111
+ model.output_norm, NULL,
112
+ LLM_NORM_RMS, -1);
113
+
114
+ cb(cur, "result_norm", -1);
115
+ res->t_embd = cur;
116
+
117
+ // lm_head
118
+ cur = build_lora_mm(model.output, cur);
119
+
120
+ cb(cur, "result_output", -1);
121
+ res->t_logits = cur;
122
+
123
+ ggml_build_forward_expand(gf, cur);
124
+ }
@@ -0,0 +1,120 @@
1
+ #include "models.h"
2
+
3
+ template <bool iswa>
4
+ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
5
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6
+
7
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
9
+
10
+ ggml_tensor * cur;
11
+ ggml_tensor * inpL;
12
+
13
+ inpL = build_inp_embd(model.tok_embd);
14
+
15
+ // inp_pos - contains the positions
16
+ ggml_tensor * inp_pos = build_inp_pos();
17
+
18
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
19
+ inp_attn_type * inp_attn = nullptr;
20
+
21
+ if constexpr (iswa) {
22
+ inp_attn = build_attn_inp_kv_iswa();
23
+ } else {
24
+ inp_attn = build_attn_inp_kv();
25
+ }
26
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
27
+
28
+ for (int il = 0; il < n_layer; ++il) {
29
+ ggml_tensor * inpSA = inpL;
30
+ ggml_tensor * probs = nullptr;
31
+
32
+ probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
33
+ cb(probs, "ffn_moe_logits", il);
34
+
35
+ // norm
36
+ cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
37
+ cb(cur, "attn_norm", il);
38
+
39
+ // self_attention
40
+ {
41
+ // compute Q and K and RoPE them
42
+ struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
43
+ cb(Qcur, "Qcur", il);
44
+
45
+ struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
46
+ cb(Kcur, "Kcur", il);
47
+
48
+ struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
49
+ cb(Vcur, "Vcur", il);
50
+
51
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
52
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
53
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
54
+
55
+ if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
56
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
57
+ ext_factor, attn_factor, beta_fast, beta_slow);
58
+
59
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
60
+ ext_factor, attn_factor, beta_fast, beta_slow);
61
+ }
62
+ cb(Qcur, "Qcur", il);
63
+ cb(Kcur, "Kcur", il);
64
+
65
+ cur = build_attn(inp_attn,
66
+ model.layers[il].wo, model.layers[il].bo,
67
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
68
+ }
69
+ if (il == n_layer - 1 && inp_out_ids) {
70
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
71
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
72
+ probs = ggml_get_rows(ctx0, probs, inp_out_ids);
73
+ }
74
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
75
+ cb(ffn_inp, "ffn_inp", il);
76
+
77
+ // MoE branch
78
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
79
+ cb(cur, "ffn_norm", il);
80
+
81
+ ggml_tensor * ffn_out =
82
+ build_moe_ffn(cur,
83
+ nullptr,
84
+ model.layers[il].ffn_up_exps,
85
+ model.layers[il].ffn_gate_exps,
86
+ model.layers[il].ffn_down_exps,
87
+ nullptr,
88
+ n_expert, n_expert_used,
89
+ LLM_FFN_RELU, true,
90
+ false, 0.0,
91
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
92
+ il, probs);
93
+
94
+ cb(ffn_out, "ffn_out", il);
95
+ cur = ffn_out;
96
+
97
+ cur = ggml_add(ctx0, cur, ffn_inp);
98
+ cur = build_cvec(cur, il);
99
+ cb(cur, "l_out", il);
100
+
101
+ // input for next layer
102
+ inpL = cur;
103
+ }
104
+ cur = inpL;
105
+
106
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
107
+ cb(cur, "result_norm", -1);
108
+ res->t_embd = cur;
109
+
110
+ // lm_head
111
+ cur = build_lora_mm(model.output, cur);
112
+ cb(cur, "result_output", -1);
113
+ res->t_logits = cur;
114
+
115
+ ggml_build_forward_expand(gf, cur);
116
+ }
117
+
118
+ // Explicit template instantiations
119
+ template struct llm_build_smallthinker<false>;
120
+ template struct llm_build_smallthinker<true>;
@@ -0,0 +1,128 @@
1
+ #include "models.h"
2
+
3
+ llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5
+
6
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8
+
9
+ ggml_tensor * cur;
10
+ ggml_tensor * inpL;
11
+
12
+ inpL = build_inp_embd(model.tok_embd);
13
+
14
+ // inp_pos - contains the positions
15
+ ggml_tensor * inp_pos = build_inp_pos();
16
+
17
+ auto * inp_attn = build_attn_inp_kv();
18
+
19
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
20
+
21
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
22
+
23
+ for (int il = 0; il < n_layer; ++il) {
24
+ ggml_tensor * inpSA = inpL;
25
+
26
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
27
+
28
+ // norm
29
+ cur = build_norm(inpL,
30
+ model.layers[il].attn_norm, NULL,
31
+ LLM_NORM_RMS, il);
32
+ cb(cur, "attn_norm", il);
33
+
34
+ // self-attention
35
+ {
36
+ // compute Q and K and RoPE them
37
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
38
+ cb(Qcur, "Qcur", il);
39
+ if (model.layers[il].bq) {
40
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
41
+ cb(Qcur, "Qcur", il);
42
+ }
43
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
44
+ cb(Kcur, "Kcur", il);
45
+ if (model.layers[il].bk) {
46
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
47
+ cb(Kcur, "Kcur", il);
48
+ }
49
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
50
+ cb(Vcur, "Vcur", il);
51
+ if (model.layers[il].bv) {
52
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
53
+ cb(Vcur, "Vcur", il);
54
+ }
55
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
56
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
57
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
58
+
59
+ if (use_rope) {
60
+ Qcur = ggml_rope_ext(
61
+ ctx0, Qcur, inp_pos, nullptr,
62
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
63
+ ext_factor, attn_factor, beta_fast, beta_slow
64
+ );
65
+
66
+ Kcur = ggml_rope_ext(
67
+ ctx0, Kcur, inp_pos, nullptr,
68
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
69
+ ext_factor, attn_factor, beta_fast, beta_slow
70
+ );
71
+ }
72
+ cb(Qcur, "Qcur", il);
73
+ cb(Kcur, "Kcur", il);
74
+ cb(Vcur, "Vcur", il);
75
+
76
+ cur = build_attn(inp_attn,
77
+ model.layers[il].wo, model.layers[il].bo,
78
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
79
+ cb(cur, "attn_out", il);
80
+ }
81
+ if (il == n_layer - 1 && inp_out_ids) {
82
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
83
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
84
+ }
85
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
86
+ cb(ffn_inp, "ffn_inp", il);
87
+
88
+ // feed-forward network
89
+ {
90
+ cur = build_norm(ffn_inp,
91
+ model.layers[il].ffn_norm, NULL,
92
+ LLM_NORM_RMS, il);
93
+ cb(cur, "ffn_norm", il);
94
+
95
+ cur = build_ffn(cur,
96
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
97
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
98
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
99
+ NULL,
100
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
101
+ cb(cur, "ffn_out", il);
102
+ }
103
+ cur = ggml_add(ctx0, cur, ffn_inp);
104
+ cb(cur, "ffn_out", il);
105
+
106
+ cur = build_cvec(cur, il);
107
+ cb(cur, "l_out", il);
108
+
109
+ // input for next layer
110
+ inpL = cur;
111
+ }
112
+ cur = inpL;
113
+
114
+ cur = build_norm(cur,
115
+ model.output_norm, NULL,
116
+ LLM_NORM_RMS, -1);
117
+
118
+ cb(cur, "result_norm", -1);
119
+ res->t_embd = cur;
120
+
121
+ // lm_head
122
+ cur = build_lora_mm(model.output, cur);
123
+
124
+ cb(cur, "result_output", -1);
125
+ res->t_logits = cur;
126
+
127
+ ggml_build_forward_expand(gf, cur);
128
+ }