@fugood/llama.node 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/CMakeLists.txt +4 -3
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +5 -5
  4. package/src/llama.cpp/CMakeLists.txt +4 -0
  5. package/src/llama.cpp/common/CMakeLists.txt +6 -37
  6. package/src/llama.cpp/common/common.cpp +1 -5
  7. package/src/llama.cpp/common/download.cpp +47 -29
  8. package/src/llama.cpp/common/log.cpp +6 -0
  9. package/src/llama.cpp/common/log.h +2 -0
  10. package/src/llama.cpp/ggml/include/ggml.h +71 -0
  11. package/src/llama.cpp/ggml/src/CMakeLists.txt +16 -0
  12. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -3
  13. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +29 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +283 -0
  15. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +1 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +235 -34
  17. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +289 -277
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +95 -42
  20. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +16 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +2 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +17 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +10 -0
  24. package/src/llama.cpp/src/CMakeLists.txt +6 -0
  25. package/src/llama.cpp/src/llama-arch.cpp +32 -0
  26. package/src/llama.cpp/src/llama-arch.h +2 -0
  27. package/src/llama.cpp/src/llama-graph.cpp +2 -1
  28. package/src/llama.cpp/src/llama-model.cpp +102 -0
  29. package/src/llama.cpp/src/llama-model.h +2 -0
  30. package/src/llama.cpp/src/llama-sampling.cpp +10 -5
  31. package/src/llama.cpp/src/llama-vocab.cpp +16 -1
  32. package/src/llama.cpp/src/llama-vocab.h +1 -0
  33. package/src/llama.cpp/src/models/afmoe.cpp +187 -0
  34. package/src/llama.cpp/src/models/models.h +4 -0
  35. package/src/llama.cpp/src/unicode.cpp +77 -0
@@ -443,6 +443,17 @@ struct llm_tokenizer_bpe : llm_tokenizer {
443
443
  "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
444
444
  };
445
445
  break;
446
+ case LLAMA_VOCAB_PRE_TYPE_AFMOE:
447
+ regex_exprs = {
448
+ // Digit handling - uses custom implementation in unicode.cpp
449
+ // Groups digits with leading 1-2 based on total length modulo 3
450
+ "\\p{AFMoE_digits}",
451
+ // CJK and Asian scripts (using direct Unicode literals)
452
+ "[一-鿿㐀-䶿豈-﫿぀-ゟ゠-ヿ・-゚⼀-⿟เ-๿຀-໿ក-៿က-႟ꩠ-ꩿꧠ-꧿가-힯ᄀ-ᇿ]+",
453
+ // Main BPE pattern
454
+ "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
455
+ };
456
+ break;
446
457
  default:
447
458
  // default regex for BPE tokenization pre-processing
448
459
  regex_exprs = {
@@ -1013,7 +1024,7 @@ private:
1013
1024
  }
1014
1025
  private:
1015
1026
  uint32_t get_node(size_t index) {
1016
- if (index > xcda_array_size) {
1027
+ if (index >= xcda_array_size) {
1017
1028
  throw std::runtime_error("Index out of array bounds in XCDA array!");
1018
1029
  }
1019
1030
  return xcda_array[index];
@@ -1993,6 +2004,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1993
2004
  tokenizer_pre == "grok-2") {
1994
2005
  pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
1995
2006
  clean_spaces = false;
2007
+ } else if (
2008
+ tokenizer_pre == "afmoe") {
2009
+ pre_type = LLAMA_VOCAB_PRE_TYPE_AFMOE;
2010
+ clean_spaces = false;
1996
2011
  } else if (
1997
2012
  tokenizer_pre == "minimax-m2") {
1998
2013
  pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
@@ -50,6 +50,7 @@ enum llama_vocab_pre_type {
50
50
  LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
51
51
  LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
52
52
  LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
53
+ LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
53
54
  };
54
55
 
55
56
  struct LLM_KV;
@@ -0,0 +1,187 @@
1
+ #include "models.h"
2
+
3
+ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
6
+
7
+ ggml_tensor * cur;
8
+ ggml_tensor * inpL;
9
+
10
+ inpL = build_inp_embd(model.tok_embd);
11
+
12
+ // MuP scaling: embeddings * sqrt(hidden_size)
13
+ // mup_enabled = true, hidden_size = 1024, scale = 32.0
14
+ inpL = ggml_scale(ctx0, inpL, sqrtf(float(n_embd)));
15
+ cb(inpL, "inp_embd_scaled", -1);
16
+
17
+ // inp_pos - contains the positions
18
+ ggml_tensor * inp_pos = build_inp_pos();
19
+ auto * inp_attn = build_attn_inp_kv_iswa();
20
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
21
+
22
+ const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
23
+
24
+ for (int il = 0; il < n_layer; ++il) {
25
+ ggml_tensor * inpSA = inpL;
26
+
27
+ // dual attention normalization (pre)
28
+ cur = build_norm(inpL,
29
+ model.layers[il].attn_norm, NULL,
30
+ LLM_NORM_RMS, il);
31
+ cb(cur, "attn_norm", il);
32
+
33
+ // self-attention
34
+ {
35
+ ggml_tensor * attn_inp = cur; // save input for gate computation
36
+
37
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
38
+ cb(Qcur, "Qcur", il);
39
+
40
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
41
+ cb(Kcur, "Kcur", il);
42
+
43
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
44
+ cb(Vcur, "Vcur", il);
45
+
46
+ // compute gate from input
47
+ ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
48
+ cb(gate, "attn_gate_proj", il);
49
+
50
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
51
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
52
+
53
+ // Q/K normalization
54
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
55
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
56
+ cb(Qcur, "Qcur_normed", il);
57
+ cb(Kcur, "Kcur_normed", il);
58
+
59
+ // RoPE only for sliding_attention layers
60
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
61
+ ((il + 1) % hparams.n_no_rope_layer_step) != 0;
62
+ if (use_rope) {
63
+ Qcur = ggml_rope_ext(
64
+ ctx0, Qcur, inp_pos, nullptr,
65
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
66
+ ext_factor, attn_factor, beta_fast, beta_slow);
67
+ cb(Qcur, "Qcur_rope", il);
68
+
69
+ Kcur = ggml_rope_ext(
70
+ ctx0, Kcur, inp_pos, nullptr,
71
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
72
+ ext_factor, attn_factor, beta_fast, beta_slow);
73
+ cb(Kcur, "Kcur_rope", il);
74
+ }
75
+
76
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
77
+
78
+ cur = build_attn(inp_attn,
79
+ NULL, NULL, // wo will be applied after gating
80
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
81
+ cb(cur, "attn_out", il);
82
+
83
+ // attention gating: attn_out * sigmoid(gate) BEFORE o_proj
84
+ gate = ggml_sigmoid(ctx0, gate);
85
+ cb(gate, "attn_gate_sig", il);
86
+ cur = ggml_mul(ctx0, cur, gate);
87
+ cb(cur, "attn_gated", il);
88
+
89
+ // now apply output projection
90
+ cur = build_lora_mm(model.layers[il].wo, cur);
91
+ cb(cur, "attn_o_proj", il);
92
+ }
93
+
94
+ // dual attention normalization (post)
95
+ cur = build_norm(cur,
96
+ model.layers[il].attn_post_norm, NULL,
97
+ LLM_NORM_RMS, il);
98
+ cb(cur, "attn_post_norm", il);
99
+
100
+ if (il == n_layer - 1 && inp_out_ids) {
101
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
102
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
103
+ }
104
+
105
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
106
+ cb(ffn_inp, "ffn_inp", il);
107
+
108
+ // dual ffn normalization (pre)
109
+ cur = build_norm(ffn_inp,
110
+ model.layers[il].ffn_norm, NULL,
111
+ LLM_NORM_RMS, il);
112
+ cb(cur, "ffn_norm", il);
113
+
114
+ // MoE or dense FFN
115
+ if ((uint32_t)il >= hparams.n_layer_dense_lead) {
116
+ // MoE layer with sigmoid routing, normalization, and scaling
117
+ ggml_tensor * moe_out = build_moe_ffn(cur,
118
+ model.layers[il].ffn_gate_inp,
119
+ model.layers[il].ffn_up_exps,
120
+ model.layers[il].ffn_gate_exps,
121
+ model.layers[il].ffn_down_exps,
122
+ model.layers[il].ffn_exp_probs_b,
123
+ n_expert, n_expert_used,
124
+ LLM_FFN_SILU,
125
+ hparams.expert_weights_norm, // norm_w (route_norm=True)
126
+ hparams.expert_weights_scale, // scale_w
127
+ hparams.expert_weights_scale, // w_scale (route_scale=2.826)
128
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
129
+ il);
130
+ cb(moe_out, "ffn_moe_out", il);
131
+
132
+ // shared expert
133
+ if (hparams.n_expert_shared > 0) {
134
+ ggml_tensor * ffn_shexp = build_ffn(cur,
135
+ model.layers[il].ffn_up_shexp, NULL, NULL,
136
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
137
+ model.layers[il].ffn_down_shexp, NULL, NULL,
138
+ NULL,
139
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
140
+ cb(ffn_shexp, "ffn_shexp", il);
141
+
142
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
143
+ cb(cur, "ffn_out", il);
144
+ } else {
145
+ cur = moe_out;
146
+ }
147
+ } else {
148
+ // dense layer
149
+ cur = build_ffn(cur,
150
+ model.layers[il].ffn_up, NULL, NULL,
151
+ model.layers[il].ffn_gate, NULL, NULL,
152
+ model.layers[il].ffn_down, NULL, NULL,
153
+ NULL,
154
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
155
+ cb(cur, "ffn_out", il);
156
+ }
157
+
158
+ // dual ffn normalization (post)
159
+ cur = build_norm(cur,
160
+ model.layers[il].ffn_post_norm, NULL,
161
+ LLM_NORM_RMS, il);
162
+ cb(cur, "ffn_post_norm", il);
163
+
164
+ cur = ggml_add(ctx0, cur, ffn_inp);
165
+ cur = build_cvec(cur, il);
166
+ cb(cur, "l_out", il);
167
+
168
+ // input for next layer
169
+ inpL = cur;
170
+ }
171
+
172
+ cur = inpL;
173
+
174
+ cur = build_norm(cur,
175
+ model.output_norm, NULL,
176
+ LLM_NORM_RMS, -1);
177
+ cb(cur, "result_norm", -1);
178
+
179
+ res->t_embd = cur;
180
+
181
+ // lm_head
182
+ cur = build_lora_mm(model.output, cur);
183
+ cb(cur, "result_output", -1);
184
+ res->t_logits = cur;
185
+
186
+ ggml_build_forward_expand(gf, cur);
187
+ }
@@ -57,6 +57,10 @@ struct llm_build_rwkv7_base : public llm_graph_context {
57
57
  int il) const;
58
58
  };
59
59
 
60
+ struct llm_build_afmoe : public llm_graph_context {
61
+ llm_build_afmoe(const llama_model & model, const llm_graph_params & params);
62
+ };
63
+
60
64
  struct llm_build_apertus : public llm_graph_context {
61
65
  llm_build_apertus(const llama_model & model, const llm_graph_params & params);
62
66
  };
@@ -729,6 +729,80 @@ static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string
729
729
  return bpe_offsets;
730
730
  }
731
731
 
732
+ // AFMOE digit handling: splits digits with leading 1-2 based on total length modulo 3
733
+ static std::vector<size_t> unicode_regex_split_custom_afmoe(const std::string & text, const std::vector<size_t> & offsets) {
734
+ std::vector<size_t> bpe_offsets;
735
+ bpe_offsets.reserve(offsets.size());
736
+
737
+ const auto cpts = unicode_cpts_from_utf8(text);
738
+
739
+ size_t start = 0;
740
+ for (auto offset : offsets) {
741
+ const size_t offset_ini = start;
742
+ const size_t offset_end = start + offset;
743
+ assert(offset_end <= cpts.size());
744
+ start = offset_end;
745
+
746
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
747
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
748
+ };
749
+
750
+ size_t _prev_end = offset_ini;
751
+ auto _add_token = [&] (const size_t end) -> size_t {
752
+ assert(_prev_end <= end && end <= offset_end);
753
+ size_t len = end - _prev_end;
754
+ if (len > 0) {
755
+ bpe_offsets.push_back(len);
756
+ }
757
+ _prev_end = end;
758
+ return len;
759
+ };
760
+
761
+ for (size_t pos = offset_ini; pos < offset_end; ) {
762
+ const auto flags = _get_flags(pos);
763
+
764
+ // Handle digit sequences with special splitting logic
765
+ if (flags.is_number) {
766
+ size_t digit_start = pos;
767
+ size_t digit_count = 0;
768
+
769
+ // Count consecutive digits
770
+ while (_get_flags(pos).is_number && pos < offset_end) {
771
+ digit_count++;
772
+ pos++;
773
+ }
774
+
775
+ // Split based on total length modulo 3
776
+ size_t remainder = digit_count % 3;
777
+ size_t current = digit_start;
778
+
779
+ // Emit leading 1-2 digits if needed
780
+ if (remainder > 0) {
781
+ _add_token(current + remainder);
782
+ current += remainder;
783
+ }
784
+
785
+ // Emit groups of 3
786
+ while (current < digit_start + digit_count) {
787
+ _add_token(current + 3);
788
+ current += 3;
789
+ }
790
+ continue;
791
+ }
792
+
793
+ // For non-digits, just move forward
794
+ pos++;
795
+ }
796
+
797
+ // Add any remaining content
798
+ if (_prev_end < offset_end) {
799
+ _add_token(offset_end);
800
+ }
801
+ }
802
+
803
+ return bpe_offsets;
804
+ }
805
+
732
806
  static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
733
807
  std::vector<size_t> bpe_offsets;
734
808
 
@@ -742,6 +816,9 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
742
816
  } else if (regex_expr == "\\p{Han}+") {
743
817
  // K2's first pattern - handle all K2 patterns together
744
818
  bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
819
+ } else if (regex_expr == "\\p{AFMoE_digits}") {
820
+ // AFMOE digit pattern - use custom implementation for proper splitting
821
+ bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
745
822
  }
746
823
 
747
824
  return bpe_offsets;