@fugood/llama.node 1.3.2 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +8 -3
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +5 -5
- package/src/LlamaCompletionWorker.cpp +33 -33
- package/src/LlamaContext.cpp +17 -16
- package/src/llama.cpp/CMakeLists.txt +4 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -37
- package/src/llama.cpp/common/common.cpp +1 -5
- package/src/llama.cpp/common/download.cpp +47 -29
- package/src/llama.cpp/common/log.cpp +6 -0
- package/src/llama.cpp/common/log.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +71 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +29 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +283 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +235 -34
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +289 -277
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +95 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +10 -0
- package/src/llama.cpp/src/CMakeLists.txt +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +32 -0
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +2 -1
- package/src/llama.cpp/src/llama-model.cpp +102 -0
- package/src/llama.cpp/src/llama-model.h +2 -0
- package/src/llama.cpp/src/llama-sampling.cpp +10 -5
- package/src/llama.cpp/src/llama-vocab.cpp +16 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/afmoe.cpp +187 -0
- package/src/llama.cpp/src/models/models.h +4 -0
- package/src/llama.cpp/src/unicode.cpp +77 -0
|
@@ -443,6 +443,17 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
443
443
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
444
444
|
};
|
|
445
445
|
break;
|
|
446
|
+
case LLAMA_VOCAB_PRE_TYPE_AFMOE:
|
|
447
|
+
regex_exprs = {
|
|
448
|
+
// Digit handling - uses custom implementation in unicode.cpp
|
|
449
|
+
// Groups digits with leading 1-2 based on total length modulo 3
|
|
450
|
+
"\\p{AFMoE_digits}",
|
|
451
|
+
// CJK and Asian scripts (using direct Unicode literals)
|
|
452
|
+
"[一-鿿㐀-䶿豈--ゟ゠-ヿ・-゚⼀-เ--ក-က-႟ꩠ-ꩿꧠ-가-ᄀ-ᇿ]+",
|
|
453
|
+
// Main BPE pattern
|
|
454
|
+
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
455
|
+
};
|
|
456
|
+
break;
|
|
446
457
|
default:
|
|
447
458
|
// default regex for BPE tokenization pre-processing
|
|
448
459
|
regex_exprs = {
|
|
@@ -1013,7 +1024,7 @@ private:
|
|
|
1013
1024
|
}
|
|
1014
1025
|
private:
|
|
1015
1026
|
uint32_t get_node(size_t index) {
|
|
1016
|
-
if (index
|
|
1027
|
+
if (index >= xcda_array_size) {
|
|
1017
1028
|
throw std::runtime_error("Index out of array bounds in XCDA array!");
|
|
1018
1029
|
}
|
|
1019
1030
|
return xcda_array[index];
|
|
@@ -1993,6 +2004,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1993
2004
|
tokenizer_pre == "grok-2") {
|
|
1994
2005
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
|
|
1995
2006
|
clean_spaces = false;
|
|
2007
|
+
} else if (
|
|
2008
|
+
tokenizer_pre == "afmoe") {
|
|
2009
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_AFMOE;
|
|
2010
|
+
clean_spaces = false;
|
|
1996
2011
|
} else if (
|
|
1997
2012
|
tokenizer_pre == "minimax-m2") {
|
|
1998
2013
|
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
#include "models.h"
|
|
2
|
+
|
|
3
|
+
llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6
|
+
|
|
7
|
+
ggml_tensor * cur;
|
|
8
|
+
ggml_tensor * inpL;
|
|
9
|
+
|
|
10
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
11
|
+
|
|
12
|
+
// MuP scaling: embeddings * sqrt(hidden_size)
|
|
13
|
+
// mup_enabled = true, hidden_size = 1024, scale = 32.0
|
|
14
|
+
inpL = ggml_scale(ctx0, inpL, sqrtf(float(n_embd)));
|
|
15
|
+
cb(inpL, "inp_embd_scaled", -1);
|
|
16
|
+
|
|
17
|
+
// inp_pos - contains the positions
|
|
18
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
19
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
20
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
21
|
+
|
|
22
|
+
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
|
|
23
|
+
|
|
24
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
25
|
+
ggml_tensor * inpSA = inpL;
|
|
26
|
+
|
|
27
|
+
// dual attention normalization (pre)
|
|
28
|
+
cur = build_norm(inpL,
|
|
29
|
+
model.layers[il].attn_norm, NULL,
|
|
30
|
+
LLM_NORM_RMS, il);
|
|
31
|
+
cb(cur, "attn_norm", il);
|
|
32
|
+
|
|
33
|
+
// self-attention
|
|
34
|
+
{
|
|
35
|
+
ggml_tensor * attn_inp = cur; // save input for gate computation
|
|
36
|
+
|
|
37
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
38
|
+
cb(Qcur, "Qcur", il);
|
|
39
|
+
|
|
40
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
41
|
+
cb(Kcur, "Kcur", il);
|
|
42
|
+
|
|
43
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
44
|
+
cb(Vcur, "Vcur", il);
|
|
45
|
+
|
|
46
|
+
// compute gate from input
|
|
47
|
+
ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
|
|
48
|
+
cb(gate, "attn_gate_proj", il);
|
|
49
|
+
|
|
50
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
51
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
52
|
+
|
|
53
|
+
// Q/K normalization
|
|
54
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
55
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
56
|
+
cb(Qcur, "Qcur_normed", il);
|
|
57
|
+
cb(Kcur, "Kcur_normed", il);
|
|
58
|
+
|
|
59
|
+
// RoPE only for sliding_attention layers
|
|
60
|
+
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
61
|
+
((il + 1) % hparams.n_no_rope_layer_step) != 0;
|
|
62
|
+
if (use_rope) {
|
|
63
|
+
Qcur = ggml_rope_ext(
|
|
64
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
65
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
66
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
67
|
+
cb(Qcur, "Qcur_rope", il);
|
|
68
|
+
|
|
69
|
+
Kcur = ggml_rope_ext(
|
|
70
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
71
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
72
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
73
|
+
cb(Kcur, "Kcur_rope", il);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
77
|
+
|
|
78
|
+
cur = build_attn(inp_attn,
|
|
79
|
+
NULL, NULL, // wo will be applied after gating
|
|
80
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
81
|
+
cb(cur, "attn_out", il);
|
|
82
|
+
|
|
83
|
+
// attention gating: attn_out * sigmoid(gate) BEFORE o_proj
|
|
84
|
+
gate = ggml_sigmoid(ctx0, gate);
|
|
85
|
+
cb(gate, "attn_gate_sig", il);
|
|
86
|
+
cur = ggml_mul(ctx0, cur, gate);
|
|
87
|
+
cb(cur, "attn_gated", il);
|
|
88
|
+
|
|
89
|
+
// now apply output projection
|
|
90
|
+
cur = build_lora_mm(model.layers[il].wo, cur);
|
|
91
|
+
cb(cur, "attn_o_proj", il);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// dual attention normalization (post)
|
|
95
|
+
cur = build_norm(cur,
|
|
96
|
+
model.layers[il].attn_post_norm, NULL,
|
|
97
|
+
LLM_NORM_RMS, il);
|
|
98
|
+
cb(cur, "attn_post_norm", il);
|
|
99
|
+
|
|
100
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
101
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
102
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
106
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
107
|
+
|
|
108
|
+
// dual ffn normalization (pre)
|
|
109
|
+
cur = build_norm(ffn_inp,
|
|
110
|
+
model.layers[il].ffn_norm, NULL,
|
|
111
|
+
LLM_NORM_RMS, il);
|
|
112
|
+
cb(cur, "ffn_norm", il);
|
|
113
|
+
|
|
114
|
+
// MoE or dense FFN
|
|
115
|
+
if ((uint32_t)il >= hparams.n_layer_dense_lead) {
|
|
116
|
+
// MoE layer with sigmoid routing, normalization, and scaling
|
|
117
|
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
|
118
|
+
model.layers[il].ffn_gate_inp,
|
|
119
|
+
model.layers[il].ffn_up_exps,
|
|
120
|
+
model.layers[il].ffn_gate_exps,
|
|
121
|
+
model.layers[il].ffn_down_exps,
|
|
122
|
+
model.layers[il].ffn_exp_probs_b,
|
|
123
|
+
n_expert, n_expert_used,
|
|
124
|
+
LLM_FFN_SILU,
|
|
125
|
+
hparams.expert_weights_norm, // norm_w (route_norm=True)
|
|
126
|
+
hparams.expert_weights_scale, // scale_w
|
|
127
|
+
hparams.expert_weights_scale, // w_scale (route_scale=2.826)
|
|
128
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
129
|
+
il);
|
|
130
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
131
|
+
|
|
132
|
+
// shared expert
|
|
133
|
+
if (hparams.n_expert_shared > 0) {
|
|
134
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
135
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
136
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
137
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
138
|
+
NULL,
|
|
139
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
140
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
141
|
+
|
|
142
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
143
|
+
cb(cur, "ffn_out", il);
|
|
144
|
+
} else {
|
|
145
|
+
cur = moe_out;
|
|
146
|
+
}
|
|
147
|
+
} else {
|
|
148
|
+
// dense layer
|
|
149
|
+
cur = build_ffn(cur,
|
|
150
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
151
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
152
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
153
|
+
NULL,
|
|
154
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
155
|
+
cb(cur, "ffn_out", il);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// dual ffn normalization (post)
|
|
159
|
+
cur = build_norm(cur,
|
|
160
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
161
|
+
LLM_NORM_RMS, il);
|
|
162
|
+
cb(cur, "ffn_post_norm", il);
|
|
163
|
+
|
|
164
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
165
|
+
cur = build_cvec(cur, il);
|
|
166
|
+
cb(cur, "l_out", il);
|
|
167
|
+
|
|
168
|
+
// input for next layer
|
|
169
|
+
inpL = cur;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
cur = inpL;
|
|
173
|
+
|
|
174
|
+
cur = build_norm(cur,
|
|
175
|
+
model.output_norm, NULL,
|
|
176
|
+
LLM_NORM_RMS, -1);
|
|
177
|
+
cb(cur, "result_norm", -1);
|
|
178
|
+
|
|
179
|
+
res->t_embd = cur;
|
|
180
|
+
|
|
181
|
+
// lm_head
|
|
182
|
+
cur = build_lora_mm(model.output, cur);
|
|
183
|
+
cb(cur, "result_output", -1);
|
|
184
|
+
res->t_logits = cur;
|
|
185
|
+
|
|
186
|
+
ggml_build_forward_expand(gf, cur);
|
|
187
|
+
}
|
|
@@ -57,6 +57,10 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|
|
57
57
|
int il) const;
|
|
58
58
|
};
|
|
59
59
|
|
|
60
|
+
struct llm_build_afmoe : public llm_graph_context {
|
|
61
|
+
llm_build_afmoe(const llama_model & model, const llm_graph_params & params);
|
|
62
|
+
};
|
|
63
|
+
|
|
60
64
|
struct llm_build_apertus : public llm_graph_context {
|
|
61
65
|
llm_build_apertus(const llama_model & model, const llm_graph_params & params);
|
|
62
66
|
};
|
|
@@ -729,6 +729,80 @@ static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string
|
|
|
729
729
|
return bpe_offsets;
|
|
730
730
|
}
|
|
731
731
|
|
|
732
|
+
// AFMOE digit handling: splits digits with leading 1-2 based on total length modulo 3
|
|
733
|
+
static std::vector<size_t> unicode_regex_split_custom_afmoe(const std::string & text, const std::vector<size_t> & offsets) {
|
|
734
|
+
std::vector<size_t> bpe_offsets;
|
|
735
|
+
bpe_offsets.reserve(offsets.size());
|
|
736
|
+
|
|
737
|
+
const auto cpts = unicode_cpts_from_utf8(text);
|
|
738
|
+
|
|
739
|
+
size_t start = 0;
|
|
740
|
+
for (auto offset : offsets) {
|
|
741
|
+
const size_t offset_ini = start;
|
|
742
|
+
const size_t offset_end = start + offset;
|
|
743
|
+
assert(offset_end <= cpts.size());
|
|
744
|
+
start = offset_end;
|
|
745
|
+
|
|
746
|
+
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
|
747
|
+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
|
748
|
+
};
|
|
749
|
+
|
|
750
|
+
size_t _prev_end = offset_ini;
|
|
751
|
+
auto _add_token = [&] (const size_t end) -> size_t {
|
|
752
|
+
assert(_prev_end <= end && end <= offset_end);
|
|
753
|
+
size_t len = end - _prev_end;
|
|
754
|
+
if (len > 0) {
|
|
755
|
+
bpe_offsets.push_back(len);
|
|
756
|
+
}
|
|
757
|
+
_prev_end = end;
|
|
758
|
+
return len;
|
|
759
|
+
};
|
|
760
|
+
|
|
761
|
+
for (size_t pos = offset_ini; pos < offset_end; ) {
|
|
762
|
+
const auto flags = _get_flags(pos);
|
|
763
|
+
|
|
764
|
+
// Handle digit sequences with special splitting logic
|
|
765
|
+
if (flags.is_number) {
|
|
766
|
+
size_t digit_start = pos;
|
|
767
|
+
size_t digit_count = 0;
|
|
768
|
+
|
|
769
|
+
// Count consecutive digits
|
|
770
|
+
while (_get_flags(pos).is_number && pos < offset_end) {
|
|
771
|
+
digit_count++;
|
|
772
|
+
pos++;
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
// Split based on total length modulo 3
|
|
776
|
+
size_t remainder = digit_count % 3;
|
|
777
|
+
size_t current = digit_start;
|
|
778
|
+
|
|
779
|
+
// Emit leading 1-2 digits if needed
|
|
780
|
+
if (remainder > 0) {
|
|
781
|
+
_add_token(current + remainder);
|
|
782
|
+
current += remainder;
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
// Emit groups of 3
|
|
786
|
+
while (current < digit_start + digit_count) {
|
|
787
|
+
_add_token(current + 3);
|
|
788
|
+
current += 3;
|
|
789
|
+
}
|
|
790
|
+
continue;
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
// For non-digits, just move forward
|
|
794
|
+
pos++;
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
// Add any remaining content
|
|
798
|
+
if (_prev_end < offset_end) {
|
|
799
|
+
_add_token(offset_end);
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
return bpe_offsets;
|
|
804
|
+
}
|
|
805
|
+
|
|
732
806
|
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
|
733
807
|
std::vector<size_t> bpe_offsets;
|
|
734
808
|
|
|
@@ -742,6 +816,9 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
|
|
742
816
|
} else if (regex_expr == "\\p{Han}+") {
|
|
743
817
|
// K2's first pattern - handle all K2 patterns together
|
|
744
818
|
bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
|
|
819
|
+
} else if (regex_expr == "\\p{AFMoE_digits}") {
|
|
820
|
+
// AFMOE digit pattern - use custom implementation for proper splitting
|
|
821
|
+
bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
|
|
745
822
|
}
|
|
746
823
|
|
|
747
824
|
return bpe_offsets;
|