npm - @fugood/llama.node - Versions diffs - 1.3.2 → 1.3.4 - Mend

@fugood/llama.node 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/CMakeLists.txt +8 -3
package/package.json +14 -14
package/scripts/llama.cpp.patch +5 -5
package/src/LlamaCompletionWorker.cpp +33 -33
package/src/LlamaContext.cpp +17 -16
package/src/llama.cpp/CMakeLists.txt +4 -0
package/src/llama.cpp/common/CMakeLists.txt +6 -37
package/src/llama.cpp/common/common.cpp +1 -5
package/src/llama.cpp/common/download.cpp +47 -29
package/src/llama.cpp/common/log.cpp +6 -0
package/src/llama.cpp/common/log.h +2 -0
package/src/llama.cpp/ggml/include/ggml.h +71 -0
package/src/llama.cpp/ggml/src/CMakeLists.txt +16 -0
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -3
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +29 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +283 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +235 -34
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +289 -277
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +95 -42
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +16 -0
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +2 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +17 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +10 -0
package/src/llama.cpp/src/CMakeLists.txt +6 -0
package/src/llama.cpp/src/llama-arch.cpp +32 -0
package/src/llama.cpp/src/llama-arch.h +2 -0
package/src/llama.cpp/src/llama-graph.cpp +2 -1
package/src/llama.cpp/src/llama-model.cpp +102 -0
package/src/llama.cpp/src/llama-model.h +2 -0
package/src/llama.cpp/src/llama-sampling.cpp +10 -5
package/src/llama.cpp/src/llama-vocab.cpp +16 -1
package/src/llama.cpp/src/llama-vocab.h +1 -0
package/src/llama.cpp/src/models/afmoe.cpp +187 -0
package/src/llama.cpp/src/models/models.h +4 -0
package/src/llama.cpp/src/unicode.cpp +77 -0

package/src/llama.cpp/src/llama-vocab.cpp CHANGED Viewed

@@ -443,6 +443,17 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_AFMOE:
+                regex_exprs = {
+                    // Digit handling - uses custom implementation in unicode.cpp
+                    // Groups digits with leading 1-2 based on total length modulo 3
+                    "\\p{AFMoE_digits}",
+                    // CJK and Asian scripts (using direct Unicode literals)
+                    "[一-鿿㐀-䶿豈-﫿぀-ゟ゠-ヿ･-ﾟ⼀-⿟เ-๿຀-໿ក-៿က-႟ꩠ-ꩿꧠ-꧿가-힯ᄀ-ᇿ]+",
+                    // Main BPE pattern
+                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
@@ -1013,7 +1024,7 @@ private:
         }
     private:
         uint32_t get_node(size_t index) {
-            if (index > xcda_array_size) {
+            if (index >= xcda_array_size) {
                 throw std::runtime_error("Index out of array bounds in XCDA array!");
             }
             return xcda_array[index];
@@ -1993,6 +2004,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 tokenizer_pre == "grok-2") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
                 clean_spaces = false;
+            } else if (
+                tokenizer_pre == "afmoe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_AFMOE;
+                clean_spaces = false;
             } else if (
                 tokenizer_pre == "minimax-m2") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;

package/src/llama.cpp/src/llama-vocab.h CHANGED Viewed

@@ -50,6 +50,7 @@ enum llama_vocab_pre_type {
     LLAMA_VOCAB_PRE_TYPE_GROK_2          = 39,
     LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
     LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2      = 41,
+    LLAMA_VOCAB_PRE_TYPE_AFMOE           = 42,
 };
 struct LLM_KV;

package/src/llama.cpp/src/models/afmoe.cpp ADDED Viewed

@@ -0,0 +1,187 @@
+#include "models.h"
+llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+    inpL = build_inp_embd(model.tok_embd);
+    // MuP scaling: embeddings * sqrt(hidden_size)
+    // mup_enabled = true, hidden_size = 1024, scale = 32.0
+    inpL = ggml_scale(ctx0, inpL, sqrtf(float(n_embd)));
+    cb(inpL, "inp_embd_scaled", -1);
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+    auto * inp_attn = build_attn_inp_kv_iswa();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+        // dual attention normalization (pre)
+        cur = build_norm(inpL,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+        // self-attention
+        {
+            ggml_tensor * attn_inp = cur;  // save input for gate computation
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
+            // compute gate from input
+            ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
+            cb(gate, "attn_gate_proj", il);
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            // Q/K normalization
+            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+            cb(Qcur, "Qcur_normed", il);
+            cb(Kcur, "Kcur_normed", il);
+            // RoPE only for sliding_attention layers
+            const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
+                                ((il + 1) % hparams.n_no_rope_layer_step) != 0;
+            if (use_rope) {
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur_rope", il);
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur_rope", il);
+            }
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+            cur = build_attn(inp_attn,
+                    NULL, NULL,  // wo will be applied after gating
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+            // attention gating: attn_out * sigmoid(gate) BEFORE o_proj
+            gate = ggml_sigmoid(ctx0, gate);
+            cb(gate, "attn_gate_sig", il);
+            cur = ggml_mul(ctx0, cur, gate);
+            cb(cur, "attn_gated", il);
+            // now apply output projection
+            cur = build_lora_mm(model.layers[il].wo, cur);
+            cb(cur, "attn_o_proj", il);
+        }
+        // dual attention normalization (post)
+        cur = build_norm(cur,
+                model.layers[il].attn_post_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_post_norm", il);
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+        // dual ffn normalization (pre)
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+        // MoE or dense FFN
+        if ((uint32_t)il >= hparams.n_layer_dense_lead) {
+            // MoE layer with sigmoid routing, normalization, and scaling
+            ggml_tensor * moe_out = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    model.layers[il].ffn_exp_probs_b,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU,
+                    hparams.expert_weights_norm,           // norm_w (route_norm=True)
+                    hparams.expert_weights_scale,          // scale_w
+                    hparams.expert_weights_scale,          // w_scale (route_scale=2.826)
+                    (llama_expert_gating_func_type) hparams.expert_gating_func,
+                    il);
+            cb(moe_out, "ffn_moe_out", il);
+            // shared expert
+            if (hparams.n_expert_shared > 0) {
+                ggml_tensor * ffn_shexp = build_ffn(cur,
+                        model.layers[il].ffn_up_shexp,   NULL, NULL,
+                        model.layers[il].ffn_gate_shexp, NULL, NULL,
+                        model.layers[il].ffn_down_shexp, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(ffn_shexp, "ffn_shexp", il);
+                cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                cb(cur, "ffn_out", il);
+            } else {
+                cur = moe_out;
+            }
+        } else {
+            // dense layer
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        }
+        // dual ffn normalization (post)
+        cur = build_norm(cur,
+                model.layers[il].ffn_post_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "ffn_post_norm", il);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}

package/src/llama.cpp/src/models/models.h CHANGED Viewed

@@ -57,6 +57,10 @@ struct llm_build_rwkv7_base : public llm_graph_context {
                                        int                  il) const;
 };
+struct llm_build_afmoe : public llm_graph_context {
+    llm_build_afmoe(const llama_model & model, const llm_graph_params & params);
+};
 struct llm_build_apertus : public llm_graph_context {
     llm_build_apertus(const llama_model & model, const llm_graph_params & params);
 };

package/src/llama.cpp/src/unicode.cpp CHANGED Viewed

@@ -729,6 +729,80 @@ static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string
     return bpe_offsets;
 }
+// AFMOE digit handling: splits digits with leading 1-2 based on total length modulo 3
+static std::vector<size_t> unicode_regex_split_custom_afmoe(const std::string & text, const std::vector<size_t> & offsets) {
+    std::vector<size_t> bpe_offsets;
+    bpe_offsets.reserve(offsets.size());
+    const auto cpts = unicode_cpts_from_utf8(text);
+    size_t start = 0;
+    for (auto offset : offsets) {
+        const size_t offset_ini = start;
+        const size_t offset_end = start + offset;
+        assert(offset_end <= cpts.size());
+        start = offset_end;
+        auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
+        };
+        size_t _prev_end = offset_ini;
+        auto _add_token = [&] (const size_t end) -> size_t {
+            assert(_prev_end <= end && end <= offset_end);
+            size_t len = end - _prev_end;
+            if (len > 0) {
+                bpe_offsets.push_back(len);
+            }
+            _prev_end = end;
+            return len;
+        };
+        for (size_t pos = offset_ini; pos < offset_end; ) {
+            const auto flags = _get_flags(pos);
+            // Handle digit sequences with special splitting logic
+            if (flags.is_number) {
+                size_t digit_start = pos;
+                size_t digit_count = 0;
+                // Count consecutive digits
+                while (_get_flags(pos).is_number && pos < offset_end) {
+                    digit_count++;
+                    pos++;
+                }
+                // Split based on total length modulo 3
+                size_t remainder = digit_count % 3;
+                size_t current = digit_start;
+                // Emit leading 1-2 digits if needed
+                if (remainder > 0) {
+                    _add_token(current + remainder);
+                    current += remainder;
+                }
+                // Emit groups of 3
+                while (current < digit_start + digit_count) {
+                    _add_token(current + 3);
+                    current += 3;
+                }
+                continue;
+            }
+            // For non-digits, just move forward
+            pos++;
+        }
+        // Add any remaining content
+        if (_prev_end < offset_end) {
+            _add_token(offset_end);
+        }
+    }
+    return bpe_offsets;
+}
 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
     std::vector<size_t> bpe_offsets;
@@ -742,6 +816,9 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
     } else if (regex_expr == "\\p{Han}+") {
         // K2's first pattern - handle all K2 patterns together
         bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
+    } else if (regex_expr == "\\p{AFMoE_digits}") {
+        // AFMOE digit pattern - use custom implementation for proper splitting
+        bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
     }
     return bpe_offsets;