npm - @fugood/llama.node - Versions diffs - 1.3.7 → 1.4.0 - Mend

@fugood/llama.node 1.3.7 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/lib/binding.js +18 -1
package/lib/binding.ts +19 -1
package/lib/index.js +3 -3
package/lib/index.ts +1 -1
package/package.json +15 -15
package/scripts/llama.cpp.patch +7 -7
package/src/LlamaCompletionWorker.cpp +2 -2
package/src/llama.cpp/common/arg.cpp +27 -2
package/src/llama.cpp/common/chat-parser.cpp +968 -0
package/src/llama.cpp/common/chat.cpp +0 -952
package/src/llama.cpp/common/common.cpp +55 -0
package/src/llama.cpp/common/common.h +18 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
package/src/llama.cpp/ggml/CMakeLists.txt +6 -4
package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
package/src/llama.cpp/ggml/include/ggml.h +12 -4
package/src/llama.cpp/ggml/src/CMakeLists.txt +26 -4
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -15
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +721 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +9 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +71 -4
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +243 -4
package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +84 -85
package/src/llama.cpp/include/llama.h +18 -0
package/src/llama.cpp/src/CMakeLists.txt +2 -0
package/src/llama.cpp/src/llama-arch.cpp +95 -16
package/src/llama.cpp/src/llama-arch.h +15 -0
package/src/llama.cpp/src/llama-context.cpp +7 -3
package/src/llama.cpp/src/llama-graph.cpp +3 -3
package/src/llama.cpp/src/llama-hparams.h +1 -1
package/src/llama.cpp/src/llama-model.cpp +141 -6
package/src/llama.cpp/src/llama-model.h +4 -0
package/src/llama.cpp/src/llama-quant.cpp +13 -5
package/src/llama.cpp/src/models/lfm2.cpp +5 -3
package/src/llama.cpp/src/models/models.h +55 -1
package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
package/src/llama.cpp/src/models/rnd1.cpp +126 -0

package/src/llama.cpp/src/models/rnd1.cpp ADDED Viewed

@@ -0,0 +1,126 @@
+#include "models.h"
+// RND1 is a Qwen3Moe AR model converted to diffusion model.
+llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+    inpL = build_inp_embd(model.tok_embd);
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+    // Non-causal attention for diffusion
+    auto * inp_attn = build_attn_inp_no_cache();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+        // norm
+        cur = build_norm(inpL,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+        // self_attention
+        {
+            // compute Q and K and RoPE them
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+            Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+            cb(Qcur, "Qcur_normed", il);
+            Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+            Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+            cb(Kcur, "Kcur_normed", il);
+            Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+        }
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+        // MoE branch
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+        ggml_tensor * moe_out =
+            build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, true,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
+        cb(moe_out, "ffn_moe_out", il);
+        cur = moe_out;
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}