npm - @fugood/llama.node - Versions diffs - 1.3.0 → 1.3.2 - Mend

@fugood/llama.node 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

package/package.json +14 -14
package/scripts/llama.cpp.patch +8 -8
package/src/llama.cpp/common/CMakeLists.txt +2 -0
package/src/llama.cpp/common/arg.cpp +44 -999
package/src/llama.cpp/common/arg.h +2 -2
package/src/llama.cpp/common/chat.cpp +17 -2
package/src/llama.cpp/common/common.cpp +33 -0
package/src/llama.cpp/common/common.h +15 -1
package/src/llama.cpp/common/download.cpp +1054 -0
package/src/llama.cpp/common/download.h +55 -0
package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
package/src/llama.cpp/ggml/include/ggml.h +2 -0
package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -11
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +21 -21
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -75
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +0 -4
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
package/src/llama.cpp/include/llama.h +7 -3
package/src/llama.cpp/src/CMakeLists.txt +95 -0
package/src/llama.cpp/src/llama-arch.cpp +108 -0
package/src/llama.cpp/src/llama-arch.h +11 -0
package/src/llama.cpp/src/llama-batch.cpp +63 -31
package/src/llama.cpp/src/llama-batch.h +12 -1
package/src/llama.cpp/src/llama-chat.cpp +32 -0
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-context.cpp +36 -13
package/src/llama.cpp/src/llama-context.h +5 -5
package/src/llama.cpp/src/llama-cparams.h +1 -0
package/src/llama.cpp/src/llama-graph.cpp +3 -3
package/src/llama.cpp/src/llama-hparams.cpp +11 -1
package/src/llama.cpp/src/llama-hparams.h +6 -0
package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
package/src/llama.cpp/src/llama-kv-cache.cpp +33 -1
package/src/llama.cpp/src/llama-kv-cells.h +44 -2
package/src/llama.cpp/src/llama-memory-recurrent.cpp +4 -3
package/src/llama.cpp/src/llama-model.cpp +320 -13171
package/src/llama.cpp/src/llama-model.h +8 -0
package/src/llama.cpp/src/llama-quant.cpp +1 -1
package/src/llama.cpp/src/llama-vocab.cpp +5 -0
package/src/llama.cpp/src/llama-vocab.h +1 -0
package/src/llama.cpp/src/models/apertus.cpp +125 -0
package/src/llama.cpp/src/models/arcee.cpp +135 -0
package/src/llama.cpp/src/models/arctic.cpp +138 -0
package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
package/src/llama.cpp/src/models/baichuan.cpp +122 -0
package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
package/src/llama.cpp/src/models/bert.cpp +176 -0
package/src/llama.cpp/src/models/bitnet.cpp +160 -0
package/src/llama.cpp/src/models/bloom.cpp +101 -0
package/src/llama.cpp/src/models/chameleon.cpp +178 -0
package/src/llama.cpp/src/models/chatglm.cpp +132 -0
package/src/llama.cpp/src/models/codeshell.cpp +111 -0
package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
package/src/llama.cpp/src/models/command-r.cpp +122 -0
package/src/llama.cpp/src/models/dbrx.cpp +123 -0
package/src/llama.cpp/src/models/deci.cpp +135 -0
package/src/llama.cpp/src/models/deepseek.cpp +144 -0
package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
package/src/llama.cpp/src/models/dots1.cpp +134 -0
package/src/llama.cpp/src/models/dream.cpp +105 -0
package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
package/src/llama.cpp/src/models/ernie4-5.cpp +110 -0
package/src/llama.cpp/src/models/exaone.cpp +114 -0
package/src/llama.cpp/src/models/exaone4.cpp +123 -0
package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
package/src/llama.cpp/src/models/falcon.cpp +120 -0
package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
package/src/llama.cpp/src/models/gemma.cpp +112 -0
package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
package/src/llama.cpp/src/models/glm4.cpp +127 -0
package/src/llama.cpp/src/models/gpt2.cpp +105 -0
package/src/llama.cpp/src/models/gptneox.cpp +144 -0
package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
package/src/llama.cpp/src/models/granite.cpp +211 -0
package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
package/src/llama.cpp/src/models/grok.cpp +159 -0
package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
package/src/llama.cpp/src/models/internlm2.cpp +120 -0
package/src/llama.cpp/src/models/jais.cpp +86 -0
package/src/llama.cpp/src/models/jamba.cpp +106 -0
package/src/llama.cpp/src/models/lfm2.cpp +173 -0
package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
package/src/llama.cpp/src/models/llada.cpp +99 -0
package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
package/src/llama.cpp/src/models/llama.cpp +155 -0
package/src/llama.cpp/src/models/mamba.cpp +55 -0
package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
package/src/llama.cpp/src/models/models.h +481 -0
package/src/llama.cpp/src/models/mpt.cpp +126 -0
package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
package/src/llama.cpp/src/models/nemotron.cpp +122 -0
package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
package/src/llama.cpp/src/models/olmo.cpp +121 -0
package/src/llama.cpp/src/models/olmo2.cpp +150 -0
package/src/llama.cpp/src/models/olmoe.cpp +124 -0
package/src/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
package/src/llama.cpp/src/models/openelm.cpp +124 -0
package/src/llama.cpp/src/models/orion.cpp +123 -0
package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
package/src/llama.cpp/src/models/phi2.cpp +121 -0
package/src/llama.cpp/src/models/phi3.cpp +152 -0
package/src/llama.cpp/src/models/plamo.cpp +110 -0
package/src/llama.cpp/src/models/plamo2.cpp +316 -0
package/src/llama.cpp/src/models/plm.cpp +168 -0
package/src/llama.cpp/src/models/qwen.cpp +108 -0
package/src/llama.cpp/src/models/qwen2.cpp +117 -0
package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
package/src/llama.cpp/src/models/qwen3.cpp +117 -0
package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
package/src/llama.cpp/src/models/refact.cpp +94 -0
package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
package/src/llama.cpp/src/models/smollm3.cpp +128 -0
package/src/llama.cpp/src/models/stablelm.cpp +146 -0
package/src/llama.cpp/src/models/starcoder.cpp +100 -0
package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
package/src/llama.cpp/src/models/xverse.cpp +108 -0

package/src/llama.cpp/src/models/rwkv6qwen2.cpp ADDED Viewed

@@ -0,0 +1,86 @@
+#include "models.h"
+llm_build_rwkv6qwen2::llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) {
+    GGML_ASSERT(n_embd == hparams.n_embd_r());
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+    inpL = build_inp_embd(model.tok_embd);
+    auto * rs_inp = build_rs_inp();
+    const auto n_embd = hparams.n_embd;
+    const auto n_seq_tokens = ubatch.n_seq_tokens;
+    const auto n_seqs = ubatch.n_seqs;
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    for (int il = 0; il < n_layer; ++il) {
+        const llama_layer * layer = &model.layers[il];
+        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+        ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
+        ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
+        cb(att_norm, "attn_norm", il);
+        ggml_tensor * x_prev = ggml_concat(
+                ctx0,
+                token_shift,
+                ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
+                1
+                );
+        cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il);
+        token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
+        ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+        cb(ffn_inp, "ffn_inp", il);
+        cur     = ggml_reshape_2d(ctx0, cur,     n_embd, n_tokens);
+        ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur     = ggml_get_rows(ctx0, cur,     inp_out_ids);
+            ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+        }
+        // feed-forward network
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up,   NULL, NULL,
+                model.layers[il].ffn_gate, NULL, NULL,
+                model.layers[il].ffn_down, NULL, NULL,
+                NULL,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur, "ffn_out", il);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+    cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+    cur = build_lora_mm(model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}

package/src/llama.cpp/src/models/rwkv7-base.cpp ADDED Viewed

@@ -0,0 +1,135 @@
+#include "models.h"
+llm_build_rwkv7_base::llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) :
+    llm_graph_context(params),
+    model(model) {}
+ggml_tensor * llm_build_rwkv7_base::build_rwkv7_channel_mix(const llama_layer * layer,
+                                                            ggml_tensor *       cur,
+                                                            ggml_tensor *       x_prev,
+                                                            llm_arch            arch) const {
+    ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+    switch (arch) {
+        case LLM_ARCH_RWKV7:
+            {
+                ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
+                ggml_tensor * k = ggml_sqr(ctx0, ggml_relu(ctx0, build_lora_mm(layer->channel_mix_key, xk)));
+                cur = build_lora_mm(layer->channel_mix_value, k);
+            }
+            break;
+        default:
+            GGML_ABORT("fatal error");
+    }
+    return cur;
+}
+ggml_tensor * llm_build_rwkv7_base::build_rwkv7_time_mix(llm_graph_input_rs * inp,
+                                                         ggml_tensor *        cur,
+                                                         ggml_tensor *        x_prev,
+                                                         ggml_tensor *&       first_layer_value,
+                                                         const llama_ubatch & ubatch,
+                                                         int                  il) const {
+    const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
+    const auto n_tokens     = ubatch.n_tokens;
+    const auto n_seqs       = ubatch.n_seqs;
+    const auto n_embd       = hparams.n_embd;
+    const auto head_size    = hparams.wkv_head_size;
+    const auto head_count   = n_embd / head_size;
+    const auto n_seq_tokens = ubatch.n_seq_tokens;
+    const auto kv_head = mctx_cur->get_head();
+    const auto & layer = model.layers[il];
+    bool has_gating = layer.time_mix_g1 && layer.time_mix_g2;
+    ggml_tensor * sx    = ggml_sub(ctx0, x_prev, cur);
+    ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5);
+    sx                  = ggml_repeat(ctx0, sx, dummy);
+    ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur);
+    ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+    ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+    ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+    ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+    ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+    ggml_tensor * xg =
+        has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) :
+                     nullptr;
+    ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr);
+    ggml_tensor * w = ggml_add(
+        ctx0, ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))),
+        layer.time_mix_w0);
+    w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531));
+    ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk);
+    ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv);
+    if (first_layer_value == nullptr) {
+        first_layer_value = v;
+    } else {
+        // Add the first layer value as a residual connection.
+        v = ggml_add(ctx0, v,
+                     ggml_mul(ctx0, ggml_sub(ctx0, first_layer_value, v),
+                              ggml_sigmoid(ctx0, ggml_add(ctx0,
+                                                          ggml_mul_mat(ctx0, layer.time_mix_v2,
+                                                                       ggml_mul_mat(ctx0, layer.time_mix_v1, xv)),
+                                                          layer.time_mix_v0))));
+    }
+    ggml_tensor * g = nullptr;
+    if (layer.time_mix_g1 && layer.time_mix_g2) {
+        g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg)));
+    }
+    ggml_tensor * a = ggml_sigmoid(
+        ctx0, ggml_add(ctx0, ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)),
+                       layer.time_mix_a0));
+    ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens);
+    kk               = ggml_l2_norm(ctx0, kk, 1e-12);
+    ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a);
+    k                = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka));
+    r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
+    w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
+    k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
+    v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
+    a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
+    ggml_tensor * wkv_state = build_rs(inp, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs);
+    ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
+    cur                      = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
+    wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
+    ggml_build_forward_expand(
+        gf, ggml_cpy(ctx0, wkv_state,
+                     ggml_view_1d(ctx0, mctx_cur->get_s_l(il), hparams.n_embd_s() * n_seqs,
+                                  hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il)))));
+    if (layer.time_mix_ln && layer.time_mix_ln_b) {
+        // group norm with head_count groups
+        cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
+        cur = ggml_norm(ctx0, cur, 64e-5f);
+        // Convert back to regular vectors.
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b);
+    } else {
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    }
+    ggml_tensor * rk = ggml_sum_rows(
+        ctx0, ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count)));
+    cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens));
+    if (has_gating) {
+        cur = ggml_mul(ctx0, cur, g);
+    }
+    cur = build_lora_mm(layer.time_mix_output, cur);
+    return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
+}

package/src/llama.cpp/src/models/rwkv7.cpp ADDED Viewed

@@ -0,0 +1,90 @@
+#include "models.h"
+llm_build_rwkv7::llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) :
+    llm_build_rwkv7_base(model, params) {
+    GGML_ASSERT(hparams.token_shift_count == 2);
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+    ggml_tensor * v_first = nullptr;
+    inpL = build_inp_embd(model.tok_embd);
+    inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+    auto * rs_inp = build_rs_inp();
+    const auto n_embd       = hparams.n_embd;
+    const auto n_seq_tokens = ubatch.n_seq_tokens;
+    const auto n_seqs       = ubatch.n_seqs;
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    for (int il = 0; il < n_layer; ++il) {
+        const llama_layer * layer = &model.layers[il];
+        inpL                      = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+        ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il);
+        ggml_tensor * att_shift =
+            ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+        ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1],
+                                               token_shift->nb[2], n_embd * ggml_element_size(token_shift));
+        ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
+        cb(att_norm, "attn_norm", il);
+        ggml_tensor * x_prev = ggml_concat(
+            ctx0, att_shift,
+            ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), 1);
+        cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il);
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+        cb(ffn_inp, "ffn_inp", il);
+        ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
+        cb(ffn_norm, "ffn_norm", il);
+        x_prev = ggml_concat(
+            ctx0, ffn_shift,
+            ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), 1);
+        token_shift = ggml_concat(ctx0,
+                                  ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2],
+                                               (n_seq_tokens - 1) * n_embd * ggml_element_size(att_norm)),
+                                  ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2],
+                                               (n_seq_tokens - 1) * n_embd * ggml_element_size(ffn_norm)),
+                                  1);
+        ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
+        ffn_inp  = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+        ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
+        x_prev   = ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens);
+        if (il == n_layer - 1 && inp_out_ids) {
+            ffn_inp  = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
+            ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
+            x_prev   = ggml_get_rows(ctx0, x_prev, inp_out_ids);
+        }
+        cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+    cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+    cur = build_lora_mm(model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}

package/src/llama.cpp/src/models/seed-oss.cpp ADDED Viewed

@@ -0,0 +1,124 @@
+#include "models.h"
+llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+    inpL = build_inp_embd(model.tok_embd);
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+    auto * inp_attn = build_attn_inp_kv();
+    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+        // norm
+        cur = build_norm(inpL,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+        // self-attention
+        {
+            // compute Q and K and RoPE them
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+            if (model.layers[il].bq) {
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+            }
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
+            if (model.layers[il].bk) {
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+            }
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
+            if (model.layers[il].bv) {
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+            }
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+            Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+            Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+        // feed-forward network
+        cur = build_norm(ffn_inp,
+                model.layers[il].attn_post_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_post_norm", il);
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up,   NULL, NULL,
+                model.layers[il].ffn_gate, NULL, NULL,
+                model.layers[il].ffn_down, NULL, NULL,
+                NULL,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur, "ffn_out", il);
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "ffn_out", il);
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}

package/src/llama.cpp/src/models/smallthinker.cpp ADDED Viewed

@@ -0,0 +1,120 @@
+#include "models.h"
+template <bool iswa>
+llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+    inpL = build_inp_embd(model.tok_embd);
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+    using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+    inp_attn_type * inp_attn = nullptr;
+    if constexpr (iswa) {
+        inp_attn = build_attn_inp_kv_iswa();
+    } else {
+        inp_attn = build_attn_inp_kv();
+    }
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA  = inpL;
+        ggml_tensor * probs  = nullptr;
+        probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL);  // [n_expert, n_tokens]
+        cb(probs, "ffn_moe_logits", il);
+        // norm
+        cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+        // self_attention
+        {
+            // compute Q and K and RoPE them
+            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
+            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+            if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+            }
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+        }
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            probs = ggml_get_rows(ctx0, probs, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+        // MoE branch
+        cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+        ggml_tensor * ffn_out =
+            build_moe_ffn(cur,
+                    nullptr,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_RELU, true,
+                    false, 0.0,
+                    static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
+                    il, probs);
+        cb(ffn_out, "ffn_out", il);
+        cur = ffn_out;
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}
+// Explicit template instantiations
+template struct llm_build_smallthinker<false>;
+template struct llm_build_smallthinker<true>;

package/src/llama.cpp/src/models/smollm3.cpp ADDED Viewed

@@ -0,0 +1,128 @@
+#include "models.h"
+llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+    inpL = build_inp_embd(model.tok_embd);
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+    auto * inp_attn = build_attn_inp_kv();
+    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+        const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
+        // norm
+        cur = build_norm(inpL,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+        // self-attention
+        {
+            // compute Q and K and RoPE them
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+            if (model.layers[il].bq) {
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+            }
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
+            if (model.layers[il].bk) {
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+            }
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
+            if (model.layers[il].bv) {
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+            }
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+            if (use_rope) {
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+            }
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+        // feed-forward network
+        {
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        }
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "ffn_out", il);
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}