npm - @fugood/llama.node - Versions diffs - 1.1.4 → 1.1.6 - Mend

@fugood/llama.node 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/lib/binding.ts +8 -0
package/package.json +14 -14
package/scripts/llama.cpp.patch +17 -13
package/src/LlamaCompletionWorker.cpp +2 -0
package/src/LlamaContext.cpp +3 -0
package/src/llama.cpp/common/arg.cpp +80 -10
package/src/llama.cpp/common/chat.cpp +52 -8
package/src/llama.cpp/common/chat.h +7 -2
package/src/llama.cpp/common/common.cpp +1 -0
package/src/llama.cpp/common/common.h +16 -6
package/src/llama.cpp/common/speculative.cpp +135 -54
package/src/llama.cpp/common/speculative.h +8 -1
package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
package/src/llama.cpp/ggml/include/ggml.h +37 -1
package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
package/src/llama.cpp/include/llama.h +9 -4
package/src/llama.cpp/src/llama-arch.cpp +105 -0
package/src/llama.cpp/src/llama-arch.h +12 -0
package/src/llama.cpp/src/llama-batch.cpp +1 -1
package/src/llama.cpp/src/llama-chat.cpp +33 -1
package/src/llama.cpp/src/llama-chat.h +2 -0
package/src/llama.cpp/src/llama-context.cpp +19 -10
package/src/llama.cpp/src/llama-context.h +4 -1
package/src/llama.cpp/src/llama-graph.cpp +175 -148
package/src/llama.cpp/src/llama-graph.h +60 -23
package/src/llama.cpp/src/llama-hparams.h +5 -3
package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
package/src/llama.cpp/src/llama-model-loader.h +3 -2
package/src/llama.cpp/src/llama-model.cpp +949 -75
package/src/llama.cpp/src/llama-model.h +24 -4
package/src/llama.cpp/src/llama-quant.cpp +40 -4
package/src/llama.cpp/src/llama-vocab.cpp +49 -1
package/src/llama.cpp/src/llama-vocab.h +1 -0

package/src/llama.cpp/src/llama-graph.h CHANGED Viewed

@@ -39,6 +39,7 @@ enum llm_ffn_op_type {
     LLM_FFN_SWIGLU,
     LLM_FFN_GEGLU,
     LLM_FFN_REGLU,
+    LLM_FFN_SWIGLU_OAI_MOE,
 };
 enum llm_ffn_gate_type {
@@ -144,7 +145,7 @@ public:
     ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
-    const llama_hparams & hparams;
+    const llama_hparams hparams;
 };
 class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
@@ -158,7 +159,7 @@ public:
     ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
-    const llama_hparams & hparams;
+    const llama_hparams hparams;
     const llama_kv_cache_unified_context * mctx;
 };
@@ -177,8 +178,8 @@ public:
     ggml_tensor * out_ids; // I32 [n_outputs]
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    const llama_hparams hparams;
+    const llama_cparams cparams;
     const uint32_t n_outputs;
 };
@@ -192,7 +193,7 @@ public:
     ggml_tensor * mean; // F32 [n_batch, n_batch]
-    const llama_cparams & cparams;
+    const llama_cparams cparams;
 };
 class llm_graph_input_cls : public llm_graph_input_i {
@@ -204,7 +205,7 @@ public:
     ggml_tensor * cls; // I32 [n_batch]
-    const llama_cparams & cparams;
+    const llama_cparams cparams;
 };
 class llm_graph_input_rs : public llm_graph_input_i {
@@ -214,7 +215,12 @@ public:
     void set_input(const llama_ubatch * ubatch) override;
-    ggml_tensor * s_copy; // I32 [kv_size]
+    ggml_tensor * s_copy;  // I32 [n_rs]
+    // views of s_copy, computed once per graph
+    // and shared across layers which use build_rs
+    ggml_tensor * s_copy_main;   // I32 [n_seqs]
+    ggml_tensor * s_copy_extra;  // I32 [n_rs - n_seqs]
     const llama_memory_recurrent_context * mctx;
 };
@@ -247,8 +253,8 @@ public:
     ggml_tensor * kq_mask     = nullptr; // F32 [n_tokens, n_batch, 1, 1]
     ggml_tensor * kq_mask_cnv = nullptr; //     [n_tokens, n_batch, 1, 1]
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    const llama_hparams hparams;
+    const llama_cparams cparams;
 };
 class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
@@ -278,8 +284,11 @@ public:
     ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
     ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    // note: these have to be copies because in order to be able to reuse a graph, its inputs
+    //       need to carry these parameters with them. otherwise, they can point to freed
+    //       llm_graph_params from a previous batch, causing stack-use-after-return
+    const llama_hparams hparams;
+    const llama_cparams cparams;
     const llama_kv_cache_unified_context * mctx;
 };
@@ -318,8 +327,8 @@ public:
     ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
     ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
+    const llama_hparams hparams;
+    const llama_cparams cparams;
     const llama_kv_cache_unified_iswa_context * mctx;
 };
@@ -415,7 +424,9 @@ struct llm_graph_params {
                 (!ubatch.embd  && !other.ubatch.embd)
             );
-        if (can_reuse_ubatch && !ubatch.equal_seqs()) {
+        // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
+        //   the reason is because the set of attention streams would be different for different sequences
+        if (can_reuse_ubatch && ubatch.equal_seqs()) {
             if (!ubatch.data) {
                 // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
                 //   therefore we cannot perform the sequence id check. normally should never happen
@@ -609,6 +620,7 @@ struct llm_graph_context {
        llm_ffn_gate_type   type_gate,
                      int   il) const;
+    // build MoE FFN without bias tensors
     ggml_tensor * build_moe_ffn(
              ggml_tensor * cur,
              ggml_tensor * gate_inp,
@@ -623,19 +635,29 @@ struct llm_graph_context {
                     bool   scale_w,
                    float   w_scale,
             llama_expert_gating_func_type gating_op,
-                     int   il) const;
+                     int   il,
+             ggml_tensor * probs_in = nullptr) const;
-    ggml_tensor * build_moe_ffn_from_probs(
+    ggml_tensor * build_moe_ffn(
              ggml_tensor * cur,
-             ggml_tensor * probs,
+             ggml_tensor * gate_inp,
+             ggml_tensor * gate_inp_b,
              ggml_tensor * up_exps,
+             ggml_tensor * up_exps_b,
              ggml_tensor * gate_exps,
+             ggml_tensor * gate_exps_b,
              ggml_tensor * down_exps,
+             ggml_tensor * down_exps_b,
              ggml_tensor * exp_probs_b,
                  int64_t   n_expert,
                  int64_t   n_expert_used,
+         llm_ffn_op_type   type_op,
+                    bool   norm_w,
+                    bool   scale_w,
+                   float   w_scale,
             llama_expert_gating_func_type gating_op,
-                     int   il) const;
+                     int   il,
+             ggml_tensor * probs_in = nullptr) const;
     //
     // inputs
@@ -663,6 +685,7 @@ struct llm_graph_context {
              ggml_tensor * v,       // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
              ggml_tensor * kq_b,
              ggml_tensor * kq_mask,
+             ggml_tensor * sinks,
              ggml_tensor * v_mla,   // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                    float   kq_scale) const;
@@ -709,6 +732,20 @@ struct llm_graph_context {
                   float   kq_scale,
                     int   il) const;
+    // TODO: temporary to keep the diff small. after the code is public will refactor to simplify this
+    ggml_tensor * build_attn_with_sinks(
+            llm_graph_input_attn_kv_unified_iswa * inp,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
+            ggml_tensor * kq_b,
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+            ggml_tensor * sinks, // [n_head_q]
+                  float   kq_scale,
+                    int   il) const;
     llm_graph_input_attn_cross * build_attn_inp_cross() const;
     ggml_tensor * build_attn(
@@ -727,7 +764,6 @@ struct llm_graph_context {
     // recurrent
     //
-    // TODO: avoid notion of "kv"
     // TODO: move this implementation to llama_memory_recurrent.
     //       this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
     //       when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
@@ -735,12 +771,13 @@ struct llm_graph_context {
     //         `llama_memory_recurrent`
     ggml_tensor * build_rs(
             ggml_tensor * s,
-            ggml_tensor * state_copy,
+            ggml_tensor * state_copy_main,
+            ggml_tensor * state_copy_extra,
                 int32_t   state_size,
                 int32_t   n_seqs,
-               uint32_t   n_kv,
-               uint32_t   kv_head,
-               uint32_t   kv_size,
+               uint32_t   n_rs,
+               uint32_t   rs_head,
+               uint32_t   rs_size,
                 int32_t   rs_zero,
             const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;

package/src/llama.cpp/src/llama-hparams.h CHANGED Viewed

@@ -9,9 +9,10 @@
 #define LLAMA_MAX_EXPERTS 384  // Kimi-K2
 enum llama_expert_gating_func_type {
-    LLAMA_EXPERT_GATING_FUNC_TYPE_NONE    = 0,
-    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
-    LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
+    LLAMA_EXPERT_GATING_FUNC_TYPE_NONE           = 0,
+    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX        = 1,
+    LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID        = 2,
+    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
 };
 enum llama_swa_type {
@@ -73,6 +74,7 @@ struct llama_hparams {
     bool     expert_weights_norm  = false;
     uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
     uint32_t moe_every_n_layers   = 0;
+    uint32_t nextn_predict_layers = 0;
     float f_norm_eps;
     float f_norm_rms_eps;

package/src/llama.cpp/src/llama-kv-cache-unified.cpp CHANGED Viewed

@@ -39,6 +39,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
     if (model.arch == LLM_ARCH_GEMMA3N) {
         n_layer_cache = 20;
     }
+    if (model.arch == LLM_ARCH_GLM4_MOE) {
+        // GLM-4.5: Only process up to last layer, skip final NextN layer
+        n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
+    }
     // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -183,7 +187,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
         const size_t memory_size_k = size_k_bytes();
         const size_t memory_size_v = size_v_bytes();
-        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
                 (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
                 ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                 ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
@@ -193,7 +197,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
     debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
     const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
-    supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
+    supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
     if (!supports_set_rows) {
         // ref: https://github.com/ggml-org/llama.cpp/pull/14363

package/src/llama.cpp/src/llama-kv-cache-unified.h CHANGED Viewed

@@ -230,7 +230,7 @@ private:
     // env: LLAMA_SET_ROWS (temporary)
     // ref: https://github.com/ggml-org/llama.cpp/pull/14285
-    bool supports_set_rows = false;
+    bool supports_set_rows = true;
     const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;

package/src/llama.cpp/src/llama-memory-hybrid.cpp CHANGED Viewed

@@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid(
                          /* common */
              uint32_t    n_seq_max,
                  bool    offload,
+                 bool    unified,
                          /* layer filters */
       layer_filter_cb && filter_attn,
       layer_filter_cb && filter_recr) :
@@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid(
         type_v,
         v_trans,
         offload,
-        1,
+        unified,
         kv_size,
         n_seq_max,
         n_pad,

package/src/llama.cpp/src/llama-memory-hybrid.h CHANGED Viewed

@@ -39,6 +39,7 @@ public:
                              /* common */
                  uint32_t    n_seq_max,
                      bool    offload,
+                     bool    unified,
                              /* layer filters */
           layer_filter_cb && filter_attn = nullptr,
           layer_filter_cb && filter_recr = nullptr);

package/src/llama.cpp/src/llama-model-loader.cpp CHANGED Viewed

@@ -35,6 +35,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_Q5_0:     return "Q5_0";
         case LLAMA_FTYPE_MOSTLY_Q5_1:     return "Q5_1";
         case LLAMA_FTYPE_MOSTLY_Q8_0:     return "Q8_0";
+        case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
         case LLAMA_FTYPE_MOSTLY_Q2_K:     return "Q2_K - Medium";
         case LLAMA_FTYPE_MOSTLY_Q2_K_S:   return "Q2_K - Small";
         case LLAMA_FTYPE_MOSTLY_Q3_K_S:   return "Q3_K - Small";

package/src/llama.cpp/src/llama-model-loader.h CHANGED Viewed

@@ -58,8 +58,9 @@ struct llama_model_loader {
         }
     };
-    static const int TENSOR_NOT_REQUIRED = 1;
-    static const int TENSOR_DUPLICATED   = 2;
+    static const int TENSOR_NOT_REQUIRED = 1 << 0;
+    static const int TENSOR_DUPLICATED   = 1 << 1;
+    static const int TENSOR_SKIP         = 1 << 2;
     int n_kv      = 0;
     int n_tensors = 0;