npm - @fugood/llama.node - Versions diffs - 1.1.9 → 1.1.10 - Mend

@fugood/llama.node 1.1.9 → 1.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/lib/binding.ts +5 -0
package/package.json +14 -14
package/scripts/llama.cpp.patch +15 -5
package/src/LlamaCompletionWorker.cpp +12 -3
package/src/LlamaCompletionWorker.h +3 -1
package/src/LlamaContext.cpp +3 -1
package/src/llama.cpp/common/chat.cpp +1 -2
package/src/llama.cpp/src/llama-hparams.cpp +25 -0
package/src/llama.cpp/src/llama-hparams.h +6 -0
package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +24 -7
package/src/llama.cpp/src/llama-kv-cache-iswa.h +4 -2
package/src/llama.cpp/src/llama-kv-cache.cpp +35 -33
package/src/llama.cpp/src/llama-kv-cache.h +13 -15
package/src/llama.cpp/src/llama-memory-hybrid.cpp +29 -28
package/src/llama.cpp/src/llama-memory-hybrid.h +18 -22
package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
package/src/llama.cpp/src/llama-memory-recurrent.h +7 -11
package/src/llama.cpp/src/llama-memory.h +8 -0
package/src/llama.cpp/src/llama-model.cpp +27 -11

package/lib/binding.ts CHANGED Viewed

@@ -100,6 +100,11 @@ export type LlamaCompletionOptions = {
   enable_thinking?: boolean
   thinking_forced_open?: boolean
   prompt?: string
+  /**
+   * Text to prefill the response with.
+   * This text will be added to the beginning of the generated response.
+   */
+  prefill_text?: string
   temperature?: number
   top_k?: number
   top_p?: number

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.1.9",
+  "version": "1.1.10",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -71,19 +71,19 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-linux-x64": "1.1.9",
-    "@fugood/node-llama-linux-x64-vulkan": "1.1.9",
-    "@fugood/node-llama-linux-x64-cuda": "1.1.9",
-    "@fugood/node-llama-linux-arm64": "1.1.9",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.1.9",
-    "@fugood/node-llama-linux-arm64-cuda": "1.1.9",
-    "@fugood/node-llama-win32-x64": "1.1.9",
-    "@fugood/node-llama-win32-x64-vulkan": "1.1.9",
-    "@fugood/node-llama-win32-x64-cuda": "1.1.9",
-    "@fugood/node-llama-win32-arm64": "1.1.9",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.1.9",
-    "@fugood/node-llama-darwin-x64": "1.1.9",
-    "@fugood/node-llama-darwin-arm64": "1.1.9"
+    "@fugood/node-llama-linux-x64": "1.1.10",
+    "@fugood/node-llama-linux-x64-vulkan": "1.1.10",
+    "@fugood/node-llama-linux-x64-cuda": "1.1.10",
+    "@fugood/node-llama-linux-arm64": "1.1.10",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.1.10",
+    "@fugood/node-llama-linux-arm64-cuda": "1.1.10",
+    "@fugood/node-llama-win32-x64": "1.1.10",
+    "@fugood/node-llama-win32-x64-vulkan": "1.1.10",
+    "@fugood/node-llama-win32-x64-cuda": "1.1.10",
+    "@fugood/node-llama-win32-arm64": "1.1.10",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.1.10",
+    "@fugood/node-llama-darwin-x64": "1.1.10",
+    "@fugood/node-llama-darwin-arm64": "1.1.10"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/scripts/llama.cpp.patch CHANGED Viewed

@@ -1,5 +1,5 @@
 diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
-index 23d3828f9..ca48af00c 100644
+index 111b4a21b..16ce87672 100644
 --- a/src/llama.cpp/common/chat.cpp
 +++ b/src/llama.cpp/common/chat.cpp
@@ -6,9 +6,6 @@
@@ -29,6 +29,16 @@ index 23d3828f9..ca48af00c 100644
  struct templates_params {
      json messages;
      json tools;
+@@ -784,8 +771,7 @@ static std::string apply(
+     if (additional_context) {
+         tmpl_inputs.extra_context.merge_patch(*additional_context);
+     }
+-    // TODO: add flag to control date/time, if only for testing purposes.
+-    // tmpl_inputs.now = std::chrono::system_clock::now();
++    tmpl_inputs.now = inputs.now;
+     minja::chat_template_options tmpl_opts;
+     // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
 diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
 index d1e480c91..437e64e29 100644
 --- a/src/llama.cpp/common/chat.h
@@ -54,10 +64,10 @@ index d1e480c91..437e64e29 100644
  struct common_chat_tool_call {
      std::string name;
 diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
-index 67dd5404f..909a97c66 100644
+index fdce1dcde..55aac3412 100644
 --- a/src/llama.cpp/common/common.cpp
 +++ b/src/llama.cpp/common/common.cpp
-@@ -1117,6 +1117,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
+@@ -1103,6 +1103,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
          mparams.n_gpu_layers = params.n_gpu_layers;
      }
@@ -66,10 +76,10 @@ index 67dd5404f..909a97c66 100644
      mparams.split_mode      = params.split_mode;
      mparams.tensor_split    = params.tensor_split;
 diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
-index 75596e6b3..0e04694c8 100644
+index 390dda5e5..f259ca785 100644
 --- a/src/llama.cpp/common/common.h
 +++ b/src/llama.cpp/common/common.h
-@@ -267,6 +267,7 @@ struct lr_opt {
+@@ -270,6 +270,7 @@ struct lr_opt {
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
  struct common_params {

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -35,12 +35,14 @@ LlamaCompletionWorker::LlamaCompletionWorker(
     const std::vector<std::string> &media_paths,
     const std::vector<llama_token> &guide_tokens,
     bool has_vocoder,
-    tts_type tts_type_val)
+    tts_type tts_type_val,
+    const std::string &prefill_text)
     : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
       _params(params), _stop_words(stop_words), _chat_format(chat_format),
       _thinking_forced_open(thinking_forced_open),
       _reasoning_format(reasoning_format),
       _media_paths(media_paths), _guide_tokens(guide_tokens),
+      _prefill_text(prefill_text),
       _has_vocoder(has_vocoder), _tts_type(tts_type_val) {
   if (!callback.IsEmpty()) {
     _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
@@ -68,8 +70,11 @@ LlamaCompletionWorker::PartialOutput LlamaCompletionWorker::getPartialOutput(con
     chat_syntax.parse_tool_calls = true;
+    // Combine prefill_text with generated_text for parsing
+    std::string full_text = _prefill_text + generated_text;
     // Use is_partial=true for streaming partial output
-    common_chat_msg parsed_msg = common_chat_parse(generated_text, true, chat_syntax);
+    common_chat_msg parsed_msg = common_chat_parse(full_text, true, chat_syntax);
     result.content = parsed_msg.content;
     result.reasoning_content = parsed_msg.reasoning_content;
@@ -156,6 +161,7 @@ void LlamaCompletionWorker::Execute() {
   auto embd = _sess->tokens_ptr();
   embd->reserve(embd->size() + max_len);
   if (is_enc_dec) {
     if (n_input > 0) {
       // Decode tokens in batches using n_batch as chunk size
@@ -378,8 +384,11 @@ void LlamaCompletionWorker::OnOK() {
       chat_syntax.thinking_forced_open = _thinking_forced_open;
       chat_syntax.reasoning_format = common_reasoning_format_from_name(_reasoning_format);
+      // Combine prefill_text with generated_text for final parsing
+      std::string full_text = _prefill_text + _result.text;
       common_chat_msg message = common_chat_parse(
-          _result.text,
+          full_text,
           false,
           chat_syntax
       );

package/src/LlamaCompletionWorker.h CHANGED Viewed

@@ -26,7 +26,8 @@ public:
                         const std::vector<std::string> &media_paths = {},
                         const std::vector<llama_token> &guide_tokens = {},
                         bool has_vocoder = false,
-                        tts_type tts_type_val = UNKNOWN);
+                        tts_type tts_type_val = UNKNOWN,
+                        const std::string &prefill_text = "");
   ~LlamaCompletionWorker();
@@ -58,6 +59,7 @@ private:
   std::string _reasoning_format;
   std::vector<std::string> _media_paths;
   std::vector<llama_token> _guide_tokens;
+  std::string _prefill_text;
   std::function<void()> _onComplete;
   bool _has_callback = false;
   bool _interrupted = false;

package/src/LlamaContext.cpp CHANGED Viewed

@@ -935,6 +935,8 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
         json_schema_to_grammar(json::parse(json_schema_str));
   }
+  std::string prefill_text = get_option<std::string>(options, "prefill_text", "");
   params.n_predict = get_option<int32_t>(options, "n_predict", -1);
   params.sampling.temp = get_option<float>(options, "temperature", 0.80f);
   params.sampling.top_k = get_option<int32_t>(options, "top_k", 40);
@@ -1007,7 +1009,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
   auto *worker =
       new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
                                 chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens,
-                                _has_vocoder, _tts_type);
+                                _has_vocoder, _tts_type, prefill_text);
   worker->Queue();
   _wip = worker;
   worker->OnComplete([this]() { _wip = nullptr; });

package/src/llama.cpp/common/chat.cpp CHANGED Viewed

@@ -771,8 +771,7 @@ static std::string apply(
     if (additional_context) {
         tmpl_inputs.extra_context.merge_patch(*additional_context);
     }
-    // TODO: add flag to control date/time, if only for testing purposes.
-    // tmpl_inputs.now = std::chrono::system_clock::now();
+    tmpl_inputs.now = inputs.now;
     minja::chat_template_options tmpl_opts;
     // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens

package/src/llama.cpp/src/llama-hparams.cpp CHANGED Viewed

@@ -153,3 +153,28 @@ bool llama_hparams::is_swa(uint32_t il) const {
     GGML_ABORT("fatal error");
 }
+bool llama_hparams::has_kv(uint32_t il) const {
+    if (n_layer_kv_from_start >= 0) {
+        if (il < (uint32_t) n_layer_kv_from_start) {
+            return true;
+        }
+        return false;
+    }
+    // by default, all layers have kv
+    return true;
+}
+uint32_t llama_hparams::n_layer_kv() const {
+    uint32_t res = 0;
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        if (has_kv(il)) {
+            res++;
+        }
+    }
+    return res;
+}

package/src/llama.cpp/src/llama-hparams.h CHANGED Viewed

@@ -41,6 +41,7 @@ struct llama_hparams {
     uint32_t n_embd;
     uint32_t n_embd_features = 0;
     uint32_t n_layer;
+     int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
     uint32_t n_rot;
     uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
     uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
@@ -221,6 +222,11 @@ struct llama_hparams {
     uint32_t n_pos_per_embd() const;
     bool is_swa(uint32_t il) const;
+    bool has_kv(uint32_t il) const;
+    // number of layers for which has_kv() returns true
+    uint32_t n_layer_kv() const;
 };
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

package/src/llama.cpp/src/llama-kv-cache-iswa.cpp CHANGED Viewed

@@ -22,9 +22,26 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
                  uint32_t   kv_size,
                  uint32_t   n_seq_max,
                  uint32_t   n_ubatch,
-                 uint32_t   n_pad) : hparams(model.hparams), unified(unified) {
-    llama_kv_cache::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
-    llama_kv_cache::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
+                 uint32_t   n_pad,
+    const layer_filter_cb & filter,
+    const  layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
+    // chain filters
+    const layer_filter_cb filter_base = [&](int32_t il) {
+        if (filter && !filter(il)) {
+            return false;
+        }
+        return !model.hparams.is_swa(il);
+    };
+    const layer_filter_cb filter_swa  = [&](int32_t il) {
+        if (filter && !filter(il)) {
+            return false;
+        }
+        return  model.hparams.is_swa(il);
+    };
     const uint32_t size_base = kv_size;
@@ -41,16 +58,16 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
     LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
     kv_base = std::make_unique<llama_kv_cache>(
-            model, std::move(filter_base), type_k, type_v,
+            model, type_k, type_v,
             v_trans, offload, unified, size_base, n_seq_max, n_pad,
-            0, LLAMA_SWA_TYPE_NONE);
+            0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
     kv_swa = std::make_unique<llama_kv_cache>(
-            model, std::move(filter_swa), type_k, type_v,
+            model, type_k, type_v,
             v_trans, offload, unified, size_swa, n_seq_max, n_pad,
-            hparams.n_swa, hparams.swa_type);
+            hparams.n_swa, hparams.swa_type, filter_swa, reuse);
 }
 void llama_kv_cache_iswa::clear(bool data) {

package/src/llama.cpp/src/llama-kv-cache-iswa.h CHANGED Viewed

@@ -20,11 +20,13 @@ public:
                          bool   v_trans,
                          bool   offload,
                          bool   swa_full,
-                         bool  ,
+                         bool   unified,
                      uint32_t   kv_size,
                      uint32_t   n_seq_max,
                      uint32_t   n_ubatch,
-                     uint32_t   n_pad);
+                     uint32_t   n_pad,
+        const layer_filter_cb & filter,
+        const  layer_reuse_cb & reuse);
     ~llama_kv_cache_iswa() = default;

package/src/llama.cpp/src/llama-kv-cache.cpp CHANGED Viewed

@@ -17,32 +17,25 @@
 //
 llama_kv_cache::llama_kv_cache(
-        const llama_model &  model,
-          layer_filter_cb && filter,
-                ggml_type    type_k,
-                ggml_type    type_v,
-                     bool    v_trans,
-                     bool    offload,
-                     bool    unified,
-                 uint32_t    kv_size,
-                 uint32_t    n_seq_max,
-                 uint32_t    n_pad,
-                 uint32_t    n_swa,
-           llama_swa_type    swa_type) :
+        const llama_model & model,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   v_trans,
+                     bool   offload,
+                     bool   unified,
+                 uint32_t   kv_size,
+                 uint32_t   n_seq_max,
+                 uint32_t   n_pad,
+                 uint32_t   n_swa,
+           llama_swa_type   swa_type,
+    const layer_filter_cb & filter,
+    const  layer_reuse_cb & reuse) :
     model(model), hparams(model.hparams), v_trans(v_trans),
     n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
     GGML_ASSERT(kv_size % n_pad == 0);
-    // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
-    auto n_layer_cache = hparams.n_layer;
-    if (model.arch == LLM_ARCH_GEMMA3N) {
-        n_layer_cache = 20;
-    }
-    if (model.arch == LLM_ARCH_GLM4_MOE) {
-        // GLM-4.5: Only process up to last layer, skip final NextN layer
-        n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
-    }
+    const uint32_t n_layer_kv = hparams.n_layer_kv();
     // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -50,7 +43,7 @@ llama_kv_cache::llama_kv_cache(
         auto it = ctx_map.find(buft);
         if (it == ctx_map.end()) {
             ggml_init_params params = {
-                /*.mem_size   =*/ size_t(2u*(1 + n_stream)*n_layer_cache*ggml_tensor_overhead()),
+                /*.mem_size   =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
                 /*.mem_buffer =*/ NULL,
                 /*.no_alloc   =*/ true,
             };
@@ -97,9 +90,14 @@ llama_kv_cache::llama_kv_cache(
                 __func__, hparams.n_embd_v_gqa_max());
     }
-    for (uint32_t il = 0; il < n_layer_cache; il++) {
+    for (uint32_t il = 0; il < hparams.n_layer; il++) {
+        if (!hparams.has_kv(il)) {
+            LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
+            continue;
+        }
         if (filter && !filter(il)) {
-            LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
+            LLAMA_LOG_DEBUG("%s: layer %3d: filtered\n", __func__, il);
             continue;
         }
@@ -147,23 +145,27 @@ llama_kv_cache::llama_kv_cache(
         layers.push_back({ il, k, v, k_stream, v_stream, });
     }
-    // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
-    if (model.arch == LLM_ARCH_GEMMA3N) {
-        LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1);
+    if (reuse) {
+        LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
-        for (uint32_t il = n_layer_cache; il < hparams.n_layer; il++) {
-            if (filter && !filter(il)) {
-                LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
+        for (uint32_t il = 0; il < hparams.n_layer; il++) {
+            const int32_t il_reuse = reuse(il);
+            if (il_reuse < 0) {
+                LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il);
                 continue;
             }
-            const bool     is_swa   = hparams.is_swa(il);
-            const uint32_t il_reuse = n_layer_cache - (is_swa ? 2 : 1);
+            if (filter && !filter(il)) {
+                LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il);
+                continue;
+            }
             GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
             map_layer_ids[il] = map_layer_ids[il_reuse];
-            LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d, isw = %d\n", __func__, il, il_reuse, is_swa);
+            LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il));
         }
     }

package/src/llama.cpp/src/llama-kv-cache.h CHANGED Viewed

@@ -21,9 +21,6 @@ class llama_kv_cache : public llama_memory_i {
 public:
     static uint32_t get_padding(const llama_cparams & cparams);
-    // this callback is used to filter out layers that should not be included in the cache
-    using layer_filter_cb = std::function<bool(int32_t il)>;
     struct stream_copy_info {
         bool empty() const {
             assert(ssrc.size() == sdst.size());
@@ -82,18 +79,19 @@ public:
     using slot_info_vec_t = std::vector<slot_info>;
     llama_kv_cache(
-            const llama_model &  model,
-              layer_filter_cb && filter,
-                    ggml_type    type_k,
-                    ggml_type    type_v,
-                         bool    v_trans,
-                         bool    offload,
-                         bool    unified,
-                     uint32_t    kv_size,
-                     uint32_t    n_seq_max,
-                     uint32_t    n_pad,
-                     uint32_t    n_swa,
-               llama_swa_type    swa_type);
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
+                         bool   unified,
+                     uint32_t   kv_size,
+                     uint32_t   n_seq_max,
+                     uint32_t   n_pad,
+                     uint32_t   n_swa,
+               llama_swa_type   swa_type,
+        const layer_filter_cb & filter,
+        const  layer_reuse_cb & reuse);
     ~llama_kv_cache() = default;

package/src/llama.cpp/src/llama-memory-hybrid.cpp CHANGED Viewed

@@ -9,32 +9,29 @@
 //
 llama_memory_hybrid::llama_memory_hybrid(
-    const llama_model & model,
-                         /* attn */
-            ggml_type    type_k,
-            ggml_type    type_v,
-                 bool    v_trans,
-             uint32_t    kv_size,
-             uint32_t    n_pad,
-             uint32_t    n_swa,
-       llama_swa_type    swa_type,
-                         /* recurrent */
-            ggml_type    type_r,
-            ggml_type    type_s,
-             uint32_t    rs_size,
-                         /* common */
-             uint32_t    n_seq_max,
-                 bool    offload,
-                 bool    unified,
-                         /* layer filters */
-      layer_filter_cb && filter_attn,
-      layer_filter_cb && filter_recr) :
+        const llama_model & model,
+                            /* attn */
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   v_trans,
+                 uint32_t   kv_size,
+                 uint32_t   n_pad,
+                 uint32_t   n_swa,
+           llama_swa_type   swa_type,
+                            /* recurrent */
+                ggml_type   type_r,
+                ggml_type   type_s,
+                 uint32_t   rs_size,
+                            /* common */
+                 uint32_t   n_seq_max,
+                     bool   offload,
+                     bool   unified,
+                            /* layer filters */
+    const layer_filter_cb & filter_attn,
+    const layer_filter_cb & filter_recr) :
     hparams(model.hparams),
     mem_attn(new llama_kv_cache(
         model,
-        filter_attn == nullptr ?
-            [&](int32_t il) { return !hparams.is_recurrent(il); }
-            : filter_attn,
         type_k,
         type_v,
         v_trans,
@@ -44,18 +41,22 @@ llama_memory_hybrid::llama_memory_hybrid(
         n_seq_max,
         n_pad,
         n_swa,
-        swa_type
+        swa_type,
+        filter_attn == nullptr ?
+            [&](int32_t il) { return !hparams.is_recurrent(il); }
+            : filter_attn,
+        nullptr
     )),
     mem_recr(new llama_memory_recurrent(
         model,
-        filter_recr == nullptr ?
-            [&](int32_t il) { return hparams.is_recurrent(il); }
-            : filter_recr,
         type_r,
         type_s,
         offload,
         rs_size,
-        n_seq_max
+        n_seq_max,
+        filter_recr == nullptr ?
+            [&](int32_t il) { return hparams.is_recurrent(il); }
+            : filter_recr
     )) {}
 llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {

package/src/llama.cpp/src/llama-memory-hybrid.h CHANGED Viewed

@@ -18,31 +18,27 @@
 class llama_memory_hybrid : public llama_memory_i {
 public:
-    // this callback is used to filter out layers that should not be included in the cache
-    using layer_filter_cb = std::function<bool(int32_t il)>;
     llama_memory_hybrid(
         const llama_model & model,
                             /* attn */
-                ggml_type    type_k,
-                ggml_type    type_v,
-                     bool    v_trans,
-                 uint32_t    kv_size,
-                 uint32_t    n_pad,
-                 uint32_t    n_swa,
-           llama_swa_type    swa_type,
-                             /* recurrent */
-                ggml_type    type_r,
-                ggml_type    type_s,
-                 uint32_t    rs_size,
-                             /* common */
-                 uint32_t    n_seq_max,
-                     bool    offload,
-                     bool    unified,
-                             /* layer filters */
-          layer_filter_cb && filter_attn = nullptr,
-          layer_filter_cb && filter_recr = nullptr);
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   v_trans,
+                 uint32_t   kv_size,
+                 uint32_t   n_pad,
+                 uint32_t   n_swa,
+           llama_swa_type   swa_type,
+                            /* recurrent */
+                ggml_type   type_r,
+                ggml_type   type_s,
+                 uint32_t   rs_size,
+                            /* common */
+                 uint32_t   n_seq_max,
+                     bool   offload,
+                     bool   unified,
+                            /* layer filters */
+    const layer_filter_cb & filter_attn = nullptr,
+    const layer_filter_cb & filter_recr = nullptr);
     ~llama_memory_hybrid() = default;

package/src/llama.cpp/src/llama-memory-recurrent.cpp CHANGED Viewed

@@ -16,13 +16,13 @@
 //
 llama_memory_recurrent::llama_memory_recurrent(
-        const llama_model &  model,
-          layer_filter_cb && filter,
-                ggml_type    type_r,
-                ggml_type    type_s,
-                     bool    offload,
-                 uint32_t    mem_size,
-                 uint32_t    n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
+        const llama_model & model,
+                ggml_type   type_r,
+                ggml_type   type_s,
+                     bool   offload,
+                 uint32_t   mem_size,
+                 uint32_t   n_seq_max,
+    const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
     const int32_t n_layer = hparams.n_layer;
     head = 0;

package/src/llama.cpp/src/llama-memory-recurrent.h CHANGED Viewed

@@ -15,18 +15,14 @@
 //       see the implementation of llama_kv_cache_context_i for an example how to do it
 class llama_memory_recurrent : public llama_memory_i {
 public:
-    // this callback is used to filter out layers that should not be included in the cache
-    using layer_filter_cb = std::function<bool(int32_t il)>;
     llama_memory_recurrent(
-            const llama_model &  model,
-              layer_filter_cb && filter,
-                    ggml_type    type_r,
-                    ggml_type    type_s,
-                         bool    offload,
-                     uint32_t    mem_size,
-                     uint32_t    n_seq_max);
+            const llama_model & model,
+                    ggml_type   type_r,
+                    ggml_type   type_s,
+                         bool   offload,
+                     uint32_t   mem_size,
+                     uint32_t   n_seq_max,
+        const layer_filter_cb & filter);
     ~llama_memory_recurrent() = default;

package/src/llama.cpp/src/llama-memory.h CHANGED Viewed

@@ -3,6 +3,7 @@
 #include "llama.h"
 #include <memory>
+#include <functional>
 struct llama_ubatch;
@@ -64,6 +65,13 @@ using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
 // general concept of LLM memory
 // the KV cache is a type of LLM memory, but there can be other types
 struct llama_memory_i {
+    // this callback is used to filter out layers that should not be included in the cache
+    using layer_filter_cb = std::function<bool(int32_t il)>;
+    // this callback is used to specify which layers should reuse memory from other layers
+    // return negative value to indicate that the layer il should not reuse memory
+    using layer_reuse_cb = std::function<int32_t(int32_t il)>;
     virtual ~llama_memory_i() = default;
     // split the input batch into a set of ubatches and verify that they can fit into the cache

package/src/llama.cpp/src/llama-model.cpp CHANGED Viewed

@@ -1115,6 +1115,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                 hparams.set_swa_pattern(5);
+                hparams.n_layer_kv_from_start     = 20;
                 hparams.rope_freq_base_train_swa  = 10000.0f;
                 hparams.rope_freq_scale_train_swa = 1.0f;
                 hparams.f_attention_scale         = 1.0f;
@@ -1474,12 +1475,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 // Expert gating function (GLM-4.5 uses sigmoid)
                 ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
                 if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
-                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+                    hparams.expert_gating_func =  LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
                 }
                 // NextN/MTP parameters
                 ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
+                // TODO: when MTP is implemented, this should probably be updated if needed
+                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
                 switch (hparams.n_layer) {
                     case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
                     case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
@@ -10524,7 +10528,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
     const int64_t n_embd_altup;
     const int64_t n_altup;
     const int     i_altup_act;
-    const int     n_layer_kv = 20; // number of layers having KV [KV_REUSE]
     const int     n_layer_sparsity = 10; // number of layers using activation sparsity
     const float   f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
@@ -10574,8 +10577,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
         for (int il = 0; il < n_layer; ++il) {
             // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
-            const bool has_kv = (il < n_layer_kv);
             const float freq_base_l  = model.get_rope_freq_base (cparams, il);
             const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
@@ -10595,7 +10596,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
             ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
             // self-attention
-            if (has_kv) {
+            if (hparams.has_kv(il)) {
                 // compute Q and K and RoPE them
                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
@@ -10635,7 +10636,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
             } else {
-                // no KV layers
+                // reuse KV cache of earlier layers
                 ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -18256,12 +18257,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                 if (llm_arch_is_recurrent(arch)) {
                     res = new llama_memory_recurrent(
                             *this,
-                            nullptr,
                             GGML_TYPE_F32,
                             GGML_TYPE_F32,
                             cparams.offload_kqv,
                             std::max((uint32_t) 1, cparams.n_seq_max),
-                            cparams.n_seq_max);
+                            cparams.n_seq_max,
+                            nullptr);
                 } else if (llm_arch_is_hybrid(arch)) {
                     const auto padding = llama_kv_cache::get_padding(cparams);
@@ -18302,6 +18303,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                     LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
+                    llama_memory_i::layer_reuse_cb reuse = nullptr;
+                    if (arch == LLM_ARCH_GEMMA3N) {
+                        reuse = [&](int32_t il) {
+                            if (il >= (int32_t) hparams.n_layer_kv_from_start) {
+                                return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1);
+                            }
+                            return -1;
+                        };
+                    }
                     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
                         GGML_ASSERT(hparams.is_swa_any());
@@ -18316,13 +18329,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                                 n_ctx_per_stream,
                                 cparams.n_seq_max,
                                 cparams.n_ubatch,
-                                padding);
+                                padding,
+                                nullptr,
+                                reuse);
                     } else {
                         GGML_ASSERT(!hparams.is_swa_any());
                         res = new llama_kv_cache(
                                 *this,
-                                nullptr,
                                 params.type_k,
                                 params.type_v,
                                 !cparams.flash_attn,
@@ -18332,7 +18346,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                                 cparams.n_seq_max,
                                 padding,
                                 hparams.n_swa,
-                                hparams.swa_type);
+                                hparams.swa_type,
+                                nullptr,
+                                nullptr);
                     }
                 }
             }