npm - @fugood/llama.node - Versions diffs - 1.1.9 → 1.1.11 - Mend

@fugood/llama.node 1.1.9 → 1.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/lib/binding.ts +7 -1
package/package.json +14 -14
package/scripts/llama.cpp.patch +15 -5
package/src/LlamaCompletionWorker.cpp +12 -3
package/src/LlamaCompletionWorker.h +3 -1
package/src/LlamaContext.cpp +20 -2
package/src/llama.cpp/common/arg.cpp +29 -19
package/src/llama.cpp/common/chat.cpp +153 -3
package/src/llama.cpp/common/chat.h +1 -0
package/src/llama.cpp/common/common.cpp +10 -3
package/src/llama.cpp/common/common.h +4 -1
package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
package/src/llama.cpp/include/llama.h +27 -1
package/src/llama.cpp/src/llama-adapter.cpp +68 -4
package/src/llama.cpp/src/llama-adapter.h +3 -0
package/src/llama.cpp/src/llama-arch.cpp +46 -2
package/src/llama.cpp/src/llama-arch.h +4 -0
package/src/llama.cpp/src/llama-context.cpp +80 -39
package/src/llama.cpp/src/llama-context.h +0 -4
package/src/llama.cpp/src/llama-graph.cpp +20 -10
package/src/llama.cpp/src/llama-graph.h +2 -1
package/src/llama.cpp/src/llama-hparams.cpp +25 -0
package/src/llama.cpp/src/llama-hparams.h +6 -0
package/src/llama.cpp/src/llama-impl.h +2 -0
package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +24 -7
package/src/llama.cpp/src/llama-kv-cache-iswa.h +4 -2
package/src/llama.cpp/src/llama-kv-cache.cpp +67 -130
package/src/llama.cpp/src/llama-kv-cache.h +16 -28
package/src/llama.cpp/src/llama-memory-hybrid.cpp +29 -28
package/src/llama.cpp/src/llama-memory-hybrid.h +18 -22
package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
package/src/llama.cpp/src/llama-memory-recurrent.h +7 -11
package/src/llama.cpp/src/llama-memory.h +8 -0
package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
package/src/llama.cpp/src/llama-model.cpp +302 -31
package/src/llama.cpp/src/llama-model.h +1 -0
package/src/llama.cpp/src/llama-vocab.cpp +1 -1
package/src/llama.cpp/src/llama.cpp +12 -0

package/src/llama.cpp/src/llama-context.cpp CHANGED Viewed

@@ -41,7 +41,6 @@ llama_context::llama_context(
     cparams.yarn_beta_slow   = params.yarn_beta_slow;
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
-    cparams.flash_attn       = params.flash_attn;
     cparams.no_perf          = params.no_perf;
     cparams.pooling_type     = params.pooling_type;
     cparams.warmup           = false;
@@ -86,6 +85,8 @@ llama_context::llama_context(
         cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
     }
+    cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
     // with causal attention, the batch size is limited by the context size
     cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -102,16 +103,6 @@ llama_context::llama_context(
     cparams.op_offload = params.op_offload;
     cparams.kv_unified = params.kv_unified;
-    {
-        const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
-        supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
-        if (!supports_set_rows && !cparams.kv_unified) {
-            LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
-            cparams.kv_unified = true;
-        }
-    }
     {
         const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
         graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
@@ -129,7 +120,7 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
     LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
     LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
-    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
+    LLAMA_LOG_INFO("%s: flash_attn    = %s\n",   __func__, llama_flash_attn_type_name(params.flash_attn_type));
     LLAMA_LOG_INFO("%s: kv_unified    = %s\n",   __func__, cparams.kv_unified ? "true" : "false");
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
@@ -279,8 +270,8 @@ llama_context::llama_context(
         }
     }
-    // reserve worst-case graph
-    if (!hparams.vocab_only && memory) {
+    // resolve automatic Flash Attention use and reserve worst-case graph
+    if (!hparams.vocab_only) {
         const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
@@ -292,11 +283,13 @@ llama_context::llama_context(
         int n_splits_tg = -1;
         int n_nodes_tg  = -1;
-        // simulate full KV cache
-        const auto mctx = memory->init_full();
-        if (!mctx) {
-            throw std::runtime_error("failed to initialize KV cache");
+        llama_memory_context_ptr mctx;
+        if (memory) {
+            LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
+            mctx = memory->init_full();
+            if (!mctx) {
+                throw std::runtime_error("failed to initialize memory module");
+            }
         }
         cross.v_embd.clear();
@@ -308,6 +301,48 @@ llama_context::llama_context(
                 throw std::runtime_error("failed to allocate compute pp buffers");
             }
+            if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
+                ggml_backend_sched_alloc_graph(sched.get(), gf);
+                const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
+                bool fa_device_mismatch = false;
+                for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+                    ggml_tensor * n = ggml_graph_node(gf, i);
+                    if (n->op != GGML_OP_FLASH_ATTN_EXT) {
+                        continue;
+                    }
+                    ggml_backend_dev_t device_fa = ggml_backend_get_device(
+                        ggml_backend_sched_get_tensor_backend(sched.get(), n));
+                    // TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
+                    GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
+                    const int il = std::stoi(n->name + prefix_len);
+                    ggml_backend_dev_t device_kv = model.dev_layer(il);
+                    if (device_fa != device_kv) {
+                        LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
+                            "is assigned to device %s (usually due to missing support)\n",
+                            __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
+                        // FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
+                        fa_device_mismatch = true;
+                        break;
+                    }
+                }
+                if (fa_device_mismatch) {
+                    cparams.flash_attn = false;
+                    LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
+                    if (ggml_is_quantized(params.type_v)) {
+                        throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
+                    }
+                    auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+                    if (!gf) {
+                        throw std::runtime_error("failed to allocate compute pp buffers");
+                    }
+                } else {
+                    cparams.flash_attn = true;
+                    LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
+                }
+            }
             n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
             n_nodes_pp  = ggml_graph_n_nodes(gf);
         }
@@ -888,12 +923,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
         }
     }
-    if (!supports_set_rows) {
-        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-        // overlap with device computation.
-        ggml_backend_sched_reset(sched.get());
-    }
     // TODO: hacky solution
     if (model.arch == LLM_ARCH_T5 && t_embd) {
         //cross.t_embd = t_embd;
@@ -1056,7 +1085,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
         const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
         if (!res) {
-            // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
+            // the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
             llama_pos pos_min[LLAMA_MAX_SEQ];
             for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
                 pos_min[s] = std::numeric_limits<llama_pos>::max();
@@ -1073,7 +1102,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                     continue;
                 }
-                LLAMA_LOG_WARN("%s: removing KV cache entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
+                LLAMA_LOG_WARN("%s: removing memory module entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
                 memory->seq_rm(s, pos_min[s], -1);
             }
@@ -1224,12 +1253,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
     // wait for the computation to finish (automatically done when obtaining the model output)
     //synchronize();
-    if (!supports_set_rows) {
-        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-        // overlap with device computation.
-        ggml_backend_sched_reset(sched.get());
-    }
     return 0;
 }
@@ -1857,7 +1880,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
     }
     if (memory != nullptr) {
-        LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
+        LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
         memory->state_write(io);
     }
@@ -1943,7 +1966,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
     }
     if (memory) {
-        LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
+        LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
         memory->state_read(io);
     }
@@ -2228,6 +2251,7 @@ llama_context_params llama_context_default_params() {
         /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
         /*.attention_type              =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
+        /*.flash_attn_type             =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
         /*.yarn_ext_factor             =*/ -1.0f,
@@ -2244,7 +2268,6 @@ llama_context_params llama_context_default_params() {
         /*.abort_callback_data         =*/ nullptr,
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
-        /*.flash_attn                  =*/ false,
         /*.no_perf                     =*/ true,
         /*.op_offload                  =*/ true,
         /*.swa_full                    =*/ true,
@@ -2272,12 +2295,30 @@ llama_context * llama_init_from_model(
         return nullptr;
     }
-    if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
+    if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && model->arch == LLM_ARCH_GROK) {
         LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
-        params.flash_attn = false;
+        params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+    }
+    if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
+        const uint32_t blck_size = ggml_blck_size(params.type_k);
+        if (model->hparams.n_embd_head_k % blck_size != 0) {
+            LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
+                __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
+            return nullptr;
+        }
+    }
+    if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
+        const uint32_t blck_size = ggml_blck_size(params.type_v);
+        if (model->hparams.n_embd_head_v % blck_size != 0) {
+            LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
+                __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
+            return nullptr;
+        }
     }
-    if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
+    if (ggml_is_quantized(params.type_v) && params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_DISABLED) {
         LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
         return nullptr;
     }

package/src/llama.cpp/src/llama-context.h CHANGED Viewed

@@ -283,10 +283,6 @@ private:
     bool has_evaluated_once = false;
-    // env: LLAMA_SET_ROWS (temporary)
-    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
-    bool supports_set_rows = true;
     // env: LLAMA_GRAPH_REUSE_DISABLE
     bool graph_reuse_disable = false;

package/src/llama.cpp/src/llama-graph.cpp CHANGED Viewed

@@ -314,8 +314,6 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
     res &= self_kq_mask->ne[0] == mctx->get_n_kv();
     res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
-    res &= mctx->get_supports_set_rows(); // TODO: tmp
     return res;
 }
@@ -350,8 +348,6 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
     res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
     res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
-    res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
     return res;
 }
@@ -1225,7 +1221,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
          ggml_tensor * kq_mask,
          ggml_tensor * sinks,
          ggml_tensor * v_mla,
-             float     kq_scale) const {
+               float   kq_scale,
+                 int   il) const {
     const bool v_trans = v->nb[1] > v->nb[2];
     // split the batch into streams if needed
@@ -1260,6 +1257,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
         cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
                                   hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
+        cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
         ggml_flash_attn_ext_add_sinks(cur, sinks);
         ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
@@ -1275,6 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
             // The permutations are noops and only change how the tensor data is interpreted.
             cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
             cur = ggml_mul_mat(ctx0, v_mla, cur);
+            cb(cur, "fattn_mla", il);
             cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
             cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
 #endif
@@ -1283,6 +1282,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
         cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
     } else {
         ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+        cb(kq, "kq", il);
         // note: this op tends to require high floating point range
         //       while for some models F16 is enough, for others it is not, so we default to F32 here
@@ -1296,32 +1296,42 @@ ggml_tensor * llm_graph_context::build_attn_mha(
             // before the softmax below
             kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
+            cb(kq, "kq_tanh", il);
             kq = ggml_scale(ctx0, kq, 30);
+            cb(kq, "kq_scaled", il);
         }
         if (hparams.attn_soft_cap) {
             kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
+            cb(kq, "kq_scaled_1", il);
             kq = ggml_tanh (ctx0, kq);
+            cb(kq, "kq_tanh", il);
             kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
+            cb(kq, "kq_scaled_2", il);
         }
         if (kq_b) {
             kq = ggml_add(ctx0, kq, kq_b);
+            cb(kq, "kq_plus_kq_b", il);
         }
         kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
         ggml_soft_max_add_sinks(kq, sinks);
+        cb(kq, "kq_soft_max", il);
         if (!v_trans) {
             // note: avoid this branch
             v = ggml_cont(ctx0, ggml_transpose(ctx0, v));
+            cb(v, "v_cont", il);
         }
         ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+        cb(kqv, "kqv", il);
         // for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
         if (v_mla) {
             kqv = ggml_mul_mat(ctx0, v_mla, kqv);
+            cb(kqv, "kqv_mla", il);
         }
         cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
@@ -1376,13 +1386,13 @@ ggml_tensor * llm_graph_context::build_attn(
     // [TAG_NO_CACHE_PAD]
     // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
-    assert(!ubatch.equal_seqs());
+    assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
     ggml_tensor * q = q_cur;
     ggml_tensor * k = k_cur;
     ggml_tensor * v = v_cur;
-    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
     cb(cur, "kqv_out", il);
     if (wo) {
@@ -1471,7 +1481,7 @@ ggml_tensor * llm_graph_context::build_attn(
     ggml_tensor * k = mctx_cur->get_k(ctx0, il);
     ggml_tensor * v = mctx_cur->get_v(ctx0, il);
-    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
     cb(cur, "kqv_out", il);
     if (wo) {
@@ -1538,7 +1548,7 @@ ggml_tensor * llm_graph_context::build_attn(
     ggml_tensor * k = mctx_cur->get_k(ctx0, il);
     ggml_tensor * v = mctx_cur->get_v(ctx0, il);
-    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
     cb(cur, "kqv_out", il);
     if (wo) {
@@ -1593,7 +1603,7 @@ ggml_tensor * llm_graph_context::build_attn(
     ggml_tensor * k = k_cur;
     ggml_tensor * v = v_cur;
-    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
     cb(cur, "kqv_out", il);
     if (wo) {

package/src/llama.cpp/src/llama-graph.h CHANGED Viewed

@@ -687,7 +687,8 @@ struct llm_graph_context {
             ggml_tensor * kq_mask,
             ggml_tensor * sinks,   // [n_head_q]
             ggml_tensor * v_mla,   // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
-                  float   kq_scale) const;
+                  float   kq_scale,
+                    int   il) const;
     llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;

package/src/llama.cpp/src/llama-hparams.cpp CHANGED Viewed

@@ -153,3 +153,28 @@ bool llama_hparams::is_swa(uint32_t il) const {
     GGML_ABORT("fatal error");
 }
+bool llama_hparams::has_kv(uint32_t il) const {
+    if (n_layer_kv_from_start >= 0) {
+        if (il < (uint32_t) n_layer_kv_from_start) {
+            return true;
+        }
+        return false;
+    }
+    // by default, all layers have kv
+    return true;
+}
+uint32_t llama_hparams::n_layer_kv() const {
+    uint32_t res = 0;
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        if (has_kv(il)) {
+            res++;
+        }
+    }
+    return res;
+}

package/src/llama.cpp/src/llama-hparams.h CHANGED Viewed

@@ -41,6 +41,7 @@ struct llama_hparams {
     uint32_t n_embd;
     uint32_t n_embd_features = 0;
     uint32_t n_layer;
+     int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
     uint32_t n_rot;
     uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
     uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
@@ -221,6 +222,11 @@ struct llama_hparams {
     uint32_t n_pos_per_embd() const;
     bool is_swa(uint32_t il) const;
+    bool has_kv(uint32_t il) const;
+    // number of layers for which has_kv() returns true
+    uint32_t n_layer_kv() const;
 };
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

package/src/llama.cpp/src/llama-impl.h CHANGED Viewed

@@ -59,3 +59,5 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
 std::string llama_format_tensor_shape(const struct ggml_tensor * t);
 std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
+#define LLAMA_TENSOR_NAME_FATTN "__fattn__"

package/src/llama.cpp/src/llama-kv-cache-iswa.cpp CHANGED Viewed

@@ -22,9 +22,26 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
                  uint32_t   kv_size,
                  uint32_t   n_seq_max,
                  uint32_t   n_ubatch,
-                 uint32_t   n_pad) : hparams(model.hparams), unified(unified) {
-    llama_kv_cache::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
-    llama_kv_cache::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
+                 uint32_t   n_pad,
+    const layer_filter_cb & filter,
+    const  layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) {
+    // chain filters
+    const layer_filter_cb filter_base = [&](int32_t il) {
+        if (filter && !filter(il)) {
+            return false;
+        }
+        return !model.hparams.is_swa(il);
+    };
+    const layer_filter_cb filter_swa  = [&](int32_t il) {
+        if (filter && !filter(il)) {
+            return false;
+        }
+        return  model.hparams.is_swa(il);
+    };
     const uint32_t size_base = kv_size;
@@ -41,16 +58,16 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
     LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
     kv_base = std::make_unique<llama_kv_cache>(
-            model, std::move(filter_base), type_k, type_v,
+            model, type_k, type_v,
             v_trans, offload, unified, size_base, n_seq_max, n_pad,
-            0, LLAMA_SWA_TYPE_NONE);
+            0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
     kv_swa = std::make_unique<llama_kv_cache>(
-            model, std::move(filter_swa), type_k, type_v,
+            model, type_k, type_v,
             v_trans, offload, unified, size_swa, n_seq_max, n_pad,
-            hparams.n_swa, hparams.swa_type);
+            hparams.n_swa, hparams.swa_type, filter_swa, reuse);
 }
 void llama_kv_cache_iswa::clear(bool data) {

package/src/llama.cpp/src/llama-kv-cache-iswa.h CHANGED Viewed

@@ -20,11 +20,13 @@ public:
                          bool   v_trans,
                          bool   offload,
                          bool   swa_full,
-                         bool  ,
+                         bool   unified,
                      uint32_t   kv_size,
                      uint32_t   n_seq_max,
                      uint32_t   n_ubatch,
-                     uint32_t   n_pad);
+                     uint32_t   n_pad,
+        const layer_filter_cb & filter,
+        const  layer_reuse_cb & reuse);
     ~llama_kv_cache_iswa() = default;