npm - cui-llama.rn - Versions diffs - 1.6.0 → 1.6.1 - Mend

cui-llama.rn 1.6.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

package/cpp/llama-context.h CHANGED Viewed

@@ -27,7 +27,12 @@ struct llama_context {
     void synchronize();
-    const llama_model & get_model() const;
+    const llama_model   & get_model()   const;
+    const llama_cparams & get_cparams() const;
+    lm_ggml_backend_sched_t get_sched() const;
+    lm_ggml_context * get_ctx_compute() const;
     uint32_t n_ctx()         const;
     uint32_t n_ctx_per_seq() const;
@@ -137,50 +142,30 @@ private:
     // Returns max number of outputs for which space was reserved.
     int32_t output_reserve(int32_t n_outputs);
-    // make the outputs have the same order they had in the user-provided batch
-    // TODO: maybe remove this
-    void output_reorder();
     //
     // graph
     //
+public:
     int32_t graph_max_nodes() const;
     // zero-out inputs and create the ctx_compute for the compute graph
     lm_ggml_cgraph * graph_init();
+    // returns the result of lm_ggml_backend_sched_graph_compute_async execution
+    lm_ggml_status graph_compute(
+            lm_ggml_cgraph * gf,
+                   bool   batched);
+private:
     llm_graph_result_ptr graph_build(
             lm_ggml_context * ctx,
              lm_ggml_cgraph * gf,
       const llama_ubatch & ubatch,
           llm_graph_type   gtype);
-    // returns the result of lm_ggml_backend_sched_graph_compute_async execution
-    lm_ggml_status graph_compute(
-            lm_ggml_cgraph * gf,
-                   bool   batched);
     llm_graph_cb graph_get_cb() const;
-    // used by kv_self_update()
-    lm_ggml_tensor * build_rope_shift(
-        lm_ggml_context * ctx0,
-        lm_ggml_tensor * cur,
-        lm_ggml_tensor * shift,
-        lm_ggml_tensor * factors,
-              float   freq_base,
-              float   freq_scale,
-        lm_ggml_backend_buffer * bbuf) const;
-    llm_graph_result_ptr build_kv_self_shift(
-            lm_ggml_context * ctx0,
-            lm_ggml_cgraph * gf) const;
-    llm_graph_result_ptr build_kv_self_defrag(
-            lm_ggml_context * ctx0,
-            lm_ggml_cgraph * gf) const;
     // TODO: read/write lora adapters and cvec
     size_t state_write_data(llama_io_write_i & io);
     size_t state_read_data (llama_io_read_i  & io);
@@ -197,11 +182,10 @@ private:
     llama_cparams       cparams;
     llama_adapter_cvec  cvec;
     llama_adapter_loras loras;
-    llama_sbatch        sbatch;
     llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
-    std::unique_ptr<llama_kv_cache_unified> kv_self;
+    std::unique_ptr<llama_memory_i> memory;
     // TODO: remove
     bool logits_all = false;

package/cpp/llama-graph.cpp CHANGED Viewed

@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && pos) {
         const int64_t n_tokens = ubatch->n_tokens;
-        lm_ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*lm_ggml_element_size(pos));
+        if (ubatch->token && n_pos_per_embd == 4) {
+            // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
+            // the 3 first dims are the same, and 4th dim is all 0
+            std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
+            // copy the first dimension
+            for (int i = 0; i < n_tokens; ++i) {
+                pos_data[               i] = ubatch->pos[i];
+                pos_data[    n_tokens + i] = ubatch->pos[i];
+                pos_data[2 * n_tokens + i] = ubatch->pos[i];
+                pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
+            }
+            lm_ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*lm_ggml_element_size(pos));
+        } else {
+            lm_ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*lm_ggml_element_size(pos));
+        }
     }
 }
@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
             ) * f_attn_temp_scale + 1.0;
         }
-        lm_ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*lm_ggml_element_size(attn_scale));
+        lm_ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*lm_ggml_element_size(attn_scale));
     }
 }
@@ -270,24 +284,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
         for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t  cell_id = i + kv_self->head;
-            //////////////////////////////////////////////
-            // TODO: this should not mutate the KV cache !
-            llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
-            // prevent out-of-bound sources
-            if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
-                kv_cell.src = cell_id;
-            }
-            data[i] = kv_cell.src;
-            // TODO: do not mutate the KV cache
-            // ensure copy only happens once
-            if (kv_cell.src != (int32_t) cell_id) {
-                kv_cell.src = cell_id;
-            }
+            data[i] = kv_self->s_copy(i);
         }
     }
 }
@@ -303,18 +300,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
         // clear unused states
         for (int i = 0; i < n_kv; ++i) {
-            const uint32_t  cell_id = i + kv_self->head;
-            //////////////////////////////////////////////
-            // TODO: this should not mutate the KV cache !
-            llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
-            data[i] = (float) (kv_cell.src >= 0);
-            // only clear once
-            if (kv_cell.src < 0) {
-                kv_cell.src = cell_id;
-            }
+            data[i] = kv_self->s_mask(i);
         }
     }
 }
@@ -592,7 +578,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     res              (std::make_unique<llm_graph_result>()) {
     }
-int64_t llm_graph_context::n_pos_per_token() const {
+int64_t llm_graph_context::n_pos_per_embd() const {
     return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
 }
@@ -803,6 +789,10 @@ lm_ggml_tensor * llm_graph_context::build_ffn(
     if (down) {
         cur = build_lora_mm(down, cur);
+        if (arch == LLM_ARCH_GLM4) {
+            // GLM4 seems to have numerical issues with half-precision accumulators
+            lm_ggml_mul_mat_set_prec(cur, LM_GGML_PREC_F32);
+        }
     }
     if (down_b) {
@@ -910,28 +900,35 @@ lm_ggml_tensor * llm_graph_context::build_moe_ffn(
     lm_ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
     cb(up, "ffn_moe_up", il);
-    lm_ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-    cb(gate, "ffn_moe_gate", il);
+    lm_ggml_tensor * experts = nullptr;
+    if (gate_exps) {
+        cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+        cb(cur, "ffn_moe_gate", il);
+    } else {
+        cur = up;
+    }
     switch (type_op) {
         case LLM_FFN_SILU:
             {
-                gate = lm_ggml_silu(ctx0, gate);
-                cb(gate, "ffn_moe_silu", il);
+                cur = lm_ggml_silu(ctx0, cur);
+                cb(cur, "ffn_moe_silu", il);
             } break;
         case LLM_FFN_GELU:
             {
-                gate = lm_ggml_gelu(ctx0, gate);
-                cb(gate, "ffn_moe_gelu", il);
+                cur = lm_ggml_gelu(ctx0, cur);
+                cb(cur, "ffn_moe_gelu", il);
             } break;
         default:
             LM_GGML_ABORT("fatal error");
     }
-    lm_ggml_tensor * par = lm_ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
-    cb(par, "ffn_moe_gate_par", il);
+    if (gate_exps) {
+        cur = lm_ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
+        cb(cur, "ffn_moe_gate_par", il);
+    }
-    lm_ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
+    experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
     cb(experts, "ffn_moe_down", il);
     if (!weight_before_ffn) {
@@ -1014,11 +1011,11 @@ lm_ggml_tensor * llm_graph_context::build_inp_embd(lm_ggml_tensor * tok_embd) co
 }
 lm_ggml_tensor * llm_graph_context::build_inp_pos() const {
-    auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
+    auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
     auto & cur = inp->pos;
-    cur = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens*n_pos_per_token());
+    cur = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens*n_pos_per_embd());
     lm_ggml_set_input(cur);
     res->add_input(std::move(inp));
@@ -1027,11 +1024,12 @@ lm_ggml_tensor * llm_graph_context::build_inp_pos() const {
 }
 lm_ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
-    auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
+    auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
     auto & cur = inp->attn_scale;
-    cur = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
+    // this need to be 1x1xN for broadcasting
+    cur = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, 1, 1, n_tokens);
     lm_ggml_set_input(cur);
     res->add_input(std::move(inp));
@@ -1079,7 +1077,7 @@ lm_ggml_tensor * llm_graph_context::build_inp_cls() const {
 }
 lm_ggml_tensor * llm_graph_context::build_inp_s_copy() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
     auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
@@ -1096,7 +1094,7 @@ lm_ggml_tensor * llm_graph_context::build_inp_s_copy() const {
 }
 lm_ggml_tensor * llm_graph_context::build_inp_s_mask() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
     auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
@@ -1188,6 +1186,7 @@ lm_ggml_tensor * llm_graph_context::build_attn_mha(
          lm_ggml_tensor * v,
          lm_ggml_tensor * kq_b,
          lm_ggml_tensor * kq_mask,
+         lm_ggml_tensor * v_mla,
              bool      v_trans,
              float     kq_scale) const {
   //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
@@ -1199,8 +1198,6 @@ lm_ggml_tensor * llm_graph_context::build_attn_mha(
   //const auto & n_embd_head_k = hparams.n_embd_head_k;
   //const auto & n_embd_head_v = hparams.n_embd_head_v;
-    const auto n_embd_head_v = v_trans ? v->ne[1] : v->ne[0];
     const auto n_tokens = q->ne[1];
     const auto n_head   = q->ne[2];
     const auto n_kv     = k->ne[1];
@@ -1229,7 +1226,12 @@ lm_ggml_tensor * llm_graph_context::build_attn_mha(
         lm_ggml_flash_attn_ext_set_prec(cur, LM_GGML_PREC_F32);
-        cur = lm_ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
+        if (v_mla) {
+            cur = lm_ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
+            cur = lm_ggml_mul_mat(ctx0, v_mla, cur);
+        }
+        cur = lm_ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
     } else {
         lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
@@ -1267,9 +1269,14 @@ lm_ggml_tensor * llm_graph_context::build_attn_mha(
         lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, v, kq);
-        lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+        // for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
+        if (v_mla) {
+            kqv = lm_ggml_mul_mat(ctx0, v_mla, kqv);
+        }
+        cur = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-        cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
+        cur = lm_ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
         if (!cparams.offload_kqv) {
             // all nodes between the KV store and the attention output are run on the CPU
@@ -1304,6 +1311,7 @@ lm_ggml_tensor * llm_graph_context::build_attn(
         lm_ggml_tensor * k_cur,
         lm_ggml_tensor * v_cur,
         lm_ggml_tensor * kq_b,
+        lm_ggml_tensor * v_mla,
             float     kq_scale,
             int       il) const {
     LM_GGML_UNUSED(n_tokens);
@@ -1325,7 +1333,7 @@ lm_ggml_tensor * llm_graph_context::build_attn(
     lm_ggml_tensor * v = lm_ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
     //cb(k, "v", il);
-    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
+    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
     cb(cur, "kqv_out", il);
@@ -1379,6 +1387,7 @@ lm_ggml_tensor * llm_graph_context::build_attn(
         lm_ggml_tensor * k_cur,
         lm_ggml_tensor * v_cur,
         lm_ggml_tensor * kq_b,
+        lm_ggml_tensor * v_mla,
             float     kq_scale,
             int       il) const {
     // these nodes are added to the graph together so that they are not reordered
@@ -1399,8 +1408,6 @@ lm_ggml_tensor * llm_graph_context::build_attn(
     // store to KV cache
     {
-        LM_GGML_ASSERT(!kv_self->recurrent);
         const auto kv_head = kv_self->head;
         LM_GGML_ASSERT(kv_self->size == n_ctx);
@@ -1464,7 +1471,7 @@ lm_ggml_tensor * llm_graph_context::build_attn(
                 lm_ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
                 0);
-    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale);
+    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
     cb(cur, "kqv_out", il);
     if (wo) {
@@ -1504,6 +1511,7 @@ lm_ggml_tensor * llm_graph_context::build_attn(
         lm_ggml_tensor * k_cur,
         lm_ggml_tensor * v_cur,
         lm_ggml_tensor * kq_b,
+        lm_ggml_tensor * v_mla,
             float     kq_scale,
             int       il) const {
     // these nodes are added to the graph together so that they are not reordered
@@ -1523,7 +1531,7 @@ lm_ggml_tensor * llm_graph_context::build_attn(
     lm_ggml_tensor * v = lm_ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
     //cb(k, "v", il);
-    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
+    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
     cb(cur, "kqv_out", il);
@@ -1549,7 +1557,7 @@ lm_ggml_tensor * llm_graph_context::build_copy_mask_state(
          lm_ggml_tensor * state_mask,
              int32_t   n_state,
              int32_t   n_seqs) const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
     const auto n_kv    = kv_self->n;
     const auto kv_head = kv_self->head;
@@ -1581,7 +1589,7 @@ lm_ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
          lm_ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
                  int   il) const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
     const auto token_shift_count = hparams.token_shift_count;
@@ -1602,7 +1610,7 @@ lm_ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
          lm_ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
                  int   il) const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
     const auto token_shift_count = hparams.token_shift_count;
     const auto n_embd = hparams.n_embd;
@@ -1692,4 +1700,3 @@ void llm_graph_context::build_pooling(
     lm_ggml_build_forward_expand(gf, cur);
 }

package/cpp/llama-graph.h CHANGED Viewed

@@ -19,6 +19,7 @@ struct llama_cparams;
 class llama_memory_i;
 class llama_kv_cache_unified;
+class llama_kv_cache_recurrent;
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@@ -90,29 +91,27 @@ public:
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
-    llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+    llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
     virtual ~llm_graph_input_pos() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * pos = nullptr; // I32 [n_batch]
-    const int64_t n_pos_per_token = 1;
+    const int64_t n_pos_per_embd = 1;
 };
 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
-    llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
-        : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
     virtual ~llm_graph_input_attn_temp() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
-    const int64_t n_pos_per_token = 1;
     const uint32_t n_attn_temp_floor_scale;
     const float    f_attn_temp_scale;
 };
@@ -188,26 +187,26 @@ public:
 class llm_graph_input_s_copy : public llm_graph_input_i {
 public:
-    llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
     virtual ~llm_graph_input_s_copy() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * s_copy; // I32 [kv_size]
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_recurrent * kv_self;
 };
 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
-    llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
     virtual ~llm_graph_input_s_mask() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * s_mask; // F32 [1, n_kv]
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_recurrent * kv_self;
 };
 class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -352,8 +351,8 @@ struct llm_graph_params {
     const llama_cparams & cparams;
     const llama_ubatch  & ubatch;
-    lm_ggml_backend_sched * sched;
-    lm_ggml_backend * backend_cpu;
+    lm_ggml_backend_sched_t sched;
+    lm_ggml_backend_t backend_cpu;
     const llama_adapter_cvec  * cvec;
     const llama_adapter_loras * loras;
@@ -404,9 +403,9 @@ struct llm_graph_context {
     lm_ggml_context * ctx0 = nullptr;
-    lm_ggml_backend_sched * sched;
+    lm_ggml_backend_sched_t sched;
-    lm_ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
+    lm_ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
     const llama_adapter_cvec  * cvec;
     const llama_adapter_loras * loras;
@@ -419,7 +418,7 @@ struct llm_graph_context {
     llm_graph_context(const llm_graph_params & params);
-    int64_t n_pos_per_token() const;
+    int64_t n_pos_per_embd() const;
     void cb(lm_ggml_tensor * cur, const char * name, int il) const;
@@ -505,11 +504,12 @@ struct llm_graph_context {
     lm_ggml_tensor * build_attn_mha(
              lm_ggml_cgraph * gf,
-             lm_ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
-             lm_ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
-             lm_ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
+             lm_ggml_tensor * q,     // [n_embd_head_q, n_tokens, n_head_q]
+             lm_ggml_tensor * k,     // [n_embd_head_k, n_tokens, n_head_k]
+             lm_ggml_tensor * v,     // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
              lm_ggml_tensor * kq_b,
              lm_ggml_tensor * kq_mask,
+             lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                     bool   v_trans,
                    float   kq_scale) const;
@@ -524,6 +524,7 @@ struct llm_graph_context {
             lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
             lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
             lm_ggml_tensor * kq_b,
+            lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
                     int   il) const;
@@ -538,6 +539,7 @@ struct llm_graph_context {
             lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
             lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
             lm_ggml_tensor * kq_b,
+            lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
                     int   il) const;
@@ -552,6 +554,7 @@ struct llm_graph_context {
             lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
             lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
             lm_ggml_tensor * kq_b,
+            lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
                     int   il) const;

package/cpp/llama-hparams.h CHANGED Viewed

@@ -43,6 +43,10 @@ struct llama_hparams {
     uint32_t n_expert_used = 0;
     uint32_t n_rel_attn_bkts = 0;
+    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
+    uint32_t n_embd_head_k_mla = 0;
+    uint32_t n_embd_head_v_mla = 0;
     // for WavTokenizer
     struct llama_hparams_posnet   posnet;
     struct llama_hparams_convnext convnext;
@@ -62,6 +66,7 @@ struct llama_hparams {
     float    expert_weights_scale = 0.0;
     bool     expert_weights_norm  = false;
     uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
+    uint32_t moe_every_n_layers   = 0;
     float f_norm_eps;
     float f_norm_rms_eps;