npm - cui-llama.rn - Versions diffs - 1.6.0 → 1.6.1 - Mend

cui-llama.rn 1.6.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h CHANGED Viewed

@@ -133,6 +133,11 @@ extern "C" {
     LM_GGML_BACKEND_API lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void);
+    LM_GGML_BACKEND_API void lm_ggml_cpu_fp32_to_fp16(const float *, lm_ggml_fp16_t *, int64_t);
+    LM_GGML_BACKEND_API void lm_ggml_cpu_fp16_to_fp32(const lm_ggml_fp16_t *, float *, int64_t);
+    LM_GGML_BACKEND_API void lm_ggml_cpu_fp32_to_bf16(const float *, lm_ggml_bf16_t *, int64_t);
+    LM_GGML_BACKEND_API void lm_ggml_cpu_bf16_to_fp32(const lm_ggml_bf16_t *, float *, int64_t);
 #ifdef __cplusplus
 }
 #endif

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h CHANGED Viewed

@@ -16,6 +16,14 @@
 #include <arm_sve.h>
 #endif // __ARM_FEATURE_SVE
+#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+#endif
 #if defined(__F16C__)
 #include <immintrin.h>
 #endif
@@ -140,8 +148,14 @@ struct lm_ggml_map_custom2_op_params {
 struct lm_ggml_map_custom3_op_params {
     lm_ggml_custom3_op_t fun;
-    int n_tasks;
-    void * userdata;
+    int               n_tasks;
+    void            * userdata;
+};
+struct lm_ggml_custom_op_params {
+    lm_ggml_custom_op_t fun;
+    int              n_tasks;
+    void           * userdata;
 };
 // bitset
@@ -311,13 +325,6 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
 // for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
 //
 #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
-    // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-    //
-    //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-    //
-    #include <arm_neon.h>
     #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
     #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h CHANGED Viewed

@@ -394,8 +394,8 @@ extern "C" {
     // precision
     enum lm_ggml_prec {
-        LM_GGML_PREC_DEFAULT,
-        LM_GGML_PREC_F32,
+        LM_GGML_PREC_DEFAULT =  0, // stored as lm_ggml_tensor.op_params, 0 by default
+        LM_GGML_PREC_F32     = 10,
     };
     // model file types
@@ -482,6 +482,7 @@ extern "C" {
         LM_GGML_OP_CONV_TRANSPOSE_1D,
         LM_GGML_OP_IM2COL,
         LM_GGML_OP_IM2COL_BACK,
+        LM_GGML_OP_CONV_2D_DW,
         LM_GGML_OP_CONV_TRANSPOSE_2D,
         LM_GGML_OP_POOL_1D,
         LM_GGML_OP_POOL_2D,
@@ -508,17 +509,12 @@ extern "C" {
         LM_GGML_OP_UNARY,
-        LM_GGML_OP_MAP_UNARY,
-        LM_GGML_OP_MAP_BINARY,
-        LM_GGML_OP_MAP_CUSTOM1_F32,
-        LM_GGML_OP_MAP_CUSTOM2_F32,
-        LM_GGML_OP_MAP_CUSTOM3_F32,
         LM_GGML_OP_MAP_CUSTOM1,
         LM_GGML_OP_MAP_CUSTOM2,
         LM_GGML_OP_MAP_CUSTOM3,
+        LM_GGML_OP_CUSTOM,
         LM_GGML_OP_CROSS_ENTROPY_LOSS,
         LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK,
         LM_GGML_OP_OPT_STEP_ADAMW,
@@ -683,6 +679,9 @@ extern "C" {
     LM_GGML_API bool lm_ggml_is_contiguous_1(const struct lm_ggml_tensor * tensor); // contiguous for dims >= 1
     LM_GGML_API bool lm_ggml_is_contiguous_2(const struct lm_ggml_tensor * tensor); // contiguous for dims >= 2
+    // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
+    LM_GGML_API bool lm_ggml_is_contiguous_channels(const struct lm_ggml_tensor * tensor);
     LM_GGML_API bool lm_ggml_are_same_shape (const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1);
     LM_GGML_API bool lm_ggml_are_same_stride(const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1);
@@ -1666,7 +1665,7 @@ extern "C" {
             struct lm_ggml_tensor  * a,
             struct lm_ggml_tensor  * b);
-    // depthwise
+    // depthwise (via im2col and mul_mat)
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_2d_dw(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,  // convolution kernel
@@ -1678,6 +1677,22 @@ extern "C" {
             int                  d0,  // dilation dimension 0
             int                  d1); // dilation dimension 1
+    // Depthwise 2D convolution
+    // may be faster than lm_ggml_conv_2d_dw, but not available in all backends
+    // a:   KW    KH    1    C    convolution kernel
+    // b:   W     H     C    N    input data
+    // res: W_out H_out C    N
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_2d_dw_direct(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,
+            struct lm_ggml_tensor  * b,
+            int                   stride0,
+            int                   stride1,
+            int                   pad0,
+            int                   pad1,
+            int                   dilation0,
+            int                   dilation1);
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_2d_p0(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
@@ -1723,24 +1738,29 @@ extern "C" {
             float                 p0,
             float                 p1);
-    // nearest interpolate
+    enum lm_ggml_scale_mode {
+        LM_GGML_SCALE_MODE_NEAREST  = 0,
+        LM_GGML_SCALE_MODE_BILINEAR = 1,
+    };
+    // interpolate
     // multiplies ne0 and ne1 by scale factor
-    // used in stable-diffusion
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_upscale(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
-            int                   scale_factor);
+            int                   scale_factor,
+            enum lm_ggml_scale_mode  mode);
-    // nearest interpolate
-    // nearest interpolate to specified dimensions
-    // used in tortoise.cpp
+    // interpolate
+    // interpolate scale to specified dimensions
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_upscale_ext(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
             int                   ne0,
             int                   ne1,
             int                   ne2,
-            int                   ne3);
+            int                   ne3,
+            enum lm_ggml_scale_mode  mode);
     // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_pad(
@@ -1917,83 +1937,6 @@ extern "C" {
     // custom operators
-    typedef void (*lm_ggml_unary_op_f32_t) (const int, float *, const float *);
-    typedef void (*lm_ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-    typedef void (*lm_ggml_custom1_op_f32_t)(struct lm_ggml_tensor *, const struct lm_ggml_tensor *);
-    typedef void (*lm_ggml_custom2_op_f32_t)(struct lm_ggml_tensor *, const struct lm_ggml_tensor *, const struct lm_ggml_tensor *);
-    typedef void (*lm_ggml_custom3_op_f32_t)(struct lm_ggml_tensor *, const struct lm_ggml_tensor *, const struct lm_ggml_tensor *, const struct lm_ggml_tensor *);
-    LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_unary_f32(
-            struct lm_ggml_context        * ctx,
-            struct lm_ggml_tensor         * a,
-                   lm_ggml_unary_op_f32_t   fun),
-        "use lm_ggml_map_custom1 instead");
-    LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_unary_inplace_f32(
-            struct lm_ggml_context        * ctx,
-            struct lm_ggml_tensor         * a,
-                   lm_ggml_unary_op_f32_t   fun),
-        "use lm_ggml_map_custom1_inplace instead");
-    LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_binary_f32(
-            struct lm_ggml_context         * ctx,
-            struct lm_ggml_tensor          * a,
-            struct lm_ggml_tensor          * b,
-                   lm_ggml_binary_op_f32_t   fun),
-        "use lm_ggml_map_custom2 instead");
-    LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_binary_inplace_f32(
-            struct lm_ggml_context         * ctx,
-            struct lm_ggml_tensor          * a,
-            struct lm_ggml_tensor          * b,
-                   lm_ggml_binary_op_f32_t   fun),
-        "use lm_ggml_map_custom2_inplace instead");
-    LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom1_f32(
-            struct lm_ggml_context          * ctx,
-            struct lm_ggml_tensor           * a,
-                   lm_ggml_custom1_op_f32_t   fun),
-        "use lm_ggml_map_custom1 instead");
-    LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom1_inplace_f32(
-            struct lm_ggml_context          * ctx,
-            struct lm_ggml_tensor           * a,
-                   lm_ggml_custom1_op_f32_t   fun),
-        "use lm_ggml_map_custom1_inplace instead");
-    LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom2_f32(
-            struct lm_ggml_context          * ctx,
-            struct lm_ggml_tensor           * a,
-            struct lm_ggml_tensor           * b,
-                   lm_ggml_custom2_op_f32_t   fun),
-        "use lm_ggml_map_custom2 instead");
-    LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom2_inplace_f32(
-            struct lm_ggml_context          * ctx,
-            struct lm_ggml_tensor           * a,
-            struct lm_ggml_tensor           * b,
-                   lm_ggml_custom2_op_f32_t   fun),
-        "use lm_ggml_map_custom2_inplace instead");
-    LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom3_f32(
-            struct lm_ggml_context          * ctx,
-            struct lm_ggml_tensor           * a,
-            struct lm_ggml_tensor           * b,
-            struct lm_ggml_tensor           * c,
-                   lm_ggml_custom3_op_f32_t   fun),
-        "use lm_ggml_map_custom3 instead");
-    LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom3_inplace_f32(
-            struct lm_ggml_context          * ctx,
-            struct lm_ggml_tensor           * a,
-            struct lm_ggml_tensor           * b,
-            struct lm_ggml_tensor           * c,
-                   lm_ggml_custom3_op_f32_t   fun),
-        "use lm_ggml_map_custom3_inplace instead");
-    // custom operators v2
     typedef void (*lm_ggml_custom1_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, int ith, int nth, void * userdata);
     typedef void (*lm_ggml_custom2_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, int ith, int nth, void * userdata);
     typedef void (*lm_ggml_custom3_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, const struct lm_ggml_tensor * c, int ith, int nth, void * userdata);
@@ -2049,6 +1992,30 @@ extern "C" {
             int                     n_tasks,
             void                  * userdata);
+    typedef void (*lm_ggml_custom_op_t)(struct lm_ggml_tensor * dst , int ith, int nth, void * userdata);
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_custom_4d(
+            struct lm_ggml_context * ctx,
+            enum lm_ggml_type        type,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            struct lm_ggml_tensor ** args,
+            int                   n_args,
+            lm_ggml_custom_op_t      fun,
+            int                   n_tasks,
+            void                * userdata);
+    LM_GGML_API struct lm_ggml_tensor * lm_ggml_custom_inplace(
+            struct lm_ggml_context * ctx,
+            struct lm_ggml_tensor  * a,
+            struct lm_ggml_tensor ** args,
+            int                   n_args,
+            lm_ggml_custom_op_t      fun,
+            int                   n_tasks,
+            void                * userdata);
     // loss function
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_cross_entropy_loss(

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h CHANGED Viewed

@@ -23,6 +23,7 @@ enum llm_arch {
     LLM_ARCH_REFACT,
     LLM_ARCH_BERT,
     LLM_ARCH_NOMIC_BERT,
+    LLM_ARCH_NOMIC_BERT_MOE,
     LLM_ARCH_JINA_BERT_V2,
     LLM_ARCH_BLOOM,
     LLM_ARCH_STABLELM,
@@ -58,6 +59,7 @@ enum llm_arch {
     LLM_ARCH_DEEPSEEK,
     LLM_ARCH_DEEPSEEK2,
     LLM_ARCH_CHATGLM,
+    LLM_ARCH_GLM4,
     LLM_ARCH_BITNET,
     LLM_ARCH_T5,
     LLM_ARCH_T5ENCODER,
@@ -109,6 +111,7 @@ enum llm_kv {
     LLM_KV_EXPERT_WEIGHTS_SCALE,
     LLM_KV_EXPERT_WEIGHTS_NORM,
     LLM_KV_EXPERT_GATING_FUNC,
+    LLM_KV_MOE_EVERY_N_LAYERS,
     LLM_KV_POOLING_TYPE,
     LLM_KV_LOGIT_SCALE,
     LLM_KV_DECODER_START_TOKEN_ID,
@@ -143,6 +146,8 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
+    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
+    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -256,6 +261,8 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_Q_NORM,
     LLM_TENSOR_ATTN_K_NORM,
     LLM_TENSOR_LAYER_OUT_NORM,
+    LLM_TENSOR_POST_ATTN_NORM,
+    LLM_TENSOR_POST_MLP_NORM,
     LLM_TENSOR_SSM_IN,
     LLM_TENSOR_SSM_CONV1D,
     LLM_TENSOR_SSM_X,
@@ -303,6 +310,8 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_Q_B,
     LLM_TENSOR_ATTN_KV_A_MQA,
     LLM_TENSOR_ATTN_KV_B,
+    LLM_TENSOR_ATTN_K_B,
+    LLM_TENSOR_ATTN_V_B,
     LLM_TENSOR_ATTN_Q_A_NORM,
     LLM_TENSOR_ATTN_KV_A_NORM,
     LLM_TENSOR_ATTN_SUB_NORM,

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h CHANGED Viewed

@@ -70,7 +70,8 @@ struct llama_sbatch {
     // sequence-wise split
     llama_ubatch split_seq(size_t n_ubatch);
-    void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
+    llama_sbatch() = default;
+    llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
 };
 // temporary allocate memory for the input batch if needed

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h CHANGED Viewed

@@ -29,8 +29,8 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_DEEPSEEK_3,
     LLM_CHAT_TEMPLATE_COMMAND_R,
     LLM_CHAT_TEMPLATE_LLAMA_3,
-    LLM_CHAT_TEMPLATE_CHATGML_3,
-    LLM_CHAT_TEMPLATE_CHATGML_4,
+    LLM_CHAT_TEMPLATE_CHATGLM_3,
+    LLM_CHAT_TEMPLATE_CHATGLM_4,
     LLM_CHAT_TEMPLATE_GLMEDGE,
     LLM_CHAT_TEMPLATE_MINICPM,
     LLM_CHAT_TEMPLATE_EXAONE_3,
@@ -41,6 +41,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_YANDEX,
     LLM_CHAT_TEMPLATE_BAILING,
     LLM_CHAT_TEMPLATE_LLAMA4,
+    LLM_CHAT_TEMPLATE_SMOLVLM,
     LLM_CHAT_TEMPLATE_UNKNOWN,
 };

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h CHANGED Viewed

@@ -27,7 +27,12 @@ struct llama_context {
     void synchronize();
-    const llama_model & get_model() const;
+    const llama_model   & get_model()   const;
+    const llama_cparams & get_cparams() const;
+    lm_ggml_backend_sched_t get_sched() const;
+    lm_ggml_context * get_ctx_compute() const;
     uint32_t n_ctx()         const;
     uint32_t n_ctx_per_seq() const;
@@ -137,50 +142,30 @@ private:
     // Returns max number of outputs for which space was reserved.
     int32_t output_reserve(int32_t n_outputs);
-    // make the outputs have the same order they had in the user-provided batch
-    // TODO: maybe remove this
-    void output_reorder();
     //
     // graph
     //
+public:
     int32_t graph_max_nodes() const;
     // zero-out inputs and create the ctx_compute for the compute graph
     lm_ggml_cgraph * graph_init();
+    // returns the result of lm_ggml_backend_sched_graph_compute_async execution
+    lm_ggml_status graph_compute(
+            lm_ggml_cgraph * gf,
+                   bool   batched);
+private:
     llm_graph_result_ptr graph_build(
             lm_ggml_context * ctx,
              lm_ggml_cgraph * gf,
       const llama_ubatch & ubatch,
           llm_graph_type   gtype);
-    // returns the result of lm_ggml_backend_sched_graph_compute_async execution
-    lm_ggml_status graph_compute(
-            lm_ggml_cgraph * gf,
-                   bool   batched);
     llm_graph_cb graph_get_cb() const;
-    // used by kv_self_update()
-    lm_ggml_tensor * build_rope_shift(
-        lm_ggml_context * ctx0,
-        lm_ggml_tensor * cur,
-        lm_ggml_tensor * shift,
-        lm_ggml_tensor * factors,
-              float   freq_base,
-              float   freq_scale,
-        lm_ggml_backend_buffer * bbuf) const;
-    llm_graph_result_ptr build_kv_self_shift(
-            lm_ggml_context * ctx0,
-            lm_ggml_cgraph * gf) const;
-    llm_graph_result_ptr build_kv_self_defrag(
-            lm_ggml_context * ctx0,
-            lm_ggml_cgraph * gf) const;
     // TODO: read/write lora adapters and cvec
     size_t state_write_data(llama_io_write_i & io);
     size_t state_read_data (llama_io_read_i  & io);
@@ -197,11 +182,10 @@ private:
     llama_cparams       cparams;
     llama_adapter_cvec  cvec;
     llama_adapter_loras loras;
-    llama_sbatch        sbatch;
     llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
-    std::unique_ptr<llama_kv_cache_unified> kv_self;
+    std::unique_ptr<llama_memory_i> memory;
     // TODO: remove
     bool logits_all = false;

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h CHANGED Viewed

@@ -19,6 +19,7 @@ struct llama_cparams;
 class llama_memory_i;
 class llama_kv_cache_unified;
+class llama_kv_cache_recurrent;
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@@ -90,29 +91,27 @@ public:
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
-    llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+    llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
     virtual ~llm_graph_input_pos() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * pos = nullptr; // I32 [n_batch]
-    const int64_t n_pos_per_token = 1;
+    const int64_t n_pos_per_embd = 1;
 };
 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
-    llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
-        : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
     virtual ~llm_graph_input_attn_temp() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
-    const int64_t n_pos_per_token = 1;
     const uint32_t n_attn_temp_floor_scale;
     const float    f_attn_temp_scale;
 };
@@ -188,26 +187,26 @@ public:
 class llm_graph_input_s_copy : public llm_graph_input_i {
 public:
-    llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
     virtual ~llm_graph_input_s_copy() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * s_copy; // I32 [kv_size]
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_recurrent * kv_self;
 };
 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
-    llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
     virtual ~llm_graph_input_s_mask() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * s_mask; // F32 [1, n_kv]
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_recurrent * kv_self;
 };
 class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -352,8 +351,8 @@ struct llm_graph_params {
     const llama_cparams & cparams;
     const llama_ubatch  & ubatch;
-    lm_ggml_backend_sched * sched;
-    lm_ggml_backend * backend_cpu;
+    lm_ggml_backend_sched_t sched;
+    lm_ggml_backend_t backend_cpu;
     const llama_adapter_cvec  * cvec;
     const llama_adapter_loras * loras;
@@ -404,9 +403,9 @@ struct llm_graph_context {
     lm_ggml_context * ctx0 = nullptr;
-    lm_ggml_backend_sched * sched;
+    lm_ggml_backend_sched_t sched;
-    lm_ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
+    lm_ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
     const llama_adapter_cvec  * cvec;
     const llama_adapter_loras * loras;
@@ -419,7 +418,7 @@ struct llm_graph_context {
     llm_graph_context(const llm_graph_params & params);
-    int64_t n_pos_per_token() const;
+    int64_t n_pos_per_embd() const;
     void cb(lm_ggml_tensor * cur, const char * name, int il) const;
@@ -505,11 +504,12 @@ struct llm_graph_context {
     lm_ggml_tensor * build_attn_mha(
              lm_ggml_cgraph * gf,
-             lm_ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
-             lm_ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
-             lm_ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
+             lm_ggml_tensor * q,     // [n_embd_head_q, n_tokens, n_head_q]
+             lm_ggml_tensor * k,     // [n_embd_head_k, n_tokens, n_head_k]
+             lm_ggml_tensor * v,     // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
              lm_ggml_tensor * kq_b,
              lm_ggml_tensor * kq_mask,
+             lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                     bool   v_trans,
                    float   kq_scale) const;
@@ -524,6 +524,7 @@ struct llm_graph_context {
             lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
             lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
             lm_ggml_tensor * kq_b,
+            lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
                     int   il) const;
@@ -538,6 +539,7 @@ struct llm_graph_context {
             lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
             lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
             lm_ggml_tensor * kq_b,
+            lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
                     int   il) const;
@@ -552,6 +554,7 @@ struct llm_graph_context {
             lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
             lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
             lm_ggml_tensor * kq_b,
+            lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
                     int   il) const;

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h CHANGED Viewed

@@ -43,6 +43,10 @@ struct llama_hparams {
     uint32_t n_expert_used = 0;
     uint32_t n_rel_attn_bkts = 0;
+    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
+    uint32_t n_embd_head_k_mla = 0;
+    uint32_t n_embd_head_v_mla = 0;
     // for WavTokenizer
     struct llama_hparams_posnet   posnet;
     struct llama_hparams_convnext convnext;
@@ -62,6 +66,7 @@ struct llama_hparams {
     float    expert_weights_scale = 0.0;
     bool     expert_weights_norm  = false;
     uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
+    uint32_t moe_every_n_layers   = 0;
     float f_norm_eps;
     float f_norm_rms_eps;