npm - cui-llama.rn - Versions diffs - 1.6.0 → 1.7.0 - Mend

cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (285) hide show

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h CHANGED Viewed

@@ -23,6 +23,7 @@ enum llm_arch {
     LLM_ARCH_REFACT,
     LLM_ARCH_BERT,
     LLM_ARCH_NOMIC_BERT,
+    LLM_ARCH_NOMIC_BERT_MOE,
     LLM_ARCH_JINA_BERT_V2,
     LLM_ARCH_BLOOM,
     LLM_ARCH_STABLELM,
@@ -58,6 +59,7 @@ enum llm_arch {
     LLM_ARCH_DEEPSEEK,
     LLM_ARCH_DEEPSEEK2,
     LLM_ARCH_CHATGLM,
+    LLM_ARCH_GLM4,
     LLM_ARCH_BITNET,
     LLM_ARCH_T5,
     LLM_ARCH_T5ENCODER,
@@ -109,6 +111,7 @@ enum llm_kv {
     LLM_KV_EXPERT_WEIGHTS_SCALE,
     LLM_KV_EXPERT_WEIGHTS_NORM,
     LLM_KV_EXPERT_GATING_FUNC,
+    LLM_KV_MOE_EVERY_N_LAYERS,
     LLM_KV_POOLING_TYPE,
     LLM_KV_LOGIT_SCALE,
     LLM_KV_DECODER_START_TOKEN_ID,
@@ -143,6 +146,8 @@ enum llm_kv {
     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
     LLM_KV_ATTENTION_SLIDING_WINDOW,
     LLM_KV_ATTENTION_SCALE,
+    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
+    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -256,6 +261,8 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_Q_NORM,
     LLM_TENSOR_ATTN_K_NORM,
     LLM_TENSOR_LAYER_OUT_NORM,
+    LLM_TENSOR_POST_ATTN_NORM,
+    LLM_TENSOR_POST_MLP_NORM,
     LLM_TENSOR_SSM_IN,
     LLM_TENSOR_SSM_CONV1D,
     LLM_TENSOR_SSM_X,
@@ -303,6 +310,8 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_Q_B,
     LLM_TENSOR_ATTN_KV_A_MQA,
     LLM_TENSOR_ATTN_KV_B,
+    LLM_TENSOR_ATTN_K_B,
+    LLM_TENSOR_ATTN_V_B,
     LLM_TENSOR_ATTN_Q_A_NORM,
     LLM_TENSOR_ATTN_KV_A_NORM,
     LLM_TENSOR_ATTN_SUB_NORM,

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h CHANGED Viewed

@@ -70,7 +70,8 @@ struct llama_sbatch {
     // sequence-wise split
     llama_ubatch split_seq(size_t n_ubatch);
-    void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
+    llama_sbatch() = default;
+    llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
 };
 // temporary allocate memory for the input batch if needed

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h CHANGED Viewed

@@ -14,6 +14,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_MISTRAL_V3,
     LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
     LLM_CHAT_TEMPLATE_MISTRAL_V7,
+    LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
     LLM_CHAT_TEMPLATE_PHI_3,
     LLM_CHAT_TEMPLATE_PHI_4,
     LLM_CHAT_TEMPLATE_FALCON_3,
@@ -29,8 +30,8 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_DEEPSEEK_3,
     LLM_CHAT_TEMPLATE_COMMAND_R,
     LLM_CHAT_TEMPLATE_LLAMA_3,
-    LLM_CHAT_TEMPLATE_CHATGML_3,
-    LLM_CHAT_TEMPLATE_CHATGML_4,
+    LLM_CHAT_TEMPLATE_CHATGLM_3,
+    LLM_CHAT_TEMPLATE_CHATGLM_4,
     LLM_CHAT_TEMPLATE_GLMEDGE,
     LLM_CHAT_TEMPLATE_MINICPM,
     LLM_CHAT_TEMPLATE_EXAONE_3,
@@ -41,6 +42,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_YANDEX,
     LLM_CHAT_TEMPLATE_BAILING,
     LLM_CHAT_TEMPLATE_LLAMA4,
+    LLM_CHAT_TEMPLATE_SMOLVLM,
     LLM_CHAT_TEMPLATE_UNKNOWN,
 };

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h CHANGED Viewed

@@ -7,6 +7,7 @@
 #include "llama-adapter.h"
 #include "ggml-cpp.h"
+#include "ggml-opt.h"
 #include <map>
 #include <vector>
@@ -27,7 +28,12 @@ struct llama_context {
     void synchronize();
-    const llama_model & get_model() const;
+    const llama_model   & get_model()   const;
+    const llama_cparams & get_cparams() const;
+    lm_ggml_backend_sched_t get_sched() const;
+    lm_ggml_context * get_ctx_compute() const;
     uint32_t n_ctx()         const;
     uint32_t n_ctx_per_seq() const;
@@ -128,6 +134,32 @@ struct llama_context {
     llama_perf_context_data perf_get_data() const;
     void perf_reset();
+    //
+    // training
+    //
+    void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
+    void opt_epoch(
+            lm_ggml_opt_dataset_t      dataset,
+            lm_ggml_opt_result_t       result_train,
+            lm_ggml_opt_result_t       result_eval,
+            int64_t                 idata_split,
+            lm_ggml_opt_epoch_callback callback_train,
+            lm_ggml_opt_epoch_callback callback_eval);
+    void opt_epoch_iter(
+            lm_ggml_opt_dataset_t               dataset,
+            lm_ggml_opt_result_t                result,
+            const std::vector<llama_token> & tokens,
+            const std::vector<llama_token> & labels_sparse,
+            llama_batch                    & batch,
+            lm_ggml_opt_epoch_callback          callback,
+            bool                             train,
+            int64_t                          idata_in_loop,
+            int64_t                          ndata_in_loop,
+            int64_t                          t_loop_start);
 private:
     //
     // output
@@ -137,50 +169,30 @@ private:
     // Returns max number of outputs for which space was reserved.
     int32_t output_reserve(int32_t n_outputs);
-    // make the outputs have the same order they had in the user-provided batch
-    // TODO: maybe remove this
-    void output_reorder();
     //
     // graph
     //
+public:
     int32_t graph_max_nodes() const;
     // zero-out inputs and create the ctx_compute for the compute graph
     lm_ggml_cgraph * graph_init();
+    // returns the result of lm_ggml_backend_sched_graph_compute_async execution
+    lm_ggml_status graph_compute(
+            lm_ggml_cgraph * gf,
+                   bool   batched);
+private:
     llm_graph_result_ptr graph_build(
             lm_ggml_context * ctx,
              lm_ggml_cgraph * gf,
       const llama_ubatch & ubatch,
           llm_graph_type   gtype);
-    // returns the result of lm_ggml_backend_sched_graph_compute_async execution
-    lm_ggml_status graph_compute(
-            lm_ggml_cgraph * gf,
-                   bool   batched);
     llm_graph_cb graph_get_cb() const;
-    // used by kv_self_update()
-    lm_ggml_tensor * build_rope_shift(
-        lm_ggml_context * ctx0,
-        lm_ggml_tensor * cur,
-        lm_ggml_tensor * shift,
-        lm_ggml_tensor * factors,
-              float   freq_base,
-              float   freq_scale,
-        lm_ggml_backend_buffer * bbuf) const;
-    llm_graph_result_ptr build_kv_self_shift(
-            lm_ggml_context * ctx0,
-            lm_ggml_cgraph * gf) const;
-    llm_graph_result_ptr build_kv_self_defrag(
-            lm_ggml_context * ctx0,
-            lm_ggml_cgraph * gf) const;
     // TODO: read/write lora adapters and cvec
     size_t state_write_data(llama_io_write_i & io);
     size_t state_read_data (llama_io_read_i  & io);
@@ -197,14 +209,10 @@ private:
     llama_cparams       cparams;
     llama_adapter_cvec  cvec;
     llama_adapter_loras loras;
-    llama_sbatch        sbatch;
     llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
-    std::unique_ptr<llama_kv_cache_unified> kv_self;
-    // TODO: remove
-    bool logits_all = false;
+    std::unique_ptr<llama_memory_i> memory;
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     size_t  logits_size = 0; // capacity (of floats) for logits
@@ -231,6 +239,9 @@ private:
     lm_ggml_context_ptr ctx_compute;
+    // training
+    lm_ggml_opt_context_t opt_ctx = nullptr;
     lm_ggml_threadpool_t threadpool       = nullptr;
     lm_ggml_threadpool_t threadpool_batch = nullptr;

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h CHANGED Viewed

@@ -30,6 +30,7 @@ struct llama_cparams {
     bool flash_attn;
     bool no_perf;
     bool warmup;
+    bool op_offload;
     enum llama_pooling_type pooling_type;

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h CHANGED Viewed

@@ -19,6 +19,8 @@ struct llama_cparams;
 class llama_memory_i;
 class llama_kv_cache_unified;
+class llama_kv_cache_unified_iswa;
+class llama_kv_cache_recurrent;
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@@ -90,29 +92,27 @@ public:
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
-    llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+    llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
     virtual ~llm_graph_input_pos() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * pos = nullptr; // I32 [n_batch]
-    const int64_t n_pos_per_token = 1;
+    const int64_t n_pos_per_embd = 1;
 };
 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
-    llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
-        : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
     virtual ~llm_graph_input_attn_temp() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
-    const int64_t n_pos_per_token = 1;
     const uint32_t n_attn_temp_floor_scale;
     const float    f_attn_temp_scale;
 };
@@ -188,26 +188,26 @@ public:
 class llm_graph_input_s_copy : public llm_graph_input_i {
 public:
-    llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
     virtual ~llm_graph_input_s_copy() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * s_copy; // I32 [kv_size]
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_recurrent * kv_self;
 };
 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
-    llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
     virtual ~llm_graph_input_s_mask() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * s_mask; // F32 [1, n_kv]
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_recurrent * kv_self;
 };
 class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -256,6 +256,31 @@ public:
     void set_input(const llama_ubatch * ubatch) override;
+    lm_ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+    lm_ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
+    lm_ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+    const llama_kv_cache_unified * kv_self;
+};
+class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_kv_unified_iswa(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_kv_cache_unified_iswa * kv_self) :
+        hparams(hparams),
+        cparams(cparams),
+        kv_self(kv_self) {
+    }
+    ~llm_graph_input_attn_kv_unified_iswa() = default;
+    void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
     lm_ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
@@ -267,7 +292,7 @@ public:
     const llama_hparams & hparams;
     const llama_cparams & cparams;
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_iswa * kv_self;
 };
 class llm_graph_input_attn_cross : public llm_graph_input_i {
@@ -299,6 +324,7 @@ class llm_graph_result_i {
 public:
     virtual ~llm_graph_result_i() = default;
+    virtual lm_ggml_tensor * get_tokens()      = 0;
     virtual lm_ggml_tensor * get_logits()      = 0;
     virtual lm_ggml_tensor * get_embd()        = 0;
     virtual lm_ggml_tensor * get_embd_pooled() = 0;
@@ -313,6 +339,7 @@ class llm_graph_result : public llm_graph_result_i {
 public:
     virtual ~llm_graph_result() = default;
+    lm_ggml_tensor * get_tokens()      override { return t_tokens; }
     lm_ggml_tensor * get_logits()      override { return t_logits; }
     lm_ggml_tensor * get_embd()        override { return t_embd; }
     lm_ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
@@ -329,6 +356,7 @@ public:
     }
     // important graph nodes
+    lm_ggml_tensor * t_tokens      = nullptr;
     lm_ggml_tensor * t_logits      = nullptr;
     lm_ggml_tensor * t_embd        = nullptr;
     lm_ggml_tensor * t_embd_pooled = nullptr;
@@ -352,8 +380,8 @@ struct llm_graph_params {
     const llama_cparams & cparams;
     const llama_ubatch  & ubatch;
-    lm_ggml_backend_sched * sched;
-    lm_ggml_backend * backend_cpu;
+    lm_ggml_backend_sched_t sched;
+    lm_ggml_backend_t backend_cpu;
     const llama_adapter_cvec  * cvec;
     const llama_adapter_loras * loras;
@@ -376,7 +404,6 @@ struct llm_graph_context {
     const int64_t n_layer;
     const int64_t n_rot;
     const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
-    const int64_t n_ctx_per_seq;
     const int64_t n_head;
     const int64_t n_head_kv;
     const int64_t n_embd_head_k;
@@ -404,9 +431,9 @@ struct llm_graph_context {
     lm_ggml_context * ctx0 = nullptr;
-    lm_ggml_backend_sched * sched;
+    lm_ggml_backend_sched_t sched;
-    lm_ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
+    lm_ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
     const llama_adapter_cvec  * cvec;
     const llama_adapter_loras * loras;
@@ -419,7 +446,7 @@ struct llm_graph_context {
     llm_graph_context(const llm_graph_params & params);
-    int64_t n_pos_per_token() const;
+    int64_t n_pos_per_embd() const;
     void cb(lm_ggml_tensor * cur, const char * name, int il) const;
@@ -505,12 +532,12 @@ struct llm_graph_context {
     lm_ggml_tensor * build_attn_mha(
              lm_ggml_cgraph * gf,
-             lm_ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
-             lm_ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
-             lm_ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
+             lm_ggml_tensor * q,       // [n_embd_head_q, n_head_q, n_tokens]
+             lm_ggml_tensor * k,       // [n_embd_head_k, n_head_k, n_tokens]
+             lm_ggml_tensor * v,       // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
              lm_ggml_tensor * kq_b,
              lm_ggml_tensor * kq_mask,
-                    bool   v_trans,
+             lm_ggml_tensor * v_mla,   // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                    float   kq_scale) const;
     llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
@@ -524,6 +551,7 @@ struct llm_graph_context {
             lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
             lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
             lm_ggml_tensor * kq_b,
+            lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
                     int   il) const;
@@ -538,6 +566,22 @@ struct llm_graph_context {
             lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
             lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
             lm_ggml_tensor * kq_b,
+            lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+                  float   kq_scale,
+                    int   il) const;
+    llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
+    lm_ggml_tensor * build_attn(
+            llm_graph_input_attn_kv_unified_iswa * inp,
+            lm_ggml_cgraph * gf,
+            lm_ggml_tensor * wo,
+            lm_ggml_tensor * wo_b,
+            lm_ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            lm_ggml_tensor * kq_b,
+            lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
                     int   il) const;
@@ -552,6 +596,7 @@ struct llm_graph_context {
             lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
             lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
             lm_ggml_tensor * kq_b,
+            lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
                     int   il) const;
@@ -590,3 +635,6 @@ struct llm_graph_context {
             lm_ggml_tensor * cls_out,
             lm_ggml_tensor * cls_out_b) const;
 };
+// TODO: better name
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h CHANGED Viewed

@@ -14,6 +14,12 @@ enum llama_expert_gating_func_type {
     LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
 };
+enum llama_swa_type {
+    LLAMA_SWA_TYPE_NONE     = 0,
+    LLAMA_SWA_TYPE_STANDARD = 1,
+    LLAMA_SWA_TYPE_CHUNKED  = 2,
+};
 struct llama_hparams_posnet {
     uint32_t n_embd;
     uint32_t n_layer;
@@ -35,14 +41,16 @@ struct llama_hparams {
     uint32_t n_embd_features = 0;
     uint32_t n_layer;
     uint32_t n_rot;
-    uint32_t n_swa = 0; // sliding window attention (SWA)
-    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
     uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
     uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
     uint32_t n_expert = 0;
     uint32_t n_expert_used = 0;
     uint32_t n_rel_attn_bkts = 0;
+    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
+    uint32_t n_embd_head_k_mla = 0;
+    uint32_t n_embd_head_v_mla = 0;
     // for WavTokenizer
     struct llama_hparams_posnet   posnet;
     struct llama_hparams_convnext convnext;
@@ -62,6 +70,7 @@ struct llama_hparams {
     float    expert_weights_scale = 0.0;
     bool     expert_weights_norm  = false;
     uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
+    uint32_t moe_every_n_layers   = 0;
     float f_norm_eps;
     float f_norm_rms_eps;
@@ -91,6 +100,15 @@ struct llama_hparams {
     std::array<int, 4> rope_sections;
+    // Sliding Window Attention (SWA)
+    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+    // the size of the sliding window (0 - no SWA)
+    uint32_t n_swa = 0;
+    // if swa_layers[il] == true, then layer il is SWA
+    // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
+    // by default, all layers are dense
+    std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
     // for State Space Models
     uint32_t ssm_d_conv  = 0;
     uint32_t ssm_d_inner = 0;
@@ -111,11 +129,10 @@ struct llama_hparams {
     bool causal_attn   = true;
     bool use_alibi     = false;
     bool attn_soft_cap = false;
+    bool use_kq_norm   = true;
+    // llama4
     uint32_t n_moe_layer_step        = 0;
-    bool     use_kq_norm             = true;
-    uint32_t n_attn_chunk            = 0;
-    // values below seems to be fixed on llama4
     uint32_t n_no_rope_layer_step    = 4;
     uint32_t n_attn_temp_floor_scale = 8192;
     float    f_attn_temp_scale       = 0.1;
@@ -128,6 +145,23 @@ struct llama_hparams {
     enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
     enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
+    // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
+    // note that if n_pattern == 0, all layers are SWA
+    //           if n_pattern == 1, all layers are dense
+    // example: n_pattern = 3
+    //   il == 0: swa
+    //   il == 1: swa
+    //   il == 2: dense
+    //   il == 3: swa
+    //   il == 4: swa
+    //   il == 5: dense
+    //   il == 6: swa
+    //   etc ...
+    void set_swa_pattern(uint32_t n_pattern);
+    // return true if one of the layers is SWA
+    bool is_swa_any() const;
     uint32_t n_head(uint32_t il = 0) const;
     uint32_t n_head_kv(uint32_t il = 0) const;