npm - cui-llama.rn - Versions diffs - 1.1.2 → 1.1.5 - Mend

cui-llama.rn 1.1.2 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/android/src/main/CMakeLists.txt +1 -2
package/android/src/main/jni.cpp +26 -21
package/cpp/common.cpp +181 -1584
package/cpp/common.h +131 -52
package/cpp/ggml-aarch64.c +612 -0
package/cpp/ggml-alloc.h +2 -2
package/cpp/ggml-backend.c +33 -6
package/cpp/ggml-backend.h +2 -0
package/cpp/ggml-common.h +20 -0
package/cpp/ggml-impl.h +36 -7
package/cpp/ggml-metal.m +68 -8
package/cpp/ggml-quants.c +932 -50
package/cpp/ggml-quants.h +15 -0
package/cpp/ggml.c +1712 -325
package/cpp/ggml.h +169 -100
package/cpp/llama-grammar.cpp +721 -122
package/cpp/llama-grammar.h +120 -15
package/cpp/llama-impl.h +132 -1
package/cpp/llama-sampling.cpp +1483 -354
package/cpp/llama-sampling.h +20 -48
package/cpp/llama-vocab.cpp +140 -7
package/cpp/llama-vocab.h +3 -2
package/cpp/llama.cpp +824 -327
package/cpp/llama.h +235 -256
package/cpp/rn-llama.hpp +18 -14
package/cpp/sampling.cpp +353 -354
package/cpp/sampling.h +62 -143
package/cpp/sgemm.cpp +153 -0
package/package.json +1 -1
package/cpp/grammar-parser.cpp +0 -539
package/cpp/grammar-parser.h +0 -29

package/cpp/llama.cpp CHANGED Viewed

@@ -1,6 +1,5 @@
 #include "llama-impl.h"
 #include "llama-vocab.h"
-#include "llama-grammar.h"
 #include "llama-sampling.h"
 #include "unicode.h"
@@ -223,6 +222,7 @@ enum llm_arch {
     LLM_ARCH_JAIS,
     LLM_ARCH_NEMOTRON,
     LLM_ARCH_EXAONE,
+    LLM_ARCH_RWKV6,
     LLM_ARCH_UNKNOWN,
 };
@@ -270,6 +270,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_JAIS,            "jais"         },
     { LLM_ARCH_NEMOTRON,        "nemotron"     },
     { LLM_ARCH_EXAONE,          "exaone"       },
+    { LLM_ARCH_RWKV6,           "rwkv6"        },
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
@@ -306,6 +307,9 @@ enum llm_kv {
     LLM_KV_DECODER_START_TOKEN_ID,
     LLM_KV_ATTN_LOGIT_SOFTCAPPING,
     LLM_KV_FINAL_LOGIT_SOFTCAPPING,
+    LLM_KV_RESCALE_EVERY_N_LAYERS,
+    LLM_KV_TIME_MIX_EXTRA_DIM,
+    LLM_KV_TIME_DECAY_EXTRA_DIM,
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -341,6 +345,8 @@ enum llm_kv {
     LLM_KV_SSM_TIME_STEP_RANK,
     LLM_KV_SSM_DT_B_C_RMS,
+    LLM_KV_WKV_HEAD_SIZE,
     LLM_KV_TOKENIZER_MODEL,
     LLM_KV_TOKENIZER_PRE,
     LLM_KV_TOKENIZER_LIST,
@@ -400,11 +406,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_EXPERT_USED_COUNT,                 "%s.expert_used_count"                 },
     { LLM_KV_EXPERT_SHARED_COUNT,               "%s.expert_shared_count"               },
     { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
-    { LLM_KV_POOLING_TYPE ,                     "%s.pooling_type"                      },
+    { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
     { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
     { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
     { LLM_KV_ATTN_LOGIT_SOFTCAPPING,            "%s.attn_logit_softcapping"            },
     { LLM_KV_FINAL_LOGIT_SOFTCAPPING,           "%s.final_logit_softcapping"           },
+    { LLM_KV_RESCALE_EVERY_N_LAYERS,            "%s.rescale_every_n_layers"            },
+    { LLM_KV_TIME_MIX_EXTRA_DIM,                "%s.time_mix_extra_dim"                },
+    { LLM_KV_TIME_DECAY_EXTRA_DIM,              "%s.time_decay_extra_dim"              },
     { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
@@ -440,6 +449,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_SSM_TIME_STEP_RANK,            "%s.ssm.time_step_rank" },
     { LLM_KV_SSM_DT_B_C_RMS,                "%s.ssm.dt_b_c_rms" },
+    { LLM_KV_WKV_HEAD_SIZE,                 "%s.wkv.head_size" },
     { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
     { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
     { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
@@ -529,6 +540,29 @@ enum llm_tensor {
     LLM_TENSOR_SSM_A,
     LLM_TENSOR_SSM_D,
     LLM_TENSOR_SSM_OUT,
+    LLM_TENSOR_TIME_MIX_W1,
+    LLM_TENSOR_TIME_MIX_W2,
+    LLM_TENSOR_TIME_MIX_LERP_X,
+    LLM_TENSOR_TIME_MIX_LERP_W,
+    LLM_TENSOR_TIME_MIX_LERP_K,
+    LLM_TENSOR_TIME_MIX_LERP_V,
+    LLM_TENSOR_TIME_MIX_LERP_R,
+    LLM_TENSOR_TIME_MIX_LERP_G,
+    LLM_TENSOR_TIME_MIX_FIRST,
+    LLM_TENSOR_TIME_MIX_DECAY,
+    LLM_TENSOR_TIME_MIX_DECAY_W1,
+    LLM_TENSOR_TIME_MIX_DECAY_W2,
+    LLM_TENSOR_TIME_MIX_KEY,
+    LLM_TENSOR_TIME_MIX_VALUE,
+    LLM_TENSOR_TIME_MIX_RECEPTANCE,
+    LLM_TENSOR_TIME_MIX_GATE,
+    LLM_TENSOR_TIME_MIX_LN,
+    LLM_TENSOR_TIME_MIX_OUTPUT,
+    LLM_TENSOR_CHANNEL_MIX_LERP_K,
+    LLM_TENSOR_CHANNEL_MIX_LERP_R,
+    LLM_TENSOR_CHANNEL_MIX_KEY,
+    LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
+    LLM_TENSOR_CHANNEL_MIX_VALUE,
     LLM_TENSOR_ATTN_Q_A,
     LLM_TENSOR_ATTN_Q_B,
     LLM_TENSOR_ATTN_KV_A_MQA,
@@ -1350,6 +1384,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_RWKV6,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,                "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM,           "token_embd_norm" },
+            { LLM_TENSOR_OUTPUT_NORM,               "output_norm" },
+            { LLM_TENSOR_OUTPUT,                    "output" },
+            { LLM_TENSOR_ATTN_NORM,                 "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_NORM_2,               "blk.%d.attn_norm_2" },
+            { LLM_TENSOR_TIME_MIX_W1,               "blk.%d.time_mix_w1" },
+            { LLM_TENSOR_TIME_MIX_W2,               "blk.%d.time_mix_w2" },
+            { LLM_TENSOR_TIME_MIX_LERP_X,           "blk.%d.time_mix_lerp_x" },
+            { LLM_TENSOR_TIME_MIX_LERP_W,           "blk.%d.time_mix_lerp_w" },
+            { LLM_TENSOR_TIME_MIX_LERP_K,           "blk.%d.time_mix_lerp_k" },
+            { LLM_TENSOR_TIME_MIX_LERP_V,           "blk.%d.time_mix_lerp_v" },
+            { LLM_TENSOR_TIME_MIX_LERP_R,           "blk.%d.time_mix_lerp_r" },
+            { LLM_TENSOR_TIME_MIX_LERP_G,           "blk.%d.time_mix_lerp_g" },
+            { LLM_TENSOR_TIME_MIX_FIRST,            "blk.%d.time_mix_first" },
+            { LLM_TENSOR_TIME_MIX_DECAY,            "blk.%d.time_mix_decay" },
+            { LLM_TENSOR_TIME_MIX_DECAY_W1,         "blk.%d.time_mix_decay_w1" },
+            { LLM_TENSOR_TIME_MIX_DECAY_W2,         "blk.%d.time_mix_decay_w2" },
+            { LLM_TENSOR_TIME_MIX_KEY,              "blk.%d.time_mix_key" },
+            { LLM_TENSOR_TIME_MIX_VALUE,            "blk.%d.time_mix_value" },
+            { LLM_TENSOR_TIME_MIX_RECEPTANCE,       "blk.%d.time_mix_receptance" },
+            { LLM_TENSOR_TIME_MIX_GATE,             "blk.%d.time_mix_gate" },
+            { LLM_TENSOR_TIME_MIX_LN,               "blk.%d.time_mix_ln" },
+            { LLM_TENSOR_TIME_MIX_OUTPUT,           "blk.%d.time_mix_output" },
+            { LLM_TENSOR_CHANNEL_MIX_LERP_K,        "blk.%d.channel_mix_lerp_k" },
+            { LLM_TENSOR_CHANNEL_MIX_LERP_R,        "blk.%d.channel_mix_lerp_r" },
+            { LLM_TENSOR_CHANNEL_MIX_KEY,           "blk.%d.channel_mix_key" },
+            { LLM_TENSOR_CHANNEL_MIX_VALUE,         "blk.%d.channel_mix_value" },
+            { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,    "blk.%d.channel_mix_receptance" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -2099,6 +2167,10 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buf
     if (host_buffer) {
         buft = lm_ggml_backend_sycl_host_buffer_type();
     }
+#elif defined(LM_GGML_USE_CANN)
+    if (host_buffer) {
+        buft = lm_ggml_backend_cann_host_buffer_type();
+    }
 #elif defined(LM_GGML_USE_CPU_HBM)
     buft = lm_ggml_backend_cpu_hbm_buffer_type();
 #elif defined(LM_GGML_USE_VULKAN)
@@ -2162,6 +2234,7 @@ enum e_model {
     MODEL_1B,
     MODEL_1_3B,
     MODEL_1_4B,
+    MODEL_1_6B,
     MODEL_2B,
     MODEL_2_8B,
     MODEL_3B,
@@ -2239,6 +2312,12 @@ struct llama_hparams {
     float f_attn_logit_softcapping = 50.0f;
     float f_final_logit_softcapping = 30.0f;
+    // for RWKV
+    uint32_t rescale_every_n_layers = 0;
+    uint32_t time_mix_extra_dim = 0;
+    uint32_t time_decay_extra_dim = 0;
+    uint32_t wkv_head_size = 0;
     float    rope_attn_factor = 1.0f;
     float    rope_freq_base_train;
     float    rope_freq_scale_train;
@@ -2302,6 +2381,11 @@ struct llama_hparams {
         if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
         if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
+        if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
+        if (this->time_mix_extra_dim     != other.time_mix_extra_dim)     return true;
+        if (this->time_decay_extra_dim   != other.time_decay_extra_dim)   return true;
+        if (this->wkv_head_size          != other.wkv_head_size)          return true;
         if (this->dec_start_token_id != other.dec_start_token_id) return true;
         const float EPSILON = 1e-9f;
@@ -2365,15 +2449,25 @@ struct llama_hparams {
     }
     uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
-        // corresponds to Mamba's conv_states size
-        // TODO: maybe support other convolution strides than 1
-        // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
-        return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
+        // corresponds to Mamba's conv_states size or RWKV's token_shift states size
+        if (wkv_head_size != 0) {
+            // for RWKV models
+            return 2 * n_embd;
+        } else {
+            // TODO: maybe support other convolution strides than 1
+            // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
+            return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
+        }
     }
     uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
-        // corresponds to Mamba's ssm_states size
-        return ssm_d_state * ssm_d_inner;
+        if (wkv_head_size != 0) {
+            // corresponds to RWKV's wkv_states size
+            return n_embd * wkv_head_size;
+        } else {
+            // corresponds to Mamba's ssm_states size
+            return ssm_d_state * ssm_d_inner;
+        }
     }
 };
@@ -2384,8 +2478,8 @@ struct llama_cparams {
     uint32_t n_batch;
     uint32_t n_ubatch;
     uint32_t n_seq_max;
-    uint32_t n_threads;       // number of threads to use for generation
-    uint32_t n_threads_batch; // number of threads to use for batch processing
+    int      n_threads;       // number of threads to use for generation
+    int      n_threads_batch; // number of threads to use for batch processing
     float rope_freq_base;
     float rope_freq_scale;
@@ -2403,6 +2497,7 @@ struct llama_cparams {
     bool causal_attn;
     bool offload_kqv;
     bool flash_attn;
+    bool no_perf;
     enum llama_pooling_type pooling_type;
@@ -2512,6 +2607,36 @@ struct llama_layer {
     struct lm_ggml_tensor * ssm_conv1d_b;
     struct lm_ggml_tensor * ssm_dt_b;
+    // rwkv
+    struct lm_ggml_tensor * time_mix_w1;
+    struct lm_ggml_tensor * time_mix_w2;
+    struct lm_ggml_tensor * time_mix_lerp_x;
+    struct lm_ggml_tensor * time_mix_lerp_w;
+    struct lm_ggml_tensor * time_mix_lerp_k;
+    struct lm_ggml_tensor * time_mix_lerp_v;
+    struct lm_ggml_tensor * time_mix_lerp_r;
+    struct lm_ggml_tensor * time_mix_lerp_g;
+    struct lm_ggml_tensor * time_mix_first;
+    struct lm_ggml_tensor * time_mix_decay;
+    struct lm_ggml_tensor * time_mix_decay_w1;
+    struct lm_ggml_tensor * time_mix_decay_w2;
+    struct lm_ggml_tensor * time_mix_key;
+    struct lm_ggml_tensor * time_mix_value;
+    struct lm_ggml_tensor * time_mix_receptance;
+    struct lm_ggml_tensor * time_mix_gate;
+    struct lm_ggml_tensor * time_mix_ln;
+    struct lm_ggml_tensor * time_mix_ln_b;
+    struct lm_ggml_tensor * time_mix_output;
+    struct lm_ggml_tensor * channel_mix_lerp_k;
+    struct lm_ggml_tensor * channel_mix_lerp_r;
+    struct lm_ggml_tensor * channel_mix_key;
+    struct lm_ggml_tensor * channel_mix_receptance;
+    struct lm_ggml_tensor * channel_mix_value;
     // long rope factors
     struct lm_ggml_tensor * rope_long  = nullptr;
     struct lm_ggml_tensor * rope_short = nullptr;
@@ -3069,7 +3194,6 @@ struct llama_sbatch {
 struct llama_context {
     llama_context(const llama_model & model)
         : model(model)
-        , sampling(llama_n_vocab(&model))
         , t_start_us(model.t_start_us)
         , t_load_us(model.t_load_us) {}
@@ -3086,7 +3210,6 @@ struct llama_context {
     const struct llama_model & model;
     struct llama_cparams        cparams;
-    struct llama_sampling       sampling;
     struct llama_sbatch         sbatch;
     struct llama_kv_cache       kv_self;
     struct llama_control_vector cvec;
@@ -3102,18 +3225,21 @@ struct llama_context {
 #endif
     lm_ggml_backend_t backend_cpu = nullptr;
+    lm_ggml_threadpool_t threadpool       = nullptr;
+    lm_ggml_threadpool_t threadpool_batch = nullptr;
     bool has_evaluated_once = false;
-    int64_t t_start_us;
-    int64_t t_load_us;
-    int64_t t_p_eval_us = 0;
-    int64_t t_eval_us   = 0;
+    mutable int64_t t_start_us;
+    mutable int64_t t_load_us;
+    mutable int64_t t_p_eval_us = 0;
+    mutable int64_t t_eval_us   = 0;
-    int64_t t_compute_start_us = 0;
-    int64_t n_queued_tokens = 0;
+    mutable int64_t t_compute_start_us = 0;
+    mutable int64_t n_queued_tokens = 0;
-    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
-    int32_t n_eval   = 0; // number of eval calls
+    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    mutable int32_t n_eval   = 0; // number of eval calls
     // host buffer for the model output (logits and embeddings)
     lm_ggml_backend_buffer_t buf_output = nullptr;
@@ -3233,29 +3359,33 @@ static size_t llama_get_device_count(const llama_model & model) {
 static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
     lm_ggml_backend_buffer_type_t buft = nullptr;
-#if defined(LM_GGML_USE_RPC)
-    int dev_count = (int)llama_get_device_count(model);
+#ifdef LM_GGML_USE_RPC
     int rpc_count = (int)model.rpc_servers.size();
-    if (gpu >= dev_count - rpc_count) {
-        const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
+#else
+    int rpc_count = 0;
+#endif
+    int local_gpu = gpu - rpc_count;
+#if defined(LM_GGML_USE_RPC)
+    if (gpu < rpc_count) {
+        const char * endpoint = model.rpc_servers[gpu].c_str();
         return lm_ggml_backend_rpc_buffer_type(endpoint);
     }
 #endif
 #if defined(LM_GGML_USE_METAL)
     buft = lm_ggml_backend_metal_buffer_type();
 #elif defined(LM_GGML_USE_CUDA)
-    buft = lm_ggml_backend_cuda_buffer_type(gpu);
+    buft = lm_ggml_backend_cuda_buffer_type(local_gpu);
 #elif defined(LM_GGML_USE_VULKAN)
-    buft = lm_ggml_backend_vk_buffer_type(gpu);
+    buft = lm_ggml_backend_vk_buffer_type(local_gpu);
 #elif defined(LM_GGML_USE_SYCL)
-    buft = lm_ggml_backend_sycl_buffer_type(gpu);
+    buft = lm_ggml_backend_sycl_buffer_type(local_gpu);
 #elif defined(LM_GGML_USE_KOMPUTE)
-    buft = lm_ggml_backend_kompute_buffer_type(gpu);
+    buft = lm_ggml_backend_kompute_buffer_type(local_gpu);
     if (buft == nullptr) {
-        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
+        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
     }
 #elif defined(LM_GGML_USE_CANN)
-    buft = lm_ggml_backend_cann_buffer_type(gpu);
+    buft = lm_ggml_backend_cann_buffer_type(local_gpu);
 #endif
     if (buft == nullptr) {
@@ -3263,7 +3393,7 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(const lla
     }
     return buft;
     LM_GGML_UNUSED(model);
-    LM_GGML_UNUSED(gpu);
+    LM_GGML_UNUSED(local_gpu);
 }
 static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
@@ -3290,13 +3420,17 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama
 }
 static size_t llama_get_device_memory(const llama_model & model, int device) {
-#if defined(LM_GGML_USE_RPC)
-    int dev_count = (int)llama_get_device_count(model);
+#ifdef LM_GGML_USE_RPC
     int rpc_count = (int)model.rpc_servers.size();
-    if (device >= dev_count - rpc_count) {
+#else
+    int rpc_count = 0;
+#endif
+    int local_device = device - rpc_count;
+#if defined(LM_GGML_USE_RPC)
+    if (device < rpc_count) {
         size_t total;
         size_t free;
-        const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
+        const char * endpoint = model.rpc_servers[device].c_str();
         lm_ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
         return free;
     }
@@ -3304,28 +3438,28 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
 #if defined(LM_GGML_USE_CUDA)
     size_t total;
     size_t free;
-    lm_ggml_backend_cuda_get_device_memory(device, &free, &total);
+    lm_ggml_backend_cuda_get_device_memory(local_device, &free, &total);
     return free;
 #elif defined(LM_GGML_USE_SYCL)
     size_t total;
     size_t free;
-    lm_ggml_backend_sycl_get_device_memory(device, &free, &total);
+    lm_ggml_backend_sycl_get_device_memory(local_device, &free, &total);
     return free;
 #elif defined(LM_GGML_USE_VULKAN)
     size_t total;
     size_t free;
-    lm_ggml_backend_vk_get_device_memory(device, &free, &total);
+    lm_ggml_backend_vk_get_device_memory(local_device, &free, &total);
     return free;
 #elif defined(LM_GGML_USE_CANN)
     size_t total;
     size_t free;
-    lm_ggml_backend_cann_get_device_memory(device, &free, &total);
+    lm_ggml_backend_cann_get_device_memory(local_device, &free, &total);
     return free;
 #else
     return 1;
 #endif
     LM_GGML_UNUSED(model);
-    LM_GGML_UNUSED(device);
+    LM_GGML_UNUSED(local_device);
 }
 //
@@ -3434,7 +3568,7 @@ static bool llama_kv_cache_find_slot(
     const uint32_t n_seq_tokens = batch.n_seq_tokens;
     if (cache.recurrent) {
-        // For recurrent state architectures (like Mamba),
+        // For recurrent state architectures (like Mamba or RWKV),
         // each cache cell can store the state for a whole sequence.
         // A slot should be always be contiguous.
@@ -3683,7 +3817,7 @@ static bool llama_kv_cache_seq_rm(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-    // models like Mamba can't have a state partially erased
+    // models like Mamba or RWKV can't have a state partially erased
     if (cache.recurrent) {
         if (seq_id >= (int64_t) cache.size) {
             // could be fatal
@@ -3697,7 +3831,8 @@ static bool llama_kv_cache_seq_rm(
                 if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
                     return false;
                 }
-                if (p0 <= cell.pos && p1 < cell.pos) {
+                // invalidate tails which will be cleared
+                if (p0 <= cell.pos && cell.pos < p1) {
                     tail_id = -1;
                 }
             }
@@ -3819,7 +3954,7 @@ static void llama_kv_cache_seq_add(
     if (p0 == p1) return;
     if (cache.recurrent) {
-        // for Mamba-like models, only the pos needs to be shifted
+        // for Mamba-like or RWKV models, only the pos needs to be shifted
         if (0 <= seq_id && seq_id < (int64_t) cache.size) {
             const int32_t tail_id = cache.cells[seq_id].tail;
             if (tail_id >= 0) {
@@ -3868,7 +4003,7 @@ static void llama_kv_cache_seq_div(
     if (p0 == p1) return;
     if (cache.recurrent) {
-        // for Mamba-like models, only the pos needs to be changed
+        // for Mamba-like or RWKV models, only the pos needs to be changed
         if (0 <= seq_id && seq_id < (int64_t) cache.size) {
             const int32_t tail_id = cache.cells[seq_id].tail;
             if (tail_id >= 0) {
@@ -4322,6 +4457,8 @@ struct llama_model_loader {
                 case LM_GGML_TYPE_Q4_K:    ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M;  break;
                 case LM_GGML_TYPE_Q5_K:    ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M;  break;
                 case LM_GGML_TYPE_Q6_K:    ftype = LLAMA_FTYPE_MOSTLY_Q6_K;    break;
+                case LM_GGML_TYPE_TQ1_0:   ftype = LLAMA_FTYPE_MOSTLY_TQ1_0;   break;
+                case LM_GGML_TYPE_TQ2_0:   ftype = LLAMA_FTYPE_MOSTLY_TQ2_0;   break;
                 case LM_GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
                 case LM_GGML_TYPE_IQ2_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS;  break;
                 case LM_GGML_TYPE_IQ2_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ2_S;   break;
@@ -5015,6 +5152,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_Q5_K_S:   return "Q5_K - Small";
         case LLAMA_FTYPE_MOSTLY_Q5_K_M:   return "Q5_K - Medium";
         case LLAMA_FTYPE_MOSTLY_Q6_K:     return "Q6_K";
+        case LLAMA_FTYPE_MOSTLY_TQ1_0:    return "TQ1_0 - 1.69 bpw ternary";
+        case LLAMA_FTYPE_MOSTLY_TQ2_0:    return "TQ2_0 - 2.06 bpw ternary";
         case LLAMA_FTYPE_MOSTLY_IQ2_XXS:  return "IQ2_XXS - 2.0625 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ2_XS:   return "IQ2_XS - 2.3125 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ2_S:    return "IQ2_S - 2.5 bpw";
@@ -5059,6 +5198,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_1B:            return "1B";
         case MODEL_1_3B:          return "1.3B";
         case MODEL_1_4B:          return "1.4B";
+        case MODEL_1_6B:          return "1.6B";
         case MODEL_2B:            return "2B";
         case MODEL_2_8B:          return "2.8B";
         case MODEL_3B:            return "3B";
@@ -5105,6 +5245,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
         case LLAMA_VOCAB_TYPE_BPE:  return "BPE";
         case LLAMA_VOCAB_TYPE_WPM:  return "WPM";
         case LLAMA_VOCAB_TYPE_UGM:  return "UGM";
+        case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
         default:                    return "unknown";
     }
 }
@@ -5801,6 +5942,26 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_RWKV6:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
+                ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
+                ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
+                ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
+                switch (hparams.n_layer) {
+                    case 24: model.type = e_model::MODEL_1_6B; break;
+                    case 32:
+                        switch (hparams.n_embd) {
+                            case 2560: model.type = e_model::MODEL_3B; break;
+                            case 4096: model.type = e_model::MODEL_7B; break;
+                            default: model.type = e_model::MODEL_UNKNOWN;
+                        } break;
+                    case 61: model.type = e_model::MODEL_14B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         default: (void)0;
     }
@@ -5930,6 +6091,15 @@ static void llm_load_vocab(
                 }
 #endif
             }
+        } else if (tokenizer_model == "rwkv") {
+            vocab.type = LLAMA_VOCAB_TYPE_RWKV;
+            // default special tokens
+            vocab.special_bos_id = -1;
+            vocab.special_eos_id = -1;
+            vocab.special_unk_id = -1;
+            vocab.special_sep_id = -1;
+            vocab.special_pad_id = -1;
         } else {
             throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
         }
@@ -6061,6 +6231,12 @@ static void llm_load_vocab(
             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             vocab.tokenizer_add_bos = false;
             vocab.tokenizer_add_eos = true;
+        } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
+            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            vocab.tokenizer_add_space_prefix = false;
+            vocab.tokenizer_clean_spaces = false;
+            vocab.tokenizer_add_bos = false;
+            vocab.tokenizer_add_eos = false;
         } else {
             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
         }
@@ -6088,6 +6264,7 @@ static void llm_load_vocab(
     const uint32_t n_vocab = lm_gguf_get_arr_n(ctx, token_idx);
+    vocab.n_vocab = n_vocab;
     vocab.id_to_token.resize(n_vocab);
     for (uint32_t i = 0; i < n_vocab; i++) {
@@ -6165,6 +6342,10 @@ static void llm_load_vocab(
         }
     } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
         vocab.linefeed_id = vocab.special_pad_id;
+    } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
+        const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
+        LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+        vocab.linefeed_id = ids[0];
     } else {
         const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
         LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
@@ -6234,6 +6415,11 @@ static void llm_load_vocab(
                         )
                    ) {
                     vocab.special_eot_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                            __func__, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
                     break;
                 }
             }
@@ -6247,6 +6433,11 @@ static void llm_load_vocab(
             const auto & t = vocab.token_to_id.find("<|eom_id|>");
             if (t != vocab.token_to_id.end()) {
                 vocab.special_eom_id = t->second;
+                if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                        __func__, t->first.c_str());
+                    vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                }
             }
         }
     }
@@ -6482,8 +6673,6 @@ static bool llm_load_tensors(
         bool use_mlock,
         llama_progress_callback progress_callback,
         void * progress_callback_user_data) {
-    model.t_start_us = lm_ggml_time_us();
     auto & hparams = model.hparams;
     model.split_mode   = split_mode;
@@ -7955,23 +8144,23 @@ static bool llm_load_tensors(
                         layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
                         layer.wq       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1});
+                        layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
                         layer.wk       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1});
+                        layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
                         layer.wv       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1});
+                        layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
                         layer.wo       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
-                        layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1});
+                        layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
                         layer.ffn_norm     = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM,     "weight", i), {n_embd});
                         layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
                         layer.ffn_gate       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
-                        layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1});
+                        layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
                         layer.ffn_down       = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
-                        layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1});
+                        layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
                         layer.ffn_up         = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
-                        layer.ffn_up_scale   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1});
+                        layer.ffn_up_scale   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "scale",  i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
                     }
                 } break;
             case LLM_ARCH_T5:
@@ -8211,6 +8400,68 @@ static bool llm_load_tensors(
                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                     }
                 } break;
+            case LLM_ARCH_RWKV6:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    // Block 0, LN0
+                    model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
+                    model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
+                    // output
+                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                    model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+                    model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
+                    const int time_decay_extra_dim = hparams.time_decay_extra_dim;
+                    const int head_size = hparams.wkv_head_size;
+                    const int attn_hidden_size = n_embd;
+                    const int ffn_size = hparams.n_ff_arr[0];
+                    for (int i = 0; i < n_layer; ++i) {
+                        lm_ggml_context * ctx_layer = ctx_for_layer(i);
+                        auto & layer = model.layers[i];
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
+                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd});
+                        layer.time_mix_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5});
+                        layer.time_mix_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5});
+                        layer.time_mix_lerp_x = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1});
+                        layer.time_mix_lerp_w = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1});
+                        layer.time_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
+                        layer.time_mix_lerp_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1});
+                        layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
+                        layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1});
+                        layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size});
+                        layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd});
+                        layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim});
+                        layer.time_mix_decay_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size});
+                        layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd});
+                        layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd});
+                        layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
+                        layer.time_mix_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd});
+                        layer.time_mix_ln = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
+                        layer.time_mix_ln_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd});
+                        layer.time_mix_output = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
+                        layer.channel_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
+                        layer.channel_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
+                        layer.channel_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size});
+                        layer.channel_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd});
+                        layer.channel_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd});
+                    }
+                } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -8352,14 +8603,13 @@ static bool llm_load_tensors(
         }
     }
-    // loading time will be recalculate after the first eval, so
-    // we take page faults deferred by mmap() into consideration
-    model.t_load_us = lm_ggml_time_us() - model.t_start_us;
     return true;
 }
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
+    model.t_start_us = lm_ggml_time_us();
     try {
         llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
@@ -8421,6 +8671,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         return -1;
     }
+    // loading time will be recalculate after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    model.t_load_us = lm_ggml_time_us() - model.t_start_us;
     return 0;
 }
@@ -8495,8 +8749,7 @@ static void llm_build_kv_store(
     LM_GGML_ASSERT(kv.size == n_ctx);
-    struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
-            (lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
+    struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head);
     cb(k_cache_view, "k_cache_view", il);
     // note: storing RoPE-ed version of K in the KV cache
@@ -8507,8 +8760,7 @@ static void llm_build_kv_store(
     struct lm_ggml_tensor * v_cache_view = nullptr;
     if (cparams.flash_attn) {
-        v_cache_view = lm_ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
-                (kv_head)*lm_ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
+        v_cache_view = lm_ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, lm_ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head);
     } else {
         // note: the V cache is transposed when not using flash attention
         v_cache_view = lm_ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
@@ -8995,8 +9247,7 @@ static struct lm_ggml_tensor * llm_build_kv(
     struct lm_ggml_tensor * cur;
-    cur  = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b,
-            q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
+    cur  = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
     cb(cur, "kqv_out", il);
     return cur;
@@ -9024,7 +9275,7 @@ static struct lm_ggml_tensor * llm_build_copy_mask_state(
     // FIXME: zero-out NANs?
     states = lm_ggml_mul(ctx, states, state_mask);
-    // copy states which won't be changed further (between n_seqs and n_rs)
+    // copy states which won't be changed further (between n_seqs and n_kv)
     lm_ggml_build_forward_expand(graph,
         lm_ggml_cpy(ctx,
             lm_ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*lm_ggml_element_size(states)),
@@ -9170,6 +9421,171 @@ static struct lm_ggml_tensor * llm_build_mamba(
     return cur;
 }
+static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
+        struct llama_context & lctx,
+        struct lm_ggml_context * ctx,
+        const struct llama_layer * layer,
+        struct lm_ggml_tensor * cur,
+        struct lm_ggml_tensor * x_prev,
+        struct lm_ggml_tensor ** wkv_state) {
+    size_t n_embed      = cur->ne[0];
+    size_t n_seq_tokens = cur->ne[1];
+    size_t n_seqs       = cur->ne[2];
+    size_t head_size  = layer->time_mix_first->ne[0];
+    size_t head_count = layer->time_mix_first->ne[1];
+    size_t n_tokens = n_seqs * n_seq_tokens;
+    struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
+    sx  = lm_ggml_reshape_2d(ctx, sx,  n_embed, n_tokens);
+    cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
+    struct lm_ggml_tensor * xxx = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
+    xxx = lm_ggml_reshape_4d(
+        ctx,
+        lm_ggml_tanh(
+            ctx,
+            lm_ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
+        ),
+        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
+    );
+    xxx = lm_ggml_cont(ctx, lm_ggml_permute(ctx, xxx, 0, 1, 3, 2));
+    xxx = lm_ggml_mul_mat(
+        ctx,
+        lm_ggml_reshape_4d(
+            ctx,
+            layer->time_mix_w2,
+            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
+        ),
+        xxx
+    );
+    struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0);
+    struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float));
+    struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float));
+    struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float));
+    struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float));
+    struct lm_ggml_tensor * xw = lm_ggml_add(
+        ctx,
+        lm_ggml_mul(
+            ctx,
+            lm_ggml_add(ctx, mw, layer->time_mix_lerp_w),
+            sx
+        ),
+        cur
+    );
+    struct lm_ggml_tensor * xk = lm_ggml_add(
+        ctx,
+        lm_ggml_mul(
+            ctx,
+            lm_ggml_add(ctx, mk, layer->time_mix_lerp_k),
+            sx
+        ),
+        cur
+    );
+    struct lm_ggml_tensor * xv = lm_ggml_add(
+        ctx,
+        lm_ggml_mul(
+            ctx,
+            lm_ggml_add(ctx, mv, layer->time_mix_lerp_v),
+            sx
+        ),
+        cur
+    );
+    struct lm_ggml_tensor * xr = lm_ggml_add(
+        ctx,
+        lm_ggml_mul(
+            ctx,
+            lm_ggml_add(ctx, mr, layer->time_mix_lerp_r),
+            sx
+        ),
+        cur
+    );
+    struct lm_ggml_tensor * xg = lm_ggml_add(
+        ctx,
+        lm_ggml_mul(
+            ctx,
+            lm_ggml_add(ctx, mg, layer->time_mix_lerp_g),
+            sx
+        ),
+        cur
+    );
+    struct lm_ggml_tensor * r = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1,         head_count, n_tokens);
+    struct lm_ggml_tensor * k = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key,        xk), 1,         head_size, head_count, n_tokens);
+    struct lm_ggml_tensor * v = lm_ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value,      xv), head_size, 1,         head_count, n_tokens);
+    struct lm_ggml_tensor * g = lm_ggml_silu(
+        ctx,
+        llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
+    );
+    struct lm_ggml_tensor * w = lm_ggml_mul_mat(
+        ctx,
+        layer->time_mix_decay_w2,
+        lm_ggml_tanh(
+            ctx,
+            lm_ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
+        )
+    );
+    w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed));
+    w = lm_ggml_exp(ctx, lm_ggml_neg(ctx, lm_ggml_exp(ctx, w)));
+    w = lm_ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
+    k = lm_ggml_transpose(ctx, k);
+    v = lm_ggml_transpose(ctx, v);
+    r = lm_ggml_transpose(ctx, r);
+    struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
+    cur = lm_ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
+    *wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float));
+    // group norm with head_count groups
+    cur = lm_ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens);
+    cur = lm_ggml_norm(ctx, cur, 64e-5f);
+    // Convert back to regular vectors.
+    cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
+    cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
+    cur = lm_ggml_mul(ctx, cur, g);
+    cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
+    return lm_ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs);
+}
+static struct lm_ggml_tensor * llm_build_rwkv6_channel_mix(
+        struct llama_context & lctx,
+        struct lm_ggml_context * ctx,
+        const struct llama_layer * layer,
+        struct lm_ggml_tensor * cur,
+        struct lm_ggml_tensor * x_prev) {
+    struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
+    struct lm_ggml_tensor * xk = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
+    struct lm_ggml_tensor * xr = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
+    struct lm_ggml_tensor * r = lm_ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr));
+    struct lm_ggml_tensor * k = lm_ggml_sqr(
+        ctx,
+        lm_ggml_relu(
+            ctx,
+            llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk)
+        )
+    );
+    return lm_ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
+}
 struct llm_build_context {
     const llama_model    & model;
           llama_context  & lctx;
@@ -9478,8 +9894,8 @@ struct llm_build_context {
     struct lm_ggml_cgraph * append_pooling(struct lm_ggml_cgraph * gf) {
         // find result_norm tensor for input
         struct lm_ggml_tensor * inp = nullptr;
-        for (int i = gf->n_nodes - 1; i >= 0; --i) {
-            inp = gf->nodes[i];
+        for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+            inp = lm_ggml_graph_node(gf, i);
             if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
                 break;
             } else {
@@ -13790,7 +14206,9 @@ struct llm_build_context {
             {
                 // compute Q and K and RoPE them
                 struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-                Qcur = lm_ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
+                if (model.layers[il].wq_scale) {
+                    Qcur = lm_ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
+                }
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = lm_ggml_add(ctx0, Qcur, model.layers[il].bq);
@@ -13799,7 +14217,9 @@ struct llm_build_context {
                 // B1.K
                 struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
-                Kcur = lm_ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
+                if (model.layers[il].wk_scale) {
+                    Kcur = lm_ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
+                }
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = lm_ggml_add(ctx0, Kcur, model.layers[il].bk);
@@ -13808,7 +14228,9 @@ struct llm_build_context {
                 // B1.V
                 struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
-                Vcur = lm_ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
+                if (model.layers[il].wv_scale) {
+                    Vcur = lm_ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
+                }
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = lm_ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -13839,7 +14261,9 @@ struct llm_build_context {
                 cb(cur, "attn_sub_norm", il);
                 cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
-                cur = lm_ggml_mul(ctx0, cur, model.layers[il].wo_scale);
+                if (model.layers[il].wo_scale) {
+                    cur = lm_ggml_mul(ctx0, cur, model.layers[il].wo_scale);
+                }
                 if (model.layers[il].bo) {
                     cur = lm_ggml_add(ctx0, cur, model.layers[il].bo);
                 }
@@ -13876,7 +14300,9 @@ struct llm_build_context {
             cb(cur, "ffn_sub_norm", il);
             cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
-            cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
+            if (model.layers[il].ffn_down_scale) {
+                cur = lm_ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
+            }
             cb(cur, "ffn_down", il);
             cur = lm_ggml_add(ctx0, cur, ffn_inp);
@@ -14691,6 +15117,117 @@ struct llm_build_context {
         return gf;
     }
+    lm_ggml_cgraph * build_rwkv6() {
+        lm_ggml_cgraph *gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        // Token shift state dimensions should be 2 * n_emb
+        LM_GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
+        const int64_t n_seqs = batch.n_seqs;
+        const int64_t n_seq_tokens = batch.n_seq_tokens;
+        const int64_t n_tokens = batch.n_tokens;
+        LM_GGML_ASSERT(n_seqs != 0);
+        LM_GGML_ASSERT(batch.equal_seqs);
+        LM_GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+        struct lm_ggml_tensor * cur;
+        struct lm_ggml_tensor * inpL;
+        struct lm_ggml_tensor * state_copy = build_inp_s_copy();
+        struct lm_ggml_tensor * state_mask = build_inp_s_mask();
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+        inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
+        for (int il = 0; il < n_layer; ++il) {
+            const llama_layer * layer = &model.layers[il];
+            // (ab)using the KV cache to store the states
+            struct lm_ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
+                    gf, kv_self.k_l[il], state_copy, state_mask,
+                    hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
+            struct lm_ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
+                    gf, kv_self.v_l[il], state_copy, state_mask,
+                    hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
+            cur = lm_ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+            token_shift = lm_ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
+            struct lm_ggml_tensor * att_shift = lm_ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+            struct lm_ggml_tensor * ffn_shift = lm_ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * lm_ggml_element_size(token_shift));
+            struct lm_ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
+            struct lm_ggml_tensor * x_prev = lm_ggml_concat(
+                ctx0,
+                att_shift,
+                lm_ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
+                1
+            );
+            cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
+            lm_ggml_build_forward_expand(gf, cur);
+            lm_ggml_build_forward_expand(
+                gf,
+                lm_ggml_cpy(
+                    ctx0,
+                    wkv_states,
+                    lm_ggml_view_1d(
+                        ctx0,
+                        kv_self.v_l[il],
+                        hparams.n_embd_v_s() * n_seqs,
+                        hparams.n_embd_v_s() * kv_head * lm_ggml_element_size(kv_self.v_l[il])
+                    )
+                )
+            );
+            struct lm_ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il);
+            x_prev = lm_ggml_concat(
+                ctx0,
+                ffn_shift,
+                lm_ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
+                1
+            );
+            cur = lm_ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev));
+            lm_ggml_build_forward_expand(gf, cur);
+            struct lm_ggml_tensor * last_norm_att = lm_ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_att));
+            struct lm_ggml_tensor * last_norm_ffn = lm_ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*lm_ggml_element_size(x_norm_ffn));
+            token_shift = lm_ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
+            lm_ggml_build_forward_expand(
+                gf,
+                lm_ggml_cpy(
+                    ctx0,
+                    lm_ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
+                    lm_ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * lm_ggml_element_size(kv_self.k_l[il]))
+                )
+            );
+            if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
+                cur = lm_ggml_scale(ctx0, cur, 0.5F);
+            }
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+            // input for next layer
+            inpL = cur;
+        }
+        cur = inpL;
+        struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
+        cur = lm_ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
+        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+        lm_ggml_build_forward_expand(gf, cur);
+        return gf;
+    }
 };
 static struct lm_ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -14937,6 +15474,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_exaone();
             } break;
+        case LLM_ARCH_RWKV6:
+            {
+                result = llm.build_rwkv6();
+            } break;
         default:
             LM_GGML_ABORT("fatal error");
     }
@@ -15296,7 +15837,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
             // clear unused states
             for (int i = 0; i < n_kv; ++i) {
-                uint32_t        cell_id = i + kv_self.head;
+                const uint32_t  cell_id = i + kv_self.head;
                 llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
                 data[i] = (float) (kv_cell.src >= 0);
@@ -15505,9 +16046,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
 }
 static void llama_graph_compute(
-        llama_context & lctx,
-          lm_ggml_cgraph * gf,
-                  int   n_threads) {
+          llama_context & lctx,
+            lm_ggml_cgraph * gf,
+                    int   n_threads,
+        lm_ggml_threadpool * threadpool) {
 #ifdef LM_GGML_USE_METAL
     if (lm_ggml_backend_is_metal(lctx.backend_metal)) {
         lm_ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -15516,6 +16058,7 @@ static void llama_graph_compute(
     if (lctx.backend_cpu != nullptr) {
         lm_ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
+        lm_ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
         lm_ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
     }
 #ifdef LM_GGML_USE_BLAS
@@ -15556,6 +16099,15 @@ static int llama_decode_internal(
     LM_GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
+    if (batch_all.token) {
+        for (uint32_t i = 0; i < n_tokens_all; ++i) {
+            if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
+                return -1;
+            }
+        }
+    }
     LM_GGML_ASSERT(n_tokens_all <= cparams.n_batch);
     LM_GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@@ -15636,6 +16188,8 @@ static int llama_decode_internal(
         }
         int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+        lm_ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
         LM_GGML_ASSERT(n_threads > 0);
         // non-causal masks do not use the KV cache
@@ -15670,8 +16224,8 @@ static int llama_decode_internal(
         lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
         // the output is always the last tensor in the graph
-        struct lm_ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
-        struct lm_ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
+        struct lm_ggml_tensor * res  = lm_ggml_graph_node(gf, -1);
+        struct lm_ggml_tensor * embd = lm_ggml_graph_node(gf, -2);
         if (lctx.n_outputs == 0) {
             // no output
@@ -15680,9 +16234,9 @@ static int llama_decode_internal(
         } else if (cparams.embeddings) {
             res  = nullptr; // do not extract logits for embedding case
             embd = nullptr;
-            for (int i = gf->n_nodes - 1; i >= 0; --i) {
-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
-                    embd = gf->nodes[i];
+            for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+                if (strcmp(lm_ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
+                    embd = lm_ggml_graph_node(gf, i);
                     break;
                 }
             }
@@ -15697,7 +16251,7 @@ static int llama_decode_internal(
         llama_set_inputs(lctx, ubatch);
-        llama_graph_compute(lctx, gf, n_threads);
+        llama_graph_compute(lctx, gf, n_threads, threadpool);
         // update the kv ring buffer
         {
@@ -15846,6 +16400,15 @@ static int llama_encode_internal(
     LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+    if (batch.token) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
     // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
     LM_GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
@@ -15874,7 +16437,9 @@ static int llama_encode_internal(
     lctx.inp_embd_enc = NULL;
     lctx.n_outputs = n_tokens;
-    const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    lm_ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
     LM_GGML_ASSERT(n_threads > 0);
     lm_ggml_backend_sched_reset(lctx.sched);
@@ -15888,15 +16453,15 @@ static int llama_encode_internal(
     // there are two cases here
     if (llama_model_has_decoder(&lctx.model)) {
         // first case is an encoder-decoder T5 model where embeddings are passed to decoder
-        embd = gf->nodes[gf->n_nodes - 1];
+        embd = lm_ggml_graph_node(gf, -1);
         LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
     } else {
         // second case is an encoder-only T5 model
         if (cparams.embeddings) {
             // only output embeddings if required
-            embd = gf->nodes[gf->n_nodes - 1];
+            embd = lm_ggml_graph_node(gf, -1);
             if (strcmp(embd->name, "result_embd_pooled") != 0) {
-                embd = gf->nodes[gf->n_nodes - 2];
+                embd = lm_ggml_graph_node(gf, -2);
             }
             LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
         }
@@ -15906,7 +16471,7 @@ static int llama_encode_internal(
     llama_set_inputs(lctx, ubatch);
-    llama_graph_compute(lctx, gf, n_threads);
+    llama_graph_compute(lctx, gf, n_threads, threadpool);
     // extract embeddings
     if (embd) {
@@ -16188,7 +16753,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
     lm_ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 #endif
     //const int64_t t_end = lm_ggml_time_us();
@@ -16214,7 +16779,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
             llama_set_k_shift(lctx);
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
             need_reserve = true;
         }
@@ -16425,6 +16990,9 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
                      new_type == LM_GGML_TYPE_Q4_0_8_8) {
                 new_type = LM_GGML_TYPE_Q4_0;
             }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
+                new_type = LM_GGML_TYPE_Q4_K;
+            }
         }
     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@@ -16624,6 +17192,8 @@ static lm_ggml_type llama_tensor_get_type(quantize_state_internal & qs, lm_ggml_
     }
     if (convert_incompatible_tensor) {
         switch (new_type) {
+            case LM_GGML_TYPE_TQ1_0:
+            case LM_GGML_TYPE_TQ2_0:  new_type = LM_GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
             case LM_GGML_TYPE_IQ2_XXS:
             case LM_GGML_TYPE_IQ2_XS:
             case LM_GGML_TYPE_IQ2_S:
@@ -16729,6 +17299,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_Q5_K_S:
         case LLAMA_FTYPE_MOSTLY_Q5_K_M:  default_type = LM_GGML_TYPE_Q5_K;    break;
         case LLAMA_FTYPE_MOSTLY_Q6_K:    default_type = LM_GGML_TYPE_Q6_K;    break;
+        case LLAMA_FTYPE_MOSTLY_TQ1_0:   default_type = LM_GGML_TYPE_TQ1_0;   break;
+        case LLAMA_FTYPE_MOSTLY_TQ2_0:   default_type = LM_GGML_TYPE_TQ2_0;   break;
         case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = LM_GGML_TYPE_IQ2_XXS; break;
         case LLAMA_FTYPE_MOSTLY_IQ2_XS:  default_type = LM_GGML_TYPE_IQ2_XS;  break;
         case LLAMA_FTYPE_MOSTLY_IQ2_S:   default_type = LM_GGML_TYPE_IQ2_XS;  break;
@@ -16833,7 +17405,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         // TODO: avoid hardcoded tensor names - use the TN_* constants
         if (name.find("attn_v.weight")   != std::string::npos ||
-            name.find("attn_qkv.weight") != std::string::npos) {
+            name.find("attn_qkv.weight") != std::string::npos ||
+            name.find("attn_kv_b.weight")!= std::string::npos) {
             ++qs.n_attention_wv;
         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
             qs.has_output = true;
@@ -16974,6 +17547,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         // NOTE: can't use LLM_TN here because the layer number is not known
         quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
+        // do not quantize RWKV's time_mix_first tensors
+        quantize &= name.find("time_mix_first.weight") == std::string::npos;
+        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
         // do not quantize relative position bias (T5)
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
@@ -17357,7 +17937,6 @@ struct llama_model_params llama_model_default_params() {
 struct llama_context_params llama_context_default_params() {
     struct llama_context_params result = {
-        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
         /*.n_ctx                       =*/ 512,
         /*.n_batch                     =*/ 2048,
         /*.n_ubatch                    =*/ 512,
@@ -17383,6 +17962,7 @@ struct llama_context_params llama_context_default_params() {
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
+        /*.no_perf                     =*/ true,
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
     };
@@ -17390,6 +17970,14 @@ struct llama_context_params llama_context_default_params() {
     return result;
 }
+struct llama_sampler_chain_params llama_sampler_chain_default_params() {
+    struct llama_sampler_chain_params result = {
+        /*.no_perf                     =*/ true,
+    };
+    return result;
+}
 struct llama_model_quantize_params llama_model_quantize_default_params() {
     struct llama_model_quantize_params result = {
         /*.nthread                     =*/ 0,
@@ -17461,6 +18049,19 @@ void llama_numa_init(enum lm_ggml_numa_strategy numa) {
     }
 }
+void llama_attach_threadpool(
+             struct llama_context * ctx,
+        lm_ggml_threadpool_t   threadpool,
+        lm_ggml_threadpool_t   threadpool_batch) {
+    ctx->threadpool       = threadpool;
+    ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
+}
+void llama_detach_threadpool(struct llama_context * ctx) {
+    ctx->threadpool       = nullptr;
+    ctx->threadpool_batch = nullptr;
+}
 void llama_backend_free(void) {
     lm_ggml_quantize_free();
 }
@@ -17572,6 +18173,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.flash_attn       = params.flash_attn;
+    cparams.no_perf          = params.no_perf;
     cparams.pooling_type     = params.pooling_type;
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
@@ -17630,10 +18232,6 @@ struct llama_context * llama_new_context_with_model(
         cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
     }
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
     LLAMA_LOG_INFO("%s: n_ctx      = %u\n",     __func__, cparams.n_ctx);
     LLAMA_LOG_INFO("%s: n_batch    = %u\n",     __func__, cparams.n_batch);
     LLAMA_LOG_INFO("%s: n_ubatch   = %u\n",     __func__, cparams.n_ubatch);
@@ -17644,10 +18242,10 @@ struct llama_context * llama_new_context_with_model(
     ctx->abort_callback      = params.abort_callback;
     ctx->abort_callback_data = params.abort_callback_data;
-    ctx->sampling.rng = std::mt19937(params.seed);
-    ctx->logits_all   = params.logits_all;
+    ctx->logits_all = params.logits_all;
     // build worst-case graph for encoder if a model contains encoder
-    ctx->is_encoding  = llama_model_has_encoder(model);
+    ctx->is_encoding = llama_model_has_encoder(model);
     uint32_t kv_size = cparams.n_ctx;
     lm_ggml_type type_k = params.type_k;
@@ -17667,6 +18265,20 @@ struct llama_context * llama_new_context_with_model(
     if (!hparams.vocab_only) {
         // initialize backends
+#if defined(LM_GGML_USE_RPC)
+        if (model->n_gpu_layers > 0) {
+            for (const auto & endpoint : model->rpc_servers) {
+                lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
+                if (backend == nullptr) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
+                    llama_free(ctx);
+                    return nullptr;
+                }
+                ctx->backends.push_back(backend);
+            }
+        }
+#endif
 #if defined(LM_GGML_USE_METAL)
         if (model->n_gpu_layers > 0) {
             ctx->backend_metal = lm_ggml_backend_metal_init();
@@ -17791,19 +18403,6 @@ struct llama_context * llama_new_context_with_model(
         }
 #endif
-#if defined(LM_GGML_USE_RPC)
-        if (model->n_gpu_layers > 0) {
-            for (const auto & endpoint : model->rpc_servers) {
-                lm_ggml_backend_t backend = lm_ggml_backend_rpc_init(endpoint.c_str());
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
-#endif
         ctx->backend_cpu = lm_ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {
             LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
@@ -17912,7 +18511,7 @@ struct llama_context * llama_new_context_with_model(
             // note: the number of splits during measure is higher than during inference due to the kv shift
             int n_splits = lm_ggml_backend_sched_get_n_splits(ctx->sched);
-            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, gf->n_nodes);
+            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, lm_ggml_graph_n_nodes(gf));
             LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
         }
     }
@@ -17924,14 +18523,6 @@ void llama_free(struct llama_context * ctx) {
     delete ctx;
 }
-const struct llama_model * llama_get_model(const struct llama_context * ctx) {
-    return &ctx->model;
-}
-const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
-    return &ctx->model.vocab;
-}
 uint32_t llama_n_ctx(const struct llama_context * ctx) {
     return ctx->cparams.n_ctx;
 }
@@ -17952,6 +18543,30 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
     return model->vocab.type;
 }
+int32_t llama_n_vocab(const struct llama_model * model) {
+    return model->hparams.n_vocab;
+}
+int32_t llama_n_ctx_train(const struct llama_model * model) {
+    return model->hparams.n_ctx_train;
+}
+int32_t llama_n_embd(const struct llama_model * model) {
+    return model->hparams.n_embd;
+}
+int32_t llama_n_layer(const struct llama_model * model) {
+    return model->hparams.n_layer;
+}
+const struct llama_model * llama_get_model(const struct llama_context * ctx) {
+    return &ctx->model;
+}
+enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
+    return ctx->cparams.pooling_type;
+}
 enum llama_rope_type llama_rope_type(const struct llama_model * model) {
     switch (model->arch) {
         // these models do not use RoPE
@@ -17965,6 +18580,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_T5:
         case LLM_ARCH_T5ENCODER:
         case LLM_ARCH_JAIS:
+        case LLM_ARCH_RWKV6:
             return LLAMA_ROPE_TYPE_NONE;
         // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -18014,26 +18630,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
     return LLAMA_ROPE_TYPE_NONE;
 }
-enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
-    return ctx->cparams.pooling_type;
-}
-int32_t llama_n_vocab(const struct llama_model * model) {
-    return model->hparams.n_vocab;
-}
-int32_t llama_n_ctx_train(const struct llama_model * model) {
-    return model->hparams.n_ctx_train;
-}
-int32_t llama_n_embd(const struct llama_model * model) {
-    return model->hparams.n_embd;
-}
-int32_t llama_n_layer(const struct llama_model * model) {
-    return model->hparams.n_layer;
-}
 float llama_rope_freq_scale_train(const struct llama_model * model) {
     return model->hparams.rope_freq_scale_train;
 }
@@ -18133,6 +18729,7 @@ llama_token llama_model_decoder_start_token(const struct llama_model * model) {
 bool llama_model_is_recurrent(const struct llama_model * model) {
     switch (model->arch) {
         case LLM_ARCH_MAMBA:  return true;
+        case LLM_ARCH_RWKV6:  return true;
         default:              return false;
     }
 }
@@ -18449,14 +19046,14 @@ struct llama_data_write {
         // TODO: add more model-specific info which should prevent loading the session file if not identical
     }
-    void write_rng(const std::mt19937 & rng) {
-        std::ostringstream rng_ss;
-        rng_ss << rng;
+    //void write_rng(const std::mt19937 & rng) {
+    //    std::ostringstream rng_ss;
+    //    rng_ss << rng;
-        const std::string & rng_str = rng_ss.str();
+    //    const std::string & rng_str = rng_ss.str();
-        write_string(rng_str);
-    }
+    //    write_string(rng_str);
+    //}
     void write_output_ids(struct llama_context * ctx) {
         llama_output_reorder(ctx);
@@ -18676,17 +19273,17 @@ struct llama_data_read {
         // TODO: add more info which needs to be identical but which is not verified otherwise
     }
-    void read_rng(std::mt19937 & rng) {
-        std::string rng_str;
-        read_string(rng_str);
+    //void read_rng(std::mt19937 & rng) {
+    //    std::string rng_str;
+    //    read_string(rng_str);
-        std::istringstream rng_ss(rng_str);
-        rng_ss >> rng;
+    //    std::istringstream rng_ss(rng_str);
+    //    rng_ss >> rng;
-        if (rng_ss.fail()) {
-            throw std::runtime_error("failed to load RNG state");
-        }
-    }
+    //    if (rng_ss.fail()) {
+    //        throw std::runtime_error("failed to load RNG state");
+    //    }
+    //}
     void read_output_ids(struct llama_context * ctx) {
         std::vector<int32_t> output_pos;
@@ -19116,8 +19713,6 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
     data_ctx.write_model_info(ctx);
-    data_ctx.write_rng(ctx->sampling.rng);
     // copy outputs
     data_ctx.write_output_ids(ctx);
     data_ctx.write_logits(ctx);
@@ -19155,9 +19750,6 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
     data_ctx.read_model_info(ctx);
-    // set rng
-    data_ctx.read_rng(ctx->sampling.rng);
     // set outputs
     data_ctx.read_output_ids(ctx);
     data_ctx.read_logits(ctx);
@@ -19377,16 +19969,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
     }
 }
-void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
+void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
     ctx->cparams.n_threads       = n_threads;
     ctx->cparams.n_threads_batch = n_threads_batch;
 }
-uint32_t llama_n_threads(struct llama_context * ctx) {
+int32_t llama_n_threads(struct llama_context * ctx) {
     return ctx->cparams.n_threads;
 }
-uint32_t llama_n_threads_batch(struct llama_context * ctx) {
+int32_t llama_n_threads_batch(struct llama_context * ctx) {
     return ctx->cparams.n_threads_batch;
 }
@@ -19500,10 +20092,14 @@ void llama_synchronize(struct llama_context * ctx) {
     // add the evaluation to the stats
     if (ctx->n_queued_tokens == 1) {
-        ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
+        if (!ctx->cparams.no_perf) {
+            ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
+        }
         ctx->n_eval++;
     } else if (ctx->n_queued_tokens > 1) {
-        ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
+        if (!ctx->cparams.no_perf) {
+            ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
+        }
         ctx->n_p_eval += ctx->n_queued_tokens;
     }
@@ -19560,8 +20156,9 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
         LM_GGML_ABORT("fatal error");
-#endif
+#else
         return nullptr;
+#endif
     }
 }
@@ -19609,8 +20206,9 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
         LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
         LM_GGML_ABORT("fatal error");
-#endif
+#else
         return nullptr;
+#endif
     }
 }
@@ -20044,128 +20642,18 @@ int32_t llama_chat_apply_template(
 }
 //
-// grammar
+// sampling
 //
-struct llama_grammar * llama_grammar_init(
-        const llama_grammar_element ** rules,
-        size_t    n_rules,
-        size_t    start_rule_index) {
-    return llama_grammar_init_impl(rules, n_rules, start_rule_index);
-}
-void llama_grammar_free(struct llama_grammar * grammar) {
-    llama_grammar_free_impl(grammar);
-}
-struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
-    return llama_grammar_copy_impl(grammar);
-}
-void llama_grammar_sample(
-      const struct llama_grammar * grammar,
-      const struct llama_context * ctx,
-          llama_token_data_array * candidates) {
-    llama_grammar_sample_impl(grammar, &ctx->model.vocab, &ctx->sampling, candidates);
-}
-void llama_sample_grammar(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-      const struct llama_grammar * grammar) {
-    llama_grammar_sample(grammar, ctx, candidates);
-}
-void llama_grammar_accept_token(
-            struct llama_grammar * grammar,
-            struct llama_context * ctx,
-                     llama_token   token) {
-    llama_grammar_accept_token_impl(grammar, &ctx->model.vocab, &ctx->sampling, token);
+// TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp
+struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
+    return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
 }
 //
-// sampling
+// model split
 //
-void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
-    llama_set_rng_seed_impl(&ctx->sampling, seed);
-}
-void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
-    llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates);
-}
-void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
-    llama_sample_top_k_impl(ctx ? &ctx->sampling : nullptr, candidates, k, min_keep);
-}
-void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    llama_sample_top_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
-}
-void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
-}
-void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates, float xtc_threshold, float xtc_probability, size_t min_keep, std::mt19937 rng){
-    llama_sample_xtc_impl(ctx ? &ctx-> sampling: nullptr, candidates, xtc_threshold, xtc_probability, min_keep, rng);
-}
-void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
-    llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep);
-}
-void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    llama_sample_typical_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
-}
-void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
-    llama_sample_entropy_impl(ctx ? &ctx->sampling : nullptr, candidates_p, min_temp, max_temp, exponent_val);
-}
-void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
-    llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
-}
-void llama_sample_repetition_penalties(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   penalty_last_n,
-                           float   penalty_repeat,
-                           float   penalty_freq,
-                           float   penalty_present) {
-    llama_sample_repetition_penalties_impl(ctx ? &ctx->sampling : nullptr, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
-}
-void llama_sample_apply_guidance(
-          struct llama_context * ctx,
-                         float * logits,
-                         float * logits_guidance,
-                         float   scale) {
-    llama_sample_apply_guidance_impl(&ctx->sampling, logits, logits_guidance, scale);
-}
-llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
-    return llama_sample_token_mirostat_impl(&ctx->sampling, candidates, tau, eta, m, mu);
-}
-llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
-    return llama_sample_token_mirostat_v2_impl(ctx ? &ctx->sampling : nullptr, candidates, tau, eta, mu);
-}
-llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
-    return llama_sample_token_greedy_impl(ctx ? &ctx->sampling : nullptr, candidates);
-}
-llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
-    return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, rng);
-}
-llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
-    return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, ctx->sampling.rng);
-}
 int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
     static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
     if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
@@ -20190,45 +20678,6 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
     return 0;
 }
-struct llama_timings llama_get_timings(struct llama_context * ctx) {
-    struct llama_timings result = {
-        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
-        /*.t_end_ms    =*/ 1.00 * lm_ggml_time_ms(),
-        /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
-        /*.t_sample_ms =*/ 1e-3 * ctx->sampling.t_sample_us,
-        /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
-        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,
-        /*.n_sample =*/ std::max(1, ctx->sampling.n_sample),
-        /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
-        /*.n_eval   =*/ std::max(1, ctx->n_eval),
-    };
-    return result;
-}
-void llama_print_timings(struct llama_context * ctx) {
-    const llama_timings timings = llama_get_timings(ctx);
-    LLAMA_LOG_INFO("\n");
-    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
-    LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
-    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
-    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
-}
-void llama_reset_timings(struct llama_context * ctx) {
-    ctx->t_start_us  = lm_ggml_time_us();
-    ctx->t_eval_us   = ctx->n_eval   = 0;
-    ctx->t_p_eval_us = ctx->n_p_eval = 0;
-    ctx->sampling.reset_timings();
-}
 const char * llama_print_system_info(void) {
     static std::string s;
@@ -20246,6 +20695,7 @@ const char * llama_print_system_info(void) {
     s += "ARM_FMA = "     + std::to_string(lm_ggml_cpu_has_arm_fma())     + " | ";
     s += "F16C = "        + std::to_string(lm_ggml_cpu_has_f16c())        + " | ";
     s += "FP16_VA = "     + std::to_string(lm_ggml_cpu_has_fp16_va())     + " | ";
+    s += "RISCV_VECT = "  + std::to_string(lm_ggml_cpu_has_riscv_v())     + " | ";
     s += "WASM_SIMD = "   + std::to_string(lm_ggml_cpu_has_wasm_simd())   + " | ";
     s += "BLAS = "        + std::to_string(lm_ggml_cpu_has_blas())        + " | ";
     s += "SSE3 = "        + std::to_string(lm_ggml_cpu_has_sse3())        + " | ";
@@ -20257,7 +20707,43 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }
-void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
+struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
+    struct llama_perf_context_data data = {};
+    if (ctx == nullptr) {
+        return data;
+    }
+    data.t_start_ms  = 1e-3 * ctx->t_start_us;
+    data.t_load_ms   = 1e-3 * ctx->t_load_us;
+    data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
+    data.t_eval_ms   = 1e-3 * ctx->t_eval_us;
+    data.n_p_eval    = std::max(1, ctx->n_p_eval);
+    data.n_eval      = std::max(1, ctx->n_eval);
+    return data;
+}
+void llama_perf_context_print(const struct llama_context * ctx) {
+    const auto data = llama_perf_context(ctx);
+    const double t_end_ms = 1e-3 * lm_ggml_time_us();
+    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
+    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+}
+void llama_perf_context_reset(struct llama_context * ctx) {
+    ctx->t_start_us  = lm_ggml_time_us();
+    ctx->t_eval_us   = ctx->n_eval = 0;
+    ctx->t_p_eval_us = ctx->n_p_eval = 0;
+}
+void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
     fprintf(stream, "\n");
     fprintf(stream, "###########\n");
     fprintf(stream, "# Timings #\n");
@@ -20268,21 +20754,15 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
             1.0e-3 * ctx->t_eval_us / ctx->n_eval);
     fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
             1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
-    fprintf(stream, "mst_sample: %.2f  # ms / token during sampling\n",
-            1.0e-3 * ctx->sampling.t_sample_us / ctx->sampling.n_sample);
     fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
     fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
-    fprintf(stream, "n_sample: %d  # number of sampled tokens\n", ctx->sampling.n_sample);
     fprintf(stream, "t_eval_us: %" PRId64 "  # total microseconds spent generating tokens\n", ctx->t_eval_us);
     fprintf(stream, "t_load_us: %" PRId64 "  # total microseconds spent loading the model\n", ctx->t_load_us);
     fprintf(stream, "t_p_eval_us: %" PRId64 "  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
-    fprintf(stream, "t_sample_us: %" PRId64 "  # total microseconds spent sampling\n", ctx->sampling.t_sample_us);
     fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
             1.0e6 * ctx->n_eval / ctx->t_eval_us);
     fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
             1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
-    fprintf(stream, "ts_sample: %.2f  # tokens / second during sampling\n",
-            1.0e6 * ctx->sampling.n_sample / ctx->sampling.t_sample_us);
 }
 // For internal test use
@@ -20334,3 +20814,20 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
     fputs(text, stderr);
     fflush(stderr);
 }
+struct llama_token_timings llama_get_token_timings(const void * v_ctx) {
+    const auto * ctx = (llama_context *) v_ctx;
+    struct llama_token_timings result = {
+        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
+        /*.t_end_ms    =*/ 1.00 * lm_ggml_time_ms(),
+        /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
+        /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
+        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,
+        /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
+        /*.n_eval   =*/ std::max(1, ctx->n_eval),
+    };
+    return result;
+}