npm - cui-llama.rn - Versions diffs - 1.1.4 → 1.1.5 - Mend

cui-llama.rn 1.1.4 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/cpp/llama.cpp CHANGED Viewed

@@ -2167,6 +2167,10 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buf
     if (host_buffer) {
         buft = lm_ggml_backend_sycl_host_buffer_type();
     }
+#elif defined(LM_GGML_USE_CANN)
+    if (host_buffer) {
+        buft = lm_ggml_backend_cann_host_buffer_type();
+    }
 #elif defined(LM_GGML_USE_CPU_HBM)
     buft = lm_ggml_backend_cpu_hbm_buffer_type();
 #elif defined(LM_GGML_USE_VULKAN)
@@ -2493,6 +2497,7 @@ struct llama_cparams {
     bool causal_attn;
     bool offload_kqv;
     bool flash_attn;
+    bool no_perf;
     enum llama_pooling_type pooling_type;
@@ -6668,8 +6673,6 @@ static bool llm_load_tensors(
         bool use_mlock,
         llama_progress_callback progress_callback,
         void * progress_callback_user_data) {
-    model.t_start_us = lm_ggml_time_us();
     auto & hparams = model.hparams;
     model.split_mode   = split_mode;
@@ -8600,14 +8603,13 @@ static bool llm_load_tensors(
         }
     }
-    // loading time will be recalculate after the first eval, so
-    // we take page faults deferred by mmap() into consideration
-    model.t_load_us = lm_ggml_time_us() - model.t_start_us;
     return true;
 }
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
+    model.t_start_us = lm_ggml_time_us();
     try {
         llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
@@ -8669,6 +8671,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         return -1;
     }
+    // loading time will be recalculate after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    model.t_load_us = lm_ggml_time_us() - model.t_start_us;
     return 0;
 }
@@ -9269,7 +9275,7 @@ static struct lm_ggml_tensor * llm_build_copy_mask_state(
     // FIXME: zero-out NANs?
     states = lm_ggml_mul(ctx, states, state_mask);
-    // copy states which won't be changed further (between n_seqs and n_rs)
+    // copy states which won't be changed further (between n_seqs and n_kv)
     lm_ggml_build_forward_expand(graph,
         lm_ggml_cpy(ctx,
             lm_ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*lm_ggml_element_size(states)),
@@ -9888,8 +9894,8 @@ struct llm_build_context {
     struct lm_ggml_cgraph * append_pooling(struct lm_ggml_cgraph * gf) {
         // find result_norm tensor for input
         struct lm_ggml_tensor * inp = nullptr;
-        for (int i = gf->n_nodes - 1; i >= 0; --i) {
-            inp = gf->nodes[i];
+        for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+            inp = lm_ggml_graph_node(gf, i);
             if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
                 break;
             } else {
@@ -15831,7 +15837,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
             // clear unused states
             for (int i = 0; i < n_kv; ++i) {
-                uint32_t        cell_id = i + kv_self.head;
+                const uint32_t  cell_id = i + kv_self.head;
                 llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
                 data[i] = (float) (kv_cell.src >= 0);
@@ -16087,19 +16093,21 @@ static int llama_decode_internal(
         return -1;
     }
-    for (uint32_t i = 0; i < n_tokens_all; ++i) {
-        if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
-            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
-            return -1;
-        }
-    }
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
     LM_GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
+    if (batch_all.token) {
+        for (uint32_t i = 0; i < n_tokens_all; ++i) {
+            if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
+                return -1;
+            }
+        }
+    }
     LM_GGML_ASSERT(n_tokens_all <= cparams.n_batch);
     LM_GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@@ -16216,8 +16224,8 @@ static int llama_decode_internal(
         lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
         // the output is always the last tensor in the graph
-        struct lm_ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
-        struct lm_ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
+        struct lm_ggml_tensor * res  = lm_ggml_graph_node(gf, -1);
+        struct lm_ggml_tensor * embd = lm_ggml_graph_node(gf, -2);
         if (lctx.n_outputs == 0) {
             // no output
@@ -16226,9 +16234,9 @@ static int llama_decode_internal(
         } else if (cparams.embeddings) {
             res  = nullptr; // do not extract logits for embedding case
             embd = nullptr;
-            for (int i = gf->n_nodes - 1; i >= 0; --i) {
-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
-                    embd = gf->nodes[i];
+            for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+                if (strcmp(lm_ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
+                    embd = lm_ggml_graph_node(gf, i);
                     break;
                 }
             }
@@ -16386,19 +16394,21 @@ static int llama_encode_internal(
         return -1;
     }
-    for (uint32_t i = 0; i < n_tokens; ++i) {
-        if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
-            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
-            return -1;
-        }
-    }
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
     LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+    if (batch.token) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
     // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
     LM_GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
@@ -16443,15 +16453,15 @@ static int llama_encode_internal(
     // there are two cases here
     if (llama_model_has_decoder(&lctx.model)) {
         // first case is an encoder-decoder T5 model where embeddings are passed to decoder
-        embd = gf->nodes[gf->n_nodes - 1];
+        embd = lm_ggml_graph_node(gf, -1);
         LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
     } else {
         // second case is an encoder-only T5 model
         if (cparams.embeddings) {
             // only output embeddings if required
-            embd = gf->nodes[gf->n_nodes - 1];
+            embd = lm_ggml_graph_node(gf, -1);
             if (strcmp(embd->name, "result_embd_pooled") != 0) {
-                embd = gf->nodes[gf->n_nodes - 2];
+                embd = lm_ggml_graph_node(gf, -2);
             }
             LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
         }
@@ -17541,6 +17551,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         quantize &= name.find("time_mix_first.weight") == std::string::npos;
         quantize &= name.find("time_mix_w1.weight") == std::string::npos;
         quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
         // do not quantize relative position bias (T5)
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
@@ -17950,6 +17962,7 @@ struct llama_context_params llama_context_default_params() {
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
+        /*.no_perf                     =*/ true,
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
     };
@@ -18160,6 +18173,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.flash_attn       = params.flash_attn;
+    cparams.no_perf          = params.no_perf;
     cparams.pooling_type     = params.pooling_type;
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
@@ -18497,7 +18511,7 @@ struct llama_context * llama_new_context_with_model(
             // note: the number of splits during measure is higher than during inference due to the kv shift
             int n_splits = lm_ggml_backend_sched_get_n_splits(ctx->sched);
-            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, gf->n_nodes);
+            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, lm_ggml_graph_n_nodes(gf));
             LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
         }
     }
@@ -20078,10 +20092,14 @@ void llama_synchronize(struct llama_context * ctx) {
     // add the evaluation to the stats
     if (ctx->n_queued_tokens == 1) {
-        ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
+        if (!ctx->cparams.no_perf) {
+            ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
+        }
         ctx->n_eval++;
     } else if (ctx->n_queued_tokens > 1) {
-        ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
+        if (!ctx->cparams.no_perf) {
+            ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
+        }
         ctx->n_p_eval += ctx->n_queued_tokens;
     }
@@ -20677,6 +20695,7 @@ const char * llama_print_system_info(void) {
     s += "ARM_FMA = "     + std::to_string(lm_ggml_cpu_has_arm_fma())     + " | ";
     s += "F16C = "        + std::to_string(lm_ggml_cpu_has_f16c())        + " | ";
     s += "FP16_VA = "     + std::to_string(lm_ggml_cpu_has_fp16_va())     + " | ";
+    s += "RISCV_VECT = "  + std::to_string(lm_ggml_cpu_has_riscv_v())     + " | ";
     s += "WASM_SIMD = "   + std::to_string(lm_ggml_cpu_has_wasm_simd())   + " | ";
     s += "BLAS = "        + std::to_string(lm_ggml_cpu_has_blas())        + " | ";
     s += "SSE3 = "        + std::to_string(lm_ggml_cpu_has_sse3())        + " | ";
@@ -20688,65 +20707,40 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }
-void llama_perf_print(const void * ctx, enum llama_perf_type type) {
-    switch (type) {
-        case LLAMA_PERF_TYPE_CONTEXT:
-            {
-                const auto * p = (const struct llama_context *) ctx;
-                const double t_start_ms   = 1e-3 * p->t_start_us;
-                const double t_end_ms     = 1.00 * lm_ggml_time_ms();
-                const double t_load_ms    = 1e-3 * p->t_load_us;
-                const double t_p_eval_ms  = 1e-3 * p->t_p_eval_us;
-                const double t_eval_ms    = 1e-3 * p->t_eval_us;
-                const int32_t n_p_eval  = std::max(0, p->n_p_eval);
-                const int32_t n_eval    = std::max(1, p->n_eval);
-                LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, t_load_ms);
-                LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
-                LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
-                LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
-            } break;
-        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
-            {
-                const auto * smpl = (const struct llama_sampler *) ctx;
-                const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
+struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
+    struct llama_perf_context_data data = {};
-                const double t_sampler_ms = 1e-3 * p->t_sample_us;
+    if (ctx == nullptr) {
+        return data;
+    }
-                const int32_t n_sampler = std::max(0, p->n_sample);
+    data.t_start_ms  = 1e-3 * ctx->t_start_us;
+    data.t_load_ms   = 1e-3 * ctx->t_load_us;
+    data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
+    data.t_eval_ms   = 1e-3 * ctx->t_eval_us;
+    data.n_p_eval    = std::max(1, ctx->n_p_eval);
+    data.n_eval      = std::max(1, ctx->n_eval);
-                LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
-            } break;
-        default:
-            LM_GGML_ABORT("invalid perf type");
-    }
+    return data;
 }
-void llama_perf_reset(void * ctx, enum llama_perf_type type) {
-    switch (type) {
-        case LLAMA_PERF_TYPE_CONTEXT:
-            {
-                auto * p = (struct llama_context *) ctx;
+void llama_perf_context_print(const struct llama_context * ctx) {
+    const auto data = llama_perf_context(ctx);
-                p->t_start_us  = lm_ggml_time_us();
-                p->t_eval_us   = p->n_eval = 0;
-                p->t_p_eval_us = p->n_p_eval = 0;
-            } break;
-        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
-            {
-                auto * smpl = (struct llama_sampler *) ctx;
-                auto * p = (struct llama_sampler_chain *) smpl->ctx;
+    const double t_end_ms = 1e-3 * lm_ggml_time_us();
-                p->t_sample_us = p->n_sample = 0;
-            } break;
-        default:
-            LM_GGML_ABORT("invalid perf type");
-    }
+    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
+    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+}
+void llama_perf_context_reset(struct llama_context * ctx) {
+    ctx->t_start_us  = lm_ggml_time_us();
+    ctx->t_eval_us   = ctx->n_eval = 0;
+    ctx->t_p_eval_us = ctx->n_p_eval = 0;
 }
 void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {

package/cpp/llama.h CHANGED Viewed

@@ -344,7 +344,7 @@ extern "C" {
         bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-      //bool no_perf;     // whether to measure performance timings, TODO: implement
+        bool no_perf;     // whether to measure performance timings
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
@@ -1057,6 +1057,9 @@ extern "C" {
     LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
     LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
+    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
+    LLAMA_API struct llama_sampler * llama_sampler_chain_remove(   struct llama_sampler * chain, int32_t i);
     // available samplers:
     LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
@@ -1131,15 +1134,20 @@ extern "C" {
                              int32_t   n_logit_bias,
               const llama_logit_bias * logit_bias);
-    // Shorthand for:
+    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
+    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
+    /// @details Sample and accept a token from the idx-th output of the last evaluation
     //
+    // Shorthand for:
     //    const auto * logits = llama_get_logits_ith(ctx, idx);
     //    llama_token_data_array cur_p = { ... init from logits ... };
     //    llama_sampler_apply(smpl, &cur_p);
-    //    return cur_p.data[cur_p.selected].id;
-    //
-    // At this point, this is mostly a convenience function.
-    //
+    //    auto token = cur_p.data[cur_p.selected].id;
+    //    llama_sampler_accept(smpl, token);
+    //    return token;
+    // Returns the sampled token
     LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
     // TODO: extend in the future
@@ -1172,13 +1180,30 @@ extern "C" {
     // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
     //
-    enum llama_perf_type {
-        LLAMA_PERF_TYPE_CONTEXT       = 0,
-        LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
+    struct llama_perf_context_data {
+        double t_start_ms;
+        double t_load_ms;
+        double t_p_eval_ms;
+        double t_eval_ms;
+        int32_t n_p_eval;
+        int32_t n_eval;
     };
-    LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
-    LLAMA_API void llama_perf_reset(      void * ctx, enum llama_perf_type type);
+    struct llama_perf_sampler_data {
+        double t_sample_ms;
+        int32_t n_sample;
+    };
+    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
+    // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
+    LLAMA_API struct llama_perf_sampler_data llama_perf_sampler      (const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
     LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);

package/cpp/rn-llama.hpp CHANGED Viewed

@@ -7,6 +7,7 @@
 #include "llama.h"
 #include <android/log.h>
+#include "sampling.h"
 #define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
 #define LLAMA_LOG_INFO(...)  __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
@@ -334,7 +335,7 @@ struct llama_rn_context
     {
         // number of tokens to keep when resetting context
         n_remain = params.n_predict;
-        llama_perf_reset(ctx, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_context_reset(ctx);
         is_predicting = true;
     }

package/cpp/sampling.cpp CHANGED Viewed

@@ -2,6 +2,9 @@
 #include "common.h"
+#include <cmath>
+#include <unordered_map>
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
 template<typename T>
@@ -139,7 +142,7 @@ std::string gpt_sampler_params::print() const {
 struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
     llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
-    lparams.no_perf = false; // TODO: control via params
+    lparams.no_perf = params.no_perf;
     auto * result = new gpt_sampler {
         /* .params = */ params,
@@ -257,10 +260,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
     // TODO: measure grammar performance
     if (gsmpl) {
-        llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+        llama_perf_sampler_print(gsmpl->chain);
     }
     if (ctx) {
-        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_context_print(ctx);
     }
 }
@@ -310,6 +313,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
     return cur_p.data[cur_p.selected].id;
 }
+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
+    return llama_sampler_get_seed(gsmpl->chain);
+}
 // helpers
 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
@@ -432,7 +439,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
 }
 std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_sampler_type> sampler_name_map {
+    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },

package/cpp/sampling.h CHANGED Viewed

@@ -2,65 +2,11 @@
 #include "llama.h"
+#include "common.h"
 #include <string>
 #include <vector>
-enum gpt_sampler_type {
-    GPT_SAMPLER_TYPE_NONE        = 0,
-    GPT_SAMPLER_TYPE_TOP_K       = 1,
-    GPT_SAMPLER_TYPE_TOP_P       = 2,
-    GPT_SAMPLER_TYPE_MIN_P       = 3,
-    GPT_SAMPLER_TYPE_TFS_Z       = 4,
-    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
-    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
-    GPT_SAMPLER_TYPE_XTC         = 7,
-};
-// sampling parameters
-struct gpt_sampler_params {
-    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   xtc_t             = 0.0f;  // 0.0 = disabled
-    float   xtc_p             = 0.0f;
-    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
-    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range    = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.00f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    bool    ignore_eos        = false;
-    std::vector<enum gpt_sampler_type> samplers = {
-        GPT_SAMPLER_TYPE_TOP_K,
-        GPT_SAMPLER_TYPE_TFS_Z,
-        GPT_SAMPLER_TYPE_TYPICAL_P,
-        GPT_SAMPLER_TYPE_TOP_P,
-        GPT_SAMPLER_TYPE_MIN_P,
-        GPT_SAMPLER_TYPE_XTC,
-        GPT_SAMPLER_TYPE_TEMPERATURE
-    };
-    std::string grammar; // optional BNF-like grammar to constrain sampling
-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
-    // print the parameters into a string
-    std::string print() const;
-};
 // gpt_sampler extends llama_sampler with additional functionality:
 //
 //  - grammar support
@@ -114,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 //
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
 // helpers
 // access the internal list of current candidate tokens

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cui-llama.rn",
-  "version": "1.1.4",
+  "version": "1.1.5",
   "description": "Fork of llama.rn for ChatterUI",
   "main": "lib/commonjs/index",
   "module": "lib/module/index",