npm - cui-llama.rn - Versions diffs - 1.2.2 → 1.2.4 - Mend

cui-llama.rn 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/android/src/main/java/com/rnllama/LlamaContext.java +5 -2
package/android/src/main/jni.cpp +7 -7
package/cpp/common.cpp +81 -63
package/cpp/common.h +79 -62
package/cpp/ggml-alloc.c +17 -19
package/cpp/ggml-backend.cpp +59 -24
package/cpp/ggml-impl.h +8 -0
package/cpp/ggml.c +65 -23
package/cpp/ggml.h +1 -0
package/cpp/json-schema-to-grammar.cpp +1 -1
package/cpp/llama-sampling.cpp +366 -24
package/cpp/llama-sampling.h +3 -2
package/cpp/llama-vocab.cpp +33 -9
package/cpp/llama-vocab.h +30 -11
package/cpp/llama.cpp +471 -387
package/cpp/llama.h +52 -21
package/cpp/log.cpp +50 -50
package/cpp/log.h +18 -18
package/cpp/rn-llama.hpp +23 -22
package/cpp/sampling.cpp +110 -119
package/cpp/sampling.h +20 -20
package/package.json +1 -1

package/cpp/llama.h CHANGED Viewed

@@ -218,6 +218,7 @@ extern "C" {
     typedef struct llama_token_data_array {
         // TODO: consider SoA
+        // NOTE: this pointer can be modified by the samplers
         llama_token_data * data;
         size_t size;
         int64_t selected; // this is the index in the data array (i.e. not the token id)
@@ -233,8 +234,11 @@ extern "C" {
     // - token  : the token ids of the input (used when embd is NULL)
     // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
     // - pos    : the positions of the respective token in the sequence
+    //            (if set to NULL, the token position will be tracked automatically by llama_decode)
     // - seq_id : the sequence to which the respective token belongs
+    //            (if set to NULL, the sequence ID will be assumed to be 0)
     // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
+    //            (if set to NULL, only the logits for last token will be returned)
     //
     typedef struct llama_batch {
         int32_t n_tokens;
@@ -245,15 +249,6 @@ extern "C" {
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
         int8_t       *  logits; // TODO: rename this to "output"
-        // NOTE: helpers for smooth API transition - can be deprecated in the future
-        //       for future-proof code, use the above fields instead and ignore everything below
-        //
-        // pos[i] = all_pos_0 + i*all_pos_1
-        //
-        llama_pos    all_pos_0;  // used if pos == NULL
-        llama_pos    all_pos_1;  // used if pos == NULL
-        llama_seq_id all_seq_id; // used if seq_id == NULL
     } llama_batch;
     enum llama_model_kv_override_type {
@@ -434,6 +429,7 @@ extern "C" {
     LLAMA_API bool llama_supports_mmap       (void);
     LLAMA_API bool llama_supports_mlock      (void);
     LLAMA_API bool llama_supports_gpu_offload(void);
+    LLAMA_API bool llama_supports_rpc        (void);
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
@@ -776,15 +772,15 @@ extern "C" {
     // Decoding
     //
-    // Return batch for single sequence of tokens starting at pos_0
+    // Return batch for single sequence of tokens
+    // The sequence ID will be fixed to 0
+    // The position of the tokens will be tracked automatically by llama_decode
     //
     // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
     //
     LLAMA_API struct llama_batch llama_batch_get_one(
                   llama_token * tokens,
-                      int32_t   n_tokens,
-                    llama_pos   pos_0,
-                 llama_seq_id   seq_id);
+                      int32_t   n_tokens);
     // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
     // Each token can be assigned up to n_seq_max sequence ids
@@ -897,6 +893,7 @@ extern "C" {
     // Special tokens
     LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
     LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
+    LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
     LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
     LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
     LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
@@ -905,11 +902,17 @@ extern "C" {
     LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
     LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
-    // Codellama infill tokens
-    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
-    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
-    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
-    LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
+    // infill tokens
+    DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
+    DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
+    LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
+    LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
+    LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
+    LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
+    LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
+    LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
     //
     // Tokenization
@@ -1068,12 +1071,13 @@ extern "C" {
     // available samplers:
-    LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
-    LLAMA_API struct llama_sampler * llama_sampler_init_dist       (uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist  (uint32_t seed);
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
     /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
-    LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void);
+    DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
+        "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
     LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
@@ -1092,11 +1096,16 @@ extern "C" {
     /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
     LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
+    /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
     LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
     /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
     LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
+    /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
+    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1141,6 +1150,28 @@ extern "C" {
                              int32_t   n_logit_bias,
               const llama_logit_bias * logit_bias);
+    // this sampler is meant to be used for fill-in-the-middle infilling
+    // it's supposed to be used after top_k + top_p sampling
+    //
+    // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
+    // 2. combine probs of tokens that have the same prefix
+    //
+    // example:
+    //
+    // - before:
+    //   "hel":   0.5
+    //   "hell":  0.2
+    //   "hello": 0.1
+    //   "dummy": 0.1
+    //
+    // - after:
+    //   "hel":   0.8
+    //   "dummy": 0.1
+    //
+    // 3. discard non-EOG tokens with low prob
+    // 4. if no tokens are left -> pick EOT
+    //
+    LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
     // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
     LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);

package/cpp/log.cpp CHANGED Viewed

@@ -8,10 +8,10 @@
 #include <thread>
 #include <vector>
-int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
+int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
-void gpt_log_set_verbosity_thold(int verbosity) {
-    gpt_log_verbosity_thold = verbosity;
+void common_log_set_verbosity_thold(int verbosity) {
+    common_log_verbosity_thold = verbosity;
 }
 #define LOG_COL_DEFAULT "\033[0m"
@@ -29,16 +29,16 @@ static int64_t t_us() {
 }
 // colors
-enum gpt_log_col : int {
-    GPT_LOG_COL_DEFAULT = 0,
-    GPT_LOG_COL_BOLD,
-    GPT_LOG_COL_RED,
-    GPT_LOG_COL_GREEN,
-    GPT_LOG_COL_YELLOW,
-    GPT_LOG_COL_BLUE,
-    GPT_LOG_COL_MAGENTA,
-    GPT_LOG_COL_CYAN,
-    GPT_LOG_COL_WHITE,
+enum common_log_col : int {
+    COMMON_LOG_COL_DEFAULT = 0,
+    COMMON_LOG_COL_BOLD,
+    COMMON_LOG_COL_RED,
+    COMMON_LOG_COL_GREEN,
+    COMMON_LOG_COL_YELLOW,
+    COMMON_LOG_COL_BLUE,
+    COMMON_LOG_COL_MAGENTA,
+    COMMON_LOG_COL_CYAN,
+    COMMON_LOG_COL_WHITE,
 };
 // disable colors by default
@@ -54,7 +54,7 @@ static std::vector<const char *> g_col = {
     "",
 };
-struct gpt_log_entry {
+struct common_log_entry {
     enum lm_ggml_log_level level;
     bool prefix;
@@ -71,7 +71,7 @@ struct gpt_log_entry {
         if (!fcur) {
             // stderr displays DBG messages only when their verbosity level is not higher than the threshold
             // these messages will still be logged to a file
-            if (level == LM_GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
+            if (level == LM_GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
                 return;
             }
@@ -86,19 +86,19 @@ struct gpt_log_entry {
             if (timestamp) {
                 // [M.s.ms.us]
                 fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
-                        g_col[GPT_LOG_COL_BLUE],
+                        g_col[COMMON_LOG_COL_BLUE],
                         (int) (timestamp / 1000000 / 60),
                         (int) (timestamp / 1000000 % 60),
                         (int) (timestamp / 1000 % 1000),
                         (int) (timestamp % 1000),
-                        g_col[GPT_LOG_COL_DEFAULT]);
+                        g_col[COMMON_LOG_COL_DEFAULT]);
             }
             switch (level) {
-                case LM_GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN],   g_col[GPT_LOG_COL_DEFAULT]); break;
-                case LM_GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], ""                        ); break;
-                case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED],     ""                        ); break;
-                case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW],  ""                        ); break;
+                case LM_GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN],   g_col[COMMON_LOG_COL_DEFAULT]); break;
+                case LM_GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], ""                        ); break;
+                case LM_GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED],     ""                        ); break;
+                case LM_GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW],  ""                        ); break;
                 default:
                     break;
             }
@@ -107,18 +107,18 @@ struct gpt_log_entry {
         fprintf(fcur, "%s", msg.data());
         if (level == LM_GGML_LOG_LEVEL_WARN || level == LM_GGML_LOG_LEVEL_ERROR || level == LM_GGML_LOG_LEVEL_DEBUG) {
-            fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
+            fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
         }
         fflush(fcur);
     }
 };
-struct gpt_log {
+struct common_log {
     // default capacity - will be expanded if needed
-    gpt_log() : gpt_log(256) {}
+    common_log() : common_log(256) {}
-    gpt_log(size_t capacity) {
+    common_log(size_t capacity) {
         file = nullptr;
         prefix = false;
         timestamps = false;
@@ -137,7 +137,7 @@ struct gpt_log {
         resume();
     }
-    ~gpt_log() {
+    ~common_log() {
         pause();
         if (file) {
             fclose(file);
@@ -158,12 +158,12 @@ private:
     int64_t t_start;
     // ring buffer of entries
-    std::vector<gpt_log_entry> entries;
+    std::vector<common_log_entry> entries;
     size_t head;
     size_t tail;
     // worker thread copies into this
-    gpt_log_entry cur;
+    common_log_entry cur;
 public:
     void add(enum lm_ggml_log_level level, const char * fmt, va_list args) {
@@ -219,7 +219,7 @@ public:
         tail = (tail + 1) % entries.size();
         if (tail == head) {
             // expand the buffer
-            std::vector<gpt_log_entry> new_entries(2*entries.size());
+            std::vector<common_log_entry> new_entries(2*entries.size());
             size_t new_tail = 0;
@@ -320,15 +320,15 @@ public:
         pause();
         if (colors) {
-            g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
-            g_col[GPT_LOG_COL_BOLD]    = LOG_COL_BOLD;
-            g_col[GPT_LOG_COL_RED]     = LOG_COL_RED;
-            g_col[GPT_LOG_COL_GREEN]   = LOG_COL_GREEN;
-            g_col[GPT_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
-            g_col[GPT_LOG_COL_BLUE]    = LOG_COL_BLUE;
-            g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
-            g_col[GPT_LOG_COL_CYAN]    = LOG_COL_CYAN;
-            g_col[GPT_LOG_COL_WHITE]   = LOG_COL_WHITE;
+            g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
+            g_col[COMMON_LOG_COL_BOLD]    = LOG_COL_BOLD;
+            g_col[COMMON_LOG_COL_RED]     = LOG_COL_RED;
+            g_col[COMMON_LOG_COL_GREEN]   = LOG_COL_GREEN;
+            g_col[COMMON_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
+            g_col[COMMON_LOG_COL_BLUE]    = LOG_COL_BLUE;
+            g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
+            g_col[COMMON_LOG_COL_CYAN]    = LOG_COL_CYAN;
+            g_col[COMMON_LOG_COL_WHITE]   = LOG_COL_WHITE;
         } else {
             for (size_t i = 0; i < g_col.size(); i++) {
                 g_col[i] = "";
@@ -355,47 +355,47 @@ public:
 // public API
 //
-struct gpt_log * gpt_log_init() {
-    return new gpt_log;
+struct common_log * common_log_init() {
+    return new common_log;
 }
-struct gpt_log * gpt_log_main() {
-    static struct gpt_log log;
+struct common_log * common_log_main() {
+    static struct common_log log;
     return &log;
 }
-void gpt_log_pause(struct gpt_log * log) {
+void common_log_pause(struct common_log * log) {
     log->pause();
 }
-void gpt_log_resume(struct gpt_log * log) {
+void common_log_resume(struct common_log * log) {
     log->resume();
 }
-void gpt_log_free(struct gpt_log * log) {
+void common_log_free(struct common_log * log) {
     delete log;
 }
-void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char * fmt, ...) {
+void common_log_add(struct common_log * log, enum lm_ggml_log_level level, const char * fmt, ...) {
     va_list args;
     va_start(args, fmt);
     log->add(level, fmt, args);
     va_end(args);
 }
-void gpt_log_set_file(struct gpt_log * log, const char * file) {
+void common_log_set_file(struct common_log * log, const char * file) {
     log->set_file(file);
 }
-void gpt_log_set_colors(struct gpt_log * log, bool colors) {
+void common_log_set_colors(struct common_log * log, bool colors) {
     log->set_colors(colors);
 }
-void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
+void common_log_set_prefix(struct common_log * log, bool prefix) {
     log->set_prefix(prefix);
 }
-void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
+void common_log_set_timestamps(struct common_log * log, bool timestamps) {
     log->set_timestamps(timestamps);
 }

package/cpp/log.h CHANGED Viewed

@@ -14,23 +14,23 @@
 #define LOG_DEFAULT_LLAMA 0
 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
-// set via gpt_log_set_verbosity()
-extern int gpt_log_verbosity_thold;
+// set via common_log_set_verbosity()
+extern int common_log_verbosity_thold;
-void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
+void common_log_set_verbosity_thold(int verbosity); // not thread-safe
-// the gpt_log uses an internal worker thread to print/write log messages
+// the common_log uses an internal worker thread to print/write log messages
 // when the worker thread is paused, incoming log messages are discarded
-struct gpt_log;
+struct common_log;
-struct gpt_log * gpt_log_init();
-struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
-void             gpt_log_pause (struct gpt_log * log); // pause  the worker thread, not thread-safe
-void             gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
-void             gpt_log_free  (struct gpt_log * log);
+struct common_log * common_log_init();
+struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
+void                common_log_pause (struct common_log * log); // pause  the worker thread, not thread-safe
+void                common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
+void                common_log_free  (struct common_log * log);
 LOG_ATTRIBUTE_FORMAT(3, 4)
-void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char * fmt, ...);
+void common_log_add(struct common_log * log, enum lm_ggml_log_level level, const char * fmt, ...);
 // defaults: file = NULL, colors = false, prefix = false, timestamps = false
 //
@@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum lm_ggml_log_level level, const char
 // D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
 //
-void gpt_log_set_file      (struct gpt_log * log, const char * file);       // not thread-safe
-void gpt_log_set_colors    (struct gpt_log * log,       bool   colors);     // not thread-safe
-void gpt_log_set_prefix    (struct gpt_log * log,       bool   prefix);     // whether to output prefix to each log
-void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // whether to output timestamps in the prefix
+void common_log_set_file      (struct common_log * log, const char * file);       // not thread-safe
+void common_log_set_colors    (struct common_log * log,       bool   colors);     // not thread-safe
+void common_log_set_prefix    (struct common_log * log,       bool   prefix);     // whether to output prefix to each log
+void common_log_set_timestamps(struct common_log * log,       bool   timestamps); // whether to output timestamps in the prefix
 // helper macros for logging
 // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@@ -66,7 +66,7 @@ void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // w
 //
 //   LOG_DBG("this is a debug message: %d\n", expensive_function());
 //
-// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
+// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
 //
@@ -98,8 +98,8 @@ void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // w
 #define LOG_TMPL(level, verbosity, ...) \
     do { \
-        if ((verbosity) <= gpt_log_verbosity_thold) { \
-            gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
+        if ((verbosity) <= common_log_verbosity_thold) { \
+            common_log_add(common_log_main(), (level), __VA_ARGS__); \
         } \
     } while (0)

package/cpp/rn-llama.hpp CHANGED Viewed

@@ -117,7 +117,7 @@ static size_t find_partial_stop_string(const std::string &stop,
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
-    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+    std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
     // if the size is 1 and first bit is 1, meaning it's a partial character
     //   (size > 1 meaning it's already a known token)
     if (out.size() == 1 && (out[0] & 0x80) == 0x80)
@@ -136,7 +136,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
     std::string ret;
     for (; begin != end; ++begin)
     {
-        ret += llama_token_to_piece(ctx, *begin);
+        ret += common_token_to_piece(ctx, *begin);
     }
     return ret;
 }
@@ -157,11 +157,11 @@ struct llama_rn_context
     std::vector<llama_token> embd;
-    gpt_params params;
+    common_params params;
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
-    gpt_sampler *ctx_sampling = nullptr;
+    common_sampler *ctx_sampling = nullptr;
     int n_ctx;
@@ -186,7 +186,7 @@ struct llama_rn_context
         }
         if (ctx_sampling != nullptr)
         {
-            gpt_sampler_free(ctx_sampling);
+            common_sampler_free(ctx_sampling);
         }
     }
@@ -213,16 +213,16 @@ struct llama_rn_context
     bool initSampling() {
         if (ctx_sampling != nullptr) {
-            gpt_sampler_free(ctx_sampling);
+            common_sampler_free(ctx_sampling);
         }
-        ctx_sampling = gpt_sampler_init(model, params.sparams);
+        ctx_sampling = common_sampler_init(model, params.sparams);
         return ctx_sampling != nullptr;
     }
-    bool loadModel(gpt_params &params_)
+    bool loadModel(common_params &params_)
     {
         params = params_;
-        llama_init_result result = llama_init_from_gpt_params(params);
+        common_init_result result = common_init_from_params(params);
         model = result.model;
         ctx = result.context;
         if (model == nullptr)
@@ -268,7 +268,7 @@ struct llama_rn_context
     void loadPrompt()
     {
-        std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true, true);
+        std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
         num_prompt_tokens = prompt_tokens.size();
         // LOG tokens
@@ -302,7 +302,7 @@ struct llama_rn_context
         // push the prompt into the sampling context (do not apply grammar)
         for (auto & token : prompt_tokens)
         {
-           gpt_sampler_accept(ctx_sampling, token, false);
+           common_sampler_accept(ctx_sampling, token, false);
         }
         // compare the evaluated prompt with the new prompt
         n_past = params.embedding? 0 :  common_part(embd, prompt_tokens);
@@ -375,8 +375,8 @@ struct llama_rn_context
             {
                 n_eval = params.n_batch;
             }
-            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
-            {
+            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
+            {
                 LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
                     n_eval,
                     n_past,
@@ -408,18 +408,19 @@ struct llama_rn_context
             std::vector<llama_token_data> candidates;
             candidates.reserve(llama_n_vocab(model));
-            result.tok = gpt_sampler_sample(ctx_sampling, ctx, -1);
+            result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
-            llama_token_data_array cur_p = *gpt_sampler_get_candidates(ctx_sampling);
+            llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
             const int32_t n_probs = params.sparams.n_probs;
-            if (params.sparams.temp <= 0 && n_probs > 0)
+            // deprecated
+            /*if (params.sparams.temp <= 0 && n_probs > 0)
             {
                 // For llama_sample_token_greedy we need to sort candidates
                 llama_sampler_init_softmax();
-            }
+            }*/
             for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
@@ -427,7 +428,7 @@ struct llama_rn_context
                 result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
             }
-            gpt_sampler_accept(ctx_sampling, result.tok, true);
+            common_sampler_accept(ctx_sampling, result.tok, true);
             if (tg) {
                 num_tokens_predicted++;
             }
@@ -487,7 +488,7 @@ struct llama_rn_context
     {
         const completion_token_output token_with_probs = nextToken();
-        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
+        const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
         generated_text += token_text;
         if (params.sparams.n_probs > 0)
@@ -528,7 +529,7 @@ struct llama_rn_context
         }
         LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s",
-            llama_token_to_piece(ctx, token_with_probs.tok),
+            common_token_to_piece(ctx, token_with_probs.tok),
             tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(),
             has_next_token,
             n_remain,
@@ -562,7 +563,7 @@ struct llama_rn_context
             return std::vector<float>(n_embd, 0.0f);
         }
         std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
-        llama_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
+        common_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
         return out;
     }