npm - cui-llama.rn - Versions diffs - 1.2.3 → 1.2.6 - Mend

cui-llama.rn 1.2.3 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/README.md +0 -2
package/android/src/main/CMakeLists.txt +1 -0
package/android/src/main/java/com/rnllama/LlamaContext.java +0 -3
package/android/src/main/jni.cpp +9 -11
package/cpp/common.cpp +85 -75
package/cpp/common.h +127 -91
package/cpp/ggml-aarch64.c +269 -0
package/cpp/ggml-alloc.c +17 -19
package/cpp/ggml-backend-impl.h +4 -15
package/cpp/ggml-backend.cpp +1697 -1626
package/cpp/ggml-backend.h +13 -25
package/cpp/ggml-cpp.h +38 -0
package/cpp/ggml-cpu.c +13720 -0
package/cpp/ggml-cpu.h +150 -0
package/cpp/ggml-impl.h +95 -0
package/cpp/ggml-metal.m +185 -71
package/cpp/ggml-quants.c +38 -51
package/cpp/ggml.c +4468 -19500
package/cpp/ggml.h +26 -146
package/cpp/json-schema-to-grammar.cpp +1 -1
package/cpp/llama-sampling.cpp +742 -249
package/cpp/llama-sampling.h +21 -2
package/cpp/llama-vocab.cpp +49 -9
package/cpp/llama-vocab.h +35 -11
package/cpp/llama.cpp +2468 -2307
package/cpp/llama.h +65 -32
package/cpp/log.cpp +50 -50
package/cpp/log.h +18 -18
package/cpp/rn-llama.hpp +23 -22
package/cpp/sampling.cpp +117 -118
package/cpp/sampling.h +20 -20
package/cpp/sgemm.cpp +57 -0
package/lib/commonjs/NativeRNLlama.js.map +1 -1
package/lib/module/NativeRNLlama.js.map +1 -1
package/lib/typescript/NativeRNLlama.d.ts +0 -1
package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
package/package.json +1 -1
package/src/NativeRNLlama.ts +0 -1

package/README.md CHANGED Viewed

@@ -11,8 +11,6 @@ The following features have been added for Android:
 - `vocab_only` mode: utilize the llama.cpp tokenizer
 - tokenizeSync: non-blocking, synchronous tokenizer function
 - Context Shift taken from [kobold.cpp](https://github.com/LostRuins/koboldcpp)
-- XTC sampling
-- Progress callback
 - Retrieving CPU Features to check for i8mm and dotprod flags
 Original repo README.md below.

package/android/src/main/CMakeLists.txt CHANGED Viewed

@@ -18,6 +18,7 @@ set(
     ${RNLLAMA_LIB_DIR}/ggml-alloc.c
     ${RNLLAMA_LIB_DIR}/ggml-backend.cpp
     ${RNLLAMA_LIB_DIR}/ggml.c
+    ${RNLLAMA_LIB_DIR}/ggml-cpu.c
     ${RNLLAMA_LIB_DIR}/ggml-quants.c
     ${RNLLAMA_LIB_DIR}/common.cpp
     ${RNLLAMA_LIB_DIR}/json.hpp

package/android/src/main/java/com/rnllama/LlamaContext.java CHANGED Viewed

@@ -248,8 +248,6 @@ public class LlamaContext {
       params.hasKey("xtc_t") ? (float) params.getDouble("xtc_t") : 0.00f,
       // float xtc_p,
       params.hasKey("xtc_p") ? (float) params.getDouble("xtc_p") : 0.00f,
-      // float tfs_z,
-      params.hasKey("tfs_z") ? (float) params.getDouble("tfs_z") : 1.00f,
       // float typical_p,
       params.hasKey("typical_p") ? (float) params.getDouble("typical_p") : 1.00f,
       // int seed,
@@ -438,7 +436,6 @@ public class LlamaContext {
     float min_p,
     float xtc_t,
     float xtc_p,
-    float tfs_z,
     float typical_p,
     int seed,
     String[] stop,

package/android/src/main/jni.cpp CHANGED Viewed

@@ -156,7 +156,7 @@ Java_com_rnllama_LlamaContext_initContext(
 ) {
     UNUSED(thiz);
-    gpt_params defaultParams;
+    common_params defaultParams;
     defaultParams.vocab_only = vocab_only;
     if(vocab_only) {
@@ -268,7 +268,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
     UNUSED(thiz);
     auto llama = context_map[(long) context_ptr];
-    std::vector<llama_chat_msg> chat;
+    std::vector<common_chat_msg> chat;
     int messages_len = env->GetArrayLength(messages);
     for (int i = 0; i < messages_len; i++) {
@@ -292,7 +292,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
     }
     const char *tmpl_chars = env->GetStringUTFChars(chat_template, nullptr);
-    std::string formatted_chat = llama_chat_apply_template(llama->model, tmpl_chars, chat, true);
+    std::string formatted_chat = common_chat_apply_template(llama->model, tmpl_chars, chat, true);
     return env->NewStringUTF(formatted_chat.c_str());
 }
@@ -399,7 +399,6 @@ Java_com_rnllama_LlamaContext_doCompletion(
     jfloat min_p,
     jfloat xtc_t,
     jfloat xtc_p,
-    jfloat tfs_z,
     jfloat typical_p,
     jint seed,
     jobjectArray stop,
@@ -438,12 +437,11 @@ Java_com_rnllama_LlamaContext_doCompletion(
     sparams.top_k = top_k;
     sparams.top_p = top_p;
     sparams.min_p = min_p;
-    sparams.tfs_z = tfs_z;
     sparams.typ_p = typical_p;
     sparams.n_probs = n_probs;
     sparams.grammar = env->GetStringUTFChars(grammar, nullptr);
-    sparams.xtc_t = xtc_t;
-    sparams.xtc_p = xtc_p;
+    sparams.xtc_threshold = xtc_t;
+    sparams.xtc_probability = xtc_p;
     sparams.logit_bias.clear();
     if (ignore_eos) {
@@ -497,7 +495,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
         if (token_with_probs.tok == -1 || llama->incomplete) {
             continue;
         }
-        const std::string token_text = llama_token_to_piece(llama->ctx, token_with_probs.tok);
+        const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
         size_t pos = std::min(sent_count, llama->generated_text.size());
@@ -532,7 +530,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
             putString(env, tokenResult, "token", to_send.c_str());
             if (llama->params.sparams.n_probs > 0) {
-              const std::vector<llama_token> to_send_toks = llama_tokenize(llama->ctx, to_send, false);
+              const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
               size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
               size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
               if (probs_pos < probs_stop_pos) {
@@ -607,7 +605,7 @@ Java_com_rnllama_LlamaContext_tokenize(
     const char *text_chars = env->GetStringUTFChars(text, nullptr);
-    const std::vector<llama_token> toks = llama_tokenize(
+    const std::vector<llama_token> toks = common_tokenize(
         llama->ctx,
         text_chars,
         false
@@ -719,7 +717,7 @@ Java_com_rnllama_LlamaContext_freeContext(
     }
     if (llama->ctx_sampling != nullptr)
     {
-        gpt_sampler_free(llama->ctx_sampling);
+        common_sampler_free(llama->ctx_sampling);
     }
     context_map.erase((long) llama->ctx);
 }

package/cpp/common.cpp CHANGED Viewed

@@ -12,6 +12,7 @@
 #include <algorithm>
 #include <cinttypes>
+#include <climits>
 #include <cmath>
 #include <codecvt>
 #include <cstdarg>
@@ -23,10 +24,10 @@
 #include <regex>
 #include <sstream>
 #include <string>
+#include <thread>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include <thread>
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -368,10 +369,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[LM_GGML_MAX_N_THR
     return true;
 }
-void gpt_init() {
+void common_init() {
     llama_log_set([](lm_ggml_log_level level, const char * text, void * /*user_data*/) {
-        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
-            gpt_log_add(gpt_log_main(), level, "%s", text);
+        if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
+            common_log_add(common_log_main(), level, "%s", text);
         }
     }, NULL);
@@ -384,7 +385,7 @@ void gpt_init() {
     LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
 }
-std::string gpt_params_get_system_info(const gpt_params & params) {
+std::string common_params_get_system_info(const common_params & params) {
     std::ostringstream os;
     os << "system_info: n_threads = " << params.cpuparams.n_threads;
@@ -406,17 +407,19 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
 // String utils
 //
-std::vector<std::string> string_split(std::string input, char separator) {
-    std::vector<std::string> parts;
-    size_t separator_pos = input.find(separator);
-    while (separator_pos != std::string::npos) {
-        std::string part = input.substr(0, separator_pos);
-        parts.emplace_back(part);
-        input = input.substr(separator_pos + 1);
-        separator_pos = input.find(separator);
-    }
-    parts.emplace_back(input);
-    return parts;
+std::string string_format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    LM_GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
 }
 std::string string_strip(const std::string & str) {
@@ -499,7 +502,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
             first = false;
         }
-        auto detokenized = llama_token_to_piece(ctx, token);
+        auto detokenized = common_token_to_piece(ctx, token);
         detokenized.erase(
             std::remove_if(
@@ -530,7 +533,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
             first = false;
         }
-        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
+        auto detokenized = common_token_to_piece(ctx, batch.token[i]);
         detokenized.erase(
                 std::remove_if(
@@ -825,16 +828,16 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 // Model utils
 //
-struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
-    llama_init_result iparams;
-    auto mparams = llama_model_params_from_gpt_params(params);
+struct common_init_result common_init_from_params(common_params & params) {
+    common_init_result iparams;
+    auto mparams = common_model_params_to_llama(params);
     llama_model * model = nullptr;
     if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+        model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
     } else if (!params.model_url.empty()) {
-        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+        model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
     } else {
         model = llama_load_model_from_file(params.model.c_str(), mparams);
     }
@@ -869,7 +872,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         }
     }
-    auto cparams = llama_context_params_from_gpt_params(params);
+    auto cparams = common_context_params_to_llama(params);
     llama_context * lctx = llama_new_context_with_model(model, cparams);
     if (lctx == NULL) {
@@ -882,7 +885,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
         if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);
-        const auto cvec = llama_control_vector_load(params.control_vectors);
+        const auto cvec = common_control_vector_load(params.control_vectors);
         if (cvec.n_embd == -1) {
             llama_free(lctx);
             llama_free_model(model);
@@ -906,7 +909,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
-        llama_lora_adapter_container loaded_la;
+        common_lora_adapter_container loaded_la;
         loaded_la.path = la.path;
         loaded_la.scale = la.scale;
         loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
@@ -919,7 +922,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
     }
     if (!params.lora_init_without_apply) {
-        llama_lora_adapters_apply(lctx, iparams.lora_adapters);
+        common_lora_adapters_apply(lctx, iparams.lora_adapters);
     }
     if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -945,7 +948,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         }
         if (llama_model_has_encoder(model)) {
-            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
+            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
             llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
             if (decoder_start_token_id == -1) {
                 decoder_start_token_id = bos;
@@ -954,7 +957,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
             tmp.push_back(decoder_start_token_id);
         }
         if (llama_model_has_decoder(model)) {
-            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
         llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
@@ -967,7 +970,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     return iparams;
 }
-void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
     llama_lora_adapter_clear(ctx);
     for (auto & la : lora_adapters) {
         if (la.scale != 0.0f) {
@@ -976,7 +979,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
     }
 }
-struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
+struct llama_model_params common_model_params_to_llama(const common_params & params) {
     auto mparams = llama_model_default_params();
     if (params.n_gpu_layers != -1) {
@@ -1029,10 +1032,10 @@ static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
         return LM_GGML_TYPE_Q5_1;
     }
-    throw std::runtime_error("Invalid cache type: " + s);
+    throw std::runtime_error("Unsupported cache type: " + s);
 }
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
+struct llama_context_params common_context_params_to_llama(const common_params & params) {
     auto cparams = llama_context_default_params();
     cparams.n_ctx             = params.n_ctx;
@@ -1041,7 +1044,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_ubatch          = params.n_ubatch;
     cparams.n_threads         = params.cpuparams.n_threads;
     cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
-                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
+                                params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
     cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
     cparams.rope_scaling_type = params.rope_scaling_type;
@@ -1122,7 +1125,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
     return false;
 }
-static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
+static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
     // Initialize libcurl
     std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@@ -1192,15 +1195,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
     }
     // Send a HEAD request to retrieve the etag and last-modified headers
-    struct llama_load_model_from_url_headers {
+    struct common_load_model_from_url_headers {
         std::string etag;
         std::string last_modified;
     };
-    llama_load_model_from_url_headers headers;
+    common_load_model_from_url_headers headers;
     {
         typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
         auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
+            common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
             static std::regex header_regex("([^:]+): (.*)\r\n");
             static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1336,7 +1339,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
     return true;
 }
-struct llama_model * llama_load_model_from_url(
+struct llama_model * common_load_model_from_url(
         const char * model_url,
         const char * path_model,
         const char * hf_token,
@@ -1347,7 +1350,7 @@ struct llama_model * llama_load_model_from_url(
         return NULL;
     }
-    if (!llama_download_file(model_url, path_model, hf_token)) {
+    if (!common_download_file(model_url, path_model, hf_token)) {
         return NULL;
     }
@@ -1400,7 +1403,7 @@ struct llama_model * llama_load_model_from_url(
                 char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
                 llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
-                return llama_download_file(split_url, split_path, hf_token);
+                return common_download_file(split_url, split_path, hf_token);
             }, idx));
         }
@@ -1415,7 +1418,7 @@ struct llama_model * llama_load_model_from_url(
     return llama_load_model_from_file(path_model, params);
 }
-struct llama_model * llama_load_model_from_hf(
+struct llama_model * common_load_model_from_hf(
         const char * repo,
         const char * model,
         const char * path_model,
@@ -1435,12 +1438,12 @@ struct llama_model * llama_load_model_from_hf(
     model_url += "/resolve/main/";
     model_url += model;
-    return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
+    return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
 }
 #else
-struct llama_model * llama_load_model_from_url(
+struct llama_model * common_load_model_from_url(
         const char * /*model_url*/,
         const char * /*path_model*/,
         const char * /*hf_token*/,
@@ -1449,7 +1452,7 @@ struct llama_model * llama_load_model_from_url(
     return nullptr;
 }
-struct llama_model * llama_load_model_from_hf(
+struct llama_model * common_load_model_from_hf(
         const char * /*repo*/,
         const char * /*model*/,
         const char * /*path_model*/,
@@ -1465,11 +1468,11 @@ struct llama_model * llama_load_model_from_hf(
 // Batch utils
 //
-void llama_batch_clear(struct llama_batch & batch) {
+void common_batch_clear(struct llama_batch & batch) {
     batch.n_tokens = 0;
 }
-void llama_batch_add(
+void common_batch_add(
                  struct llama_batch & batch,
                         llama_token   id,
                           llama_pos   pos,
@@ -1492,15 +1495,15 @@ void llama_batch_add(
 // Vocab utils
 //
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
                         bool   add_special,
                         bool   parse_special) {
-    return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
+    return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
 }
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
     const struct llama_model * model,
            const std::string & text,
                         bool   add_special,
@@ -1519,7 +1522,7 @@ std::vector<llama_token> llama_tokenize(
     return result;
 }
-std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
+std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
     std::string piece;
     piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
     const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
@@ -1535,7 +1538,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
     return piece;
 }
-std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
     std::string text;
     text.resize(std::max(text.capacity(), tokens.size()));
     int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@@ -1555,15 +1558,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
 // Chat template utils
 //
-bool llama_chat_verify_template(const std::string & tmpl) {
+bool common_chat_verify_template(const std::string & tmpl) {
     llama_chat_message chat[] = {{"user", "test"}};
     int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
     return res >= 0;
 }
-std::string llama_chat_apply_template(const struct llama_model * model,
+std::string common_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
-        const std::vector<llama_chat_msg> & msgs,
+        const std::vector<common_chat_msg> & msgs,
         bool add_ass) {
     int alloc_size = 0;
     bool fallback = false; // indicate if we must fallback to default chatml
@@ -1605,42 +1608,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
     return formatted_chat;
 }
-std::string llama_chat_format_single(const struct llama_model * model,
+std::string common_chat_format_single(const struct llama_model * model,
         const std::string & tmpl,
-        const std::vector<llama_chat_msg> & past_msg,
-        const llama_chat_msg & new_msg,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
         bool add_ass) {
     std::ostringstream ss;
-    auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
-    std::vector<llama_chat_msg> chat_new(past_msg);
+    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
+    std::vector<common_chat_msg> chat_new(past_msg);
     // if the past_msg ends with a newline, we must preserve it in the formatted version
     if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
         ss << "\n";
     };
     // format chat with new_msg
     chat_new.push_back(new_msg);
-    auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
+    auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
     // get the diff part
     ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
     return ss.str();
 }
-std::string llama_chat_format_example(const struct llama_model * model,
+std::string common_chat_format_example(const struct llama_model * model,
         const std::string & tmpl) {
-    std::vector<llama_chat_msg> msgs = {
+    std::vector<common_chat_msg> msgs = {
         {"system",    "You are a helpful assistant"},
         {"user",      "Hello"},
         {"assistant", "Hi there"},
         {"user",      "How are you?"},
     };
-    return llama_chat_apply_template(model, tmpl, msgs, true);
+    return common_chat_apply_template(model, tmpl, msgs, true);
 }
 //
 // KV cache utils
 //
-void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
     static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
@@ -1663,7 +1666,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
     printf("\n=== Done dumping\n");
 }
-void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
     static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
     printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
@@ -1715,7 +1718,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
 // Embedding utils
 //
-void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
     double sum = 0.0;
     switch (embd_norm) {
@@ -1749,7 +1752,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
     }
 }
-float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
+float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
     double sum  = 0.0;
     double sum1 = 0.0;
     double sum2 = 0.0;
@@ -1775,8 +1778,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
 // Control vector utils
 //
-static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
-    llama_control_vector_data result = { -1, {} };
+static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
+    common_control_vector_data result = { -1, {} };
     lm_ggml_context * ctx = nullptr;
     struct lm_gguf_init_params meta_lm_gguf_params = {
@@ -1860,11 +1863,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
     return result;
 }
-llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
-    llama_control_vector_data result = { -1, {} };
+common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
+    common_control_vector_data result = { -1, {} };
     for (const auto & info : load_infos) {
-        auto cur = llama_control_vector_load_one(info);
+        auto cur = common_control_vector_load_one(info);
         if (cur.n_embd == -1) {
             result.n_embd = -1;
@@ -1956,8 +1959,10 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
     }
 }
-void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
+void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
                                const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
+    lm_ggml_cpu_init(); // some ARM features are detected at runtime
     const auto & sparams = params.sparams;
     fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
@@ -2013,6 +2018,10 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
     fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
+    fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
+    fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
+    fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
+    fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
     fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
@@ -2093,11 +2102,12 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
     yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
-    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
     fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
+    fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
+    fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
     fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
     fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");