npm - cui-llama.rn - Versions diffs - 1.3.4 → 1.3.6 - Mend

cui-llama.rn 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/android/src/main/CMakeLists.txt +14 -8
package/android/src/main/jni.cpp +38 -37
package/cpp/common.cpp +50 -30
package/cpp/common.h +32 -13
package/cpp/ggml-alloc.c +0 -1
package/cpp/ggml-backend-reg.cpp +79 -49
package/cpp/ggml-backend.cpp +5 -2
package/cpp/ggml-cpp.h +1 -0
package/cpp/ggml-cpu-aarch64.cpp +57 -72
package/cpp/ggml-cpu-quants.c +5 -1
package/cpp/ggml-cpu.c +6 -6
package/cpp/ggml-cpu.cpp +9 -0
package/cpp/ggml-impl.h +11 -0
package/cpp/ggml-metal.m +2 -2
package/cpp/ggml.c +129 -1388
package/cpp/ggml.h +29 -152
package/cpp/gguf.cpp +1325 -0
package/cpp/gguf.h +202 -0
package/cpp/llama-adapter.cpp +346 -0
package/cpp/llama-adapter.h +73 -0
package/cpp/llama-arch.cpp +1434 -0
package/cpp/llama-arch.h +395 -0
package/cpp/llama-batch.cpp +368 -0
package/cpp/llama-batch.h +88 -0
package/cpp/llama-chat.cpp +567 -0
package/cpp/llama-chat.h +51 -0
package/cpp/llama-context.cpp +1771 -0
package/cpp/llama-context.h +128 -0
package/cpp/llama-cparams.cpp +1 -0
package/cpp/llama-cparams.h +37 -0
package/cpp/llama-cpp.h +30 -0
package/cpp/llama-grammar.cpp +16 -15
package/cpp/llama-grammar.h +5 -6
package/cpp/llama-hparams.cpp +71 -0
package/cpp/llama-hparams.h +140 -0
package/cpp/llama-impl.cpp +167 -0
package/cpp/llama-impl.h +16 -136
package/cpp/llama-kv-cache.cpp +718 -0
package/cpp/llama-kv-cache.h +218 -0
package/cpp/llama-mmap.cpp +589 -0
package/cpp/llama-mmap.h +67 -0
package/cpp/llama-model-loader.cpp +1011 -0
package/cpp/llama-model-loader.h +158 -0
package/cpp/llama-model.cpp +2202 -0
package/cpp/llama-model.h +391 -0
package/cpp/llama-sampling.cpp +117 -4
package/cpp/llama-vocab.cpp +26 -29
package/cpp/llama-vocab.h +14 -2
package/cpp/llama.cpp +8839 -19131
package/cpp/llama.cpp.rej +23 -0
package/cpp/llama.h +31 -9
package/cpp/rn-llama.hpp +39 -37
package/cpp/sgemm.cpp +1091 -378
package/cpp/sgemm.h +2 -2
package/cpp/unicode.cpp +6 -0
package/package.json +1 -1

package/android/src/main/CMakeLists.txt CHANGED Viewed

@@ -9,17 +9,23 @@ include_directories(${RNLLAMA_LIB_DIR})
 set(
     SOURCE_FILES
-    ${RNLLAMA_LIB_DIR}/llama-grammar.cpp
-    ${RNLLAMA_LIB_DIR}/llama-sampling.cpp
-    ${RNLLAMA_LIB_DIR}/llama-vocab.cpp
-    ${RNLLAMA_LIB_DIR}/log.cpp
-    #${RNLLAMA_LIB_DIR}/amx/amx.cpp
-    #${RNLLAMA_LIB_DIR}/amx/mmq.cpp
+    ${RNLLAMA_LIB_DIR}/common.cpp
     ${RNLLAMA_LIB_DIR}/llama-grammar.cpp
     ${RNLLAMA_LIB_DIR}/llama-sampling.cpp
     ${RNLLAMA_LIB_DIR}/llama-vocab.cpp
+    ${RNLLAMA_LIB_DIR}/llama-chat.cpp
+    ${RNLLAMA_LIB_DIR}/llama-mmap.cpp
+    ${RNLLAMA_LIB_DIR}/llama-context.cpp
+    ${RNLLAMA_LIB_DIR}/llama-kv-cache.cpp
+    ${RNLLAMA_LIB_DIR}/llama-model-loader.cpp
+    ${RNLLAMA_LIB_DIR}/llama-model.cpp
+    ${RNLLAMA_LIB_DIR}/llama-batch.cpp
+    ${RNLLAMA_LIB_DIR}/llama-arch.cpp
+    ${RNLLAMA_LIB_DIR}/llama-cparams.cpp
+    ${RNLLAMA_LIB_DIR}/llama-hparams.cpp
+    ${RNLLAMA_LIB_DIR}/llama-adapter.cpp
+    ${RNLLAMA_LIB_DIR}/llama-impl.cpp
     ${RNLLAMA_LIB_DIR}/log.cpp
     ${RNLLAMA_LIB_DIR}/json.hpp
     ${RNLLAMA_LIB_DIR}/json-schema-to-grammar.cpp
@@ -28,6 +34,7 @@ set(
     ${RNLLAMA_LIB_DIR}/ggml-backend.cpp
     ${RNLLAMA_LIB_DIR}/ggml-backend-reg.cpp
     ${RNLLAMA_LIB_DIR}/ggml.c
+    ${RNLLAMA_LIB_DIR}/gguf.cpp
     ${RNLLAMA_LIB_DIR}/ggml-cpu.c
     ${RNLLAMA_LIB_DIR}/ggml-cpu.cpp
     ${RNLLAMA_LIB_DIR}/ggml-cpu-aarch64.cpp
@@ -35,7 +42,6 @@ set(
     ${RNLLAMA_LIB_DIR}/ggml-cpu-quants.c
     ${RNLLAMA_LIB_DIR}/ggml-threading.cpp
     ${RNLLAMA_LIB_DIR}/ggml-quants.c
-    ${RNLLAMA_LIB_DIR}/common.cpp
     ${RNLLAMA_LIB_DIR}/sampling.cpp
     ${RNLLAMA_LIB_DIR}/unicode-data.cpp
     ${RNLLAMA_LIB_DIR}/unicode.cpp

package/android/src/main/jni.cpp CHANGED Viewed

@@ -11,7 +11,8 @@
 #include <unordered_map>
 #include "llama.h"
 #include "llama-impl.h"
-#include "ggml.h"
+#include "llama-context.h"
+#include "gguf.h"
 #include "rn-llama.hpp"
 #define UNUSED(x) (void)(x)
@@ -336,17 +337,17 @@ Java_com_rnllama_LlamaContext_initContext(
     LOGI("[RNLlama] is_model_loaded %s", (is_model_loaded ? "true" : "false"));
     if (is_model_loaded) {
-      if (embedding && llama_model_has_encoder(llama->model) && llama_model_has_decoder(llama->model)) {
+      if (embedding && llama_model_has_encoder(llama->model.get()) && llama_model_has_decoder(llama->model.get())) {
         LOGI("[RNLlama] computing embeddings in encoder-decoder models is not supported");
-        llama_free(llama->ctx);
+        llama_free(llama->ctx.get());
         return -1;
       }
-      context_map[(long) llama->ctx] = llama;
+      context_map[(long) llama->ctx.get()] = llama;
     } else {
-      llama_free(llama->ctx);
+      llama_free(llama->ctx.get());
     }
-    return reinterpret_cast<jlong>(llama->ctx);
+    return reinterpret_cast<jlong>(llama->ctx.get());
 }
@@ -372,13 +373,13 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
     UNUSED(thiz);
     auto llama = context_map[(long) context_ptr];
-    int count = llama_model_meta_count(llama->model);
+    int count = llama_model_meta_count(llama->model.get());
     auto meta = createWriteableMap(env);
     for (int i = 0; i < count; i++) {
         char key[256];
-        llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
+        llama_model_meta_key_by_index(llama->model.get(), i, key, sizeof(key));
         char val[2048];
-        llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
+        llama_model_meta_val_str_by_index(llama->model.get(), i, val, sizeof(val));
         putString(env, meta, key, val);
     }
@@ -386,10 +387,10 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
     auto result = createWriteableMap(env);
     char desc[1024];
-    llama_model_desc(llama->model, desc, sizeof(desc));
+    llama_model_desc(llama->model.get(), desc, sizeof(desc));
     putString(env, result, "desc", desc);
-    putDouble(env, result, "size", llama_model_size(llama->model));
-    putDouble(env, result, "nParams", llama_model_n_params(llama->model));
+    putDouble(env, result, "size", llama_model_size(llama->model.get()));
+    putDouble(env, result, "nParams", llama_model_n_params(llama->model.get()));
     putBoolean(env, result, "isChatTemplateSupported", llama->validateModelChatTemplate());
     putMap(env, result, "metadata", meta);
@@ -431,7 +432,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
     }
     const char *tmpl_chars = env->GetStringUTFChars(chat_template, nullptr);
-    std::string formatted_chat = common_chat_apply_template(llama->model, tmpl_chars, chat, true);
+    std::string formatted_chat = common_chat_apply_template(llama->model.get(), tmpl_chars, chat, true);
     return env->NewStringUTF(formatted_chat.c_str());
 }
@@ -450,7 +451,7 @@ Java_com_rnllama_LlamaContext_loadSession(
     auto result = createWriteableMap(env);
     size_t n_token_count_out = 0;
     llama->embd.resize(llama->params.n_ctx);
-    if (!llama_state_load_file(llama->ctx, path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
+    if (!llama_state_load_file(llama->ctx.get(), path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
       env->ReleaseStringUTFChars(path, path_chars);
       putString(env, result, "error", "Failed to load session");
@@ -459,7 +460,7 @@ Java_com_rnllama_LlamaContext_loadSession(
     llama->embd.resize(n_token_count_out);
     env->ReleaseStringUTFChars(path, path_chars);
-    const std::string text = rnllama::tokens_to_str(llama->ctx, llama->embd.cbegin(), llama->embd.cend());
+    const std::string text = rnllama::tokens_to_str(llama->ctx.get(), llama->embd.cbegin(), llama->embd.cend());
     putInt(env, result, "tokens_loaded", n_token_count_out);
     putString(env, result, "prompt", text.c_str());
     return reinterpret_cast<jobject>(result);
@@ -481,7 +482,7 @@ Java_com_rnllama_LlamaContext_saveSession(
     std::vector<llama_token> session_tokens = llama->embd;
     int default_size = session_tokens.size();
     int save_size = size > 0 && size <= default_size ? size : default_size;
-    if (!llama_state_save_file(llama->ctx, path_chars, session_tokens.data(), save_size)) {
+    if (!llama_state_save_file(llama->ctx.get(), path_chars, session_tokens.data(), save_size)) {
       env->ReleaseStringUTFChars(path, path_chars);
       return -1;
     }
@@ -499,13 +500,13 @@ static inline jobject tokenProbsToMap(
     for (const auto &prob : probs) {
         auto probsForToken = createWritableArray(env);
         for (const auto &p : prob.probs) {
-            std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, p.tok);
+            std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), p.tok);
             auto probResult = createWriteableMap(env);
             putString(env, probResult, "tok_str", tokStr.c_str());
             putDouble(env, probResult, "prob", p.prob);
             pushMap(env, probsForToken, probResult);
         }
-        std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, prob.tok);
+        std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), prob.tok);
         auto tokenResult = createWriteableMap(env);
         putString(env, tokenResult, "content", tokStr.c_str());
         putArray(env, tokenResult, "probs", probsForToken);
@@ -555,7 +556,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
     llama->rewind();
-    //llama_reset_timings(llama->ctx);
+    //llama_reset_timings(llama->ctx.get());
     llama->params.prompt = env->GetStringUTFChars(prompt, nullptr);
     llama->params.sampling.seed = (seed == -1) ? time(NULL) : seed;
@@ -593,7 +594,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
     sparams.logit_bias.clear();
     if (ignore_eos) {
-        sparams.logit_bias[llama_token_eos(llama->model)].bias = -INFINITY;
+        sparams.logit_bias[llama_token_eos(llama->model.get())].bias = -INFINITY;
     }
     // dry break seq
@@ -612,7 +613,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
     sparams.dry_sequence_breakers = dry_sequence_breakers_vector;
     // logit bias
-    const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
+    const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx.get()));
     jsize logit_bias_len = env->GetArrayLength(logit_bias);
     for (jsize i = 0; i < logit_bias_len; i++) {
@@ -659,7 +660,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
         if (token_with_probs.tok == -1 || llama->incomplete) {
             continue;
         }
-        const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
+        const std::string token_text = common_token_to_piece(llama->ctx.get(), token_with_probs.tok);
         size_t pos = std::min(sent_count, llama->generated_text.size());
@@ -694,7 +695,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
             putString(env, tokenResult, "token", to_send.c_str());
             if (llama->params.sampling.n_probs > 0) {
-              const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
+              const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx.get(), to_send, false);
               size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
               size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
               if (probs_pos < probs_stop_pos) {
@@ -711,7 +712,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
         }
     }
-    llama_perf_context_print(llama->ctx);
+    llama_perf_context_print(llama->ctx.get());
     llama->is_predicting = false;
     auto result = createWriteableMap(env);
@@ -726,7 +727,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
     putString(env, result, "stopping_word", llama->stopping_word.c_str());
     putInt(env, result, "tokens_cached", llama->n_past);
-    const auto timings_token = llama_perf_context(llama -> ctx);
+    const auto timings_token = llama_perf_context(llama -> ctx.get());
     auto timingsResult = createWriteableMap(env);
     putInt(env, timingsResult, "prompt_n", timings_token.n_p_eval);
@@ -770,7 +771,7 @@ Java_com_rnllama_LlamaContext_tokenize(
     const char *text_chars = env->GetStringUTFChars(text, nullptr);
     const std::vector<llama_token> toks = common_tokenize(
-        llama->ctx,
+        llama->ctx.get(),
         text_chars,
         false
     );
@@ -797,7 +798,7 @@ Java_com_rnllama_LlamaContext_detokenize(
         toks.push_back(tokens_ptr[i]);
     }
-    auto text = rnllama::tokens_to_str(llama->ctx, toks.cbegin(), toks.cend());
+    auto text = rnllama::tokens_to_str(llama->ctx.get(), toks.cbegin(), toks.cend());
     env->ReleaseIntArrayElements(tokens, tokens_ptr, 0);
@@ -834,7 +835,7 @@ Java_com_rnllama_LlamaContext_embedding(
     llama->rewind();
-    llama_perf_context_reset(llama->ctx);
+    llama_perf_context_reset(llama->ctx.get());
     llama->params.prompt = text_chars;
@@ -860,7 +861,7 @@ Java_com_rnllama_LlamaContext_embedding(
     auto promptTokens = createWritableArray(env);
     for (const auto &tok : llama->embd) {
-      pushString(env, promptTokens, common_token_to_piece(llama->ctx, tok).c_str());
+      pushString(env, promptTokens, common_token_to_piece(llama->ctx.get(), tok).c_str());
     }
     putArray(env, result, "prompt_tokens", promptTokens);
@@ -890,17 +891,17 @@ Java_com_rnllama_LlamaContext_freeContext(
     UNUSED(env);
     UNUSED(thiz);
     auto llama = context_map[(long) context_ptr];
-    if (llama->model) {
-        llama_free_model(llama->model);
+    if (llama->model.get()) {
+        llama_model_free(llama->model.get());
     }
-    if (llama->ctx) {
-        llama_free(llama->ctx);
+    if (llama->ctx.get()) {
+        llama_free(llama->ctx.get());
     }
-    if (llama->ctx_sampling != nullptr)
+    /*if (llama->ctx.get()-> != nullptr)
     {
-        common_sampler_free(llama->ctx_sampling);
-    }
-    context_map.erase((long) llama->ctx);
+        common_sampler_free(llama->ctx.get() -> _sampling);
+    }*/
+    context_map.erase((long) llama->ctx.get());
 }
 JNIEXPORT void JNICALL

package/cpp/common.cpp CHANGED Viewed

@@ -2,6 +2,9 @@
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
+#include "ggml.h"
+#include "gguf.h"
 #include "common.h"
 #include "log.h"
 // Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
@@ -18,6 +21,7 @@
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <iterator>
@@ -68,7 +72,9 @@ char const *LLAMA_BUILD_TARGET = "unknown";
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
-#define PATH_MAX MAX_PATH
+#   if !defined(PATH_MAX)
+#   define PATH_MAX MAX_PATH
+#   endif
 #else
 #include <sys/syslimits.h>
 #endif
@@ -849,7 +855,7 @@ struct common_init_result common_init_from_params(common_params & params) {
     } else if (!params.model_url.empty()) {
         model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
     } else {
-        model = llama_load_model_from_file(params.model.c_str(), mparams);
+        model = llama_model_load_from_file(params.model.c_str(), mparams);
     }
     if (model == NULL) {
@@ -876,7 +882,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         }
         if (!ok) {
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
@@ -887,14 +893,13 @@ struct common_init_result common_init_from_params(common_params & params) {
     llama_context * lctx = llama_new_context_with_model(model, cparams);
     if (lctx == NULL) {
         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
-        llama_free_model(model);
+        llama_model_free(model);
         return iparams;
     }
     if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
-        LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
-        llama_free_model(model);
-        return iparams;
+        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
+        params.ctx_shift = false;
     }
     if (!params.control_vectors.empty()) {
@@ -904,7 +909,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         const auto cvec = common_control_vector_load(params.control_vectors);
         if (cvec.n_embd == -1) {
             llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
@@ -917,7 +922,7 @@ struct common_init_result common_init_from_params(common_params & params) {
                                              params.control_vector_layer_end);
         if (err) {
             llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
@@ -925,20 +930,21 @@ struct common_init_result common_init_from_params(common_params & params) {
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
-        common_lora_adapter_container loaded_la;
-        loaded_la.path = la.path;
-        loaded_la.scale = la.scale;
-        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
-        if (loaded_la.adapter == nullptr) {
+        llama_lora_adapter_ptr lora;
+        lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
+        if (lora == nullptr) {
             LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
             llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
-        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
+        la.ptr = lora.get();
+        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
     }
     if (!params.lora_init_without_apply) {
-        common_lora_adapters_apply(lctx, iparams.lora_adapters);
+        common_lora_adapters_apply(lctx, params.lora_adapters);
     }
     if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -985,7 +991,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_encoder(model)) {
             llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
             llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-            if (decoder_start_token_id == -1) {
+            if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
                 decoder_start_token_id = bos;
             }
             tmp.clear();
@@ -999,17 +1005,17 @@ struct common_init_result common_init_from_params(common_params & params) {
         llama_perf_context_reset(lctx);
     }
-    iparams.model   = model;
-    iparams.context = lctx;
+    iparams.model.reset(model);
+    iparams.context.reset(lctx);
     return iparams;
 }
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
     llama_lora_adapter_clear(ctx);
-    for (auto & la : lora_adapters) {
+    for (auto & la : lora) {
         if (la.scale != 0.0f) {
-            llama_lora_adapter_set(ctx, la.adapter, la.scale);
+            llama_lora_adapter_set(ctx, la.ptr, la.scale);
         }
     }
 }
@@ -1105,7 +1111,7 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
 #define CURL_MAX_RETRY 3
 #define CURL_RETRY_DELAY_SECONDS 2
-static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
     int remaining_attempts = max_attempts;
     while (remaining_attempts > 0) {
@@ -1129,7 +1135,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
 }
 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
     // Initialize libcurl
     std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
     if (!curl) {
@@ -1159,8 +1164,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
 #endif
     // Check if the file already exists locally
-    struct stat model_file_info;
-    auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
+    auto file_exists = std::filesystem::exists(path);
     // If the file exists, check its JSON metadata companion file.
     std::string metadata_path = path + ".json";
@@ -1202,11 +1206,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
         std::string etag;
         std::string last_modified;
     };
     common_load_model_from_url_headers headers;
     {
         typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
         auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
+            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
             static std::regex header_regex("([^:]+): (.*)\r\n");
             static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1418,7 +1424,7 @@ struct llama_model * common_load_model_from_url(
         }
     }
-    return llama_load_model_from_file(local_path.c_str(), params);
+    return llama_model_load_from_file(local_path.c_str(), params);
 }
 struct llama_model * common_load_model_from_hf(
@@ -1621,6 +1627,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
 // Chat template utils
 //
+std::string common_get_builtin_chat_template(const struct llama_model * model) {
+    static const char * template_key = "tokenizer.chat_template";
+    // call with NULL buffer to get the total size of the string
+    int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
+    if (res > 0) {
+        std::vector<char> model_template(res + 1, 0);
+        llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
+        return std::string(model_template.data(), model_template.size() - 1);
+    }
+    return "";
+}
 bool common_chat_verify_template(const std::string & tmpl) {
     llama_chat_message chat[] = {{"user", "test"}};
     int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
@@ -1790,7 +1808,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
             break;
         case 0: // max absolute
             for (int i = 0; i < n; i++) {
-                if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
+                if (sum < std::abs(inp[i])) {
+                    sum = std::abs(inp[i]);
+                }
             }
             sum /= 32760.0; // make an int16 range
             break;

package/cpp/common.h CHANGED Viewed

@@ -2,7 +2,7 @@
 #pragma once
-#include "llama.h"
+#include "llama-cpp.h"
 #include <string>
 #include <vector>
@@ -27,10 +27,8 @@
 struct common_lora_adapter_info {
     std::string path;
     float scale;
-};
-struct common_lora_adapter_container : common_lora_adapter_info {
-    struct llama_lora_adapter * adapter;
+    struct llama_lora_adapter * ptr;
 };
 using llama_tokens = std::vector<llama_token>;
@@ -91,6 +89,7 @@ enum llama_example {
     LLAMA_EXAMPLE_LLAVA,
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
+    LLAMA_EXAMPLE_TTS,
     LLAMA_EXAMPLE_COUNT,
 };
@@ -170,6 +169,7 @@ struct common_params_sampling {
 struct common_params_speculative {
     std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
     int32_t n_ctx        =     0; // draft context size
     int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
     int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
@@ -183,6 +183,14 @@ struct common_params_speculative {
     std::string model = ""; // draft model for speculative decoding                          // NOLINT
 };
+struct common_params_vocoder {
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
+    std::string model     = ""; // model path                                                // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT
+};
 struct common_params {
     void * progress_callback_user_data        = nullptr;
@@ -229,8 +237,9 @@ struct common_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
-    struct common_params_sampling sampling;
+    struct common_params_sampling    sampling;
     struct common_params_speculative speculative;
+    struct common_params_vocoder     vocoder;
     std::string model                = ""; // model path                                                    // NOLINT
     std::string model_alias          = ""; // model alias                                                   // NOLINT
@@ -482,10 +491,12 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //
+// note: defines object's lifetime
 struct common_init_result {
-    struct llama_model   * model   = nullptr;
-    struct llama_context * context = nullptr;
-    std::vector<common_lora_adapter_container> lora_adapters;
+    llama_model_ptr   model;
+    llama_context_ptr context;
+    std::vector<llama_lora_adapter_ptr> lora;
 };
 struct common_init_result     common_init_from_params(common_params & params);
@@ -507,7 +518,7 @@ struct llama_model * common_load_model_from_hf(
     const struct llama_model_params & params);
 // clear LoRA adapters from context, then apply new list of adapters
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
 //
 // Batch utils
@@ -575,6 +586,9 @@ struct common_chat_msg {
     std::string content;
 };
+// Get the built-in chat template for the model. Return empty string if not present.
+std::string common_get_builtin_chat_template(const struct llama_model * model);
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl);
@@ -611,7 +625,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
 // Embedding utils
 //
-void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+// TODO: repace embd_norm with an enum
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
@@ -640,6 +655,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 // Split utils
 //
-static const char * const LLM_KV_SPLIT_NO            = "split.no";
-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+namespace {
+const char * const LLM_KV_SPLIT_NO            = "split.no";
+const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+}

package/cpp/ggml-alloc.c CHANGED Viewed

@@ -534,7 +534,6 @@ static void lm_ggml_gallocr_allocate_node(lm_ggml_gallocr_t galloc, struct lm_gg
         size_t offset = lm_ggml_dyn_tallocr_alloc(alloc, size, node);
         hn->buffer_id = buffer_id;
         hn->offset = offset;
-        return;
     }
 }