npm - cui-llama.rn - Versions diffs - 1.3.5 → 1.3.6 - Mend

cui-llama.rn 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/android/src/main/CMakeLists.txt +14 -8
package/android/src/main/jni.cpp +38 -37
package/cpp/common.cpp +43 -26
package/cpp/common.h +18 -11
package/cpp/ggml-backend-reg.cpp +5 -0
package/cpp/ggml-backend.cpp +5 -2
package/cpp/ggml-cpp.h +1 -0
package/cpp/ggml-cpu-aarch64.cpp +6 -1
package/cpp/ggml-cpu-quants.c +5 -1
package/cpp/ggml-impl.h +11 -16
package/cpp/ggml-metal.m +2 -2
package/cpp/ggml.c +0 -1276
package/cpp/ggml.h +0 -140
package/cpp/gguf.cpp +1325 -0
package/cpp/gguf.h +202 -0
package/cpp/llama-adapter.cpp +346 -0
package/cpp/llama-adapter.h +73 -0
package/cpp/llama-arch.cpp +1434 -0
package/cpp/llama-arch.h +395 -0
package/cpp/llama-batch.cpp +368 -0
package/cpp/llama-batch.h +88 -0
package/cpp/llama-chat.cpp +567 -0
package/cpp/llama-chat.h +51 -0
package/cpp/llama-context.cpp +1771 -0
package/cpp/llama-context.h +128 -0
package/cpp/llama-cparams.cpp +1 -0
package/cpp/llama-cparams.h +37 -0
package/cpp/llama-cpp.h +30 -0
package/cpp/llama-grammar.cpp +1 -0
package/cpp/llama-grammar.h +3 -1
package/cpp/llama-hparams.cpp +71 -0
package/cpp/llama-hparams.h +140 -0
package/cpp/llama-impl.cpp +167 -0
package/cpp/llama-impl.h +16 -136
package/cpp/llama-kv-cache.cpp +718 -0
package/cpp/llama-kv-cache.h +218 -0
package/cpp/llama-mmap.cpp +589 -0
package/cpp/llama-mmap.h +67 -0
package/cpp/llama-model-loader.cpp +1011 -0
package/cpp/llama-model-loader.h +158 -0
package/cpp/llama-model.cpp +2202 -0
package/cpp/llama-model.h +391 -0
package/cpp/llama-sampling.cpp +117 -4
package/cpp/llama-vocab.cpp +21 -28
package/cpp/llama-vocab.h +13 -1
package/cpp/llama.cpp +8437 -19421
package/cpp/llama.cpp.rej +23 -0
package/cpp/llama.h +31 -6
package/cpp/rn-llama.hpp +39 -37
package/cpp/sgemm.cpp +776 -70
package/cpp/unicode.cpp +6 -0
package/package.json +1 -1

package/android/src/main/CMakeLists.txt CHANGED Viewed

@@ -9,17 +9,23 @@ include_directories(${RNLLAMA_LIB_DIR})
 set(
     SOURCE_FILES
-    ${RNLLAMA_LIB_DIR}/llama-grammar.cpp
-    ${RNLLAMA_LIB_DIR}/llama-sampling.cpp
-    ${RNLLAMA_LIB_DIR}/llama-vocab.cpp
-    ${RNLLAMA_LIB_DIR}/log.cpp
-    #${RNLLAMA_LIB_DIR}/amx/amx.cpp
-    #${RNLLAMA_LIB_DIR}/amx/mmq.cpp
+    ${RNLLAMA_LIB_DIR}/common.cpp
     ${RNLLAMA_LIB_DIR}/llama-grammar.cpp
     ${RNLLAMA_LIB_DIR}/llama-sampling.cpp
     ${RNLLAMA_LIB_DIR}/llama-vocab.cpp
+    ${RNLLAMA_LIB_DIR}/llama-chat.cpp
+    ${RNLLAMA_LIB_DIR}/llama-mmap.cpp
+    ${RNLLAMA_LIB_DIR}/llama-context.cpp
+    ${RNLLAMA_LIB_DIR}/llama-kv-cache.cpp
+    ${RNLLAMA_LIB_DIR}/llama-model-loader.cpp
+    ${RNLLAMA_LIB_DIR}/llama-model.cpp
+    ${RNLLAMA_LIB_DIR}/llama-batch.cpp
+    ${RNLLAMA_LIB_DIR}/llama-arch.cpp
+    ${RNLLAMA_LIB_DIR}/llama-cparams.cpp
+    ${RNLLAMA_LIB_DIR}/llama-hparams.cpp
+    ${RNLLAMA_LIB_DIR}/llama-adapter.cpp
+    ${RNLLAMA_LIB_DIR}/llama-impl.cpp
     ${RNLLAMA_LIB_DIR}/log.cpp
     ${RNLLAMA_LIB_DIR}/json.hpp
     ${RNLLAMA_LIB_DIR}/json-schema-to-grammar.cpp
@@ -28,6 +34,7 @@ set(
     ${RNLLAMA_LIB_DIR}/ggml-backend.cpp
     ${RNLLAMA_LIB_DIR}/ggml-backend-reg.cpp
     ${RNLLAMA_LIB_DIR}/ggml.c
+    ${RNLLAMA_LIB_DIR}/gguf.cpp
     ${RNLLAMA_LIB_DIR}/ggml-cpu.c
     ${RNLLAMA_LIB_DIR}/ggml-cpu.cpp
     ${RNLLAMA_LIB_DIR}/ggml-cpu-aarch64.cpp
@@ -35,7 +42,6 @@ set(
     ${RNLLAMA_LIB_DIR}/ggml-cpu-quants.c
     ${RNLLAMA_LIB_DIR}/ggml-threading.cpp
     ${RNLLAMA_LIB_DIR}/ggml-quants.c
-    ${RNLLAMA_LIB_DIR}/common.cpp
     ${RNLLAMA_LIB_DIR}/sampling.cpp
     ${RNLLAMA_LIB_DIR}/unicode-data.cpp
     ${RNLLAMA_LIB_DIR}/unicode.cpp

package/android/src/main/jni.cpp CHANGED Viewed

@@ -11,7 +11,8 @@
 #include <unordered_map>
 #include "llama.h"
 #include "llama-impl.h"
-#include "ggml.h"
+#include "llama-context.h"
+#include "gguf.h"
 #include "rn-llama.hpp"
 #define UNUSED(x) (void)(x)
@@ -336,17 +337,17 @@ Java_com_rnllama_LlamaContext_initContext(
     LOGI("[RNLlama] is_model_loaded %s", (is_model_loaded ? "true" : "false"));
     if (is_model_loaded) {
-      if (embedding && llama_model_has_encoder(llama->model) && llama_model_has_decoder(llama->model)) {
+      if (embedding && llama_model_has_encoder(llama->model.get()) && llama_model_has_decoder(llama->model.get())) {
         LOGI("[RNLlama] computing embeddings in encoder-decoder models is not supported");
-        llama_free(llama->ctx);
+        llama_free(llama->ctx.get());
         return -1;
       }
-      context_map[(long) llama->ctx] = llama;
+      context_map[(long) llama->ctx.get()] = llama;
     } else {
-      llama_free(llama->ctx);
+      llama_free(llama->ctx.get());
     }
-    return reinterpret_cast<jlong>(llama->ctx);
+    return reinterpret_cast<jlong>(llama->ctx.get());
 }
@@ -372,13 +373,13 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
     UNUSED(thiz);
     auto llama = context_map[(long) context_ptr];
-    int count = llama_model_meta_count(llama->model);
+    int count = llama_model_meta_count(llama->model.get());
     auto meta = createWriteableMap(env);
     for (int i = 0; i < count; i++) {
         char key[256];
-        llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
+        llama_model_meta_key_by_index(llama->model.get(), i, key, sizeof(key));
         char val[2048];
-        llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
+        llama_model_meta_val_str_by_index(llama->model.get(), i, val, sizeof(val));
         putString(env, meta, key, val);
     }
@@ -386,10 +387,10 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
     auto result = createWriteableMap(env);
     char desc[1024];
-    llama_model_desc(llama->model, desc, sizeof(desc));
+    llama_model_desc(llama->model.get(), desc, sizeof(desc));
     putString(env, result, "desc", desc);
-    putDouble(env, result, "size", llama_model_size(llama->model));
-    putDouble(env, result, "nParams", llama_model_n_params(llama->model));
+    putDouble(env, result, "size", llama_model_size(llama->model.get()));
+    putDouble(env, result, "nParams", llama_model_n_params(llama->model.get()));
     putBoolean(env, result, "isChatTemplateSupported", llama->validateModelChatTemplate());
     putMap(env, result, "metadata", meta);
@@ -431,7 +432,7 @@ Java_com_rnllama_LlamaContext_getFormattedChat(
     }
     const char *tmpl_chars = env->GetStringUTFChars(chat_template, nullptr);
-    std::string formatted_chat = common_chat_apply_template(llama->model, tmpl_chars, chat, true);
+    std::string formatted_chat = common_chat_apply_template(llama->model.get(), tmpl_chars, chat, true);
     return env->NewStringUTF(formatted_chat.c_str());
 }
@@ -450,7 +451,7 @@ Java_com_rnllama_LlamaContext_loadSession(
     auto result = createWriteableMap(env);
     size_t n_token_count_out = 0;
     llama->embd.resize(llama->params.n_ctx);
-    if (!llama_state_load_file(llama->ctx, path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
+    if (!llama_state_load_file(llama->ctx.get(), path_chars, llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
       env->ReleaseStringUTFChars(path, path_chars);
       putString(env, result, "error", "Failed to load session");
@@ -459,7 +460,7 @@ Java_com_rnllama_LlamaContext_loadSession(
     llama->embd.resize(n_token_count_out);
     env->ReleaseStringUTFChars(path, path_chars);
-    const std::string text = rnllama::tokens_to_str(llama->ctx, llama->embd.cbegin(), llama->embd.cend());
+    const std::string text = rnllama::tokens_to_str(llama->ctx.get(), llama->embd.cbegin(), llama->embd.cend());
     putInt(env, result, "tokens_loaded", n_token_count_out);
     putString(env, result, "prompt", text.c_str());
     return reinterpret_cast<jobject>(result);
@@ -481,7 +482,7 @@ Java_com_rnllama_LlamaContext_saveSession(
     std::vector<llama_token> session_tokens = llama->embd;
     int default_size = session_tokens.size();
     int save_size = size > 0 && size <= default_size ? size : default_size;
-    if (!llama_state_save_file(llama->ctx, path_chars, session_tokens.data(), save_size)) {
+    if (!llama_state_save_file(llama->ctx.get(), path_chars, session_tokens.data(), save_size)) {
       env->ReleaseStringUTFChars(path, path_chars);
       return -1;
     }
@@ -499,13 +500,13 @@ static inline jobject tokenProbsToMap(
     for (const auto &prob : probs) {
         auto probsForToken = createWritableArray(env);
         for (const auto &p : prob.probs) {
-            std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, p.tok);
+            std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), p.tok);
             auto probResult = createWriteableMap(env);
             putString(env, probResult, "tok_str", tokStr.c_str());
             putDouble(env, probResult, "prob", p.prob);
             pushMap(env, probsForToken, probResult);
         }
-        std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, prob.tok);
+        std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx.get(), prob.tok);
         auto tokenResult = createWriteableMap(env);
         putString(env, tokenResult, "content", tokStr.c_str());
         putArray(env, tokenResult, "probs", probsForToken);
@@ -555,7 +556,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
     llama->rewind();
-    //llama_reset_timings(llama->ctx);
+    //llama_reset_timings(llama->ctx.get());
     llama->params.prompt = env->GetStringUTFChars(prompt, nullptr);
     llama->params.sampling.seed = (seed == -1) ? time(NULL) : seed;
@@ -593,7 +594,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
     sparams.logit_bias.clear();
     if (ignore_eos) {
-        sparams.logit_bias[llama_token_eos(llama->model)].bias = -INFINITY;
+        sparams.logit_bias[llama_token_eos(llama->model.get())].bias = -INFINITY;
     }
     // dry break seq
@@ -612,7 +613,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
     sparams.dry_sequence_breakers = dry_sequence_breakers_vector;
     // logit bias
-    const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
+    const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx.get()));
     jsize logit_bias_len = env->GetArrayLength(logit_bias);
     for (jsize i = 0; i < logit_bias_len; i++) {
@@ -659,7 +660,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
         if (token_with_probs.tok == -1 || llama->incomplete) {
             continue;
         }
-        const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
+        const std::string token_text = common_token_to_piece(llama->ctx.get(), token_with_probs.tok);
         size_t pos = std::min(sent_count, llama->generated_text.size());
@@ -694,7 +695,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
             putString(env, tokenResult, "token", to_send.c_str());
             if (llama->params.sampling.n_probs > 0) {
-              const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
+              const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx.get(), to_send, false);
               size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
               size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
               if (probs_pos < probs_stop_pos) {
@@ -711,7 +712,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
         }
     }
-    llama_perf_context_print(llama->ctx);
+    llama_perf_context_print(llama->ctx.get());
     llama->is_predicting = false;
     auto result = createWriteableMap(env);
@@ -726,7 +727,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
     putString(env, result, "stopping_word", llama->stopping_word.c_str());
     putInt(env, result, "tokens_cached", llama->n_past);
-    const auto timings_token = llama_perf_context(llama -> ctx);
+    const auto timings_token = llama_perf_context(llama -> ctx.get());
     auto timingsResult = createWriteableMap(env);
     putInt(env, timingsResult, "prompt_n", timings_token.n_p_eval);
@@ -770,7 +771,7 @@ Java_com_rnllama_LlamaContext_tokenize(
     const char *text_chars = env->GetStringUTFChars(text, nullptr);
     const std::vector<llama_token> toks = common_tokenize(
-        llama->ctx,
+        llama->ctx.get(),
         text_chars,
         false
     );
@@ -797,7 +798,7 @@ Java_com_rnllama_LlamaContext_detokenize(
         toks.push_back(tokens_ptr[i]);
     }
-    auto text = rnllama::tokens_to_str(llama->ctx, toks.cbegin(), toks.cend());
+    auto text = rnllama::tokens_to_str(llama->ctx.get(), toks.cbegin(), toks.cend());
     env->ReleaseIntArrayElements(tokens, tokens_ptr, 0);
@@ -834,7 +835,7 @@ Java_com_rnllama_LlamaContext_embedding(
     llama->rewind();
-    llama_perf_context_reset(llama->ctx);
+    llama_perf_context_reset(llama->ctx.get());
     llama->params.prompt = text_chars;
@@ -860,7 +861,7 @@ Java_com_rnllama_LlamaContext_embedding(
     auto promptTokens = createWritableArray(env);
     for (const auto &tok : llama->embd) {
-      pushString(env, promptTokens, common_token_to_piece(llama->ctx, tok).c_str());
+      pushString(env, promptTokens, common_token_to_piece(llama->ctx.get(), tok).c_str());
     }
     putArray(env, result, "prompt_tokens", promptTokens);
@@ -890,17 +891,17 @@ Java_com_rnllama_LlamaContext_freeContext(
     UNUSED(env);
     UNUSED(thiz);
     auto llama = context_map[(long) context_ptr];
-    if (llama->model) {
-        llama_free_model(llama->model);
+    if (llama->model.get()) {
+        llama_model_free(llama->model.get());
     }
-    if (llama->ctx) {
-        llama_free(llama->ctx);
+    if (llama->ctx.get()) {
+        llama_free(llama->ctx.get());
     }
-    if (llama->ctx_sampling != nullptr)
+    /*if (llama->ctx.get()-> != nullptr)
     {
-        common_sampler_free(llama->ctx_sampling);
-    }
-    context_map.erase((long) llama->ctx);
+        common_sampler_free(llama->ctx.get() -> _sampling);
+    }*/
+    context_map.erase((long) llama->ctx.get());
 }
 JNIEXPORT void JNICALL

package/cpp/common.cpp CHANGED Viewed

@@ -2,6 +2,9 @@
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
+#include "ggml.h"
+#include "gguf.h"
 #include "common.h"
 #include "log.h"
 // Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
@@ -18,6 +21,7 @@
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <iterator>
@@ -68,7 +72,9 @@ char const *LLAMA_BUILD_TARGET = "unknown";
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
-#define PATH_MAX MAX_PATH
+#   if !defined(PATH_MAX)
+#   define PATH_MAX MAX_PATH
+#   endif
 #else
 #include <sys/syslimits.h>
 #endif
@@ -849,7 +855,7 @@ struct common_init_result common_init_from_params(common_params & params) {
     } else if (!params.model_url.empty()) {
         model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
     } else {
-        model = llama_load_model_from_file(params.model.c_str(), mparams);
+        model = llama_model_load_from_file(params.model.c_str(), mparams);
     }
     if (model == NULL) {
@@ -876,7 +882,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         }
         if (!ok) {
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
@@ -887,14 +893,13 @@ struct common_init_result common_init_from_params(common_params & params) {
     llama_context * lctx = llama_new_context_with_model(model, cparams);
     if (lctx == NULL) {
         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
-        llama_free_model(model);
+        llama_model_free(model);
         return iparams;
     }
     if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
-        LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
-        llama_free_model(model);
-        return iparams;
+        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
+        params.ctx_shift = false;
     }
     if (!params.control_vectors.empty()) {
@@ -904,7 +909,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         const auto cvec = common_control_vector_load(params.control_vectors);
         if (cvec.n_embd == -1) {
             llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
@@ -917,7 +922,7 @@ struct common_init_result common_init_from_params(common_params & params) {
                                              params.control_vector_layer_end);
         if (err) {
             llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
@@ -925,20 +930,21 @@ struct common_init_result common_init_from_params(common_params & params) {
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
-        common_lora_adapter_container loaded_la;
-        loaded_la.path = la.path;
-        loaded_la.scale = la.scale;
-        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
-        if (loaded_la.adapter == nullptr) {
+        llama_lora_adapter_ptr lora;
+        lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
+        if (lora == nullptr) {
             LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
             llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);
             return iparams;
         }
-        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
+        la.ptr = lora.get();
+        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
     }
     if (!params.lora_init_without_apply) {
-        common_lora_adapters_apply(lctx, iparams.lora_adapters);
+        common_lora_adapters_apply(lctx, params.lora_adapters);
     }
     if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -985,7 +991,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_encoder(model)) {
             llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
             llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-            if (decoder_start_token_id == -1) {
+            if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
                 decoder_start_token_id = bos;
             }
             tmp.clear();
@@ -999,17 +1005,17 @@ struct common_init_result common_init_from_params(common_params & params) {
         llama_perf_context_reset(lctx);
     }
-    iparams.model   = model;
-    iparams.context = lctx;
+    iparams.model.reset(model);
+    iparams.context.reset(lctx);
     return iparams;
 }
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
     llama_lora_adapter_clear(ctx);
-    for (auto & la : lora_adapters) {
+    for (auto & la : lora) {
         if (la.scale != 0.0f) {
-            llama_lora_adapter_set(ctx, la.adapter, la.scale);
+            llama_lora_adapter_set(ctx, la.ptr, la.scale);
         }
     }
 }
@@ -1158,8 +1164,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
 #endif
     // Check if the file already exists locally
-    struct stat model_file_info;
-    auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
+    auto file_exists = std::filesystem::exists(path);
     // If the file exists, check its JSON metadata companion file.
     std::string metadata_path = path + ".json";
@@ -1419,7 +1424,7 @@ struct llama_model * common_load_model_from_url(
         }
     }
-    return llama_load_model_from_file(local_path.c_str(), params);
+    return llama_model_load_from_file(local_path.c_str(), params);
 }
 struct llama_model * common_load_model_from_hf(
@@ -1622,6 +1627,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
 // Chat template utils
 //
+std::string common_get_builtin_chat_template(const struct llama_model * model) {
+    static const char * template_key = "tokenizer.chat_template";
+    // call with NULL buffer to get the total size of the string
+    int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
+    if (res > 0) {
+        std::vector<char> model_template(res + 1, 0);
+        llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
+        return std::string(model_template.data(), model_template.size() - 1);
+    }
+    return "";
+}
 bool common_chat_verify_template(const std::string & tmpl) {
     llama_chat_message chat[] = {{"user", "test"}};
     int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);

package/cpp/common.h CHANGED Viewed

@@ -2,7 +2,7 @@
 #pragma once
-#include "llama.h"
+#include "llama-cpp.h"
 #include <string>
 #include <vector>
@@ -27,10 +27,8 @@
 struct common_lora_adapter_info {
     std::string path;
     float scale;
-};
-struct common_lora_adapter_container : common_lora_adapter_info {
-    struct llama_lora_adapter * adapter;
+    struct llama_lora_adapter * ptr;
 };
 using llama_tokens = std::vector<llama_token>;
@@ -493,10 +491,12 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //
+// note: defines object's lifetime
 struct common_init_result {
-    struct llama_model   * model   = nullptr;
-    struct llama_context * context = nullptr;
-    std::vector<common_lora_adapter_container> lora_adapters;
+    llama_model_ptr   model;
+    llama_context_ptr context;
+    std::vector<llama_lora_adapter_ptr> lora;
 };
 struct common_init_result     common_init_from_params(common_params & params);
@@ -518,7 +518,7 @@ struct llama_model * common_load_model_from_hf(
     const struct llama_model_params & params);
 // clear LoRA adapters from context, then apply new list of adapters
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
 //
 // Batch utils
@@ -586,6 +586,9 @@ struct common_chat_msg {
     std::string content;
 };
+// Get the built-in chat template for the model. Return empty string if not present.
+std::string common_get_builtin_chat_template(const struct llama_model * model);
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl);
@@ -652,6 +655,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 // Split utils
 //
-static const char * const LLM_KV_SPLIT_NO            = "split.no";
-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+namespace {
+const char * const LLM_KV_SPLIT_NO            = "split.no";
+const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+}

package/cpp/ggml-backend-reg.cpp CHANGED Viewed

@@ -574,4 +574,9 @@ void lm_ggml_backend_load_all_from_path(const char * dir_path) {
     lm_ggml_backend_load_best("opencl", silent, dir_path);
     lm_ggml_backend_load_best("musa", silent, dir_path);
     lm_ggml_backend_load_best("cpu", silent, dir_path);
+    // check the environment variable LM_GGML_BACKEND_PATH to load an out-of-tree backend
+    const char * backend_path = std::getenv("LM_GGML_BACKEND_PATH");
+    if (backend_path) {
+        lm_ggml_backend_load(backend_path);
+    }
 }

package/cpp/ggml-backend.cpp CHANGED Viewed

@@ -764,7 +764,7 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
         if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
             int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
             // check if a backend with higher prio wants to offload the op
-            if (src_backend_id == sched->n_backends - 1) {
+            if (src_backend_id == sched->n_backends - 1 && lm_ggml_backend_buffer_is_host(src->buffer)) {
                 for (int b = 0; b < src_backend_id; b++) {
                     if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
                         SET_CAUSE(tensor, "1.off");
@@ -795,9 +795,12 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
     for (int i = 0; i < graph->n_nodes; i++) {
         if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
             lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
-            LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
+            LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, lm_ggml_backend_name(split_backend),
                 sched->splits[cur_split].n_inputs);
             for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
+                if (j == 0) {
+                    LM_GGML_LOG_DEBUG(": ");
+                }
                 LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
                     fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
             }

package/cpp/ggml-cpp.h CHANGED Viewed

@@ -7,6 +7,7 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
+#include "gguf.h"
 #include <memory>
 // Smart pointers for ggml types

package/cpp/ggml-cpu-aarch64.cpp CHANGED Viewed

@@ -194,9 +194,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
 }
 static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
     const __m256i zero = _mm256_setzero_si256();
     return _mm256_dpbusd_epi32(zero, ax, sy);
+#elif defined(__AVXVNNI__)
+    const __m256i zero = _mm256_setzero_si256();
+    return _mm256_dpbusd_avx_epi32(zero, ax, sy);
 #else
     // Perform multiplication and create 16-bit values
     const __m256i dot = _mm256_maddubs_epi16(ax, sy);
@@ -4166,6 +4169,8 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_aarch64_buffer_type_alloc_bu
     buffer->buft              = buft;
     buffer->iface.init_tensor = lm_ggml_backend_cpu_aarch64_buffer_init_tensor;
     buffer->iface.set_tensor  = lm_ggml_backend_cpu_aarch64_buffer_set_tensor;
+    buffer->iface.get_tensor  = nullptr;
+    buffer->iface.cpy_tensor  = nullptr;
     return buffer;
 }

package/cpp/ggml-cpu-quants.c CHANGED Viewed

@@ -103,10 +103,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
 }
 static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
     const __m256i zero = _mm256_setzero_si256();
     const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
     return _mm256_cvtepi32_ps(summed_pairs);
+#elif defined(__AVXVNNI__)
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
 #else
     // Perform multiplication and create 16-bit values
     const __m256i dot = _mm256_maddubs_epi16(ax, sy);

package/cpp/ggml-impl.h CHANGED Viewed

@@ -3,6 +3,8 @@
 // GGML internal header
 #include "ggml.h"
+#include "gguf.h"
 #include <assert.h>
 #include <math.h>
 #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
@@ -551,22 +553,15 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
 #define LM_GGML_FP32_TO_BF16(x) lm_ggml_compute_fp32_to_bf16(x)
 #define LM_GGML_BF16_TO_FP32(x) lm_ggml_compute_bf16_to_fp32(x)
-// expose GGUF internals for test code
-LM_GGML_API size_t lm_gguf_type_size(enum lm_gguf_type type);
-LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params);
-struct lm_gguf_buf {
-    void * data;
-    size_t size;
-    size_t offset;
-};
-LM_GGML_API struct lm_gguf_buf lm_gguf_buf_init(size_t size);
-LM_GGML_API void lm_gguf_buf_free(struct lm_gguf_buf buf);
-LM_GGML_API void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, struct lm_gguf_buf * buf, bool only_meta);
 #ifdef __cplusplus
 }
 #endif
+#ifdef __cplusplus
+#include <vector>
+// expose GGUF internals for test code
+LM_GGML_API size_t lm_gguf_type_size(enum lm_gguf_type type);
+LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params);
+LM_GGML_API void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
+#endif // __cplusplus

package/cpp/ggml-metal.m CHANGED Viewed

@@ -2067,8 +2067,8 @@ static void lm_ggml_metal_encode_node(
                 LM_GGML_ASSERT(ne12 % ne02 == 0);
                 LM_GGML_ASSERT(ne13 % ne03 == 0);
-                const uint r2 = ne12/ne02;
-                const uint r3 = ne13/ne03;
+                const uint32_t r2 = ne12/ne02;
+                const uint32_t r3 = ne13/ne03;
                 // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                 // to the matrix-vector kernel