npm - @fugood/llama.node - Versions diffs - 0.3.9 → 0.3.11 - Mend

@fugood/llama.node 0.3.9 → 0.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.js +2 -2
package/lib/binding.ts +47 -8
package/lib/index.js +21 -1
package/lib/index.ts +31 -1
package/package.json +12 -3
package/src/LlamaCompletionWorker.cpp +33 -6
package/src/LlamaCompletionWorker.h +3 -1
package/src/LlamaContext.cpp +336 -28
package/src/LlamaContext.h +2 -0
package/src/common.hpp +19 -2
package/src/llama.cpp/.github/workflows/build.yml +289 -107
package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
package/src/llama.cpp/.github/workflows/docker.yml +2 -1
package/src/llama.cpp/.github/workflows/server.yml +25 -2
package/src/llama.cpp/CMakeLists.txt +10 -19
package/src/llama.cpp/cmake/build-info.cmake +1 -1
package/src/llama.cpp/common/CMakeLists.txt +32 -0
package/src/llama.cpp/common/arg.cpp +66 -16
package/src/llama.cpp/common/chat-template.hpp +515 -0
package/src/llama.cpp/common/chat.cpp +966 -0
package/src/llama.cpp/common/chat.hpp +52 -0
package/src/llama.cpp/common/common.cpp +159 -36
package/src/llama.cpp/common/common.h +56 -14
package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
package/src/llama.cpp/common/llguidance.cpp +270 -0
package/src/llama.cpp/common/log.cpp +1 -10
package/src/llama.cpp/common/log.h +10 -0
package/src/llama.cpp/common/minja.hpp +2868 -0
package/src/llama.cpp/common/sampling.cpp +22 -1
package/src/llama.cpp/common/sampling.h +3 -0
package/src/llama.cpp/docs/build.md +54 -9
package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
package/src/llama.cpp/examples/llava/clip.cpp +133 -14
package/src/llama.cpp/examples/llava/clip.h +2 -0
package/src/llama.cpp/examples/llava/llava.cpp +22 -8
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
package/src/llama.cpp/examples/main/main.cpp +26 -25
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
package/src/llama.cpp/examples/run/run.cpp +224 -69
package/src/llama.cpp/examples/server/server.cpp +252 -81
package/src/llama.cpp/examples/server/utils.hpp +73 -21
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
package/src/llama.cpp/ggml/include/ggml.h +1 -1
package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
package/src/llama.cpp/ggml/src/ggml.c +23 -13
package/src/llama.cpp/include/llama.h +14 -1
package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
package/src/llama.cpp/src/CMakeLists.txt +1 -1
package/src/llama.cpp/src/llama-arch.cpp +7 -2
package/src/llama.cpp/src/llama-arch.h +3 -1
package/src/llama.cpp/src/llama-chat.cpp +11 -2
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-grammar.cpp +86 -6
package/src/llama.cpp/src/llama-grammar.h +22 -1
package/src/llama.cpp/src/llama-mmap.cpp +1 -0
package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
package/src/llama.cpp/src/llama-model.cpp +76 -6
package/src/llama.cpp/src/llama-sampling.cpp +47 -4
package/src/llama.cpp/src/llama-vocab.cpp +10 -4
package/src/llama.cpp/src/llama.cpp +181 -123
package/src/llama.cpp/tests/CMakeLists.txt +4 -0
package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
package/src/llama.cpp/tests/test-chat.cpp +607 -0
package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32

package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h CHANGED Viewed

@@ -45,6 +45,7 @@ extern "C" {
 #endif
 #include <stddef.h> /* For size_t. */
+#include <stdlib.h>
 extern const char *linenoiseEditMore;
@@ -69,10 +70,23 @@ struct linenoiseState {
     int history_index;  /* The history index we are currently editing. */
 };
-typedef struct linenoiseCompletions {
-  size_t len;
-  char **cvec;
-} linenoiseCompletions;
+struct linenoiseCompletions {
+    size_t  len     = 0;
+    char ** cvec    = nullptr;
+    bool    to_free = true;
+    ~linenoiseCompletions() {
+        if (!to_free) {
+            return;
+        }
+        for (size_t i = 0; i < len; ++i) {
+            free(cvec[i]);
+        }
+        free(cvec);
+    }
+};
 /* Non blocking API. */
 int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);

package/src/llama.cpp/examples/run/run.cpp CHANGED Viewed

@@ -24,14 +24,16 @@
 #include <string>
 #include <vector>
+#include "chat-template.hpp"
 #include "common.h"
 #include "json.hpp"
 #include "linenoise.cpp/linenoise.h"
 #include "llama-cpp.h"
+#include "log.h"
 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
 [[noreturn]] static void sigint_handler(int) {
-    printf("\n\033[0m");
+    printf("\n" LOG_COL_DEFAULT);
     exit(0);  // not ideal, but it's the only way to guarantee exit in all cases
 }
 #endif
@@ -64,6 +66,13 @@ static int printe(const char * fmt, ...) {
     return ret;
 }
+static std::string strftime_fmt(const char * fmt, const std::tm & tm) {
+    std::ostringstream oss;
+    oss << std::put_time(&tm, fmt);
+    return oss.str();
+}
 class Opt {
   public:
     int init(int argc, const char ** argv) {
@@ -105,6 +114,7 @@ class Opt {
     llama_model_params   model_params;
     std::string model_;
     std::string          user;
+    bool                 use_jinja   = false;
     int                  context_size = -1, ngl = -1;
     float                temperature = -1;
     bool                 verbose     = false;
@@ -145,7 +155,8 @@ class Opt {
                 if (handle_option_with_value(argc, argv, i, context_size) == 1) {
                     return 1;
                 }
-            } else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
+            } else if (options_parsing &&
+                       (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
                 if (handle_option_with_value(argc, argv, i, ngl) == 1) {
                     return 1;
                 }
@@ -156,6 +167,8 @@ class Opt {
             } else if (options_parsing &&
                        (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
                 verbose = true;
+            } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
+                use_jinja = true;
             } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
                 help = true;
                 return 0;
@@ -176,6 +189,10 @@ class Opt {
             }
         }
+        if (model_.empty()){
+            return 1;
+        }
         return 0;
     }
@@ -190,7 +207,7 @@ class Opt {
             "Options:\n"
             "  -c, --context-size <value>\n"
             "      Context size (default: %d)\n"
-            "  -n, --ngl <value>\n"
+            "  -n, -ngl, --ngl <value>\n"
             "      Number of GPU layers (default: %d)\n"
             "  --temp <value>\n"
             "      Temperature (default: %.1f)\n"
@@ -314,6 +331,10 @@ class HttpClient {
   public:
     int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
              const bool progress, std::string * response_str = nullptr) {
+        if (std::filesystem::exists(output_file)) {
+            return 0;
+        }
         std::string output_file_partial;
         curl = curl_easy_init();
         if (!curl) {
@@ -341,7 +362,11 @@ class HttpClient {
         data.file_size = set_resume_point(output_file_partial);
         set_progress_options(progress, data);
         set_headers(headers);
-        perform(url);
+        CURLcode res = perform(url);
+        if (res != CURLE_OK){
+            printe("Fetching resource '%s' failed: %s\n", url.c_str(), curl_easy_strerror(res));
+            return 1;
+        }
         if (!output_file.empty()) {
             std::filesystem::rename(output_file_partial, output_file);
         }
@@ -406,16 +431,12 @@ class HttpClient {
         }
     }
-    void perform(const std::string & url) {
-        CURLcode res;
+    CURLcode perform(const std::string & url) {
         curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
         curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
         curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https");
         curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
-        res = curl_easy_perform(curl);
-        if (res != CURLE_OK) {
-            printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
-        }
+        return curl_easy_perform(curl);
     }
     static std::string human_readable_time(double seconds) {
@@ -553,13 +574,14 @@ class LlamaData {
         }
         sampler = initialize_sampler(opt);
         return 0;
     }
   private:
 #ifdef LLAMA_USE_CURL
-    int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
-                 const bool progress, std::string * response_str = nullptr) {
+    int download(const std::string & url, const std::string & output_file, const bool progress,
+                 const std::vector<std::string> & headers = {}, std::string * response_str = nullptr) {
         HttpClient http;
         if (http.init(url, headers, output_file, progress, response_str)) {
             return 1;
@@ -568,48 +590,85 @@ class LlamaData {
         return 0;
     }
 #else
-    int download(const std::string &, const std::vector<std::string> &, const std::string &, const bool,
+    int download(const std::string &, const std::string &, const bool, const std::vector<std::string> & = {},
                  std::string * = nullptr) {
         printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
         return 1;
     }
 #endif
-    int huggingface_dl(const std::string & model, const std::vector<std::string> headers, const std::string & bn) {
+    // Helper function to handle model tag extraction and URL construction
+    std::pair<std::string, std::string> extract_model_and_tag(std::string & model, const std::string & base_url) {
+        std::string  model_tag = "latest";
+        const size_t colon_pos = model.find(':');
+        if (colon_pos != std::string::npos) {
+            model_tag = model.substr(colon_pos + 1);
+            model     = model.substr(0, colon_pos);
+        }
+        std::string url = base_url + model + "/manifests/" + model_tag;
+        return { model, url };
+    }
+    // Helper function to download and parse the manifest
+    int download_and_parse_manifest(const std::string & url, const std::vector<std::string> & headers,
+                                    nlohmann::json & manifest) {
+        std::string manifest_str;
+        int         ret = download(url, "", false, headers, &manifest_str);
+        if (ret) {
+            return ret;
+        }
+        manifest = nlohmann::json::parse(manifest_str);
+        return 0;
+    }
+    int huggingface_dl(std::string & model, const std::string & bn) {
         // Find the second occurrence of '/' after protocol string
         size_t pos = model.find('/');
         pos        = model.find('/', pos + 1);
+        std::string              hfr, hff;
+        std::vector<std::string> headers = { "User-Agent: llama-cpp", "Accept: application/json" };
+        std::string              url;
         if (pos == std::string::npos) {
-            return 1;
+            auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/");
+            hfr                             = model_name;
+            nlohmann::json manifest;
+            int            ret = download_and_parse_manifest(manifest_url, headers, manifest);
+            if (ret) {
+                return ret;
+            }
+            hff = manifest["ggufFile"]["rfilename"];
+        } else {
+            hfr = model.substr(0, pos);
+            hff = model.substr(pos + 1);
         }
-        const std::string hfr = model.substr(0, pos);
-        const std::string hff = model.substr(pos + 1);
-        const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
-        return download(url, headers, bn, true);
+        url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
+        return download(url, bn, true, headers);
     }
-    int ollama_dl(std::string & model, const std::vector<std::string> headers, const std::string & bn) {
+    int ollama_dl(std::string & model, const std::string & bn) {
+        const std::vector<std::string> headers = { "Accept: application/vnd.docker.distribution.manifest.v2+json" };
         if (model.find('/') == std::string::npos) {
             model = "library/" + model;
         }
-        std::string model_tag = "latest";
-        size_t      colon_pos = model.find(':');
-        if (colon_pos != std::string::npos) {
-            model_tag = model.substr(colon_pos + 1);
-            model     = model.substr(0, colon_pos);
-        }
-        std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag;
-        std::string manifest_str;
-        const int   ret = download(manifest_url, headers, "", false, &manifest_str);
+        auto [model_name, manifest_url] = extract_model_and_tag(model, "https://registry.ollama.ai/v2/");
+        nlohmann::json manifest;
+        int            ret = download_and_parse_manifest(manifest_url, {}, manifest);
         if (ret) {
             return ret;
         }
-        nlohmann::json manifest = nlohmann::json::parse(manifest_str);
-        std::string    layer;
+        std::string layer;
         for (const auto & l : manifest["layers"]) {
             if (l["mediaType"] == "application/vnd.ollama.image.model") {
                 layer = l["digest"];
@@ -617,8 +676,67 @@ class LlamaData {
             }
         }
-        std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer;
-        return download(blob_url, headers, bn, true);
+        std::string blob_url = "https://registry.ollama.ai/v2/" + model_name + "/blobs/" + layer;
+        return download(blob_url, bn, true, headers);
+    }
+    int github_dl(const std::string & model, const std::string & bn) {
+        std::string  repository = model;
+        std::string  branch     = "main";
+        const size_t at_pos     = model.find('@');
+        if (at_pos != std::string::npos) {
+            repository = model.substr(0, at_pos);
+            branch     = model.substr(at_pos + 1);
+        }
+        const std::vector<std::string> repo_parts = string_split(repository, "/");
+        if (repo_parts.size() < 3) {
+            printe("Invalid GitHub repository format\n");
+            return 1;
+        }
+        const std::string & org          = repo_parts[0];
+        const std::string & project      = repo_parts[1];
+        std::string         url          = "https://raw.githubusercontent.com/" + org + "/" + project + "/" + branch;
+        for (size_t i = 2; i < repo_parts.size(); ++i) {
+            url += "/" + repo_parts[i];
+        }
+        return download(url, bn, true);
+    }
+    int s3_dl(const std::string & model, const std::string & bn) {
+        const size_t slash_pos = model.find('/');
+        if (slash_pos == std::string::npos) {
+            return 1;
+        }
+        const std::string bucket     = model.substr(0, slash_pos);
+        const std::string key        = model.substr(slash_pos + 1);
+        const char * access_key = std::getenv("AWS_ACCESS_KEY_ID");
+        const char * secret_key = std::getenv("AWS_SECRET_ACCESS_KEY");
+        if (!access_key || !secret_key) {
+            printe("AWS credentials not found in environment\n");
+            return 1;
+        }
+        // Generate AWS Signature Version 4 headers
+        // (Implementation requires HMAC-SHA256 and date handling)
+        // Get current timestamp
+        const time_t                   now     = time(nullptr);
+        const tm                       tm      = *gmtime(&now);
+        const std::string              date     = strftime_fmt("%Y%m%d", tm);
+        const std::string              datetime = strftime_fmt("%Y%m%dT%H%M%SZ", tm);
+        const std::vector<std::string> headers  = {
+            "Authorization: AWS4-HMAC-SHA256 Credential=" + std::string(access_key) + "/" + date +
+                "/us-east-1/s3/aws4_request",
+            "x-amz-content-sha256: UNSIGNED-PAYLOAD", "x-amz-date: " + datetime
+        };
+        const std::string url = "https://" + bucket + ".s3.amazonaws.com/" + key;
+        return download(url, bn, true, headers);
     }
     std::string basename(const std::string & path) {
@@ -630,37 +748,44 @@ class LlamaData {
         return path.substr(pos + 1);
     }
-    int remove_proto(std::string & model_) {
-        const std::string::size_type pos = model_.find("://");
+    int rm_until_substring(std::string & model_, const std::string & substring) {
+        const std::string::size_type pos = model_.find(substring);
         if (pos == std::string::npos) {
             return 1;
         }
-        model_ = model_.substr(pos + 3);  // Skip past "://"
+        model_ = model_.substr(pos + substring.size());  // Skip past the substring
         return 0;
     }
     int resolve_model(std::string & model_) {
         int                            ret     = 0;
         if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) {
-            remove_proto(model_);
+            rm_until_substring(model_, "://");
             return ret;
         }
-        const std::string              bn      = basename(model_);
-        const std::vector<std::string> headers = { "--header",
-                                                   "Accept: application/vnd.docker.distribution.manifest.v2+json" };
-        if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
-            remove_proto(model_);
-            ret = huggingface_dl(model_, headers, bn);
-        } else if (string_starts_with(model_, "ollama://")) {
-            remove_proto(model_);
-            ret = ollama_dl(model_, headers, bn);
-        } else if (string_starts_with(model_, "https://")) {
-            download(model_, headers, bn, true);
-        } else {
-            ret = ollama_dl(model_, headers, bn);
+        const std::string bn = basename(model_);
+        if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://") ||
+            string_starts_with(model_, "hf.co/")) {
+            rm_until_substring(model_, "hf.co/");
+            rm_until_substring(model_, "://");
+            ret = huggingface_dl(model_, bn);
+        } else if ((string_starts_with(model_, "https://") || string_starts_with(model_, "http://")) &&
+                   !string_starts_with(model_, "https://ollama.com/library/")) {
+            ret = download(model_, bn, true);
+        } else if (string_starts_with(model_, "github:") || string_starts_with(model_, "github://")) {
+            rm_until_substring(model_, "github:");
+            rm_until_substring(model_, "://");
+            ret = github_dl(model_, bn);
+        } else if (string_starts_with(model_, "s3://")) {
+            rm_until_substring(model_, "://");
+            ret = s3_dl(model_, bn);
+        } else {  // ollama:// or nothing
+            rm_until_substring(model_, "ollama.com/library/");
+            rm_until_substring(model_, "://");
+            ret = ollama_dl(model_, bn);
         }
         model_ = bn;
@@ -713,13 +838,39 @@ static void add_message(const char * role, const std::string & text, LlamaData &
 }
 // Function to apply the chat template and resize `formatted` if needed
-static int apply_chat_template(LlamaData & llama_data, const bool append) {
+static int apply_chat_template(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) {
+    if (use_jinja) {
+        json messages = json::array();
+        for (const auto & msg : llama_data.messages) {
+            messages.push_back({
+                {"role", msg.role},
+                {"content", msg.content},
+            });
+        }
+        try {
+            minja::chat_template_inputs tmpl_inputs;
+            tmpl_inputs.messages = messages;
+            tmpl_inputs.add_generation_prompt = append;
+            minja::chat_template_options tmpl_opts;
+            tmpl_opts.use_bos_token = false;
+            tmpl_opts.use_eos_token = false;
+            auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
+            llama_data.fmtted.resize(result.size() + 1);
+            memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
+            return result.size();
+        } catch (const std::exception & e) {
+            printe("failed to render the chat template: %s\n", e.what());
+            return -1;
+        }
+    }
     int result = llama_chat_apply_template(
-        llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(), llama_data.messages.size(), append,
+        tmpl.source().c_str(), llama_data.messages.data(), llama_data.messages.size(), append,
         append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
     if (append && result > static_cast<int>(llama_data.fmtted.size())) {
         llama_data.fmtted.resize(result);
-        result = llama_chat_apply_template(llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(),
+        result = llama_chat_apply_template(tmpl.source().c_str(), llama_data.messages.data(),
                                            llama_data.messages.size(), append, llama_data.fmtted.data(),
                                            llama_data.fmtted.size());
     }
@@ -729,10 +880,12 @@ static int apply_chat_template(LlamaData & llama_data, const bool append) {
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
-                           std::vector<llama_token> & prompt_tokens) {
-    const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
+                           std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
+    const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
+    const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
     prompt_tokens.resize(n_prompt_tokens);
-    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
+    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first,
                        true) < 0) {
         printe("failed to tokenize the prompt\n");
         return -1;
@@ -746,7 +899,7 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch &
     const int n_ctx      = llama_n_ctx(ctx.get());
     const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
     if (n_ctx_used + batch.n_tokens > n_ctx) {
-        printf("\033[0m\n");
+        printf(LOG_COL_DEFAULT "\n");
         printe("context size exceeded\n");
         return 1;
     }
@@ -778,7 +931,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
     const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get());
     std::vector<llama_token> tokens;
-    if (tokenize_prompt(vocab, prompt, tokens) < 0) {
+    if (tokenize_prompt(vocab, prompt, tokens, llama_data) < 0) {
         return 1;
     }
@@ -809,7 +962,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
         batch = llama_batch_get_one(&new_token_id, 1);
     }
-    printf("\033[0m");
+    printf(LOG_COL_DEFAULT);
     return 0;
 }
@@ -818,7 +971,7 @@ static int read_user_input(std::string & user_input) {
 #ifdef WIN32
     printf(
         "\r%*s"
-        "\r\033[0m%s",
+        "\r" LOG_COL_DEFAULT "%s",
         get_terminal_width(), " ", prompt_prefix);
     std::getline(std::cin, user_input);
@@ -855,7 +1008,7 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
                              const bool stdout_a_terminal) {
     // Set response color
     if (stdout_a_terminal) {
-        printf("\033[33m");
+        printf(LOG_COL_YELLOW);
     }
     if (generate(llama_data, prompt, response)) {
@@ -864,13 +1017,13 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
     }
     // End response with color reset and newline
-    printf("\n%s", stdout_a_terminal ? "\033[0m" : "");
+    printf("\n%s", stdout_a_terminal ? LOG_COL_DEFAULT : "");
     return 0;
 }
 // Helper function to apply the chat template and handle errors
-static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) {
-    const int new_len = apply_chat_template(llama_data, append);
+static int apply_chat_template_with_error_handling(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
+    const int new_len = apply_chat_template(tmpl, llama_data, append, use_jinja);
     if (new_len < 0) {
         printe("failed to apply the chat template\n");
         return -1;
@@ -929,9 +1082,11 @@ static int get_user_input(std::string & user_input, const std::string & user) {
 }
 // Main chat loop function
-static int chat_loop(LlamaData & llama_data, const std::string & user) {
+static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_jinja) {
     int prev_len = 0;
     llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
+    auto chat_templates = common_chat_templates_from_model(llama_data.model.get(), "");
+    GGML_ASSERT(chat_templates.template_default);
     static const bool stdout_a_terminal = is_stdout_a_terminal();
     while (true) {
         // Get user input
@@ -942,7 +1097,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user) {
         add_message("user", user.empty() ? user_input : user, llama_data);
         int new_len;
-        if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
+        if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, true, new_len, use_jinja) < 0) {
             return 1;
         }
@@ -957,7 +1112,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user) {
         }
         add_message("assistant", response, llama_data);
-        if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) {
+        if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, false, prev_len, use_jinja) < 0) {
             return 1;
         }
     }
@@ -1017,7 +1172,7 @@ int main(int argc, const char ** argv) {
         return 1;
     }
-    if (chat_loop(llama_data, opt.user)) {
+    if (chat_loop(llama_data, opt.user, opt.use_jinja)) {
         return 1;
     }