npm - @novastera-oss/llamarn - Versions diffs - 0.1.5-beta.3 → 0.2.2 - Mend

@novastera-oss/llamarn 0.1.5-beta.3 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (274) hide show

package/cpp/LlamaCppModel.h CHANGED Viewed

@@ -8,6 +8,9 @@
 #include <unordered_map>
 #include <functional>
+// Add ReactCommon includes for proper async handling
+#include <ReactCommon/CallInvoker.h>
 // Include all necessary common headers from llama.cpp
 #include "common.h"
 #include "sampling.h"
@@ -70,13 +73,14 @@ struct ToolCall {
  * - Uses common_token_to_piece for token->text conversion
  * - Leverages the llama.cpp chat template system
  */
-class LlamaCppModel : public jsi::HostObject {
+class LlamaCppModel : public jsi::HostObject, public std::enable_shared_from_this<LlamaCppModel> {
 public:
   /**
    * Constructor
    * @param rn_ctx A pointer to an initialized rn_llama_context
+   * @param jsInvoker CallInvoker for async operations (optional, for async completion)
    */
-  LlamaCppModel(rn_llama_context* rn_ctx);
+  LlamaCppModel(rn_llama_context* rn_ctx, std::shared_ptr<CallInvoker> jsInvoker = nullptr);
   virtual ~LlamaCppModel();
   /**
@@ -124,6 +128,8 @@ private:
    * JSI method implementations
    */
   jsi::Value completionJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
+  jsi::Value completionAsyncJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
+  jsi::Value stopCompletionJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
   jsi::Value tokenizeJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
   jsi::Value detokenizeJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
   jsi::Value embeddingJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count);
@@ -157,6 +163,9 @@ private:
   // Completion state
   bool should_stop_completion_;
   bool is_predicting_;
+  // Add CallInvoker for async operations
+  std::shared_ptr<CallInvoker> jsInvoker_;
 };
 } // namespace facebook::react

package/cpp/PureCppImpl.cpp CHANGED Viewed

@@ -35,7 +35,7 @@ std::shared_ptr<TurboModule> PureCppImpl::create(std::shared_ptr<CallInvoker> js
 }
 PureCppImpl::PureCppImpl(std::shared_ptr<CallInvoker> jsInvoker)
-    : NativeRNLlamaCppCxxSpec(std::move(jsInvoker)) {
+    : NativeRNLlamaCppCxxSpec(jsInvoker), jsInvoker_(jsInvoker) {
 }
 double PureCppImpl::multiply(jsi::Runtime& rt, double a, double b) {
@@ -325,8 +325,8 @@ jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
 }
 jsi::Object PureCppImpl::createModelObject(jsi::Runtime& runtime, rn_llama_context* rn_ctx) {
-  // Create a shared_ptr to a new LlamaCppModel instance
-  auto llamaModel = std::make_shared<LlamaCppModel>(rn_ctx);
+  // Create a shared_ptr to a new LlamaCppModel instance with CallInvoker
+  auto llamaModel = std::make_shared<LlamaCppModel>(rn_ctx, jsInvoker_);
   // Create a host object from the LlamaCppModel instance
   return jsi::Object::createFromHostObject(runtime, std::move(llamaModel));

package/cpp/PureCppImpl.h CHANGED Viewed

@@ -54,6 +54,9 @@ private:
     // Mutex for thread safety when accessing rn_ctx_ or other shared resources
     std::mutex mutex_;
+    // CallInvoker for async operations
+    std::shared_ptr<CallInvoker> jsInvoker_;
 };
 } // namespace facebook::react

package/cpp/build-info.cpp CHANGED Viewed

@@ -1,4 +1,4 @@
-int LLAMA_BUILD_NUMBER = 5349;
-char const *LLAMA_COMMIT = "9a390c48";
+int LLAMA_BUILD_NUMBER = 5541;
+char const *LLAMA_COMMIT = "07e4351c";
 char const *LLAMA_COMPILER = "unknown";
 char const *LLAMA_BUILD_TARGET = "unknown";

package/cpp/llama.cpp/README.md CHANGED Viewed

@@ -37,7 +37,7 @@ range of hardware - locally and in the cloud.
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2, AVX512 and AMX support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
-- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
+- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
@@ -237,7 +237,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [BLAS](docs/build.md#blas-build) | All |
 | [BLIS](docs/backend/BLIS.md) | All |
 | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
+| [MUSA](docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
 | [HIP](docs/build.md#hip) | AMD GPU |
 | [Vulkan](docs/build.md#vulkan) | GPU |
@@ -572,4 +572,12 @@ automatically. For example:
 $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 ```
-## References
+## Dependencies
+- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
+- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
+- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
+- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
+- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
+- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
+- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain

package/cpp/llama.cpp/build-xcframework.sh CHANGED Viewed

@@ -117,6 +117,7 @@ setup_framework_structure() {
     # Copy all required headers (common for all platforms)
     cp include/llama.h             ${header_path}
     cp ggml/include/ggml.h         ${header_path}
+    cp ggml/include/ggml-opt.h     ${header_path}
     cp ggml/include/ggml-alloc.h   ${header_path}
     cp ggml/include/ggml-backend.h ${header_path}
     cp ggml/include/ggml-metal.h   ${header_path}

package/cpp/llama.cpp/common/CMakeLists.txt CHANGED Viewed

@@ -60,12 +60,16 @@ add_library(${TARGET} STATIC
     base64.hpp
     chat.cpp
     chat.h
+    chat-parser.cpp
+    chat-parser.h
     common.cpp
     common.h
     console.cpp
     console.h
     json-schema-to-grammar.cpp
     json.hpp
+    json-partial.h
+    json-partial.cpp
     llguidance.cpp
     log.cpp
     log.h
@@ -73,6 +77,8 @@ add_library(${TARGET} STATIC
     minja/minja.hpp
     ngram-cache.cpp
     ngram-cache.h
+    regex-partial.cpp
+    regex-partial.h
     sampling.cpp
     sampling.h
     speculative.cpp
@@ -119,8 +125,8 @@ if (LLAMA_LLGUIDANCE)
     ExternalProject_Add(llguidance_ext
         GIT_REPOSITORY https://github.com/guidance-ai/llguidance
-        # v0.7.19 (+ fancy-regex build fix):
-        GIT_TAG b59f98f85269892a7de3d3641ad155366f13daa6
+        # v0.7.20 (+ fix to build on GCC 15):
+        GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
         PREFIX ${CMAKE_BINARY_DIR}/llguidance
         SOURCE_DIR ${LLGUIDANCE_SRC}
         BUILD_IN_SOURCE TRUE

package/cpp/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -39,7 +39,7 @@
 using json = nlohmann::ordered_json;
 std::initializer_list<enum llama_example> mmproj_examples = {
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
     LLAMA_EXAMPLE_SERVER,
 };
@@ -242,33 +242,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
 }
 // download one single file from remote URL to local path
-static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
-    // Initialize libcurl
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    if (!curl) {
-        LOG_ERR("%s: error initializing libcurl\n", __func__);
-        return false;
-    }
-    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    // Check if hf-token or bearer-token was specified
-    if (!bearer_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + bearer_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-    }
-    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-#if defined(_WIN32)
-    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
-    //   operating system. Currently implemented under MS-Windows.
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
+static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
     // Check if the file already exists locally
     auto file_exists = std::filesystem::exists(path);
@@ -279,6 +253,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
     std::string last_modified;
     if (file_exists) {
+        if (offline) {
+            LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
+            return true; // skip verification/downloading
+        }
         // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
         std::ifstream metadata_in(metadata_path);
         if (metadata_in.good()) {
@@ -297,6 +275,10 @@ static bool common_download_file_single(const std::string & url, const std::stri
         }
         // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
     } else {
+        if (offline) {
+            LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
+            return false;
+        }
         LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
     }
@@ -310,50 +292,73 @@ static bool common_download_file_single(const std::string & url, const std::stri
     bool head_request_ok = false;
     bool should_download = !file_exists; // by default, we should download if the file does not exist
-    // get ETag to see if the remote file has changed
-    {
-        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
-        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+    // Initialize libcurl
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    if (!curl) {
+        LOG_ERR("%s: error initializing libcurl\n", __func__);
+        return false;
+    }
-            static std::regex header_regex("([^:]+): (.*)\r\n");
-            static std::regex etag_regex("ETag", std::regex_constants::icase);
-            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
+    // Set the URL, allow to follow http redirection
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-            std::string header(buffer, n_items);
-            std::smatch match;
-            if (std::regex_match(header, match, header_regex)) {
-                const std::string & key = match[1];
-                const std::string & value = match[2];
-                if (std::regex_match(key, match, etag_regex)) {
-                    headers->etag = value;
-                } else if (std::regex_match(key, match, last_modified_regex)) {
-                    headers->last_modified = value;
-                }
-            }
-            return n_items;
-        };
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    // Check if hf-token or bearer-token was specified
+    if (!bearer_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + bearer_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+    }
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+#if defined(_WIN32)
+    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
+    //   operating system. Currently implemented under MS-Windows.
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
+    typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
+    auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
+        common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
-        // we only allow retrying once for HEAD requests
-        // this is for the use case of using running offline (no internet), retrying can be annoying
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
-        if (!was_perform_successful) {
-            head_request_ok = false;
-        }
+        static std::regex header_regex("([^:]+): (.*)\r\n");
+        static std::regex etag_regex("ETag", std::regex_constants::icase);
+        static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
-        long http_code = 0;
-        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code == 200) {
-            head_request_ok = true;
-        } else {
-            LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
-            head_request_ok = false;
+        std::string header(buffer, n_items);
+        std::smatch match;
+        if (std::regex_match(header, match, header_regex)) {
+            const std::string & key = match[1];
+            const std::string & value = match[2];
+            if (std::regex_match(key, match, etag_regex)) {
+                headers->etag = value;
+            } else if (std::regex_match(key, match, last_modified_regex)) {
+                headers->last_modified = value;
+            }
         }
+        return n_items;
+    };
+    curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
+    curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
+    // we only allow retrying once for HEAD requests
+    // this is for the use case of using running offline (no internet), retrying can be annoying
+    bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
+    if (!was_perform_successful) {
+        head_request_ok = false;
+    }
+    long http_code = 0;
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+    if (http_code == 200) {
+        head_request_ok = true;
+    } else {
+        LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+        head_request_ok = false;
     }
     // if head_request_ok is false, we don't have the etag or last-modified headers
@@ -460,12 +465,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
 // download multiple files from remote URLs to local paths
 // the input is a vector of pairs <url, path>
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
     // Prepare download in parallel
     std::vector<std::future<bool>> futures_download;
     for (auto const & item : urls) {
-        futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
-            return common_download_file_single(it.first, it.second, bearer_token);
+        futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
+            return common_download_file_single(it.first, it.second, bearer_token, offline);
         }, item));
     }
@@ -481,14 +486,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
 static bool common_download_model(
         const common_params_model & model,
-        const std::string & bearer_token) {
+        const std::string & bearer_token,
+        bool offline) {
     // Basic validation of the model.url
     if (model.url.empty()) {
         LOG_ERR("%s: invalid model url\n", __func__);
         return false;
     }
-    if (!common_download_file_single(model.url, model.path, bearer_token)) {
+    if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
         return false;
     }
@@ -547,7 +553,7 @@ static bool common_download_model(
         }
         // Download in parallel
-        common_download_file_multiple(urls, bearer_token);
+        common_download_file_multiple(urls, bearer_token, offline);
     }
     return true;
@@ -608,7 +614,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
  *
  * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
  */
-static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
+static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
     auto parts = string_split<std::string>(hf_repo_with_tag, ':');
     std::string tag = parts.size() > 1 ? parts.back() : "latest";
     std::string hf_repo = parts[0];
@@ -638,20 +644,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
     long res_code = 0;
     std::string res_str;
     bool use_cache = false;
-    try {
-        auto res = common_remote_get_content(url, params);
-        res_code = res.first;
-        res_str = std::string(res.second.data(), res.second.size());
-    } catch (const std::exception & e) {
-        LOG_WRN("error: failed to get manifest: %s\n", e.what());
-        LOG_WRN("try reading from cache\n");
-        // try to read from cache
+    if (!offline) {
         try {
+            auto res = common_remote_get_content(url, params);
+            res_code = res.first;
+            res_str = std::string(res.second.data(), res.second.size());
+        } catch (const std::exception & e) {
+            LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
+        }
+    }
+    if (res_code == 0) {
+        if (std::filesystem::exists(cached_response_path)) {
+            LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
             res_str = read_file(cached_response_path);
             res_code = 200;
             use_cache = true;
-        } catch (const std::exception & e) {
-            throw std::runtime_error("error: failed to get manifest (check your internet connection)");
+        } else {
+            throw std::runtime_error(
+                offline ? "error: failed to get manifest (offline mode)"
+                : "error: failed to get manifest (check your internet connection)");
         }
     }
     std::string ggufFile;
@@ -698,24 +709,25 @@ bool common_has_curl() {
     return false;
 }
-static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
+static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
     LOG_ERR("error: built without CURL, cannot download model from internet\n");
     return false;
 }
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
     LOG_ERR("error: built without CURL, cannot download model from the internet\n");
     return false;
 }
 static bool common_download_model(
         const common_params_model &,
-        const std::string &) {
+        const std::string &,
+        bool) {
     LOG_ERR("error: built without CURL, cannot download model from the internet\n");
     return false;
 }
-static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
+static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
     LOG_ERR("error: built without CURL, cannot download model from the internet\n");
     return {};
 }
@@ -742,7 +754,8 @@ struct handle_model_result {
 static handle_model_result common_params_handle_model(
         struct common_params_model & model,
         const std::string & bearer_token,
-        const std::string & model_path_default) {
+        const std::string & model_path_default,
+        bool offline) {
     handle_model_result result;
     // handle pre-fill default model path and url based on hf_repo and hf_file
     {
@@ -750,7 +763,7 @@ static handle_model_result common_params_handle_model(
             // short-hand to avoid specifying --hf-file -> default it to --model
             if (model.hf_file.empty()) {
                 if (model.path.empty()) {
-                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
+                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
                     if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
                         exit(1); // built without CURL, error message already printed
                     }
@@ -791,7 +804,7 @@ static handle_model_result common_params_handle_model(
     // then, download it if needed
     if (!model.url.empty()) {
-        bool ok = common_download_model(model, bearer_token);
+        bool ok = common_download_model(model, bearer_token, offline);
         if (!ok) {
             LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
             exit(1);
@@ -934,7 +947,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
     // handle model and download
     {
-        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
+        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
         if (params.no_mmproj) {
             params.mmproj = {};
         } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -944,12 +957,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         // only download mmproj if the current example is using it
         for (auto & ex : mmproj_examples) {
             if (ctx_arg.ex == ex) {
-                common_params_handle_model(params.mmproj,    params.hf_token, "");
+                common_params_handle_model(params.mmproj,    params.hf_token, "", params.offline);
                 break;
             }
         }
-        common_params_handle_model(params.speculative.model, params.hf_token, "");
-        common_params_handle_model(params.vocoder.model,     params.hf_token, "");
+        common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
+        common_params_handle_model(params.vocoder.model,     params.hf_token, "", params.offline);
     }
     if (params.escape) {
@@ -1445,6 +1458,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_keep = value;
         }
     ));
+    add_opt(common_arg(
+        {"--swa-full"},
+        string_format("use full-size SWA cache (default: %s)\n"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
+        [](common_params & params) {
+            params.swa_full = true;
+        }
+    ).set_env("LLAMA_ARG_SWA_FULL"));
     add_opt(common_arg(
         {"--no-context-shift"},
         string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1670,7 +1691,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.warmup = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"--spm-infill"},
         string_format(
@@ -2057,13 +2078,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.grp_attn_w = value;
         }
     ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(common_arg(
-        {"-dkvc", "--dump-kv-cache"},
-        "verbose print of the KV cache",
-        [](common_params & params) {
-            params.dump_kv_cache = true;
-        }
-    ));
     add_opt(common_arg(
         {"-nkvo", "--no-kv-offload"},
         "disable KV offload",
@@ -2232,12 +2246,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
     add_opt(common_arg(
-        {"--image"}, "FILE",
-        "path to an image file. use with multimodal models. Specify multiple times for batching",
+        {"--image", "--audio"}, "FILE",
+        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
         [](common_params & params, const std::string & value) {
             params.image.emplace_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples({LLAMA_EXAMPLE_MTMD}));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",
@@ -2585,7 +2599,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_junk = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
     add_opt(common_arg(
         {"--pos"}, "N",
         string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
@@ -2648,7 +2662,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.is_pp_shared = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
     add_opt(common_arg(
         {"-npp"}, "n0,n1,...",
         "number of prompt tokens",
@@ -2847,15 +2861,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
     add_opt(common_arg(
         {"--reasoning-format"}, "FORMAT",
-        "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
-        "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
-        "only supported for non-streamed responses",
+        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
+        "- none: leaves thoughts unparsed in `message.content`\n"
+        "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
+        "(default: deepseek)",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
             else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
-            else { std::invalid_argument("invalid value"); }
+            else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
+    add_opt(common_arg(
+        {"--reasoning-budget"}, "N",
+        "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
+        [](common_params & params, int value) {
+            if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
+            params.reasoning_budget = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
     add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
         string_format(
@@ -2867,7 +2890,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.chat_template = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
     add_opt(common_arg(
         {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
         string_format(
@@ -2880,6 +2903,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.chat_template = read_file(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+    add_opt(common_arg(
+        {"--no-prefill-assistant"},
+        string_format(
+            "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
+            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
+        ),
+        [](common_params & params) {
+            params.prefill_assistant = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
     add_opt(common_arg(
         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
         string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2944,7 +2977,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
             else if (value == "md") { params.batched_bench_output_jsonl = false; }
-            else { std::invalid_argument("invalid value"); }
+            else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(common_arg(
@@ -2976,6 +3009,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             common_log_set_verbosity_thold(INT_MAX);
         }
     ));
+    add_opt(common_arg(
+        {"--offline"},
+        "Offline mode: forces use of cache, prevents network access",
+        [](common_params & params) {
+            params.offline = true;
+        }
+    ).set_env("LLAMA_OFFLINE"));
     add_opt(common_arg(
         {"-lv", "--verbosity", "--log-verbosity"}, "N",
         "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",