npm - @fugood/llama.node - Versions diffs - 1.4.14 → 1.4.15 - Mend

@fugood/llama.node 1.4.14 → 1.4.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/lib/binding.ts +12 -1
package/lib/index.ts +6 -1
package/package.json +15 -15
package/scripts/llama.cpp.patch +8 -11
package/src/LlamaContext.cpp +11 -2
package/src/llama.cpp/CMakeLists.txt +24 -8
package/src/llama.cpp/common/CMakeLists.txt +3 -34
package/src/llama.cpp/common/arg.cpp +39 -10
package/src/llama.cpp/common/chat-parser.cpp +115 -0
package/src/llama.cpp/common/chat.cpp +67 -0
package/src/llama.cpp/common/chat.h +1 -0
package/src/llama.cpp/common/common.h +2 -0
package/src/llama.cpp/common/debug.cpp +165 -0
package/src/llama.cpp/common/debug.h +43 -0
package/src/llama.cpp/common/download.cpp +12 -342
package/src/llama.cpp/common/download.h +6 -0
package/src/llama.cpp/common/preset.cpp +12 -2
package/src/llama.cpp/src/CMakeLists.txt +1 -0
package/src/llama.cpp/src/llama-arch.cpp +35 -0
package/src/llama.cpp/src/llama-arch.h +1 -0
package/src/llama.cpp/src/llama-chat.cpp +20 -0
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-graph.cpp +31 -43
package/src/llama.cpp/src/llama-mmap.cpp +8 -5
package/src/llama.cpp/src/llama-model.cpp +210 -96
package/src/llama.cpp/src/llama-vocab.cpp +37 -24
package/src/llama.cpp/src/llama-vocab.h +1 -0
package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
package/src/llama.cpp/src/models/models.h +13 -2
package/src/llama.cpp/src/models/qwen3next.cpp +198 -182

package/lib/binding.ts CHANGED Viewed

@@ -515,9 +515,20 @@ export interface LlamaContext {
   /**
    * Initialize multimodal support with a mmproj file
    * @param options Object containing path and optional use_gpu flag
+   * @param options.path Path to the multimodal projector model file (mmproj)
+   * @param options.use_gpu Whether to use GPU for multimodal processing (default: true)
+   * @param options.image_min_tokens Minimum number of tokens for image input (for dynamic resolution models)
+   * @param options.image_max_tokens Maximum number of tokens for image input (for dynamic resolution models).
+   *                                  Lower values reduce memory usage and improve speed for high-resolution images.
+   *                                  Recommended: 256-512 for faster inference, up to 4096 for maximum detail.
    * @returns boolean indicating if initialization was successful
    */
-  initMultimodal(options: { path: string; use_gpu?: boolean }): boolean
+  initMultimodal(options: {
+    path: string
+    use_gpu?: boolean
+    image_min_tokens?: number
+    image_max_tokens?: number
+  }): boolean
   /**
    * Check if multimodal support is enabled

package/lib/index.ts CHANGED Viewed

@@ -254,7 +254,12 @@ class LlamaContextWrapper {
     return this.ctx.getLoadedLoraAdapters()
   }
-  initMultimodal(options: { path: string; use_gpu?: boolean }): boolean {
+  initMultimodal(options: {
+    path: string
+    use_gpu?: boolean
+    image_min_tokens?: number
+    image_max_tokens?: number
+  }): boolean {
     return this.ctx.initMultimodal(options)
   }

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.4.14",
+  "version": "1.4.15",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -72,20 +72,20 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-darwin-arm64": "1.4.14",
-    "@fugood/node-llama-darwin-x64": "1.4.14",
-    "@fugood/node-llama-linux-arm64": "1.4.14",
-    "@fugood/node-llama-linux-arm64-cuda": "1.4.14",
-    "@fugood/node-llama-linux-arm64-snapdragon": "1.4.14",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.4.14",
-    "@fugood/node-llama-linux-x64": "1.4.14",
-    "@fugood/node-llama-linux-x64-cuda": "1.4.14",
-    "@fugood/node-llama-linux-x64-vulkan": "1.4.14",
-    "@fugood/node-llama-win32-arm64": "1.4.14",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.4.14",
-    "@fugood/node-llama-win32-x64": "1.4.14",
-    "@fugood/node-llama-win32-x64-cuda": "1.4.14",
-    "@fugood/node-llama-win32-x64-vulkan": "1.4.14"
+    "@fugood/node-llama-darwin-arm64": "1.4.15",
+    "@fugood/node-llama-darwin-x64": "1.4.15",
+    "@fugood/node-llama-linux-arm64": "1.4.15",
+    "@fugood/node-llama-linux-arm64-cuda": "1.4.15",
+    "@fugood/node-llama-linux-arm64-snapdragon": "1.4.15",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.4.15",
+    "@fugood/node-llama-linux-x64": "1.4.15",
+    "@fugood/node-llama-linux-x64-cuda": "1.4.15",
+    "@fugood/node-llama-linux-x64-vulkan": "1.4.15",
+    "@fugood/node-llama-win32-arm64": "1.4.15",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.4.15",
+    "@fugood/node-llama-win32-x64": "1.4.15",
+    "@fugood/node-llama-win32-x64-cuda": "1.4.15",
+    "@fugood/node-llama-win32-x64-vulkan": "1.4.15"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/scripts/llama.cpp.patch CHANGED Viewed

@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
-index f7b99159e..fa37fed19 100644
+index 723973ed7..e4b2c6537 100644
 --- a/src/llama.cpp/common/CMakeLists.txt
 +++ b/src/llama.cpp/common/CMakeLists.txt
-@@ -154,8 +154,14 @@ if (LLAMA_LLGUIDANCE)
+@@ -146,4 +146,11 @@ if (LLAMA_LLGUIDANCE)
      set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
  endif ()
@@ -13,11 +13,8 @@ index f7b99159e..fa37fed19 100644
 +else()
 +    set(LLAMA_COMMON_WIN_LIBS "")
 +endif()
++
 +target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
- #
- # copy the license files
 diff --git a/src/llama.cpp/common/chat-peg-parser.cpp b/src/llama.cpp/common/chat-peg-parser.cpp
 index 1bcba9cd8..b7cd68734 100644
 --- a/src/llama.cpp/common/chat-peg-parser.cpp
@@ -32,7 +29,7 @@ index 1bcba9cd8..b7cd68734 100644
  static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
      int count = 0;
 diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
-index 22e527bab..c3d0affca 100644
+index d531388bc..e6712b368 100644
 --- a/src/llama.cpp/common/chat.cpp
 +++ b/src/llama.cpp/common/chat.cpp
@@ -7,9 +7,6 @@
@@ -62,7 +59,7 @@ index 22e527bab..c3d0affca 100644
  struct templates_params {
      json messages;
      json tools;
-@@ -752,7 +739,7 @@ static std::string apply(
+@@ -753,7 +740,7 @@ static std::string apply(
          tmpl_inputs.extra_context.merge_patch(*additional_context);
      }
      // TODO: add flag to control date/time, if only for testing purposes.
@@ -72,7 +69,7 @@ index 22e527bab..c3d0affca 100644
      minja::chat_template_options tmpl_opts;
      // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
 diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
-index 8bd4a325f..333b3301f 100644
+index 454085e90..e01390cf9 100644
 --- a/src/llama.cpp/common/chat.h
 +++ b/src/llama.cpp/common/chat.h
@@ -10,7 +10,18 @@
@@ -108,10 +105,10 @@ index 744f0b4ee..04fcebb9e 100644
      mparams.main_gpu        = params.main_gpu;
      mparams.split_mode      = params.split_mode;
 diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
-index 7794c0268..5b77ae0c3 100644
+index e60087dea..c21797cd8 100644
 --- a/src/llama.cpp/common/common.h
 +++ b/src/llama.cpp/common/common.h
-@@ -310,6 +310,7 @@ struct lr_opt {
+@@ -311,6 +311,7 @@ struct lr_opt {
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
  struct common_params {

package/src/LlamaContext.cpp CHANGED Viewed

@@ -1333,7 +1333,7 @@ extern "C" void cleanup_logging() {
 }
-// initMultimodal(options: { path: string, use_gpu?: boolean }): boolean
+// initMultimodal(options: { path: string, use_gpu?: boolean, image_min_tokens?: number, image_max_tokens?: number }): boolean
 Napi::Value LlamaContext::InitMultimodal(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
@@ -1345,6 +1345,15 @@ Napi::Value LlamaContext::InitMultimodal(const Napi::CallbackInfo &info) {
   auto options = info[0].As<Napi::Object>();
   auto mmproj_path = options.Get("path").ToString().Utf8Value();
   auto use_gpu = options.Get("use_gpu").ToBoolean().Value();
+  int image_min_tokens = -1;
+  int image_max_tokens = -1;
+  if (options.Has("image_min_tokens") && options.Get("image_min_tokens").IsNumber()) {
+    image_min_tokens = options.Get("image_min_tokens").ToNumber().Int32Value();
+  }
+  if (options.Has("image_max_tokens") && options.Get("image_max_tokens").IsNumber()) {
+    image_max_tokens = options.Get("image_max_tokens").ToNumber().Int32Value();
+  }
   if (mmproj_path.empty()) {
     Napi::TypeError::New(env, "mmproj path is required")
@@ -1360,7 +1369,7 @@ Napi::Value LlamaContext::InitMultimodal(const Napi::CallbackInfo &info) {
   // Disable ctx_shift before initializing multimodal
   _rn_ctx->params.ctx_shift = false;
-  bool result = _rn_ctx->initMultimodal(mmproj_path, use_gpu);
+  bool result = _rn_ctx->initMultimodal(mmproj_path, use_gpu, image_min_tokens, image_max_tokens);
   if (!result) {
     Napi::Error::New(env, "Failed to initialize multimodal context")
         .ThrowAsJavaScriptException();

package/src/llama.cpp/CMakeLists.txt CHANGED Viewed

@@ -111,11 +111,16 @@ option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
 # 3rd party libs
-option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
-option(LLAMA_HTTPLIB    "llama: if libcurl is disabled, use httplib to download model from an URL" ON)
-option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" OFF)
+option(LLAMA_HTTPLIB    "llama: httplib for downloading functionality" ON)
+option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
+# deprecated
+option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
+if (LLAMA_CURL)
+    message(WARNING "LLAMA_CURL option is deprecated and will be ignored")
+endif()
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
@@ -182,6 +187,9 @@ if (NOT MSVC)
     endif()
 endif()
+include("cmake/license.cmake")
+license_add_file("llama.cpp" "LICENSE")
 #
 # 3rd-party
 #
@@ -209,11 +217,6 @@ add_subdirectory(src)
 # utils, programs, examples and tests
 #
-if (NOT LLAMA_BUILD_COMMON)
-    message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
-    set(LLAMA_CURL OFF)
-endif()
 if (LLAMA_BUILD_COMMON)
     add_subdirectory(common)
     if (LLAMA_HTTPLIB)
@@ -235,6 +238,19 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
     add_subdirectory(tools)
 endif()
+# Automatically add all files from the 'licenses' directory
+file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
+foreach(FILE_PATH ${EXTRA_LICENSES})
+    get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
+    string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
+    license_add_file("${NAME}" "${FILE_PATH}")
+endforeach()
+if (LLAMA_BUILD_COMMON)
+    license_generate(common)
+endif()
 #
 # install
 #

package/src/llama.cpp/common/CMakeLists.txt CHANGED Viewed

@@ -60,6 +60,8 @@ add_library(${TARGET} STATIC
     common.h
     console.cpp
     console.h
+    debug.cpp
+    debug.h
     download.cpp
     download.h
     http.h
@@ -95,17 +97,7 @@ endif()
 # TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
 set(LLAMA_COMMON_EXTRA_LIBS build_info)
-if (LLAMA_CURL)
-    # Use curl to download model url
-    find_package(CURL)
-    if (NOT CURL_FOUND)
-        message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
-    endif()
-    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
-    include_directories(${CURL_INCLUDE_DIRS})
-    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
-elseif (LLAMA_HTTPLIB)
-    # otherwise, use cpp-httplib
+if (LLAMA_HTTPLIB)
     target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
     set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
 endif()
@@ -162,26 +154,3 @@ else()
 endif()
 target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
-#
-# copy the license files
-#
-# Check if running in GitHub Actions
-if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
-    message(STATUS "Running inside GitHub Actions - copying license files")
-    # Copy all files from licenses/ to build/bin/
-    file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
-    foreach(LICENSE_FILE ${LICENSE_FILES})
-        get_filename_component(FILENAME ${LICENSE_FILE} NAME)
-        add_custom_command(
-            POST_BUILD
-            TARGET ${TARGET}
-            COMMAND ${CMAKE_COMMAND} -E copy_if_different
-                "${LICENSE_FILE}"
-                "$<TARGET_FILE_DIR:llama>/${FILENAME}"
-            COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
-        message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
-    endforeach()
-endif()

package/src/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -2,10 +2,10 @@
 #include "chat.h"
 #include "common.h"
+#include "download.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
-#include "download.h"
 #include "preset.h"
 // fix problem with std::min and std::max
@@ -48,6 +48,8 @@
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+extern const char * LICENSES[];
 using json = nlohmann::ordered_json;
 using namespace common_arg_utils;
@@ -279,12 +281,20 @@ static std::string clean_file_name(const std::string & fname) {
 static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
     GGML_ASSERT(!params.model.hf_repo.empty());
+    // the returned hf_repo is without tag
+    auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
+    // "latest" tag (default if not specified) is translated to "default" preset
+    if (hf_tag == "latest") {
+        hf_tag = "default";
+    }
     const bool offline = params.offline;
     std::string model_endpoint = get_model_endpoint();
-    auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";
+    auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
     // prepare local path for caching
-    auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
+    auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
     auto preset_path = fs_get_cache_file(preset_fname);
     const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
     const bool has_preset = status >= 200 && status < 400;
@@ -293,14 +303,15 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
     if (has_preset) {
         LOG_INF("applying remote preset from %s\n", preset_url.c_str());
         common_preset_context ctx(ex, /* only_remote_allowed */ true);
-        common_preset global; // unused for now
+        common_preset global;
         auto remote_presets = ctx.load_from_ini(preset_path, global);
-        if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
-            common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
+        remote_presets = ctx.cascade(global, remote_presets);
+        if (remote_presets.find(hf_tag) != remote_presets.end()) {
+            common_preset preset = remote_presets.at(hf_tag);
             LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
             preset.apply_to_params(params);
         } else {
-            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
+            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
         }
     } else {
         LOG_INF("%s", "no remote preset found, skipping\n");
@@ -330,7 +341,7 @@ static handle_model_result common_params_handle_model(
                 if (model.path.empty()) {
                     auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
                     if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
-                        exit(1); // built without CURL, error message already printed
+                        exit(1); // error message already printed
                     }
                     model.name    = model.hf_repo;      // repo name with tag
                     model.hf_repo = auto_detected.repo; // repo name without tag
@@ -1030,6 +1041,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             exit(0);
         }
     ));
+    add_opt(common_arg(
+        {"--license"},
+        "show source code license and dependencies",
+        [](common_params &) {
+            for (int i = 0; LICENSES[i]; ++i) {
+                printf("%s\n", LICENSES[i]);
+            }
+            exit(0);
+        }
+    ));
     add_opt(common_arg(
         {"-cl", "--cache-list"},
         "show list of models in cache",
@@ -1274,7 +1295,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.kv_unified = true;
         }
-    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
     add_opt(common_arg(
         {"--context-shift"},
         {"--no-context-shift"},
@@ -2856,10 +2877,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_threads_http = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
+    add_opt(common_arg(
+        {"--cache-prompt"},
+        {"--no-cache-prompt"},
+        string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.cache_prompt = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
     add_opt(common_arg(
         {"--cache-reuse"}, "N",
         string_format(
-            "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
+            "min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
             "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
         ),
         [](common_params & params, int value) {

package/src/llama.cpp/common/chat-parser.cpp CHANGED Viewed

@@ -1403,6 +1403,118 @@ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
     builder.add_content(builder.consume_rest());
 }
+static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
+    // 1) <tool_call>{ "name": "...", "arguments": {...} }</tool_call>
+    // 2) <tool_call>{ "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }</tool_call>
+    static const common_regex tool_call_open(R"(<tool_call[^>]*>)");
+    if (!builder.syntax().parse_tool_calls) {
+        LOG_DBG("%s: not parse_tool_calls\n", __func__);
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    LOG_DBG("%s: parse_tool_calls\n", __func__);
+    // Find all <tool_call></tool_call> blocks
+    while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
+        builder.move_to(first->groups[0].end);
+        builder.consume_spaces();
+        builder.try_consume_literal("```json");
+        builder.try_consume_literal("```");
+        builder.consume_spaces();
+        // Consume JSON object
+        auto data = builder.consume_json();
+        builder.consume_spaces();
+        builder.try_consume_literal("```");
+        builder.consume_spaces();
+        if (!builder.try_consume_literal("</tool_call>")) {
+            throw common_chat_msg_partial_exception("incomplete tool call");
+        }
+        builder.consume_spaces();
+        // Extract name and arguments
+        std::string name;
+        std::string id;
+        nlohmann::ordered_json arguments;
+        const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
+            if (!obj.contains("name") || !obj.contains("arguments")) {
+                return false;
+            }
+            name = obj.at("name").get<std::string>();
+            arguments = obj.at("arguments");
+            if (obj.contains("id") && obj.at("id").is_string()) {
+                id = obj.at("id").get<std::string>();
+            }
+            return true;
+        };
+        if (!extract_args(data.json)) {
+            if (data.json.contains("function") && data.json.at("function").is_object()) {
+                auto fn = data.json.at("function");
+                extract_args(fn);
+                if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
+                    id = data.json.at("id").get<std::string>();
+                }
+            }
+        }
+        // If name is empty, treat the JSON object as content
+        if (name.empty()) {
+            LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
+            builder.add_content(data.json.dump());
+            continue;
+        }
+        std::string args_str = arguments.dump();
+        if (!builder.add_tool_call(name, id, args_str)) {
+            throw common_chat_msg_partial_exception("incomplete tool call");
+        }
+    }
+    builder.add_content(builder.consume_rest());
+}
+static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
+    LOG_DBG("%s: parsing exaone_moe\n", __func__);
+    // EXAONE MoE outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
+    // First try to parse using the standard reasoning parsing method
+    LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
+    auto start_pos = builder.pos();
+    auto found_end_think = builder.try_find_literal("</think>");
+    builder.move_to(start_pos);
+    if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
+        LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
+        common_chat_parse_exaone_moe_content(builder);
+    } else if (builder.try_parse_reasoning("<think>", "</think>")) {
+        // If reasoning was parsed successfully, the remaining content is regular content
+        LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
+        common_chat_parse_exaone_moe_content(builder);
+    } else {
+        if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+          LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
+          common_chat_parse_exaone_moe_content(builder);
+          return;
+        }
+        // If no reasoning tags found, check if we should treat everything as reasoning
+        if (builder.syntax().thinking_forced_open) {
+            // If thinking is forced open but no tags found, treat everything as reasoning
+            LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
+            builder.add_reasoning_content(builder.consume_rest());
+        } else {
+            LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
+            common_chat_parse_exaone_moe_content(builder);
+        }
+    }
+}
 static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
     builder.try_parse_reasoning("<think>", "</think>");
     builder.add_content(builder.consume_rest());
@@ -1490,6 +1602,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
         case COMMON_CHAT_FORMAT_SOLAR_OPEN:
             common_chat_parse_solar_open(builder);
             break;
+        case COMMON_CHAT_FORMAT_EXAONE_MOE:
+            common_chat_parse_exaone_moe(builder);
+            break;
         default:
             throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
     }

package/src/llama.cpp/common/chat.cpp CHANGED Viewed

@@ -657,6 +657,7 @@ const char * common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
         case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
         case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
+        case COMMON_CHAT_FORMAT_EXAONE_MOE: return "EXAONE MoE";
         case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
         case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
         case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
@@ -2526,6 +2527,65 @@ static common_chat_params common_chat_params_init_solar_open(const common_chat_t
     return data;
 }
+static common_chat_params common_chat_params_init_exaone_moe(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+    data.prompt = apply(tmpl, inputs);
+    data.format = COMMON_CHAT_FORMAT_EXAONE_MOE;
+    if (string_ends_with(data.prompt, "<think>\n")) {
+        if (!inputs.enable_thinking) {
+            data.prompt += "</think>\n\n";
+        } else {
+            data.thinking_forced_open = true;
+        }
+    }
+    if (inputs.tools.is_array() && !inputs.tools.empty()) {
+        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            std::vector<std::string> tool_rules;
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                std::string name = function.at("name");
+                auto parameters = function.at("parameters");
+                builder.resolve_refs(parameters);
+                // Expect: <tool_call>{"name": "<name>", "arguments": {...}}</tool_call>
+                tool_rules.push_back(builder.add_rule(
+                    name + "-call",
+                    "\"<tool_call>\" space " +
+                        builder.add_schema(name + "-obj", json{
+                            {"type", "object"},
+                            {"properties", {
+                                {"name",      json{{"const", name}}},
+                                {"arguments", parameters},
+                            }},
+                            {"required", json::array({"name", "arguments"})},
+                        }) +
+                    " space \"</tool_call>\" space"));
+            });
+            auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
+            builder.add_rule("root",
+                std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
+                (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
+            data.grammar_triggers.push_back({
+                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+                std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)?" : "") +
+                    "(<tool_call>)[\\s\\S]*"
+            });
+            data.preserved_tokens = {
+                "<think>",
+                "</think>",
+                "<tool_call>",
+                "</tool_call>",
+            };
+        });
+    }
+    return data;
+}
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
     data.prompt = apply(tmpl, inputs);
@@ -2696,6 +2756,13 @@ static common_chat_params common_chat_templates_apply_jinja(
         return common_chat_params_init_xiaomi_mimo(tmpl, params);
     }
+    // EXAONE MoE format detection
+    if (src.find("<tool_call>") != std::string::npos &&
+        src.find("<tool_result>") != std::string::npos &&
+        src.find("<|tool_declare|>") != std::string::npos) {
+        return common_chat_params_init_exaone_moe(tmpl, params);
+    }
     // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
     if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
         return common_chat_params_init_hermes_2_pro(tmpl, params);

package/src/llama.cpp/common/chat.h CHANGED Viewed

@@ -136,6 +136,7 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_APRIEL_1_5,
     COMMON_CHAT_FORMAT_XIAOMI_MIMO,
     COMMON_CHAT_FORMAT_SOLAR_OPEN,
+    COMMON_CHAT_FORMAT_EXAONE_MOE,
     // These are intended to be parsed by the PEG parser
     COMMON_CHAT_FORMAT_PEG_SIMPLE,

package/src/llama.cpp/common/common.h CHANGED Viewed

@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
 //
 enum llama_example {
+    LLAMA_EXAMPLE_BATCHED,
     LLAMA_EXAMPLE_DEBUG,
     LLAMA_EXAMPLE_COMMON,
     LLAMA_EXAMPLE_SPECULATIVE,
@@ -476,6 +477,7 @@ struct common_params {
     int32_t timeout_write     = timeout_read; // http write timeout in seconds
     int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
     int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
+    bool    cache_prompt      = true;         // whether to enable prompt caching
     int32_t n_ctx_checkpoints = 8;            // max number of context checkpoints per slot
     int32_t cache_ram_mib     = 8192;         // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.