npm - @fugood/llama.node - Versions diffs - 1.4.2 → 1.4.4 - Mend

@fugood/llama.node 1.4.2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/CMakeLists.txt +1 -1
package/lib/binding.js +3 -0
package/lib/binding.ts +10 -0
package/lib/index.js +9 -0
package/lib/index.ts +10 -0
package/package.json +15 -15
package/scripts/llama.cpp.patch +25 -11
package/src/LlamaContext.cpp +24 -0
package/src/LlamaContext.h +3 -0
package/src/llama.cpp/CMakeLists.txt +21 -6
package/src/llama.cpp/common/CMakeLists.txt +6 -0
package/src/llama.cpp/common/arg.cpp +83 -22
package/src/llama.cpp/common/chat-parser.cpp +40 -0
package/src/llama.cpp/common/chat-peg-parser.cpp +110 -0
package/src/llama.cpp/common/chat-peg-parser.h +105 -0
package/src/llama.cpp/common/chat.cpp +40 -29
package/src/llama.cpp/common/chat.h +10 -1
package/src/llama.cpp/common/common.cpp +70 -7
package/src/llama.cpp/common/common.h +23 -5
package/src/llama.cpp/common/download.cpp +18 -8
package/src/llama.cpp/common/download.h +3 -1
package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
package/src/llama.cpp/common/log.cpp +18 -27
package/src/llama.cpp/common/log.h +19 -12
package/src/llama.cpp/common/peg-parser.cpp +1712 -0
package/src/llama.cpp/common/peg-parser.h +459 -0
package/src/llama.cpp/common/unicode.cpp +64 -0
package/src/llama.cpp/common/unicode.h +22 -0
package/src/llama.cpp/ggml/CMakeLists.txt +52 -48
package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
package/src/llama.cpp/ggml/include/ggml-zendnn.h +22 -0
package/src/llama.cpp/ggml/include/ggml.h +29 -2
package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -4
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +0 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -13
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +51 -125
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +98 -12
package/src/llama.cpp/src/CMakeLists.txt +1 -0
package/src/llama.cpp/src/llama-arch.cpp +30 -1
package/src/llama.cpp/src/llama-arch.h +3 -0
package/src/llama.cpp/src/llama-graph.cpp +3 -6
package/src/llama.cpp/src/llama-hparams.h +2 -2
package/src/llama.cpp/src/llama-impl.h +1 -1
package/src/llama.cpp/src/llama-mmap.cpp +1 -1
package/src/llama.cpp/src/llama-model.cpp +54 -6
package/src/llama.cpp/src/llama-quant.cpp +0 -29
package/src/llama.cpp/src/llama-vocab.cpp +1 -2
package/src/llama.cpp/src/models/deepseek2.cpp +18 -0
package/src/llama.cpp/src/models/mistral3.cpp +160 -0
package/src/llama.cpp/src/models/models.h +4 -0
package/src/llama.cpp/src/unicode.cpp +2 -2

package/CMakeLists.txt CHANGED Viewed

@@ -99,7 +99,7 @@ endif()
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 if (MINGW)
-  add_definitions(-D_WIN32_WINNT=0x0601)
+  add_definitions(-D_WIN32_WINNT=0x0A00)
 endif()
 # VULKAN_SDK

package/lib/binding.js CHANGED Viewed

@@ -64,6 +64,9 @@ const loadModule = (variant) => __awaiter(void 0, void 0, void 0, function* () {
                 /* no-op */
             }
         }
+        const nDev = process.env.GGML_HEXAGON_NDEV;
+        if (!nDev)
+            process.env.GGML_HEXAGON_NDEV = '16';
     }
     let module = yield loadPlatformPackage(packageName);
     if (module) {

package/lib/binding.ts CHANGED Viewed

@@ -565,6 +565,14 @@ export interface LlamaContext {
    */
   cancelRequest(requestId: number): void
+  /**
+   * Clear the KV and recurrent caches.
+   * This is faster than recreating the context and useful for preventing
+   * cache contamination between chat sessions.
+   * @param clearData If true, also clears the cache data (default: false)
+   */
+  clearCache(clearData?: boolean): void
   // static
   loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
   toggleNativeLog(
@@ -616,6 +624,8 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
         /* no-op */
       }
     }
+    const nDev = process.env.GGML_HEXAGON_NDEV
+    if (!nDev) process.env.GGML_HEXAGON_NDEV = '16'
   }
   let module = await loadPlatformPackage(packageName)

package/lib/index.js CHANGED Viewed

@@ -195,6 +195,15 @@ class LlamaContextWrapper {
     decodeAudioTokens(tokens) {
         return this.ctx.decodeAudioTokens(tokens);
     }
+    /**
+     * Clear the KV and recurrent caches.
+     * This is faster than recreating the context and useful for preventing
+     * cache contamination between chat sessions.
+     * @param clearData If true, also clears the cache data (default: false)
+     */
+    clearCache(clearData) {
+        this.ctx.clearCache(clearData);
+    }
 }
 const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, function* () {
     var _a, _b;

package/lib/index.ts CHANGED Viewed

@@ -299,6 +299,16 @@ class LlamaContextWrapper {
   decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
     return this.ctx.decodeAudioTokens(tokens)
   }
+  /**
+   * Clear the KV and recurrent caches.
+   * This is faster than recreating the context and useful for preventing
+   * cache contamination between chat sessions.
+   * @param clearData If true, also clears the cache data (default: false)
+   */
+  clearCache(clearData?: boolean): void {
+    this.ctx.clearCache(clearData)
+  }
 }
 export const loadModel = async (

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.4.2",
+  "version": "1.4.4",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -72,20 +72,20 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-darwin-arm64": "1.4.2",
-    "@fugood/node-llama-darwin-x64": "1.4.2",
-    "@fugood/node-llama-linux-arm64": "1.4.2",
-    "@fugood/node-llama-linux-arm64-cuda": "1.4.2",
-    "@fugood/node-llama-linux-arm64-snapdragon": "1.4.2",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.4.2",
-    "@fugood/node-llama-linux-x64": "1.4.2",
-    "@fugood/node-llama-linux-x64-cuda": "1.4.2",
-    "@fugood/node-llama-linux-x64-vulkan": "1.4.2",
-    "@fugood/node-llama-win32-arm64": "1.4.2",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.4.2",
-    "@fugood/node-llama-win32-x64": "1.4.2",
-    "@fugood/node-llama-win32-x64-cuda": "1.4.2",
-    "@fugood/node-llama-win32-x64-vulkan": "1.4.2"
+    "@fugood/node-llama-darwin-arm64": "1.4.4",
+    "@fugood/node-llama-darwin-x64": "1.4.4",
+    "@fugood/node-llama-linux-arm64": "1.4.4",
+    "@fugood/node-llama-linux-arm64-cuda": "1.4.4",
+    "@fugood/node-llama-linux-arm64-snapdragon": "1.4.4",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.4.4",
+    "@fugood/node-llama-linux-x64": "1.4.4",
+    "@fugood/node-llama-linux-x64-cuda": "1.4.4",
+    "@fugood/node-llama-linux-x64-vulkan": "1.4.4",
+    "@fugood/node-llama-win32-arm64": "1.4.4",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.4.4",
+    "@fugood/node-llama-win32-x64": "1.4.4",
+    "@fugood/node-llama-win32-x64-cuda": "1.4.4",
+    "@fugood/node-llama-win32-x64-vulkan": "1.4.4"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/scripts/llama.cpp.patch CHANGED Viewed

@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
-index bb168e835..cfc0e2c2e 100644
+index 377b26846..1873b5206 100644
 --- a/src/llama.cpp/common/CMakeLists.txt
 +++ b/src/llama.cpp/common/CMakeLists.txt
-@@ -143,9 +143,16 @@ if (LLAMA_LLGUIDANCE)
+@@ -149,9 +149,16 @@ if (LLAMA_LLGUIDANCE)
      set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
  endif ()
@@ -20,8 +20,22 @@ index bb168e835..cfc0e2c2e 100644
  #
+diff --git a/src/llama.cpp/common/chat-peg-parser.cpp b/src/llama.cpp/common/chat-peg-parser.cpp
+index 74a7b6a46..7b7a1bd50 100644
+--- a/src/llama.cpp/common/chat-peg-parser.cpp
++++ b/src/llama.cpp/common/chat-peg-parser.cpp
+@@ -1,9 +1,5 @@
+ #include "chat-peg-parser.h"
+-#include <nlohmann/json.hpp>
+-
+-using json = nlohmann::json;
+-
+ static std::string_view trim_trailing_space(std::string_view sv) {
+     while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
+         sv.remove_suffix(1);
 diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
-index b4a0f985e..2383d2ea9 100644
+index 41a5bb42d..da5cf4b94 100644
 --- a/src/llama.cpp/common/chat.cpp
 +++ b/src/llama.cpp/common/chat.cpp
@@ -6,9 +6,6 @@
@@ -34,7 +48,7 @@ index b4a0f985e..2383d2ea9 100644
  #include <algorithm>
  #include <cstdio>
  #include <cctype>
-@@ -126,16 +123,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
+@@ -134,16 +131,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
      return diffs;
  }
@@ -51,7 +65,7 @@ index b4a0f985e..2383d2ea9 100644
  struct templates_params {
      json messages;
      json tools;
-@@ -709,7 +696,7 @@ static std::string apply(
+@@ -720,7 +707,7 @@ static std::string apply(
          tmpl_inputs.extra_context.merge_patch(*additional_context);
      }
      // TODO: add flag to control date/time, if only for testing purposes.
@@ -61,10 +75,10 @@ index b4a0f985e..2383d2ea9 100644
      minja::chat_template_options tmpl_opts;
      // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
 diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
-index 754c411e2..71241a6cc 100644
+index 6085510a4..263076ce2 100644
 --- a/src/llama.cpp/common/chat.h
 +++ b/src/llama.cpp/common/chat.h
-@@ -9,7 +9,18 @@
+@@ -10,7 +10,18 @@
  #include <vector>
  #include <map>
@@ -85,10 +99,10 @@ index 754c411e2..71241a6cc 100644
  struct common_chat_tool_call {
      std::string name;
 diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
-index 0d7fd9a93..6bf3cc7ab 100644
+index f07af1d86..1b10c7b13 100644
 --- a/src/llama.cpp/common/common.cpp
 +++ b/src/llama.cpp/common/common.cpp
-@@ -1217,6 +1217,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
+@@ -1236,6 +1236,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
          mparams.n_gpu_layers = params.n_gpu_layers;
      }
@@ -97,10 +111,10 @@ index 0d7fd9a93..6bf3cc7ab 100644
      mparams.split_mode      = params.split_mode;
      mparams.tensor_split    = params.tensor_split;
 diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
-index 2f23d0baa..e4e6c795e 100644
+index 179113a4d..78aa24bc3 100644
 --- a/src/llama.cpp/common/common.h
 +++ b/src/llama.cpp/common/common.h
-@@ -299,6 +299,7 @@ struct lr_opt {
+@@ -302,6 +302,7 @@ struct lr_opt {
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
  struct common_params {

package/src/LlamaContext.cpp CHANGED Viewed

@@ -200,6 +200,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
            static_cast<napi_property_attributes>(napi_enumerable)),
        InstanceMethod<&LlamaContext::CancelRequest>(
            "cancelRequest",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::ClearCache>(
+           "clearCache",
            static_cast<napi_property_attributes>(napi_enumerable))});
   Napi::FunctionReference *constructor = new Napi::FunctionReference();
   *constructor = Napi::Persistent(func);
@@ -1505,3 +1508,24 @@ Napi::Value LlamaContext::DecodeAudioTokens(const Napi::CallbackInfo &info) {
   worker->Queue();
   return worker->Promise();
 }
+// clearCache(clearData?: boolean): void
+void LlamaContext::ClearCache(const Napi::CallbackInfo &info) {
+  Napi::Env env = info.Env();
+  if (!_rn_ctx) {
+    Napi::TypeError::New(env, "Context is disposed").ThrowAsJavaScriptException();
+    return;
+  }
+  if (_rn_ctx->completion != nullptr && _rn_ctx->completion->is_predicting) {
+    Napi::TypeError::New(env, "Cannot clear cache while completion is in progress")
+        .ThrowAsJavaScriptException();
+    return;
+  }
+  bool clear_data = false;
+  if (info.Length() >= 1 && info[0].IsBoolean()) {
+    clear_data = info[0].ToBoolean().Value();
+  }
+  _rn_ctx->clearCache(clear_data);
+}

package/src/LlamaContext.h CHANGED Viewed

@@ -69,6 +69,9 @@ private:
   Napi::Value QueueRerank(const Napi::CallbackInfo &info);
   void CancelRequest(const Napi::CallbackInfo &info);
+  // Cache management
+  void ClearCache(const Napi::CallbackInfo &info);
   std::string _info;
   std::vector<std::string> _used_devices;
   Napi::Object _meta;

package/src/llama.cpp/CMakeLists.txt CHANGED Viewed

@@ -33,10 +33,24 @@ endif()
 option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
+option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
 if (EMSCRIPTEN)
     set(BUILD_SHARED_LIBS_DEFAULT OFF)
-    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
+    # Use 64-bit memory to support backend_get_memory queries
+    # TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
+    if (LLAMA_WASM_MEM64)
+      add_compile_options("-sMEMORY64=1")
+      add_link_options("-sMEMORY64=1")
+    endif()
+    add_link_options("-sALLOW_MEMORY_GROWTH=1")
+    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
+    option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
+    if (LLAMA_BUILD_HTML)
+        set(CMAKE_EXECUTABLE_SUFFIX ".html")
+    endif()
 else()
     if (MINGW)
         set(BUILD_SHARED_LIBS_DEFAULT OFF)
@@ -58,6 +72,12 @@ if (MSVC)
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()
+if (LLAMA_STANDALONE)
+    # enable parallel builds for msbuild
+    list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
+    list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
+endif()
 if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
     set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
 else()
@@ -179,11 +199,6 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
     # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
-if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
 #
 # build the library
 #

package/src/llama.cpp/common/CMakeLists.txt CHANGED Viewed

@@ -52,6 +52,8 @@ add_library(${TARGET} STATIC
     chat-parser.h
     chat-parser-xml-toolcall.h
     chat-parser-xml-toolcall.cpp
+    chat-peg-parser.cpp
+    chat-peg-parser.h
     chat.cpp
     chat.h
     common.cpp
@@ -69,12 +71,16 @@ add_library(${TARGET} STATIC
     log.h
     ngram-cache.cpp
     ngram-cache.h
+    peg-parser.cpp
+    peg-parser.h
     regex-partial.cpp
     regex-partial.h
     sampling.cpp
     sampling.h
     speculative.cpp
     speculative.h
+    unicode.cpp
+    unicode.h
     )
 if (BUILD_SHARED_LIBS)

package/src/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -30,6 +30,7 @@
 #include <thread> // for hardware_concurrency
 #include <vector>
+#ifndef __EMSCRIPTEN__
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
@@ -41,6 +42,8 @@
 #else
 #include <sys/syslimits.h>
 #endif
+#endif
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
 using json = nlohmann::ordered_json;
@@ -212,13 +215,13 @@ struct handle_model_result {
 static handle_model_result common_params_handle_model(
         struct common_params_model & model,
         const std::string & bearer_token,
-        const std::string & model_path_default,
         bool offline) {
     handle_model_result result;
     // handle pre-fill default model path and url based on hf_repo and hf_file
     {
         if (!model.docker_repo.empty()) {  // Handle Docker URLs by resolving them to local paths
             model.path = common_docker_resolve_model(model.docker_repo);
+            model.name = model.docker_repo; // set name for consistency
         } else if (!model.hf_repo.empty()) {
             // short-hand to avoid specifying --hf-file -> default it to --model
             if (model.hf_file.empty()) {
@@ -227,7 +230,8 @@ static handle_model_result common_params_handle_model(
                     if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
                         exit(1); // built without CURL, error message already printed
                     }
-                    model.hf_repo = auto_detected.repo;
+                    model.name    = model.hf_repo;      // repo name with tag
+                    model.hf_repo = auto_detected.repo; // repo name without tag
                     model.hf_file = auto_detected.ggufFile;
                     if (!auto_detected.mmprojFile.empty()) {
                         result.found_mmproj   = true;
@@ -257,8 +261,6 @@ static handle_model_result common_params_handle_model(
                 model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
             }
-        } else if (model.path.empty()) {
-            model.path = model_path_default;
         }
     }
@@ -405,7 +407,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
     // handle model and download
     {
-        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
+        auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
         if (params.no_mmproj) {
             params.mmproj = {};
         } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -415,12 +417,18 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         // only download mmproj if the current example is using it
         for (auto & ex : mmproj_examples) {
             if (ctx_arg.ex == ex) {
-                common_params_handle_model(params.mmproj,    params.hf_token, "", params.offline);
+                common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
                 break;
             }
         }
-        common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
-        common_params_handle_model(params.vocoder.model,     params.hf_token, "", params.offline);
+        common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
+        common_params_handle_model(params.vocoder.model,     params.hf_token, params.offline);
+    }
+    // model is required (except for server)
+    // TODO @ngxson : maybe show a list of available models in CLI in this case
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
+        throw std::invalid_argument("error: --model is required\n");
     }
     if (params.escape) {
@@ -700,6 +708,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         params.use_jinja = true;
     }
+    params.use_color = tty_can_use_colors();
     // load dynamic backends
     ggml_backend_load_all();
@@ -782,10 +792,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
-        {"-co", "--color"},
-        string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
-        [](common_params & params) {
-            params.use_color = true;
+        {"-co", "--color"}, "[on|off|auto]",
+        "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
+        "'auto' enables colors when output is to a terminal",
+        [](common_params & params, const std::string & value) {
+            if (is_truthy(value)) {
+                params.use_color = true;
+            } else if (is_falsey(value)) {
+                params.use_color = false;
+            } else if (is_autoy(value)) {
+                params.use_color = tty_can_use_colors();
+            } else {
+                throw std::invalid_argument(
+                    string_format("error: unknown value for --color: '%s'\n", value.c_str()));
+            }
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
@@ -1014,7 +1034,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                                params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
                            } else {
                                throw std::runtime_error(
-                                   string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
+                                   string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
                            }
                        }).set_env("LLAMA_ARG_FLASH_ATTN"));
     add_opt(common_arg(
@@ -1221,7 +1241,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.warmup = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--spm-infill"},
         string_format(
@@ -2090,11 +2110,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-m", "--model"}, "FNAME",
         ex == LLAMA_EXAMPLE_EXPORT_LORA
-            ? std::string("model path from which to load base model")
-            : string_format(
-                "model path (default: `models/$filename` with filename from `--hf-file` "
-                "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
-            ),
+            ? "model path from which to load base model"
+            : "model path to load",
         [](common_params & params, const std::string & value) {
             params.model.path = value;
         }
@@ -2486,12 +2503,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "path to save slot kv cache (default: disabled)",
         [](common_params & params, const std::string & value) {
             params.slot_save_path = value;
+            if (!fs_is_directory(params.slot_save_path)) {
+                throw std::invalid_argument("not a directory: " + value);
+            }
             // if doesn't end with DIRECTORY_SEPARATOR, add it
             if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
                 params.slot_save_path += DIRECTORY_SEPARATOR;
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--media-path"}, "PATH",
+        "directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
+        [](common_params & params, const std::string & value) {
+            params.media_path = value;
+            if (!fs_is_directory(params.media_path)) {
+                throw std::invalid_argument("not a directory: " + value);
+            }
+            // if doesn't end with DIRECTORY_SEPARATOR, add it
+            if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
+                params.media_path += DIRECTORY_SEPARATOR;
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--models-dir"}, "PATH",
+        "directory containing models for the router server (default: disabled)",
+        [](common_params & params, const std::string & value) {
+            params.models_dir = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
+    add_opt(common_arg(
+        {"--models-max"}, "N",
+        string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
+        [](common_params & params, int value) {
+            params.models_max = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
+    add_opt(common_arg(
+        {"--no-models-autoload"},
+        "disables automatic loading of models (default: enabled)",
+        [](common_params & params) {
+            params.models_autoload = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
     add_opt(common_arg(
         {"--jinja"},
         string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
@@ -2639,7 +2694,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params &, const std::string & value) {
             common_log_set_file(common_log_main(), value.c_str());
         }
-    ));
+    ).set_env("LLAMA_LOG_FILE"));
     add_opt(common_arg(
         {"--log-colors"}, "[on|off|auto]",
         "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@@ -2653,7 +2708,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
             } else {
                 throw std::invalid_argument(
-                    string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
+                    string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
             }
         }
     ).set_env("LLAMA_LOG_COLORS"));
@@ -2674,7 +2729,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_OFFLINE"));
     add_opt(common_arg(
         {"-lv", "--verbosity", "--log-verbosity"}, "N",
-        "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
+        string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
+            " - 0: generic output\n"
+            " - 1: error\n"
+            " - 2: warning\n"
+            " - 3: info\n"
+            " - 4: debug\n"
+            "(default: %d)\n", params.verbosity),
         [](common_params & params, int value) {
             params.verbosity = value;
             common_log_set_verbosity_thold(value);

package/src/llama.cpp/common/chat-parser.cpp CHANGED Viewed

@@ -1,6 +1,8 @@
 #include "chat-parser.h"
+#include "chat-peg-parser.h"
 #include "common.h"
 #include "log.h"
+#include "peg-parser.h"
 #include "regex-partial.h"
 #include <algorithm>
@@ -1483,6 +1485,11 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
 }
 common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+    if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
+        syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
+        syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
+        return common_chat_peg_parse(syntax.parser, input, is_partial, syntax);
+    }
     common_chat_msg_parser builder(input, is_partial, syntax);
     try {
         common_chat_parse(builder);
@@ -1500,3 +1507,36 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
     }
     return msg;
 }
+common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+    if (parser.empty()) {
+        throw std::runtime_error("Failed to parse due to missing parser definition.");
+    }
+    LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(syntax.format), input.c_str());
+    common_peg_parse_context ctx(input, is_partial);
+    auto result = parser.parse(ctx);
+    if (result.fail()) {
+        throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end));
+    }
+    common_chat_msg msg;
+    msg.role = "assistant";
+    if (syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE) {
+        auto mapper = common_chat_peg_native_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+    } else if (syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
+        auto mapper = common_chat_peg_constructed_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+    } else {
+        // Generic mapper
+        auto mapper = common_chat_peg_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+    }
+    if (!is_partial) {
+        LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
+    }
+    return msg;
+}