npm - @fugood/llama.node - Versions diffs - 1.3.0-rc.5 → 1.3.0 - Mend

@fugood/llama.node 1.3.0-rc.5 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/CMakeLists.txt +12 -2
package/package.json +14 -14
package/scripts/llama.cpp.patch +12 -13
package/src/llama.cpp/common/arg.cpp +2 -2
package/src/llama.cpp/common/chat.cpp +199 -0
package/src/llama.cpp/common/chat.h +1 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -0
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -21
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
package/src/llama.cpp/src/llama-arch.cpp +35 -0
package/src/llama.cpp/src/llama-arch.h +3 -0
package/src/llama.cpp/src/llama-batch.h +1 -1
package/src/llama.cpp/src/llama-chat.cpp +35 -2
package/src/llama.cpp/src/llama-chat.h +2 -0
package/src/llama.cpp/src/llama-context.cpp +10 -4
package/src/llama.cpp/src/llama-graph.cpp +35 -0
package/src/llama.cpp/src/llama-hparams.h +2 -0
package/src/llama.cpp/src/llama-kv-cache.cpp +23 -20
package/src/llama.cpp/src/llama-kv-cache.h +2 -4
package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
package/src/llama.cpp/src/llama-model.cpp +307 -37
package/src/llama.cpp/src/llama-model.h +4 -2
package/src/llama.cpp/src/llama-vocab.cpp +1 -0

package/CMakeLists.txt CHANGED Viewed

@@ -64,13 +64,23 @@ endif()
 # Improve speed
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
   if (MSVC)
-    if (NOT GGML_VULKAN)
+    # Enable parallel compilation for all MSVC builds
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP")
+    if (NOT GGML_VULKAN AND NOT GGML_CUDA)
+      # Full optimization with LTCG for default builds
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Ob2 /Oi /Ot /Oy /GL")
       set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 /Ob2 /Oi /Ot /Oy /GL")
       set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} /LTCG")
-    else()
+    elseif(GGML_VULKAN)
+      # Reduced optimization for Vulkan builds
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O1 /Ob1 /bigobj")
       set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O1 /Ob1 /bigobj")
+    else()
+      # Faster linking for CUDA builds (no LTCG)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Ob2 /Oi")
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 /Ob2 /Oi")
     endif()
   else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -funroll-loops -flto=auto")

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.3.0-rc.5",
+  "version": "1.3.0",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -72,19 +72,19 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-linux-x64": "1.3.0-rc.5",
-    "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.5",
-    "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.5",
-    "@fugood/node-llama-linux-arm64": "1.3.0-rc.5",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.5",
-    "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.5",
-    "@fugood/node-llama-win32-x64": "1.3.0-rc.5",
-    "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.5",
-    "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.5",
-    "@fugood/node-llama-win32-arm64": "1.3.0-rc.5",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.5",
-    "@fugood/node-llama-darwin-x64": "1.3.0-rc.5",
-    "@fugood/node-llama-darwin-arm64": "1.3.0-rc.5"
+    "@fugood/node-llama-linux-x64": "1.3.0",
+    "@fugood/node-llama-linux-x64-vulkan": "1.3.0",
+    "@fugood/node-llama-linux-x64-cuda": "1.3.0",
+    "@fugood/node-llama-linux-arm64": "1.3.0",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.3.0",
+    "@fugood/node-llama-linux-arm64-cuda": "1.3.0",
+    "@fugood/node-llama-win32-x64": "1.3.0",
+    "@fugood/node-llama-win32-x64-vulkan": "1.3.0",
+    "@fugood/node-llama-win32-x64-cuda": "1.3.0",
+    "@fugood/node-llama-win32-arm64": "1.3.0",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.3.0",
+    "@fugood/node-llama-darwin-x64": "1.3.0",
+    "@fugood/node-llama-darwin-arm64": "1.3.0"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/scripts/llama.cpp.patch CHANGED Viewed

@@ -21,7 +21,7 @@ index fe290bf8f..d377e29b9 100644
  #
 diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
-index 8587140e1..7931a31a1 100644
+index 63583fb22..f8be20148 100644
 --- a/src/llama.cpp/common/chat.cpp
 +++ b/src/llama.cpp/common/chat.cpp
@@ -6,9 +6,6 @@
@@ -31,10 +31,10 @@ index 8587140e1..7931a31a1 100644
 -#include <minja/chat-template.hpp>
 -#include <minja/minja.hpp>
 -
+ #include <algorithm>
  #include <cstdio>
- #include <exception>
- #include <iostream>
-@@ -123,16 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
+ #include <cctype>
+@@ -126,16 +123,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
      return diffs;
  }
@@ -51,18 +51,17 @@ index 8587140e1..7931a31a1 100644
  struct templates_params {
      json messages;
      json tools;
-@@ -807,8 +794,7 @@ static std::string apply(
-     if (additional_context) {
+@@ -812,7 +799,7 @@ static std::string apply(
          tmpl_inputs.extra_context.merge_patch(*additional_context);
      }
--    // TODO: add flag to control date/time, if only for testing purposes.
+     // TODO: add flag to control date/time, if only for testing purposes.
 -    // tmpl_inputs.now = std::chrono::system_clock::now();
 +    tmpl_inputs.now = inputs.now;
      minja::chat_template_options tmpl_opts;
      // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
 diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
-index f7b36ec71..c07429f08 100644
+index 50efb0d4e..f471a84c7 100644
 --- a/src/llama.cpp/common/chat.h
 +++ b/src/llama.cpp/common/chat.h
@@ -9,7 +9,18 @@
@@ -98,7 +97,7 @@ index b0591e84b..93759f884 100644
      mparams.split_mode      = params.split_mode;
      mparams.tensor_split    = params.tensor_split;
 diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
-index 040a44ebd..37ad69173 100644
+index a8cb630ea..0919ec5d3 100644
 --- a/src/llama.cpp/common/common.h
 +++ b/src/llama.cpp/common/common.h
@@ -274,6 +274,7 @@ struct lr_opt {
@@ -110,7 +109,7 @@ index 040a44ebd..37ad69173 100644
      int32_t n_ctx                 =  4096; // context size
      int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
 diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
-index 42041b717..371752718 100644
+index 34323afa0..1a6924db0 100644
 --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
 +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -123,10 +122,10 @@ index 42041b717..371752718 100644
              check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
              if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
 diff --git a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
-index 83a83887b..8ae962b29 100644
+index de01336cd..29b1a043d 100644
 --- a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
 +++ b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
-@@ -112,7 +112,7 @@ if (Vulkan_FOUND)
+@@ -121,7 +121,7 @@ if (Vulkan_FOUND)
      endif()
      # Set up toolchain for host compilation whether cross-compiling or not
@@ -135,7 +134,7 @@ index 83a83887b..8ae962b29 100644
          if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
              set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN})
          else()
-@@ -132,7 +132,7 @@ if (Vulkan_FOUND)
+@@ -141,7 +141,7 @@ if (Vulkan_FOUND)
      include(ExternalProject)

package/src/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -3248,7 +3248,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--embd-output-format"}, "FORMAT",
-        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
+        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
         [](common_params & params, const std::string & value) {
             params.embd_out = value;
         }
@@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.use_jinja = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
     add_opt(common_arg(
         {"--reasoning-format"}, "FORMAT",
         "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"

package/src/llama.cpp/common/chat.cpp CHANGED Viewed

@@ -6,8 +6,11 @@
 #include "log.h"
 #include "regex-partial.h"
+#include <algorithm>
 #include <cstdio>
+#include <cctype>
 #include <exception>
+#include <functional>
 #include <iostream>
 #include <optional>
 #include <stdexcept>
@@ -627,6 +630,7 @@ const char * common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
         case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
         case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
+        case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS: return "LFM2 with JSON tools";
         default:
             throw std::runtime_error("Unknown chat format");
     }
@@ -794,6 +798,7 @@ static std::string apply(
     if (additional_context) {
         tmpl_inputs.extra_context.merge_patch(*additional_context);
     }
+    // TODO: add flag to control date/time, if only for testing purposes.
     tmpl_inputs.now = inputs.now;
     minja::chat_template_options tmpl_opts;
@@ -972,6 +977,126 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
     return data;
 }
+// Case-insensitive find
+static size_t ifind_string(const std::string & haystack, const std::string & needle, size_t pos = 0) {
+    auto it = std::search(
+        haystack.begin() + pos, haystack.end(),
+        needle.begin(), needle.end(),
+        [](char a, char b) { return std::tolower(a) == std::tolower(b); }
+    );
+    return (it == haystack.end()) ? std::string::npos : std::distance(haystack.begin(), it);
+}
+static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+    const auto is_json_schema_provided = !inputs.json_schema.is_null();
+    const auto is_grammar_provided = !inputs.grammar.empty();
+    const auto are_tools_provided = inputs.tools.is_array() && !inputs.tools.empty();
+    // the logic requires potentially modifying the messages
+    auto tweaked_messages = inputs.messages;
+    auto replace_json_schema_marker = [](json & messages) -> bool {
+        static std::string marker1 = "force json schema.\n";
+        static std::string marker2 = "force json schema.";
+        if (messages.empty() || messages.at(0).at("role") != "system") {
+            return false;
+        }
+        std::string content = messages.at(0).at("content");
+        for (const auto & marker : {marker1, marker2}) {
+            const auto pos = ifind_string(content, marker);
+            if (pos != std::string::npos) {
+                content.replace(pos, marker.length(), "");
+                // inject modified content back into the messages
+                messages.at(0).at("content") = content;
+                return true;
+            }
+        }
+        return false;
+    };
+    // Lfm2 model does not natively work with json, but can generally understand the tools structure
+    //
+    // Example of the pytorch dialog structure:
+    //     <|startoftext|><|im_start|>system
+    //     List of tools: <|tool_list_start|>[{"name": "get_candidate_status", "description": "Retrieves the current status of a candidate in the recruitment process", "parameters": {"type": "object", "properties": {"candidate_id": {"type": "string", "description": "Unique identifier for the candidate"}}, "required": ["candidate_id"]}}]<|tool_list_end|><|im_end|>
+    //     <|im_start|>user
+    //     What is the current status of candidate ID 12345?<|im_end|>
+    //     <|im_start|>assistant
+    //     <|tool_call_start|>[get_candidate_status(candidate_id="12345")]<|tool_call_end|>Checking the current status of candidate ID 12345.<|im_end|>
+    //     <|im_start|>tool
+    //     <|tool_response_start|>{"candidate_id": "12345", "status": "Interview Scheduled", "position": "Clinical Research Associate", "date": "2023-11-20"}<|tool_response_end|><|im_end|>
+    //     <|im_start|>assistant
+    //     The candidate with ID 12345 is currently in the "Interview Scheduled" stage for the position of Clinical Research Associate, with an interview date set for 2023-11-20.<|im_end|>
+    //
+    // For the llama server compatibility with json tools semantic,
+    // the client can add "Follow json schema." line into the system message prompt to force the json output.
+    //
+    if (are_tools_provided && (is_json_schema_provided || is_grammar_provided)) {
+        // server/utils.hpp prohibits that branch for the custom grammar anyways
+        throw std::runtime_error("Tools call must not use \"json_schema\" or \"grammar\", use non-tool invocation if you want to use custom grammar");
+    } else if (are_tools_provided && replace_json_schema_marker(tweaked_messages)) {
+        LOG_INF("%s: Using tools to build a grammar\n", __func__);
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            auto schemas = json::array();
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                schemas.push_back({
+                    {"type", "object"},
+                    {"properties", {
+                        {"name", {
+                            {"type", "string"},
+                            {"const", function.at("name")},
+                        }},
+                        {"arguments", function.at("parameters")},
+                    }},
+                    {"required", json::array({"name", "arguments", "id"})},
+                });
+            });
+            auto schema = json {
+                {"type", "array"},
+                {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+                {"minItems", 1},
+            };
+            if (!inputs.parallel_tool_calls) {
+                schema["maxItems"] = 1;
+            }
+            builder.add_rule("root", "\"<|tool_call_start|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tool_call_end|>\"");
+        });
+        // model has no concept of tool selection mode choice,
+        // if the system prompt rendered correctly it will produce a tool call
+        // the grammar goes inside the tool call body
+        data.grammar_lazy = true;
+        data.grammar_triggers = {{COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, "\\s*<\\|tool_call_start\\|>\\s*\\["}};
+        data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
+        data.format = COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS;
+    } else if (are_tools_provided && (!is_json_schema_provided && !is_grammar_provided)) {
+        LOG_INF("%s: Using tools without json schema or grammar\n", __func__);
+        // output those tokens
+        data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
+    } else if (is_json_schema_provided) {
+        LOG_INF("%s: Using provided json schema to build a grammar\n", __func__);
+        data.grammar = json_schema_to_grammar(inputs.json_schema);
+    } else if (is_grammar_provided) {
+        LOG_INF("%s: Using provided grammar\n", __func__);
+        data.grammar = inputs.grammar;
+    } else {
+        LOG_INF("%s: Using content relying on the template\n", __func__);
+    }
+    data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
+    LOG_DBG("%s: Prompt: %s\n", __func__, data.prompt.c_str());
+    return data;
+}
 static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
     data.prompt = apply(tmpl, inputs);
@@ -2485,6 +2610,71 @@ static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
     builder.add_content(builder.consume_rest());
 }
+static void common_chat_parse_lfm2(common_chat_msg_parser & builder) {
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    // LFM2 format: <|tool_call_start|>[{"name": "get_current_time", "arguments": {"location": "Paris"}}]<|tool_call_end|>
+    static const common_regex tool_call_start_regex(regex_escape("<|tool_call_start|>"));
+    static const common_regex tool_call_end_regex(regex_escape("<|tool_call_end|>"));
+    // Loop through all tool calls
+    while (auto res = builder.try_find_regex(tool_call_start_regex, std::string::npos, /* add_prelude_to_content= */ true)) {
+        builder.move_to(res->groups[0].end);
+        // Parse JSON array format: [{"name": "...", "arguments": {...}}]
+        auto tool_calls_data = builder.consume_json();
+        // Consume end marker
+        builder.consume_spaces();
+        if (!builder.try_consume_regex(tool_call_end_regex)) {
+            throw common_chat_msg_partial_exception("Expected <|tool_call_end|>");
+        }
+        // Process each tool call in the array
+        if (tool_calls_data.json.is_array()) {
+            for (const auto & tool_call : tool_calls_data.json) {
+                if (!tool_call.is_object()) {
+                    throw common_chat_msg_partial_exception("Tool call must be an object");
+                }
+                if (!tool_call.contains("name")) {
+                    throw common_chat_msg_partial_exception("Tool call missing 'name' field");
+                }
+                std::string function_name = tool_call.at("name");
+                std::string arguments = "{}";
+                if (tool_call.contains("arguments")) {
+                    if (tool_call.at("arguments").is_object()) {
+                        arguments = tool_call.at("arguments").dump();
+                    } else if (tool_call.at("arguments").is_string()) {
+                        arguments = tool_call.at("arguments");
+                    }
+                }
+                if (!builder.add_tool_call(function_name, "", arguments)) {
+                    throw common_chat_msg_partial_exception("Incomplete tool call");
+                }
+            }
+        } else {
+            throw common_chat_msg_partial_exception("Expected JSON array for tool calls");
+        }
+        // Consume any trailing whitespace after this tool call
+        builder.consume_spaces();
+    }
+    // Consume any remaining content after all tool calls
+    auto remaining = builder.consume_rest();
+    if (!string_strip(remaining).empty()) {
+        builder.add_content(remaining);
+    }
+}
 static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
     // Parse thinking tags first - this handles the main reasoning content
     builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2734,6 +2924,12 @@ static common_chat_params common_chat_templates_apply_jinja(
         return common_chat_params_init_apertus(tmpl, params);
     }
+    // LFM2 (w/ tools)
+    if (src.find("List of tools: <|tool_list_start|>[") != std::string::npos &&
+        src.find("]<|tool_list_end|>") != std::string::npos) {
+        return common_chat_params_init_lfm2(tmpl, params);
+    }
     // Use generic handler when mixing tools + JSON schema.
     // TODO: support that mix in handlers below.
     if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2912,6 +3108,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
         case COMMON_CHAT_FORMAT_APERTUS:
             common_chat_parse_apertus(builder);
             break;
+        case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS:
+            common_chat_parse_lfm2(builder);
+            break;
         default:
             throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
     }

package/src/llama.cpp/common/chat.h CHANGED Viewed

@@ -127,6 +127,7 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_SEED_OSS,
     COMMON_CHAT_FORMAT_NEMOTRON_V2,
     COMMON_CHAT_FORMAT_APERTUS,
+    COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };

package/src/llama.cpp/common/json-schema-to-grammar.cpp CHANGED Viewed

@@ -601,7 +601,10 @@ private:
     }
     std::string _resolve_ref(const std::string & ref) {
-        std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
+        auto it = ref.find('#');
+        std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
+        static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
+        std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
         if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
             _refs_being_resolved.insert(ref);
             json resolved = _refs[ref];
@@ -774,11 +777,24 @@ public:
                         std::vector<std::string> tokens = string_split(pointer, "/");
                         for (size_t i = 1; i < tokens.size(); ++i) {
                             std::string sel = tokens[i];
-                            if (target.is_null() || !target.contains(sel)) {
+                            if (target.is_object() && target.contains(sel)) {
+                                target = target[sel];
+                            } else if (target.is_array()) {
+                                size_t sel_index;
+                                try {
+                                    sel_index = std::stoul(sel);
+                                } catch (const std::invalid_argument & e) {
+                                    sel_index = target.size();
+                                }
+                                if (sel_index >= target.size()) {
+                                    _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
+                                    return;
+                                }
+                                target = target[sel_index];
+                            } else {
                                 _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
                                 return;
                             }
-                            target = target[sel];
                         }
                         _refs[ref] = target;
                     }

package/src/llama.cpp/ggml/CMakeLists.txt CHANGED Viewed

@@ -251,6 +251,8 @@ option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adr
 set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                             "gmml: OpenCL API version to target")
+option(GGML_HEXAGON                         "ggml: enable Hexagon backend"                    OFF)
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

package/src/llama.cpp/ggml/include/ggml-hexagon.h ADDED Viewed

@@ -0,0 +1,19 @@
+#pragma once
+#include "ggml.h"
+#include "ggml-backend.h"
+#ifdef  __cplusplus
+extern "C" {
+#endif
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
+GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
+#ifdef  __cplusplus
+}
+#endif

package/src/llama.cpp/ggml/src/CMakeLists.txt CHANGED Viewed

@@ -307,6 +307,10 @@ function(ggml_add_cpu_backend_variant tag_name)
         foreach (feat ${ARGN})
             set(GGML_INTERNAL_${feat} ON)
         endforeach()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
+        foreach (feat ${ARGN})
+            set(GGML_INTERNAL_${feat} ON)
+        endforeach()
     endif()
     ggml_add_cpu_backend_variant_impl(${tag_name})
@@ -371,6 +375,14 @@ if (GGML_CPU_ALL_VARIANTS)
         else()
             message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
         endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
+        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+            ggml_add_cpu_backend_variant(s390x_z15  Z15 VXE)
+            # ggml_add_cpu_backend_variant(s390x_z16  Z16 VXE)
+            # ggml_add_cpu_backend_variant(s390x_z17  Z17 VXE)
+        else()
+            message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
+        endif()
     else()
         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
     endif()
@@ -390,6 +402,7 @@ ggml_add_backend(Vulkan)
 ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
+ggml_add_backend(Hexagon)
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt CHANGED Viewed

@@ -466,29 +466,45 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
     elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
         message(STATUS "s390x detected")
-        list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
-        file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
-        string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
-        # TODO: Separation to determine activation of VX/VXE/VXE2
-        if (${S390X_M} MATCHES "8561|8562")
-            message(STATUS "z15 target")
-            list(APPEND ARCH_FLAGS -march=z15)
-        elseif (${S390X_M} MATCHES "3931")
-            message(STATUS "z16 target")
-            list(APPEND ARCH_FLAGS -march=z16)
-        elseif (${S390X_M} MATCHES "9175|9176")
-            # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
-            #       binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
-            message(STATUS "z17 target")
-            list(APPEND ARCH_FLAGS -march=arch15)
-        else()
-            message(STATUS "Unknown target")
-            message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
-            list(APPEND ARCH_FLAGS -march=native -mtune=native)
+        list(APPEND GGML_CPU_SOURCES
+            ggml-cpu/arch/s390/quants.c)
+        # for native compilation
+        if (GGML_NATIVE)
+            # check machine level to determine target
+            file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
+            string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
+            # TODO: Separation to determine activation of VX/VXE/VXE2
+            if (${S390X_M} MATCHES "8561|8562")
+                message(STATUS "z15 target")
+                list(APPEND ARCH_FLAGS -march=z15)
+            elseif (${S390X_M} MATCHES "3931")
+                message(STATUS "z16 target")
+                list(APPEND ARCH_FLAGS -march=z16)
+            elseif (${S390X_M} MATCHES "9175|9176")
+                # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
+                #       binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
+                message(STATUS "z17 target")
+                list(APPEND ARCH_FLAGS -march=arch15)
+            else()
+                message(STATUS "Unknown target")
+                message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
+                list(APPEND ARCH_FLAGS -march=native -mtune=native)
+            endif()
+        # for cross-compilation
+        elseif(GGML_CPU_ALL_VARIANTS)
+            # range through IBM z15 to z17
+            # NOTE: update when a new hardware level is released
+            foreach (ZHW RANGE 15 17)
+                if(DEFINED GGML_INTERNAL_Z${ZHW})
+                    message(STATUS "z${ZHW} cross-compile target")
+                    list(APPEND ARCH_FLAGS -march=z${ZHW})
+                endif()
+            endforeach()
         endif()
-        if (GGML_VXE)
+        if (GGML_VXE OR GGML_INTERNAL_VXE)
             message(STATUS "VX/VXE/VXE2 enabled")
             list(APPEND ARCH_FLAGS -mvx -mzvector)
             list(APPEND ARCH_DEFINITIONS GGML_VXE)

package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp CHANGED Viewed

@@ -7519,8 +7519,8 @@ static void ggml_compute_forward_upscale_f32(
         float pixel_offset = 0.5f;
         if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
             pixel_offset = 0.0f;
-            sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1);
-            sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1);
+            sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
+            sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
         }
         for (int64_t i3 = 0; i3 < ne3; i3++) {