@fugood/llama.node 1.3.0-rc.5 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +12 -2
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +12 -13
- package/src/llama.cpp/common/arg.cpp +2 -2
- package/src/llama.cpp/common/chat.cpp +199 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
- package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-batch.h +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +35 -2
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +10 -4
- package/src/llama.cpp/src/llama-graph.cpp +35 -0
- package/src/llama.cpp/src/llama-hparams.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +23 -20
- package/src/llama.cpp/src/llama-kv-cache.h +2 -4
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-model.cpp +307 -37
- package/src/llama.cpp/src/llama-model.h +4 -2
- package/src/llama.cpp/src/llama-vocab.cpp +1 -0
package/CMakeLists.txt
CHANGED
|
@@ -64,13 +64,23 @@ endif()
|
|
|
64
64
|
# Improve speed
|
|
65
65
|
if(CMAKE_BUILD_TYPE STREQUAL "Release")
|
|
66
66
|
if (MSVC)
|
|
67
|
-
|
|
67
|
+
# Enable parallel compilation for all MSVC builds
|
|
68
|
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
|
|
69
|
+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP")
|
|
70
|
+
|
|
71
|
+
if (NOT GGML_VULKAN AND NOT GGML_CUDA)
|
|
72
|
+
# Full optimization with LTCG for default builds
|
|
68
73
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Ob2 /Oi /Ot /Oy /GL")
|
|
69
74
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 /Ob2 /Oi /Ot /Oy /GL")
|
|
70
75
|
set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} /LTCG")
|
|
71
|
-
|
|
76
|
+
elseif(GGML_VULKAN)
|
|
77
|
+
# Reduced optimization for Vulkan builds
|
|
72
78
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O1 /Ob1 /bigobj")
|
|
73
79
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O1 /Ob1 /bigobj")
|
|
80
|
+
else()
|
|
81
|
+
# Faster linking for CUDA builds (no LTCG)
|
|
82
|
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Ob2 /Oi")
|
|
83
|
+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 /Ob2 /Oi")
|
|
74
84
|
endif()
|
|
75
85
|
else()
|
|
76
86
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -funroll-loops -flto=auto")
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.3.0
|
|
4
|
+
"version": "1.3.0",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.3.0
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.3.0
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.3.0
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.3.0
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.3.0
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.3.0
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.3.0
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.3.0
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.3.0
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.3.0
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.3.0
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.3.0
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.3.0
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.3.0",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.3.0",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.3.0",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.3.0",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.3.0",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.3.0",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.3.0",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.3.0",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.3.0",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.3.0",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.3.0",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.3.0",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.3.0"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -21,7 +21,7 @@ index fe290bf8f..d377e29b9 100644
|
|
|
21
21
|
|
|
22
22
|
#
|
|
23
23
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
24
|
-
index
|
|
24
|
+
index 63583fb22..f8be20148 100644
|
|
25
25
|
--- a/src/llama.cpp/common/chat.cpp
|
|
26
26
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
27
27
|
@@ -6,9 +6,6 @@
|
|
@@ -31,10 +31,10 @@ index 8587140e1..7931a31a1 100644
|
|
|
31
31
|
-#include <minja/chat-template.hpp>
|
|
32
32
|
-#include <minja/minja.hpp>
|
|
33
33
|
-
|
|
34
|
+
#include <algorithm>
|
|
34
35
|
#include <cstdio>
|
|
35
|
-
#include <
|
|
36
|
-
|
|
37
|
-
@@ -123,16 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
|
36
|
+
#include <cctype>
|
|
37
|
+
@@ -126,16 +123,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
|
38
38
|
return diffs;
|
|
39
39
|
}
|
|
40
40
|
|
|
@@ -51,18 +51,17 @@ index 8587140e1..7931a31a1 100644
|
|
|
51
51
|
struct templates_params {
|
|
52
52
|
json messages;
|
|
53
53
|
json tools;
|
|
54
|
-
@@ -
|
|
55
|
-
if (additional_context) {
|
|
54
|
+
@@ -812,7 +799,7 @@ static std::string apply(
|
|
56
55
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
57
56
|
}
|
|
58
|
-
|
|
57
|
+
// TODO: add flag to control date/time, if only for testing purposes.
|
|
59
58
|
- // tmpl_inputs.now = std::chrono::system_clock::now();
|
|
60
59
|
+ tmpl_inputs.now = inputs.now;
|
|
61
60
|
|
|
62
61
|
minja::chat_template_options tmpl_opts;
|
|
63
62
|
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
64
63
|
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
65
|
-
index
|
|
64
|
+
index 50efb0d4e..f471a84c7 100644
|
|
66
65
|
--- a/src/llama.cpp/common/chat.h
|
|
67
66
|
+++ b/src/llama.cpp/common/chat.h
|
|
68
67
|
@@ -9,7 +9,18 @@
|
|
@@ -98,7 +97,7 @@ index b0591e84b..93759f884 100644
|
|
|
98
97
|
mparams.split_mode = params.split_mode;
|
|
99
98
|
mparams.tensor_split = params.tensor_split;
|
|
100
99
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
101
|
-
index
|
|
100
|
+
index a8cb630ea..0919ec5d3 100644
|
|
102
101
|
--- a/src/llama.cpp/common/common.h
|
|
103
102
|
+++ b/src/llama.cpp/common/common.h
|
|
104
103
|
@@ -274,6 +274,7 @@ struct lr_opt {
|
|
@@ -110,7 +109,7 @@ index 040a44ebd..37ad69173 100644
|
|
|
110
109
|
int32_t n_ctx = 4096; // context size
|
|
111
110
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
112
111
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
113
|
-
index
|
|
112
|
+
index 34323afa0..1a6924db0 100644
|
|
114
113
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
115
114
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
116
115
|
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
@@ -123,10 +122,10 @@ index 42041b717..371752718 100644
|
|
|
123
122
|
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
|
124
123
|
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
|
125
124
|
diff --git a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
126
|
-
index
|
|
125
|
+
index de01336cd..29b1a043d 100644
|
|
127
126
|
--- a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
128
127
|
+++ b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
129
|
-
@@ -
|
|
128
|
+
@@ -121,7 +121,7 @@ if (Vulkan_FOUND)
|
|
130
129
|
endif()
|
|
131
130
|
|
|
132
131
|
# Set up toolchain for host compilation whether cross-compiling or not
|
|
@@ -135,7 +134,7 @@ index 83a83887b..8ae962b29 100644
|
|
|
135
134
|
if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
|
|
136
135
|
set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN})
|
|
137
136
|
else()
|
|
138
|
-
@@ -
|
|
137
|
+
@@ -141,7 +141,7 @@ if (Vulkan_FOUND)
|
|
139
138
|
|
|
140
139
|
include(ExternalProject)
|
|
141
140
|
|
|
@@ -3248,7 +3248,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3248
3248
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
3249
3249
|
add_opt(common_arg(
|
|
3250
3250
|
{"--embd-output-format"}, "FORMAT",
|
|
3251
|
-
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
|
|
3251
|
+
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
|
|
3252
3252
|
[](common_params & params, const std::string & value) {
|
|
3253
3253
|
params.embd_out = value;
|
|
3254
3254
|
}
|
|
@@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3435
3435
|
[](common_params & params) {
|
|
3436
3436
|
params.use_jinja = true;
|
|
3437
3437
|
}
|
|
3438
|
-
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
|
3438
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
|
3439
3439
|
add_opt(common_arg(
|
|
3440
3440
|
{"--reasoning-format"}, "FORMAT",
|
|
3441
3441
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
|
@@ -6,8 +6,11 @@
|
|
|
6
6
|
#include "log.h"
|
|
7
7
|
#include "regex-partial.h"
|
|
8
8
|
|
|
9
|
+
#include <algorithm>
|
|
9
10
|
#include <cstdio>
|
|
11
|
+
#include <cctype>
|
|
10
12
|
#include <exception>
|
|
13
|
+
#include <functional>
|
|
11
14
|
#include <iostream>
|
|
12
15
|
#include <optional>
|
|
13
16
|
#include <stdexcept>
|
|
@@ -627,6 +630,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
627
630
|
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
|
|
628
631
|
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
|
|
629
632
|
case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
|
|
633
|
+
case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS: return "LFM2 with JSON tools";
|
|
630
634
|
default:
|
|
631
635
|
throw std::runtime_error("Unknown chat format");
|
|
632
636
|
}
|
|
@@ -794,6 +798,7 @@ static std::string apply(
|
|
|
794
798
|
if (additional_context) {
|
|
795
799
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
796
800
|
}
|
|
801
|
+
// TODO: add flag to control date/time, if only for testing purposes.
|
|
797
802
|
tmpl_inputs.now = inputs.now;
|
|
798
803
|
|
|
799
804
|
minja::chat_template_options tmpl_opts;
|
|
@@ -972,6 +977,126 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
|
|
972
977
|
return data;
|
|
973
978
|
}
|
|
974
979
|
|
|
980
|
+
|
|
981
|
+
// Case-insensitive find
|
|
982
|
+
static size_t ifind_string(const std::string & haystack, const std::string & needle, size_t pos = 0) {
|
|
983
|
+
auto it = std::search(
|
|
984
|
+
haystack.begin() + pos, haystack.end(),
|
|
985
|
+
needle.begin(), needle.end(),
|
|
986
|
+
[](char a, char b) { return std::tolower(a) == std::tolower(b); }
|
|
987
|
+
);
|
|
988
|
+
return (it == haystack.end()) ? std::string::npos : std::distance(haystack.begin(), it);
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
992
|
+
common_chat_params data;
|
|
993
|
+
const auto is_json_schema_provided = !inputs.json_schema.is_null();
|
|
994
|
+
const auto is_grammar_provided = !inputs.grammar.empty();
|
|
995
|
+
const auto are_tools_provided = inputs.tools.is_array() && !inputs.tools.empty();
|
|
996
|
+
|
|
997
|
+
// the logic requires potentially modifying the messages
|
|
998
|
+
auto tweaked_messages = inputs.messages;
|
|
999
|
+
|
|
1000
|
+
auto replace_json_schema_marker = [](json & messages) -> bool {
|
|
1001
|
+
static std::string marker1 = "force json schema.\n";
|
|
1002
|
+
static std::string marker2 = "force json schema.";
|
|
1003
|
+
|
|
1004
|
+
if (messages.empty() || messages.at(0).at("role") != "system") {
|
|
1005
|
+
return false;
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
std::string content = messages.at(0).at("content");
|
|
1009
|
+
|
|
1010
|
+
for (const auto & marker : {marker1, marker2}) {
|
|
1011
|
+
const auto pos = ifind_string(content, marker);
|
|
1012
|
+
if (pos != std::string::npos) {
|
|
1013
|
+
content.replace(pos, marker.length(), "");
|
|
1014
|
+
// inject modified content back into the messages
|
|
1015
|
+
messages.at(0).at("content") = content;
|
|
1016
|
+
return true;
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
return false;
|
|
1021
|
+
};
|
|
1022
|
+
|
|
1023
|
+
// Lfm2 model does not natively work with json, but can generally understand the tools structure
|
|
1024
|
+
//
|
|
1025
|
+
// Example of the pytorch dialog structure:
|
|
1026
|
+
// <|startoftext|><|im_start|>system
|
|
1027
|
+
// List of tools: <|tool_list_start|>[{"name": "get_candidate_status", "description": "Retrieves the current status of a candidate in the recruitment process", "parameters": {"type": "object", "properties": {"candidate_id": {"type": "string", "description": "Unique identifier for the candidate"}}, "required": ["candidate_id"]}}]<|tool_list_end|><|im_end|>
|
|
1028
|
+
// <|im_start|>user
|
|
1029
|
+
// What is the current status of candidate ID 12345?<|im_end|>
|
|
1030
|
+
// <|im_start|>assistant
|
|
1031
|
+
// <|tool_call_start|>[get_candidate_status(candidate_id="12345")]<|tool_call_end|>Checking the current status of candidate ID 12345.<|im_end|>
|
|
1032
|
+
// <|im_start|>tool
|
|
1033
|
+
// <|tool_response_start|>{"candidate_id": "12345", "status": "Interview Scheduled", "position": "Clinical Research Associate", "date": "2023-11-20"}<|tool_response_end|><|im_end|>
|
|
1034
|
+
// <|im_start|>assistant
|
|
1035
|
+
// The candidate with ID 12345 is currently in the "Interview Scheduled" stage for the position of Clinical Research Associate, with an interview date set for 2023-11-20.<|im_end|>
|
|
1036
|
+
//
|
|
1037
|
+
// For the llama server compatibility with json tools semantic,
|
|
1038
|
+
// the client can add "Follow json schema." line into the system message prompt to force the json output.
|
|
1039
|
+
//
|
|
1040
|
+
if (are_tools_provided && (is_json_schema_provided || is_grammar_provided)) {
|
|
1041
|
+
// server/utils.hpp prohibits that branch for the custom grammar anyways
|
|
1042
|
+
throw std::runtime_error("Tools call must not use \"json_schema\" or \"grammar\", use non-tool invocation if you want to use custom grammar");
|
|
1043
|
+
} else if (are_tools_provided && replace_json_schema_marker(tweaked_messages)) {
|
|
1044
|
+
LOG_INF("%s: Using tools to build a grammar\n", __func__);
|
|
1045
|
+
|
|
1046
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1047
|
+
auto schemas = json::array();
|
|
1048
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1049
|
+
const auto & function = tool.at("function");
|
|
1050
|
+
schemas.push_back({
|
|
1051
|
+
{"type", "object"},
|
|
1052
|
+
{"properties", {
|
|
1053
|
+
{"name", {
|
|
1054
|
+
{"type", "string"},
|
|
1055
|
+
{"const", function.at("name")},
|
|
1056
|
+
}},
|
|
1057
|
+
{"arguments", function.at("parameters")},
|
|
1058
|
+
}},
|
|
1059
|
+
{"required", json::array({"name", "arguments", "id"})},
|
|
1060
|
+
});
|
|
1061
|
+
});
|
|
1062
|
+
auto schema = json {
|
|
1063
|
+
{"type", "array"},
|
|
1064
|
+
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
|
1065
|
+
{"minItems", 1},
|
|
1066
|
+
};
|
|
1067
|
+
if (!inputs.parallel_tool_calls) {
|
|
1068
|
+
schema["maxItems"] = 1;
|
|
1069
|
+
}
|
|
1070
|
+
|
|
1071
|
+
builder.add_rule("root", "\"<|tool_call_start|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tool_call_end|>\"");
|
|
1072
|
+
});
|
|
1073
|
+
// model has no concept of tool selection mode choice,
|
|
1074
|
+
// if the system prompt rendered correctly it will produce a tool call
|
|
1075
|
+
// the grammar goes inside the tool call body
|
|
1076
|
+
data.grammar_lazy = true;
|
|
1077
|
+
data.grammar_triggers = {{COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, "\\s*<\\|tool_call_start\\|>\\s*\\["}};
|
|
1078
|
+
data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
|
|
1079
|
+
data.format = COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS;
|
|
1080
|
+
} else if (are_tools_provided && (!is_json_schema_provided && !is_grammar_provided)) {
|
|
1081
|
+
LOG_INF("%s: Using tools without json schema or grammar\n", __func__);
|
|
1082
|
+
// output those tokens
|
|
1083
|
+
data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
|
|
1084
|
+
} else if (is_json_schema_provided) {
|
|
1085
|
+
LOG_INF("%s: Using provided json schema to build a grammar\n", __func__);
|
|
1086
|
+
data.grammar = json_schema_to_grammar(inputs.json_schema);
|
|
1087
|
+
} else if (is_grammar_provided) {
|
|
1088
|
+
LOG_INF("%s: Using provided grammar\n", __func__);
|
|
1089
|
+
data.grammar = inputs.grammar;
|
|
1090
|
+
} else {
|
|
1091
|
+
LOG_INF("%s: Using content relying on the template\n", __func__);
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
|
|
1095
|
+
LOG_DBG("%s: Prompt: %s\n", __func__, data.prompt.c_str());
|
|
1096
|
+
|
|
1097
|
+
return data;
|
|
1098
|
+
}
|
|
1099
|
+
|
|
975
1100
|
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
976
1101
|
common_chat_params data;
|
|
977
1102
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -2485,6 +2610,71 @@ static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
|
|
|
2485
2610
|
builder.add_content(builder.consume_rest());
|
|
2486
2611
|
}
|
|
2487
2612
|
|
|
2613
|
+
|
|
2614
|
+
static void common_chat_parse_lfm2(common_chat_msg_parser & builder) {
|
|
2615
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
2616
|
+
builder.add_content(builder.consume_rest());
|
|
2617
|
+
return;
|
|
2618
|
+
}
|
|
2619
|
+
|
|
2620
|
+
// LFM2 format: <|tool_call_start|>[{"name": "get_current_time", "arguments": {"location": "Paris"}}]<|tool_call_end|>
|
|
2621
|
+
static const common_regex tool_call_start_regex(regex_escape("<|tool_call_start|>"));
|
|
2622
|
+
static const common_regex tool_call_end_regex(regex_escape("<|tool_call_end|>"));
|
|
2623
|
+
|
|
2624
|
+
// Loop through all tool calls
|
|
2625
|
+
while (auto res = builder.try_find_regex(tool_call_start_regex, std::string::npos, /* add_prelude_to_content= */ true)) {
|
|
2626
|
+
builder.move_to(res->groups[0].end);
|
|
2627
|
+
|
|
2628
|
+
// Parse JSON array format: [{"name": "...", "arguments": {...}}]
|
|
2629
|
+
auto tool_calls_data = builder.consume_json();
|
|
2630
|
+
|
|
2631
|
+
// Consume end marker
|
|
2632
|
+
builder.consume_spaces();
|
|
2633
|
+
if (!builder.try_consume_regex(tool_call_end_regex)) {
|
|
2634
|
+
throw common_chat_msg_partial_exception("Expected <|tool_call_end|>");
|
|
2635
|
+
}
|
|
2636
|
+
|
|
2637
|
+
// Process each tool call in the array
|
|
2638
|
+
if (tool_calls_data.json.is_array()) {
|
|
2639
|
+
for (const auto & tool_call : tool_calls_data.json) {
|
|
2640
|
+
if (!tool_call.is_object()) {
|
|
2641
|
+
throw common_chat_msg_partial_exception("Tool call must be an object");
|
|
2642
|
+
}
|
|
2643
|
+
|
|
2644
|
+
if (!tool_call.contains("name")) {
|
|
2645
|
+
throw common_chat_msg_partial_exception("Tool call missing 'name' field");
|
|
2646
|
+
}
|
|
2647
|
+
|
|
2648
|
+
std::string function_name = tool_call.at("name");
|
|
2649
|
+
std::string arguments = "{}";
|
|
2650
|
+
|
|
2651
|
+
if (tool_call.contains("arguments")) {
|
|
2652
|
+
if (tool_call.at("arguments").is_object()) {
|
|
2653
|
+
arguments = tool_call.at("arguments").dump();
|
|
2654
|
+
} else if (tool_call.at("arguments").is_string()) {
|
|
2655
|
+
arguments = tool_call.at("arguments");
|
|
2656
|
+
}
|
|
2657
|
+
}
|
|
2658
|
+
|
|
2659
|
+
if (!builder.add_tool_call(function_name, "", arguments)) {
|
|
2660
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2661
|
+
}
|
|
2662
|
+
}
|
|
2663
|
+
} else {
|
|
2664
|
+
throw common_chat_msg_partial_exception("Expected JSON array for tool calls");
|
|
2665
|
+
}
|
|
2666
|
+
|
|
2667
|
+
// Consume any trailing whitespace after this tool call
|
|
2668
|
+
builder.consume_spaces();
|
|
2669
|
+
}
|
|
2670
|
+
|
|
2671
|
+
// Consume any remaining content after all tool calls
|
|
2672
|
+
auto remaining = builder.consume_rest();
|
|
2673
|
+
if (!string_strip(remaining).empty()) {
|
|
2674
|
+
builder.add_content(remaining);
|
|
2675
|
+
}
|
|
2676
|
+
}
|
|
2677
|
+
|
|
2488
2678
|
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
|
2489
2679
|
// Parse thinking tags first - this handles the main reasoning content
|
|
2490
2680
|
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
|
|
@@ -2734,6 +2924,12 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2734
2924
|
return common_chat_params_init_apertus(tmpl, params);
|
|
2735
2925
|
}
|
|
2736
2926
|
|
|
2927
|
+
// LFM2 (w/ tools)
|
|
2928
|
+
if (src.find("List of tools: <|tool_list_start|>[") != std::string::npos &&
|
|
2929
|
+
src.find("]<|tool_list_end|>") != std::string::npos) {
|
|
2930
|
+
return common_chat_params_init_lfm2(tmpl, params);
|
|
2931
|
+
}
|
|
2932
|
+
|
|
2737
2933
|
// Use generic handler when mixing tools + JSON schema.
|
|
2738
2934
|
// TODO: support that mix in handlers below.
|
|
2739
2935
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -2912,6 +3108,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2912
3108
|
case COMMON_CHAT_FORMAT_APERTUS:
|
|
2913
3109
|
common_chat_parse_apertus(builder);
|
|
2914
3110
|
break;
|
|
3111
|
+
case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS:
|
|
3112
|
+
common_chat_parse_lfm2(builder);
|
|
3113
|
+
break;
|
|
2915
3114
|
default:
|
|
2916
3115
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
2917
3116
|
}
|
|
@@ -601,7 +601,10 @@ private:
|
|
|
601
601
|
}
|
|
602
602
|
|
|
603
603
|
std::string _resolve_ref(const std::string & ref) {
|
|
604
|
-
|
|
604
|
+
auto it = ref.find('#');
|
|
605
|
+
std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
|
|
606
|
+
static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
|
|
607
|
+
std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
|
|
605
608
|
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
|
|
606
609
|
_refs_being_resolved.insert(ref);
|
|
607
610
|
json resolved = _refs[ref];
|
|
@@ -774,11 +777,24 @@ public:
|
|
|
774
777
|
std::vector<std::string> tokens = string_split(pointer, "/");
|
|
775
778
|
for (size_t i = 1; i < tokens.size(); ++i) {
|
|
776
779
|
std::string sel = tokens[i];
|
|
777
|
-
if (target.
|
|
780
|
+
if (target.is_object() && target.contains(sel)) {
|
|
781
|
+
target = target[sel];
|
|
782
|
+
} else if (target.is_array()) {
|
|
783
|
+
size_t sel_index;
|
|
784
|
+
try {
|
|
785
|
+
sel_index = std::stoul(sel);
|
|
786
|
+
} catch (const std::invalid_argument & e) {
|
|
787
|
+
sel_index = target.size();
|
|
788
|
+
}
|
|
789
|
+
if (sel_index >= target.size()) {
|
|
790
|
+
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
|
|
791
|
+
return;
|
|
792
|
+
}
|
|
793
|
+
target = target[sel_index];
|
|
794
|
+
} else {
|
|
778
795
|
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
|
|
779
796
|
return;
|
|
780
797
|
}
|
|
781
|
-
target = target[sel];
|
|
782
798
|
}
|
|
783
799
|
_refs[ref] = target;
|
|
784
800
|
}
|
|
@@ -251,6 +251,8 @@ option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adr
|
|
|
251
251
|
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
|
252
252
|
"gmml: OpenCL API version to target")
|
|
253
253
|
|
|
254
|
+
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
|
|
255
|
+
|
|
254
256
|
# toolchain for vulkan-shaders-gen
|
|
255
257
|
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
|
256
258
|
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "ggml-backend.h"
|
|
5
|
+
|
|
6
|
+
#ifdef __cplusplus
|
|
7
|
+
extern "C" {
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
// backend API
|
|
11
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
|
|
12
|
+
|
|
13
|
+
GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
|
|
14
|
+
|
|
15
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
|
|
16
|
+
|
|
17
|
+
#ifdef __cplusplus
|
|
18
|
+
}
|
|
19
|
+
#endif
|
|
@@ -307,6 +307,10 @@ function(ggml_add_cpu_backend_variant tag_name)
|
|
|
307
307
|
foreach (feat ${ARGN})
|
|
308
308
|
set(GGML_INTERNAL_${feat} ON)
|
|
309
309
|
endforeach()
|
|
310
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
311
|
+
foreach (feat ${ARGN})
|
|
312
|
+
set(GGML_INTERNAL_${feat} ON)
|
|
313
|
+
endforeach()
|
|
310
314
|
endif()
|
|
311
315
|
|
|
312
316
|
ggml_add_cpu_backend_variant_impl(${tag_name})
|
|
@@ -371,6 +375,14 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
371
375
|
else()
|
|
372
376
|
message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
|
|
373
377
|
endif()
|
|
378
|
+
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
379
|
+
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
380
|
+
ggml_add_cpu_backend_variant(s390x_z15 Z15 VXE)
|
|
381
|
+
# ggml_add_cpu_backend_variant(s390x_z16 Z16 VXE)
|
|
382
|
+
# ggml_add_cpu_backend_variant(s390x_z17 Z17 VXE)
|
|
383
|
+
else()
|
|
384
|
+
message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
|
|
385
|
+
endif()
|
|
374
386
|
else()
|
|
375
387
|
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
|
376
388
|
endif()
|
|
@@ -390,6 +402,7 @@ ggml_add_backend(Vulkan)
|
|
|
390
402
|
ggml_add_backend(WebGPU)
|
|
391
403
|
ggml_add_backend(zDNN)
|
|
392
404
|
ggml_add_backend(OpenCL)
|
|
405
|
+
ggml_add_backend(Hexagon)
|
|
393
406
|
|
|
394
407
|
foreach (target ggml-base ggml)
|
|
395
408
|
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
|
@@ -466,29 +466,45 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
466
466
|
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
|
467
467
|
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
468
468
|
message(STATUS "s390x detected")
|
|
469
|
-
list(APPEND GGML_CPU_SOURCES
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
469
|
+
list(APPEND GGML_CPU_SOURCES
|
|
470
|
+
ggml-cpu/arch/s390/quants.c)
|
|
471
|
+
|
|
472
|
+
# for native compilation
|
|
473
|
+
if (GGML_NATIVE)
|
|
474
|
+
# check machine level to determine target
|
|
475
|
+
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
|
|
476
|
+
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
|
|
477
|
+
|
|
478
|
+
# TODO: Separation to determine activation of VX/VXE/VXE2
|
|
479
|
+
if (${S390X_M} MATCHES "8561|8562")
|
|
480
|
+
message(STATUS "z15 target")
|
|
481
|
+
list(APPEND ARCH_FLAGS -march=z15)
|
|
482
|
+
elseif (${S390X_M} MATCHES "3931")
|
|
483
|
+
message(STATUS "z16 target")
|
|
484
|
+
list(APPEND ARCH_FLAGS -march=z16)
|
|
485
|
+
elseif (${S390X_M} MATCHES "9175|9176")
|
|
486
|
+
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
|
487
|
+
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
|
|
488
|
+
message(STATUS "z17 target")
|
|
489
|
+
list(APPEND ARCH_FLAGS -march=arch15)
|
|
490
|
+
else()
|
|
491
|
+
message(STATUS "Unknown target")
|
|
492
|
+
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
|
493
|
+
list(APPEND ARCH_FLAGS -march=native -mtune=native)
|
|
494
|
+
endif()
|
|
495
|
+
# for cross-compilation
|
|
496
|
+
elseif(GGML_CPU_ALL_VARIANTS)
|
|
497
|
+
# range through IBM z15 to z17
|
|
498
|
+
# NOTE: update when a new hardware level is released
|
|
499
|
+
foreach (ZHW RANGE 15 17)
|
|
500
|
+
if(DEFINED GGML_INTERNAL_Z${ZHW})
|
|
501
|
+
message(STATUS "z${ZHW} cross-compile target")
|
|
502
|
+
list(APPEND ARCH_FLAGS -march=z${ZHW})
|
|
503
|
+
endif()
|
|
504
|
+
endforeach()
|
|
489
505
|
endif()
|
|
490
506
|
|
|
491
|
-
if (GGML_VXE)
|
|
507
|
+
if (GGML_VXE OR GGML_INTERNAL_VXE)
|
|
492
508
|
message(STATUS "VX/VXE/VXE2 enabled")
|
|
493
509
|
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
|
494
510
|
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
|
@@ -7519,8 +7519,8 @@ static void ggml_compute_forward_upscale_f32(
|
|
|
7519
7519
|
float pixel_offset = 0.5f;
|
|
7520
7520
|
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
|
7521
7521
|
pixel_offset = 0.0f;
|
|
7522
|
-
sf0 = (float)(ne0 - 1) / (
|
|
7523
|
-
sf1 = (float)(ne1 - 1) / (
|
|
7522
|
+
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
|
|
7523
|
+
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
|
|
7524
7524
|
}
|
|
7525
7525
|
|
|
7526
7526
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|