@fugood/llama.node 1.3.0-rc.6 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +12 -2
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +7 -8
- package/src/llama.cpp/common/arg.cpp +2 -2
- package/src/llama.cpp/common/chat.cpp +199 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
- package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/src/llama.cpp/src/llama-context.cpp +8 -3
- package/src/llama.cpp/src/llama-graph.cpp +9 -4
- package/src/llama.cpp/src/llama-kv-cache.cpp +23 -20
- package/src/llama.cpp/src/llama-kv-cache.h +2 -4
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-model.cpp +39 -32
- package/src/llama.cpp/src/llama-model.h +1 -2
package/CMakeLists.txt
CHANGED
|
@@ -64,13 +64,23 @@ endif()
|
|
|
64
64
|
# Improve speed
|
|
65
65
|
if(CMAKE_BUILD_TYPE STREQUAL "Release")
|
|
66
66
|
if (MSVC)
|
|
67
|
-
|
|
67
|
+
# Enable parallel compilation for all MSVC builds
|
|
68
|
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
|
|
69
|
+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP")
|
|
70
|
+
|
|
71
|
+
if (NOT GGML_VULKAN AND NOT GGML_CUDA)
|
|
72
|
+
# Full optimization with LTCG for default builds
|
|
68
73
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Ob2 /Oi /Ot /Oy /GL")
|
|
69
74
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 /Ob2 /Oi /Ot /Oy /GL")
|
|
70
75
|
set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} /LTCG")
|
|
71
|
-
|
|
76
|
+
elseif(GGML_VULKAN)
|
|
77
|
+
# Reduced optimization for Vulkan builds
|
|
72
78
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O1 /Ob1 /bigobj")
|
|
73
79
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O1 /Ob1 /bigobj")
|
|
80
|
+
else()
|
|
81
|
+
# Faster linking for CUDA builds (no LTCG)
|
|
82
|
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Ob2 /Oi")
|
|
83
|
+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 /Ob2 /Oi")
|
|
74
84
|
endif()
|
|
75
85
|
else()
|
|
76
86
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -funroll-loops -flto=auto")
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.3.0
|
|
4
|
+
"version": "1.3.0",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.3.0
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.3.0
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.3.0
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.3.0
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.3.0
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.3.0
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.3.0
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.3.0
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.3.0
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.3.0
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.3.0
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.3.0
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.3.0
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.3.0",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.3.0",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.3.0",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.3.0",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.3.0",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.3.0",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.3.0",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.3.0",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.3.0",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.3.0",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.3.0",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.3.0",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.3.0"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -21,7 +21,7 @@ index fe290bf8f..d377e29b9 100644
|
|
|
21
21
|
|
|
22
22
|
#
|
|
23
23
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
24
|
-
index
|
|
24
|
+
index 63583fb22..f8be20148 100644
|
|
25
25
|
--- a/src/llama.cpp/common/chat.cpp
|
|
26
26
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
27
27
|
@@ -6,9 +6,6 @@
|
|
@@ -31,10 +31,10 @@ index 8587140e1..7931a31a1 100644
|
|
|
31
31
|
-#include <minja/chat-template.hpp>
|
|
32
32
|
-#include <minja/minja.hpp>
|
|
33
33
|
-
|
|
34
|
+
#include <algorithm>
|
|
34
35
|
#include <cstdio>
|
|
35
|
-
#include <
|
|
36
|
-
|
|
37
|
-
@@ -123,16 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
|
36
|
+
#include <cctype>
|
|
37
|
+
@@ -126,16 +123,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
|
38
38
|
return diffs;
|
|
39
39
|
}
|
|
40
40
|
|
|
@@ -51,18 +51,17 @@ index 8587140e1..7931a31a1 100644
|
|
|
51
51
|
struct templates_params {
|
|
52
52
|
json messages;
|
|
53
53
|
json tools;
|
|
54
|
-
@@ -
|
|
55
|
-
if (additional_context) {
|
|
54
|
+
@@ -812,7 +799,7 @@ static std::string apply(
|
|
56
55
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
57
56
|
}
|
|
58
|
-
|
|
57
|
+
// TODO: add flag to control date/time, if only for testing purposes.
|
|
59
58
|
- // tmpl_inputs.now = std::chrono::system_clock::now();
|
|
60
59
|
+ tmpl_inputs.now = inputs.now;
|
|
61
60
|
|
|
62
61
|
minja::chat_template_options tmpl_opts;
|
|
63
62
|
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
64
63
|
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
65
|
-
index
|
|
64
|
+
index 50efb0d4e..f471a84c7 100644
|
|
66
65
|
--- a/src/llama.cpp/common/chat.h
|
|
67
66
|
+++ b/src/llama.cpp/common/chat.h
|
|
68
67
|
@@ -9,7 +9,18 @@
|
|
@@ -3248,7 +3248,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3248
3248
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
3249
3249
|
add_opt(common_arg(
|
|
3250
3250
|
{"--embd-output-format"}, "FORMAT",
|
|
3251
|
-
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
|
|
3251
|
+
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
|
|
3252
3252
|
[](common_params & params, const std::string & value) {
|
|
3253
3253
|
params.embd_out = value;
|
|
3254
3254
|
}
|
|
@@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3435
3435
|
[](common_params & params) {
|
|
3436
3436
|
params.use_jinja = true;
|
|
3437
3437
|
}
|
|
3438
|
-
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
|
3438
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
|
3439
3439
|
add_opt(common_arg(
|
|
3440
3440
|
{"--reasoning-format"}, "FORMAT",
|
|
3441
3441
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
|
@@ -6,8 +6,11 @@
|
|
|
6
6
|
#include "log.h"
|
|
7
7
|
#include "regex-partial.h"
|
|
8
8
|
|
|
9
|
+
#include <algorithm>
|
|
9
10
|
#include <cstdio>
|
|
11
|
+
#include <cctype>
|
|
10
12
|
#include <exception>
|
|
13
|
+
#include <functional>
|
|
11
14
|
#include <iostream>
|
|
12
15
|
#include <optional>
|
|
13
16
|
#include <stdexcept>
|
|
@@ -627,6 +630,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
627
630
|
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
|
|
628
631
|
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
|
|
629
632
|
case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
|
|
633
|
+
case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS: return "LFM2 with JSON tools";
|
|
630
634
|
default:
|
|
631
635
|
throw std::runtime_error("Unknown chat format");
|
|
632
636
|
}
|
|
@@ -794,6 +798,7 @@ static std::string apply(
|
|
|
794
798
|
if (additional_context) {
|
|
795
799
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
796
800
|
}
|
|
801
|
+
// TODO: add flag to control date/time, if only for testing purposes.
|
|
797
802
|
tmpl_inputs.now = inputs.now;
|
|
798
803
|
|
|
799
804
|
minja::chat_template_options tmpl_opts;
|
|
@@ -972,6 +977,126 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
|
|
972
977
|
return data;
|
|
973
978
|
}
|
|
974
979
|
|
|
980
|
+
|
|
981
|
+
// Case-insensitive find
|
|
982
|
+
static size_t ifind_string(const std::string & haystack, const std::string & needle, size_t pos = 0) {
|
|
983
|
+
auto it = std::search(
|
|
984
|
+
haystack.begin() + pos, haystack.end(),
|
|
985
|
+
needle.begin(), needle.end(),
|
|
986
|
+
[](char a, char b) { return std::tolower(a) == std::tolower(b); }
|
|
987
|
+
);
|
|
988
|
+
return (it == haystack.end()) ? std::string::npos : std::distance(haystack.begin(), it);
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
992
|
+
common_chat_params data;
|
|
993
|
+
const auto is_json_schema_provided = !inputs.json_schema.is_null();
|
|
994
|
+
const auto is_grammar_provided = !inputs.grammar.empty();
|
|
995
|
+
const auto are_tools_provided = inputs.tools.is_array() && !inputs.tools.empty();
|
|
996
|
+
|
|
997
|
+
// the logic requires potentially modifying the messages
|
|
998
|
+
auto tweaked_messages = inputs.messages;
|
|
999
|
+
|
|
1000
|
+
auto replace_json_schema_marker = [](json & messages) -> bool {
|
|
1001
|
+
static std::string marker1 = "force json schema.\n";
|
|
1002
|
+
static std::string marker2 = "force json schema.";
|
|
1003
|
+
|
|
1004
|
+
if (messages.empty() || messages.at(0).at("role") != "system") {
|
|
1005
|
+
return false;
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
std::string content = messages.at(0).at("content");
|
|
1009
|
+
|
|
1010
|
+
for (const auto & marker : {marker1, marker2}) {
|
|
1011
|
+
const auto pos = ifind_string(content, marker);
|
|
1012
|
+
if (pos != std::string::npos) {
|
|
1013
|
+
content.replace(pos, marker.length(), "");
|
|
1014
|
+
// inject modified content back into the messages
|
|
1015
|
+
messages.at(0).at("content") = content;
|
|
1016
|
+
return true;
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
return false;
|
|
1021
|
+
};
|
|
1022
|
+
|
|
1023
|
+
// Lfm2 model does not natively work with json, but can generally understand the tools structure
|
|
1024
|
+
//
|
|
1025
|
+
// Example of the pytorch dialog structure:
|
|
1026
|
+
// <|startoftext|><|im_start|>system
|
|
1027
|
+
// List of tools: <|tool_list_start|>[{"name": "get_candidate_status", "description": "Retrieves the current status of a candidate in the recruitment process", "parameters": {"type": "object", "properties": {"candidate_id": {"type": "string", "description": "Unique identifier for the candidate"}}, "required": ["candidate_id"]}}]<|tool_list_end|><|im_end|>
|
|
1028
|
+
// <|im_start|>user
|
|
1029
|
+
// What is the current status of candidate ID 12345?<|im_end|>
|
|
1030
|
+
// <|im_start|>assistant
|
|
1031
|
+
// <|tool_call_start|>[get_candidate_status(candidate_id="12345")]<|tool_call_end|>Checking the current status of candidate ID 12345.<|im_end|>
|
|
1032
|
+
// <|im_start|>tool
|
|
1033
|
+
// <|tool_response_start|>{"candidate_id": "12345", "status": "Interview Scheduled", "position": "Clinical Research Associate", "date": "2023-11-20"}<|tool_response_end|><|im_end|>
|
|
1034
|
+
// <|im_start|>assistant
|
|
1035
|
+
// The candidate with ID 12345 is currently in the "Interview Scheduled" stage for the position of Clinical Research Associate, with an interview date set for 2023-11-20.<|im_end|>
|
|
1036
|
+
//
|
|
1037
|
+
// For the llama server compatibility with json tools semantic,
|
|
1038
|
+
// the client can add "Follow json schema." line into the system message prompt to force the json output.
|
|
1039
|
+
//
|
|
1040
|
+
if (are_tools_provided && (is_json_schema_provided || is_grammar_provided)) {
|
|
1041
|
+
// server/utils.hpp prohibits that branch for the custom grammar anyways
|
|
1042
|
+
throw std::runtime_error("Tools call must not use \"json_schema\" or \"grammar\", use non-tool invocation if you want to use custom grammar");
|
|
1043
|
+
} else if (are_tools_provided && replace_json_schema_marker(tweaked_messages)) {
|
|
1044
|
+
LOG_INF("%s: Using tools to build a grammar\n", __func__);
|
|
1045
|
+
|
|
1046
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1047
|
+
auto schemas = json::array();
|
|
1048
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1049
|
+
const auto & function = tool.at("function");
|
|
1050
|
+
schemas.push_back({
|
|
1051
|
+
{"type", "object"},
|
|
1052
|
+
{"properties", {
|
|
1053
|
+
{"name", {
|
|
1054
|
+
{"type", "string"},
|
|
1055
|
+
{"const", function.at("name")},
|
|
1056
|
+
}},
|
|
1057
|
+
{"arguments", function.at("parameters")},
|
|
1058
|
+
}},
|
|
1059
|
+
{"required", json::array({"name", "arguments", "id"})},
|
|
1060
|
+
});
|
|
1061
|
+
});
|
|
1062
|
+
auto schema = json {
|
|
1063
|
+
{"type", "array"},
|
|
1064
|
+
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
|
|
1065
|
+
{"minItems", 1},
|
|
1066
|
+
};
|
|
1067
|
+
if (!inputs.parallel_tool_calls) {
|
|
1068
|
+
schema["maxItems"] = 1;
|
|
1069
|
+
}
|
|
1070
|
+
|
|
1071
|
+
builder.add_rule("root", "\"<|tool_call_start|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tool_call_end|>\"");
|
|
1072
|
+
});
|
|
1073
|
+
// model has no concept of tool selection mode choice,
|
|
1074
|
+
// if the system prompt rendered correctly it will produce a tool call
|
|
1075
|
+
// the grammar goes inside the tool call body
|
|
1076
|
+
data.grammar_lazy = true;
|
|
1077
|
+
data.grammar_triggers = {{COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, "\\s*<\\|tool_call_start\\|>\\s*\\["}};
|
|
1078
|
+
data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
|
|
1079
|
+
data.format = COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS;
|
|
1080
|
+
} else if (are_tools_provided && (!is_json_schema_provided && !is_grammar_provided)) {
|
|
1081
|
+
LOG_INF("%s: Using tools without json schema or grammar\n", __func__);
|
|
1082
|
+
// output those tokens
|
|
1083
|
+
data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
|
|
1084
|
+
} else if (is_json_schema_provided) {
|
|
1085
|
+
LOG_INF("%s: Using provided json schema to build a grammar\n", __func__);
|
|
1086
|
+
data.grammar = json_schema_to_grammar(inputs.json_schema);
|
|
1087
|
+
} else if (is_grammar_provided) {
|
|
1088
|
+
LOG_INF("%s: Using provided grammar\n", __func__);
|
|
1089
|
+
data.grammar = inputs.grammar;
|
|
1090
|
+
} else {
|
|
1091
|
+
LOG_INF("%s: Using content relying on the template\n", __func__);
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
|
|
1095
|
+
LOG_DBG("%s: Prompt: %s\n", __func__, data.prompt.c_str());
|
|
1096
|
+
|
|
1097
|
+
return data;
|
|
1098
|
+
}
|
|
1099
|
+
|
|
975
1100
|
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
976
1101
|
common_chat_params data;
|
|
977
1102
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -2485,6 +2610,71 @@ static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
|
|
|
2485
2610
|
builder.add_content(builder.consume_rest());
|
|
2486
2611
|
}
|
|
2487
2612
|
|
|
2613
|
+
|
|
2614
|
+
static void common_chat_parse_lfm2(common_chat_msg_parser & builder) {
|
|
2615
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
2616
|
+
builder.add_content(builder.consume_rest());
|
|
2617
|
+
return;
|
|
2618
|
+
}
|
|
2619
|
+
|
|
2620
|
+
// LFM2 format: <|tool_call_start|>[{"name": "get_current_time", "arguments": {"location": "Paris"}}]<|tool_call_end|>
|
|
2621
|
+
static const common_regex tool_call_start_regex(regex_escape("<|tool_call_start|>"));
|
|
2622
|
+
static const common_regex tool_call_end_regex(regex_escape("<|tool_call_end|>"));
|
|
2623
|
+
|
|
2624
|
+
// Loop through all tool calls
|
|
2625
|
+
while (auto res = builder.try_find_regex(tool_call_start_regex, std::string::npos, /* add_prelude_to_content= */ true)) {
|
|
2626
|
+
builder.move_to(res->groups[0].end);
|
|
2627
|
+
|
|
2628
|
+
// Parse JSON array format: [{"name": "...", "arguments": {...}}]
|
|
2629
|
+
auto tool_calls_data = builder.consume_json();
|
|
2630
|
+
|
|
2631
|
+
// Consume end marker
|
|
2632
|
+
builder.consume_spaces();
|
|
2633
|
+
if (!builder.try_consume_regex(tool_call_end_regex)) {
|
|
2634
|
+
throw common_chat_msg_partial_exception("Expected <|tool_call_end|>");
|
|
2635
|
+
}
|
|
2636
|
+
|
|
2637
|
+
// Process each tool call in the array
|
|
2638
|
+
if (tool_calls_data.json.is_array()) {
|
|
2639
|
+
for (const auto & tool_call : tool_calls_data.json) {
|
|
2640
|
+
if (!tool_call.is_object()) {
|
|
2641
|
+
throw common_chat_msg_partial_exception("Tool call must be an object");
|
|
2642
|
+
}
|
|
2643
|
+
|
|
2644
|
+
if (!tool_call.contains("name")) {
|
|
2645
|
+
throw common_chat_msg_partial_exception("Tool call missing 'name' field");
|
|
2646
|
+
}
|
|
2647
|
+
|
|
2648
|
+
std::string function_name = tool_call.at("name");
|
|
2649
|
+
std::string arguments = "{}";
|
|
2650
|
+
|
|
2651
|
+
if (tool_call.contains("arguments")) {
|
|
2652
|
+
if (tool_call.at("arguments").is_object()) {
|
|
2653
|
+
arguments = tool_call.at("arguments").dump();
|
|
2654
|
+
} else if (tool_call.at("arguments").is_string()) {
|
|
2655
|
+
arguments = tool_call.at("arguments");
|
|
2656
|
+
}
|
|
2657
|
+
}
|
|
2658
|
+
|
|
2659
|
+
if (!builder.add_tool_call(function_name, "", arguments)) {
|
|
2660
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2661
|
+
}
|
|
2662
|
+
}
|
|
2663
|
+
} else {
|
|
2664
|
+
throw common_chat_msg_partial_exception("Expected JSON array for tool calls");
|
|
2665
|
+
}
|
|
2666
|
+
|
|
2667
|
+
// Consume any trailing whitespace after this tool call
|
|
2668
|
+
builder.consume_spaces();
|
|
2669
|
+
}
|
|
2670
|
+
|
|
2671
|
+
// Consume any remaining content after all tool calls
|
|
2672
|
+
auto remaining = builder.consume_rest();
|
|
2673
|
+
if (!string_strip(remaining).empty()) {
|
|
2674
|
+
builder.add_content(remaining);
|
|
2675
|
+
}
|
|
2676
|
+
}
|
|
2677
|
+
|
|
2488
2678
|
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
|
2489
2679
|
// Parse thinking tags first - this handles the main reasoning content
|
|
2490
2680
|
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
|
|
@@ -2734,6 +2924,12 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2734
2924
|
return common_chat_params_init_apertus(tmpl, params);
|
|
2735
2925
|
}
|
|
2736
2926
|
|
|
2927
|
+
// LFM2 (w/ tools)
|
|
2928
|
+
if (src.find("List of tools: <|tool_list_start|>[") != std::string::npos &&
|
|
2929
|
+
src.find("]<|tool_list_end|>") != std::string::npos) {
|
|
2930
|
+
return common_chat_params_init_lfm2(tmpl, params);
|
|
2931
|
+
}
|
|
2932
|
+
|
|
2737
2933
|
// Use generic handler when mixing tools + JSON schema.
|
|
2738
2934
|
// TODO: support that mix in handlers below.
|
|
2739
2935
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -2912,6 +3108,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2912
3108
|
case COMMON_CHAT_FORMAT_APERTUS:
|
|
2913
3109
|
common_chat_parse_apertus(builder);
|
|
2914
3110
|
break;
|
|
3111
|
+
case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS:
|
|
3112
|
+
common_chat_parse_lfm2(builder);
|
|
3113
|
+
break;
|
|
2915
3114
|
default:
|
|
2916
3115
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
2917
3116
|
}
|
|
@@ -601,7 +601,10 @@ private:
|
|
|
601
601
|
}
|
|
602
602
|
|
|
603
603
|
std::string _resolve_ref(const std::string & ref) {
|
|
604
|
-
|
|
604
|
+
auto it = ref.find('#');
|
|
605
|
+
std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
|
|
606
|
+
static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
|
|
607
|
+
std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
|
|
605
608
|
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
|
|
606
609
|
_refs_being_resolved.insert(ref);
|
|
607
610
|
json resolved = _refs[ref];
|
|
@@ -774,11 +777,24 @@ public:
|
|
|
774
777
|
std::vector<std::string> tokens = string_split(pointer, "/");
|
|
775
778
|
for (size_t i = 1; i < tokens.size(); ++i) {
|
|
776
779
|
std::string sel = tokens[i];
|
|
777
|
-
if (target.
|
|
780
|
+
if (target.is_object() && target.contains(sel)) {
|
|
781
|
+
target = target[sel];
|
|
782
|
+
} else if (target.is_array()) {
|
|
783
|
+
size_t sel_index;
|
|
784
|
+
try {
|
|
785
|
+
sel_index = std::stoul(sel);
|
|
786
|
+
} catch (const std::invalid_argument & e) {
|
|
787
|
+
sel_index = target.size();
|
|
788
|
+
}
|
|
789
|
+
if (sel_index >= target.size()) {
|
|
790
|
+
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
|
|
791
|
+
return;
|
|
792
|
+
}
|
|
793
|
+
target = target[sel_index];
|
|
794
|
+
} else {
|
|
778
795
|
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
|
|
779
796
|
return;
|
|
780
797
|
}
|
|
781
|
-
target = target[sel];
|
|
782
798
|
}
|
|
783
799
|
_refs[ref] = target;
|
|
784
800
|
}
|
|
@@ -251,6 +251,8 @@ option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adr
|
|
|
251
251
|
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
|
252
252
|
"gmml: OpenCL API version to target")
|
|
253
253
|
|
|
254
|
+
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
|
|
255
|
+
|
|
254
256
|
# toolchain for vulkan-shaders-gen
|
|
255
257
|
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
|
256
258
|
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "ggml-backend.h"
|
|
5
|
+
|
|
6
|
+
#ifdef __cplusplus
|
|
7
|
+
extern "C" {
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
// backend API
|
|
11
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
|
|
12
|
+
|
|
13
|
+
GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
|
|
14
|
+
|
|
15
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
|
|
16
|
+
|
|
17
|
+
#ifdef __cplusplus
|
|
18
|
+
}
|
|
19
|
+
#endif
|
|
@@ -402,6 +402,7 @@ ggml_add_backend(Vulkan)
|
|
|
402
402
|
ggml_add_backend(WebGPU)
|
|
403
403
|
ggml_add_backend(zDNN)
|
|
404
404
|
ggml_add_backend(OpenCL)
|
|
405
|
+
ggml_add_backend(Hexagon)
|
|
405
406
|
|
|
406
407
|
foreach (target ggml-base ggml)
|
|
407
408
|
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
|
@@ -7519,8 +7519,8 @@ static void ggml_compute_forward_upscale_f32(
|
|
|
7519
7519
|
float pixel_offset = 0.5f;
|
|
7520
7520
|
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
|
7521
7521
|
pixel_offset = 0.0f;
|
|
7522
|
-
sf0 = (float)(ne0 - 1) / (
|
|
7523
|
-
sf1 = (float)(ne1 - 1) / (
|
|
7522
|
+
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
|
|
7523
|
+
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
|
|
7524
7524
|
}
|
|
7525
7525
|
|
|
7526
7526
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
@@ -268,9 +268,7 @@ llama_context::llama_context(
|
|
|
268
268
|
if (pipeline_parallel) {
|
|
269
269
|
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
|
|
270
270
|
}
|
|
271
|
-
}
|
|
272
271
|
|
|
273
|
-
if (!hparams.vocab_only) {
|
|
274
272
|
llama_memory_context_ptr mctx;
|
|
275
273
|
if (memory) {
|
|
276
274
|
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
|
|
@@ -343,7 +341,14 @@ llama_context::llama_context(
|
|
|
343
341
|
{
|
|
344
342
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
345
343
|
if (!gf) {
|
|
346
|
-
|
|
344
|
+
if (pipeline_parallel) {
|
|
345
|
+
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
|
346
|
+
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
|
|
347
|
+
gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
348
|
+
}
|
|
349
|
+
if (!gf) {
|
|
350
|
+
throw std::runtime_error("failed to allocate compute pp buffers");
|
|
351
|
+
}
|
|
347
352
|
}
|
|
348
353
|
|
|
349
354
|
n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
|
|
@@ -810,6 +810,9 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|
|
810
810
|
GGML_ABORT("fatal error");
|
|
811
811
|
}
|
|
812
812
|
|
|
813
|
+
//expand here so that we can fuse ffn gate
|
|
814
|
+
ggml_build_forward_expand(gf, cur);
|
|
815
|
+
|
|
813
816
|
if (gate && type_gate == LLM_FFN_PAR) {
|
|
814
817
|
cur = ggml_mul(ctx0, cur, tmp);
|
|
815
818
|
cb(cur, "ffn_gate_par", il);
|
|
@@ -1006,10 +1009,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
1006
1009
|
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
|
|
1007
1010
|
cb(weights_sum, "ffn_moe_weights_sum", il);
|
|
1008
1011
|
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
}
|
|
1012
|
+
// Avoid division by zero, clamp to smallest number representable by F16
|
|
1013
|
+
weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY);
|
|
1014
|
+
cb(weights_sum, "ffn_moe_weights_sum_clamped", il);
|
|
1013
1015
|
|
|
1014
1016
|
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
|
|
1015
1017
|
cb(weights, "ffn_moe_weights_norm", il);
|
|
@@ -1091,6 +1093,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|
|
1091
1093
|
GGML_ABORT("fatal error");
|
|
1092
1094
|
}
|
|
1093
1095
|
|
|
1096
|
+
//expand here so that we can fuse ffn gate
|
|
1097
|
+
ggml_build_forward_expand(gf, cur);
|
|
1098
|
+
|
|
1094
1099
|
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
|
1095
1100
|
cb(experts, "ffn_moe_down", il);
|
|
1096
1101
|
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include <algorithm>
|
|
9
9
|
#include <cassert>
|
|
10
10
|
#include <cmath>
|
|
11
|
+
#include <cstring>
|
|
11
12
|
#include <limits>
|
|
12
13
|
#include <map>
|
|
13
14
|
#include <stdexcept>
|
|
@@ -37,8 +38,15 @@ llama_kv_cache::llama_kv_cache(
|
|
|
37
38
|
|
|
38
39
|
const uint32_t n_layer_kv = hparams.n_layer_kv();
|
|
39
40
|
|
|
41
|
+
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
|
42
|
+
struct ggml_backend_buft_comparator {
|
|
43
|
+
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
|
44
|
+
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
|
48
|
+
|
|
40
49
|
// create a context for each buffer type
|
|
41
|
-
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
42
50
|
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
|
43
51
|
auto it = ctx_map.find(buft);
|
|
44
52
|
if (it == ctx_map.end()) {
|
|
@@ -53,13 +61,12 @@ llama_kv_cache::llama_kv_cache(
|
|
|
53
61
|
return nullptr;
|
|
54
62
|
}
|
|
55
63
|
|
|
56
|
-
ctx_map
|
|
57
|
-
ctxs.emplace_back(ctx);
|
|
64
|
+
ctx_map.emplace(buft, ctx);
|
|
58
65
|
|
|
59
66
|
return ctx;
|
|
60
67
|
}
|
|
61
68
|
|
|
62
|
-
return it->second;
|
|
69
|
+
return it->second.get();
|
|
63
70
|
};
|
|
64
71
|
|
|
65
72
|
GGML_ASSERT(n_stream == 1 || n_stream == n_seq_max);
|
|
@@ -167,11 +174,8 @@ llama_kv_cache::llama_kv_cache(
|
|
|
167
174
|
}
|
|
168
175
|
|
|
169
176
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
|
170
|
-
for (auto
|
|
171
|
-
|
|
172
|
-
auto * ctx = it.second;
|
|
173
|
-
|
|
174
|
-
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
177
|
+
for (auto & [buft, ctx] : ctx_map) {
|
|
178
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
|
175
179
|
if (!buf) {
|
|
176
180
|
throw std::runtime_error("failed to allocate buffer for kv cache");
|
|
177
181
|
}
|
|
@@ -179,7 +183,7 @@ llama_kv_cache::llama_kv_cache(
|
|
|
179
183
|
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
|
180
184
|
|
|
181
185
|
ggml_backend_buffer_clear(buf, 0);
|
|
182
|
-
|
|
186
|
+
ctxs_bufs.emplace_back(std::move(ctx), buf);
|
|
183
187
|
}
|
|
184
188
|
|
|
185
189
|
{
|
|
@@ -203,7 +207,7 @@ void llama_kv_cache::clear(bool data) {
|
|
|
203
207
|
}
|
|
204
208
|
|
|
205
209
|
if (data) {
|
|
206
|
-
for (auto & buf :
|
|
210
|
+
for (auto & [_, buf] : ctxs_bufs) {
|
|
207
211
|
ggml_backend_buffer_clear(buf.get(), 0);
|
|
208
212
|
}
|
|
209
213
|
}
|
|
@@ -472,8 +476,8 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
472
476
|
|
|
473
477
|
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
|
474
478
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
475
|
-
for (const
|
|
476
|
-
ret[ggml_backend_buffer_get_type(
|
|
479
|
+
for (const auto & [_, buf] : ctxs_bufs) {
|
|
480
|
+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
|
477
481
|
}
|
|
478
482
|
return ret;
|
|
479
483
|
}
|
|
@@ -957,10 +961,14 @@ bool llama_kv_cache::get_has_shift() const {
|
|
|
957
961
|
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
|
|
958
962
|
uint32_t result = 0;
|
|
959
963
|
|
|
964
|
+
// pad the n_kv value so that the graph remains constant across batches and can be reused
|
|
965
|
+
// note: this also helps some backends with performance (f.ex https://github.com/ggml-org/llama.cpp/pull/16812#issuecomment-3455112220)
|
|
966
|
+
const uint32_t n_pad_cur = std::max(n_pad, 256u);
|
|
967
|
+
|
|
960
968
|
for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
|
|
961
969
|
const auto & cells = v_cells[sinfo.strm[s]];
|
|
962
970
|
|
|
963
|
-
result = std::max(std::min(cells.size(), std::max(
|
|
971
|
+
result = std::max(std::min(cells.size(), std::max(n_pad_cur, GGML_PAD(cells.used_max_p1(), n_pad_cur))), result);
|
|
964
972
|
}
|
|
965
973
|
|
|
966
974
|
return result;
|
|
@@ -1298,7 +1306,7 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
|
|
|
1298
1306
|
size_t llama_kv_cache::total_size() const {
|
|
1299
1307
|
size_t size = 0;
|
|
1300
1308
|
|
|
1301
|
-
for (const auto & buf :
|
|
1309
|
+
for (const auto & [_, buf] : ctxs_bufs) {
|
|
1302
1310
|
size += ggml_backend_buffer_get_size(buf.get());
|
|
1303
1311
|
}
|
|
1304
1312
|
|
|
@@ -2010,8 +2018,3 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
|
|
2010
2018
|
void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
2011
2019
|
kv->set_input_pos_bucket(dst, ubatch);
|
|
2012
2020
|
}
|
|
2013
|
-
|
|
2014
|
-
uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) {
|
|
2015
|
-
// the FA kernels require padding to avoid extra runtime boundary checks
|
|
2016
|
-
return cparams.flash_attn ? 256u : 32u;
|
|
2017
|
-
}
|
|
@@ -19,8 +19,6 @@ struct llama_context;
|
|
|
19
19
|
|
|
20
20
|
class llama_kv_cache : public llama_memory_i {
|
|
21
21
|
public:
|
|
22
|
-
static uint32_t get_padding(const llama_cparams & cparams);
|
|
23
|
-
|
|
24
22
|
struct stream_copy_info {
|
|
25
23
|
bool empty() const {
|
|
26
24
|
assert(ssrc.size() == sdst.size());
|
|
@@ -217,8 +215,8 @@ private:
|
|
|
217
215
|
// this is the SWA type of the cache - not to be confused with the model SWA type
|
|
218
216
|
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
|
219
217
|
|
|
220
|
-
|
|
221
|
-
std::vector<ggml_backend_buffer_ptr
|
|
218
|
+
// ggml contexts for the KV cache along with the allocated backend buffers:
|
|
219
|
+
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
|
|
222
220
|
|
|
223
221
|
// the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
|
|
224
222
|
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
#include <algorithm>
|
|
9
9
|
#include <cassert>
|
|
10
|
+
#include <cstring>
|
|
10
11
|
#include <limits>
|
|
11
12
|
#include <map>
|
|
12
13
|
#include <stdexcept>
|
|
@@ -32,8 +33,15 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
32
33
|
cells.clear();
|
|
33
34
|
cells.resize(mem_size);
|
|
34
35
|
|
|
36
|
+
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
|
37
|
+
struct ggml_backend_buft_comparator {
|
|
38
|
+
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
|
39
|
+
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
|
40
|
+
}
|
|
41
|
+
};
|
|
42
|
+
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
|
43
|
+
|
|
35
44
|
// create a context for each buffer type
|
|
36
|
-
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
37
45
|
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
|
38
46
|
auto it = ctx_map.find(buft);
|
|
39
47
|
if (it == ctx_map.end()) {
|
|
@@ -48,13 +56,12 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
48
56
|
return nullptr;
|
|
49
57
|
}
|
|
50
58
|
|
|
51
|
-
ctx_map
|
|
52
|
-
ctxs.emplace_back(ctx);
|
|
59
|
+
ctx_map.emplace(buft, ctx);
|
|
53
60
|
|
|
54
61
|
return ctx;
|
|
55
62
|
}
|
|
56
63
|
|
|
57
|
-
return it->second;
|
|
64
|
+
return it->second.get();
|
|
58
65
|
};
|
|
59
66
|
|
|
60
67
|
r_l.resize(n_layer);
|
|
@@ -93,17 +100,14 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|
|
93
100
|
}
|
|
94
101
|
|
|
95
102
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
|
96
|
-
for (auto
|
|
97
|
-
|
|
98
|
-
auto * ctx = it.second;
|
|
99
|
-
|
|
100
|
-
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
103
|
+
for (auto & [buft, ctx] : ctx_map) {
|
|
104
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
|
101
105
|
if (!buf) {
|
|
102
106
|
throw std::runtime_error("failed to allocate buffer for rs cache");
|
|
103
107
|
}
|
|
104
108
|
ggml_backend_buffer_clear(buf, 0);
|
|
105
109
|
LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
|
106
|
-
|
|
110
|
+
ctxs_bufs.emplace_back(std::move(ctx), buf);
|
|
107
111
|
}
|
|
108
112
|
|
|
109
113
|
{
|
|
@@ -129,7 +133,7 @@ void llama_memory_recurrent::clear(bool data) {
|
|
|
129
133
|
used = 0;
|
|
130
134
|
|
|
131
135
|
if (data) {
|
|
132
|
-
for (auto & buf :
|
|
136
|
+
for (auto & [_, buf] : ctxs_bufs) {
|
|
133
137
|
ggml_backend_buffer_clear(buf.get(), 0);
|
|
134
138
|
}
|
|
135
139
|
}
|
|
@@ -364,8 +368,8 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
|
|
364
368
|
|
|
365
369
|
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
|
|
366
370
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
367
|
-
for (const
|
|
368
|
-
ret[ggml_backend_buffer_get_type(
|
|
371
|
+
for (const auto & [_, buf] : ctxs_bufs) {
|
|
372
|
+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
|
369
373
|
}
|
|
370
374
|
return ret;
|
|
371
375
|
}
|
|
@@ -662,7 +666,7 @@ bool llama_memory_recurrent::get_can_shift() const {
|
|
|
662
666
|
|
|
663
667
|
size_t llama_memory_recurrent::total_size() const {
|
|
664
668
|
size_t size = 0;
|
|
665
|
-
for (const auto & buf :
|
|
669
|
+
for (const auto & [_, buf] : ctxs_bufs) {
|
|
666
670
|
size += ggml_backend_buffer_get_size(buf.get());
|
|
667
671
|
}
|
|
668
672
|
|
|
@@ -109,8 +109,8 @@ private:
|
|
|
109
109
|
|
|
110
110
|
const uint32_t n_seq_max = 1;
|
|
111
111
|
|
|
112
|
-
|
|
113
|
-
std::vector<ggml_backend_buffer_ptr
|
|
112
|
+
// ggml contexts for the KV cache along with the allocated backend buffers:
|
|
113
|
+
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
|
|
114
114
|
|
|
115
115
|
size_t total_size() const;
|
|
116
116
|
|
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
|
|
16
16
|
#include <algorithm>
|
|
17
17
|
#include <cassert>
|
|
18
|
-
#include <cmath>
|
|
19
18
|
#include <cfloat>
|
|
20
19
|
#include <cstring>
|
|
21
20
|
#include <cmath>
|
|
@@ -404,6 +403,19 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
|
|
|
404
403
|
// add the device default buffer type
|
|
405
404
|
buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
|
|
406
405
|
|
|
406
|
+
// add the device extra buffer type (if any)
|
|
407
|
+
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
|
408
|
+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
409
|
+
ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
|
|
410
|
+
|
|
411
|
+
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
412
|
+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
|
|
413
|
+
while (extra_bufts && *extra_bufts) {
|
|
414
|
+
buft_list.emplace_back(dev, *extra_bufts);
|
|
415
|
+
++extra_bufts;
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
407
419
|
return buft_list;
|
|
408
420
|
}
|
|
409
421
|
|
|
@@ -425,7 +437,7 @@ struct llama_model::impl {
|
|
|
425
437
|
llama_mlocks mlock_mmaps;
|
|
426
438
|
|
|
427
439
|
// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
|
|
428
|
-
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr
|
|
440
|
+
std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
|
|
429
441
|
|
|
430
442
|
buft_list_t cpu_buft_list;
|
|
431
443
|
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
|
@@ -2219,7 +2231,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2219
2231
|
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
|
2220
2232
|
struct ggml_backend_buft_comparator {
|
|
2221
2233
|
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
|
2222
|
-
return ggml_backend_buft_name(lhs)
|
|
2234
|
+
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
|
2223
2235
|
}
|
|
2224
2236
|
};
|
|
2225
2237
|
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
|
@@ -6173,7 +6185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6173
6185
|
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
|
|
6174
6186
|
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
|
|
6175
6187
|
|
|
6176
|
-
|
|
6188
|
+
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
6177
6189
|
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
|
6178
6190
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
|
6179
6191
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
|
@@ -6186,15 +6198,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6186
6198
|
continue;
|
|
6187
6199
|
}
|
|
6188
6200
|
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
|
6189
|
-
buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
|
|
6201
|
+
ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
|
|
6190
6202
|
if (buf == nullptr) {
|
|
6191
6203
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
6192
6204
|
}
|
|
6205
|
+
bufs.emplace_back(buf);
|
|
6193
6206
|
buf_map.emplace(idx, buf);
|
|
6194
6207
|
}
|
|
6195
6208
|
}
|
|
6196
6209
|
else {
|
|
6197
|
-
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
6210
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
6198
6211
|
if (buf == nullptr) {
|
|
6199
6212
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
6200
6213
|
}
|
|
@@ -6204,11 +6217,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6204
6217
|
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
|
6205
6218
|
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
|
6206
6219
|
}
|
|
6220
|
+
bufs.emplace_back(buf);
|
|
6207
6221
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
|
6208
6222
|
buf_map.emplace(idx, buf);
|
|
6209
6223
|
}
|
|
6210
6224
|
}
|
|
6211
|
-
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr),
|
|
6225
|
+
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
|
|
6212
6226
|
|
|
6213
6227
|
for (auto & buf : buf_map) {
|
|
6214
6228
|
// indicate that this buffer contains weights
|
|
@@ -6234,8 +6248,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6234
6248
|
}
|
|
6235
6249
|
|
|
6236
6250
|
// print memory requirements per buffer type
|
|
6237
|
-
for (auto & [_,
|
|
6238
|
-
|
|
6251
|
+
for (auto & [_, bufs] : pimpl->ctxs_bufs) {
|
|
6252
|
+
for (auto & buf: bufs) {
|
|
6253
|
+
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
|
|
6254
|
+
__func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
|
|
6255
|
+
}
|
|
6239
6256
|
}
|
|
6240
6257
|
|
|
6241
6258
|
// populate tensors_by_name
|
|
@@ -6287,8 +6304,10 @@ size_t llama_model::n_devices() const {
|
|
|
6287
6304
|
|
|
6288
6305
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6289
6306
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6290
|
-
for (const auto & [_,
|
|
6291
|
-
|
|
6307
|
+
for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
|
|
6308
|
+
for (const auto & buf : bufs) {
|
|
6309
|
+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
|
6310
|
+
}
|
|
6292
6311
|
}
|
|
6293
6312
|
return ret;
|
|
6294
6313
|
}
|
|
@@ -6356,6 +6375,8 @@ void llama_model::print_info() const {
|
|
|
6356
6375
|
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
|
6357
6376
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
|
6358
6377
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
|
6378
|
+
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
|
|
6379
|
+
LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
|
|
6359
6380
|
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
|
6360
6381
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
|
6361
6382
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
|
@@ -6456,8 +6477,6 @@ void llama_model::print_info() const {
|
|
|
6456
6477
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6457
6478
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
6458
6479
|
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
6459
|
-
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
|
|
6460
|
-
LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
|
|
6461
6480
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
6462
6481
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
6463
6482
|
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
@@ -17952,6 +17971,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
|
|
|
17952
17971
|
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
17953
17972
|
cb(cur, "result_norm", -1);
|
|
17954
17973
|
|
|
17974
|
+
res->t_embd = cur;
|
|
17975
|
+
|
|
17955
17976
|
// lm_head
|
|
17956
17977
|
cur = build_lora_mm(model.output, cur);
|
|
17957
17978
|
cb(cur, "result_output", -1);
|
|
@@ -19324,6 +19345,7 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
19324
19345
|
|
|
19325
19346
|
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
19326
19347
|
cb(cur, "result_norm", -1);
|
|
19348
|
+
res->t_embd = cur;
|
|
19327
19349
|
|
|
19328
19350
|
// lm_head
|
|
19329
19351
|
cur = build_lora_mm(model.output, cur);
|
|
@@ -19619,7 +19641,7 @@ struct llm_build_apertus : public llm_graph_context {
|
|
|
19619
19641
|
}
|
|
19620
19642
|
};
|
|
19621
19643
|
|
|
19622
|
-
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
19644
|
+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
|
|
19623
19645
|
llama_memory_i * res;
|
|
19624
19646
|
|
|
19625
19647
|
switch (arch) {
|
|
@@ -19670,17 +19692,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
19670
19692
|
};
|
|
19671
19693
|
}
|
|
19672
19694
|
|
|
19673
|
-
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
19674
|
-
|
|
19675
|
-
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
19676
|
-
|
|
19677
19695
|
res = new llama_memory_hybrid(
|
|
19678
19696
|
/* model */ *this,
|
|
19679
19697
|
/* attn_type_k */ params.type_k,
|
|
19680
19698
|
/* attn_type_v */ params.type_v,
|
|
19681
19699
|
/* attn_v_trans */ !cparams.flash_attn,
|
|
19682
19700
|
/* attn_kv_size */ cparams.n_ctx,
|
|
19683
|
-
/* attn_n_pad */
|
|
19701
|
+
/* attn_n_pad */ 1,
|
|
19684
19702
|
/* attn_n_swa */ hparams.n_swa,
|
|
19685
19703
|
/* attn_swa_type */ hparams.swa_type,
|
|
19686
19704
|
/* recurrent_type_k */ GGML_TYPE_F32,
|
|
@@ -19692,23 +19710,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
19692
19710
|
/* filter_attn */ std::move(filter_attn),
|
|
19693
19711
|
/* filter_recr */ std::move(filter_recr));
|
|
19694
19712
|
} else {
|
|
19695
|
-
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
19696
|
-
|
|
19697
19713
|
uint32_t n_ctx_per_stream = cparams.n_ctx;
|
|
19698
19714
|
|
|
19699
19715
|
if (!cparams.kv_unified) {
|
|
19700
19716
|
n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
|
|
19701
|
-
n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
|
|
19702
|
-
|
|
19703
|
-
cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
|
|
19704
|
-
} else {
|
|
19705
|
-
n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
|
|
19706
|
-
|
|
19707
|
-
cparams.n_ctx = n_ctx_per_stream;
|
|
19708
19717
|
}
|
|
19709
19718
|
|
|
19710
|
-
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
19711
|
-
|
|
19712
19719
|
llama_memory_i::layer_reuse_cb reuse = nullptr;
|
|
19713
19720
|
|
|
19714
19721
|
if (arch == LLM_ARCH_GEMMA3N) {
|
|
@@ -19735,7 +19742,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
19735
19742
|
n_ctx_per_stream,
|
|
19736
19743
|
cparams.n_seq_max,
|
|
19737
19744
|
cparams.n_ubatch,
|
|
19738
|
-
|
|
19745
|
+
1,
|
|
19739
19746
|
nullptr,
|
|
19740
19747
|
reuse);
|
|
19741
19748
|
} else {
|
|
@@ -19750,7 +19757,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
19750
19757
|
cparams.kv_unified,
|
|
19751
19758
|
n_ctx_per_stream,
|
|
19752
19759
|
cparams.n_seq_max,
|
|
19753
|
-
|
|
19760
|
+
1,
|
|
19754
19761
|
hparams.n_swa,
|
|
19755
19762
|
hparams.swa_type,
|
|
19756
19763
|
nullptr,
|
|
@@ -500,9 +500,8 @@ struct llama_model {
|
|
|
500
500
|
|
|
501
501
|
ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
|
|
502
502
|
|
|
503
|
-
// note: can mutate `cparams`
|
|
504
503
|
// TODO: move this to new llm_arch_model_i interface
|
|
505
|
-
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
|
504
|
+
llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
|
|
506
505
|
|
|
507
506
|
// TODO: move this to new llm_arch_model_i interface
|
|
508
507
|
ggml_cgraph * build_graph(const llm_graph_params & params) const;
|