@fugood/llama.node 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +12 -12
- package/src/llama.cpp/CMakeLists.txt +0 -1
- package/src/llama.cpp/common/arg.cpp +17 -0
- package/src/llama.cpp/common/chat.cpp +37 -20
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.h +4 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +181 -10
- package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +38 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1297 -211
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -9
- package/src/llama.cpp/include/llama.h +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +108 -2
- package/src/llama.cpp/src/llama-arch.h +7 -0
- package/src/llama.cpp/src/llama-batch.cpp +27 -1
- package/src/llama.cpp/src/llama-batch.h +8 -1
- package/src/llama.cpp/src/llama-chat.cpp +15 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +95 -81
- package/src/llama.cpp/src/llama-graph.h +43 -16
- package/src/llama.cpp/src/llama-hparams.cpp +2 -1
- package/src/llama.cpp/src/llama-hparams.h +1 -0
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
- package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
- package/src/llama.cpp/src/llama-kv-cells.h +62 -10
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
- package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +34 -16
- package/src/llama.cpp/src/llama-memory.cpp +17 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +1374 -210
- package/src/llama.cpp/src/llama-model.h +3 -0
- package/src/llama.cpp/src/llama-vocab.cpp +8 -1
- package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.0.
|
|
4
|
+
"version": "1.0.3",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -70,19 +70,19 @@
|
|
|
70
70
|
"CMakeLists.txt"
|
|
71
71
|
],
|
|
72
72
|
"optionalDependencies": {
|
|
73
|
-
"@fugood/node-llama-linux-x64": "1.0.
|
|
74
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.0.
|
|
75
|
-
"@fugood/node-llama-linux-x64-cuda": "1.0.
|
|
76
|
-
"@fugood/node-llama-linux-arm64": "1.0.
|
|
77
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.0.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.0.
|
|
79
|
-
"@fugood/node-llama-win32-x64": "1.0.
|
|
80
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.0.
|
|
81
|
-
"@fugood/node-llama-win32-x64-cuda": "1.0.
|
|
82
|
-
"@fugood/node-llama-win32-arm64": "1.0.
|
|
83
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.0.
|
|
84
|
-
"@fugood/node-llama-darwin-x64": "1.0.
|
|
85
|
-
"@fugood/node-llama-darwin-arm64": "1.0.
|
|
73
|
+
"@fugood/node-llama-linux-x64": "1.0.3",
|
|
74
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.0.3",
|
|
75
|
+
"@fugood/node-llama-linux-x64-cuda": "1.0.3",
|
|
76
|
+
"@fugood/node-llama-linux-arm64": "1.0.3",
|
|
77
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.0.3",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.0.3",
|
|
79
|
+
"@fugood/node-llama-win32-x64": "1.0.3",
|
|
80
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.0.3",
|
|
81
|
+
"@fugood/node-llama-win32-x64-cuda": "1.0.3",
|
|
82
|
+
"@fugood/node-llama-win32-arm64": "1.0.3",
|
|
83
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.0.3",
|
|
84
|
+
"@fugood/node-llama-darwin-x64": "1.0.3",
|
|
85
|
+
"@fugood/node-llama-darwin-arm64": "1.0.3"
|
|
86
86
|
},
|
|
87
87
|
"devDependencies": {
|
|
88
88
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
2
|
-
index
|
|
2
|
+
index 114dbfcc..6771bd43 100644
|
|
3
3
|
--- a/src/llama.cpp/common/chat.cpp
|
|
4
4
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
5
5
|
@@ -6,9 +6,6 @@
|
|
@@ -12,7 +12,7 @@ index 7d9aaeb1..a7b68d4a 100644
|
|
|
12
12
|
#include <cstdio>
|
|
13
13
|
#include <exception>
|
|
14
14
|
#include <iostream>
|
|
15
|
-
@@ -
|
|
15
|
+
@@ -123,14 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
|
16
16
|
return diffs;
|
|
17
17
|
}
|
|
18
18
|
|
|
@@ -27,13 +27,13 @@ index 7d9aaeb1..a7b68d4a 100644
|
|
|
27
27
|
struct templates_params {
|
|
28
28
|
json messages;
|
|
29
29
|
json tools;
|
|
30
|
-
diff --git a/
|
|
31
|
-
index
|
|
30
|
+
diff --git a/common/chat.h b/common/chat.h
|
|
31
|
+
index ca807c14..56649863 100644
|
|
32
32
|
--- a/src/llama.cpp/common/chat.h
|
|
33
33
|
+++ b/src/llama.cpp/common/chat.h
|
|
34
|
-
@@ -
|
|
35
|
-
#include <string>
|
|
34
|
+
@@ -9,7 +9,16 @@
|
|
36
35
|
#include <vector>
|
|
36
|
+
#include <map>
|
|
37
37
|
|
|
38
38
|
-struct common_chat_templates;
|
|
39
39
|
+#include <minja/chat-template.hpp>
|
|
@@ -62,10 +62,10 @@ index e4e71ad1..091ddda4 100644
|
|
|
62
62
|
mparams.split_mode = params.split_mode;
|
|
63
63
|
mparams.tensor_split = params.tensor_split;
|
|
64
64
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
65
|
-
index
|
|
65
|
+
index 8922090e..3c2d1a6a 100644
|
|
66
66
|
--- a/src/llama.cpp/common/common.h
|
|
67
67
|
+++ b/src/llama.cpp/common/common.h
|
|
68
|
-
@@ -
|
|
68
|
+
@@ -224,6 +224,7 @@ enum common_reasoning_format {
|
|
69
69
|
};
|
|
70
70
|
|
|
71
71
|
struct common_params {
|
|
@@ -74,7 +74,7 @@ index e08a59ea..d120b67d 100644
|
|
|
74
74
|
int32_t n_ctx = 4096; // context size
|
|
75
75
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
76
76
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
77
|
-
index
|
|
77
|
+
index 671fad4d..93fc3cd7 100644
|
|
78
78
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
79
79
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
80
80
|
@@ -104,7 +104,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
@@ -87,10 +87,10 @@ index 71b1d67b..093cd6f9 100644
|
|
|
87
87
|
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
|
88
88
|
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
|
89
89
|
diff --git a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
90
|
-
index
|
|
90
|
+
index b97e7bf9..c3eb9519 100644
|
|
91
91
|
--- a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
92
92
|
+++ b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
93
|
-
@@ -
|
|
93
|
+
@@ -111,7 +111,7 @@ if (Vulkan_FOUND)
|
|
94
94
|
endif()
|
|
95
95
|
|
|
96
96
|
# Set up toolchain for host compilation whether cross-compiling or not
|
|
@@ -99,7 +99,7 @@ index 39f022f3..7ae9047e 100644
|
|
|
99
99
|
if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
|
|
100
100
|
set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN})
|
|
101
101
|
else()
|
|
102
|
-
@@ -
|
|
102
|
+
@@ -131,7 +131,7 @@ if (Vulkan_FOUND)
|
|
103
103
|
|
|
104
104
|
include(ExternalProject)
|
|
105
105
|
|
|
@@ -120,7 +120,6 @@ endfunction()
|
|
|
120
120
|
|
|
121
121
|
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
|
|
122
122
|
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
|
123
|
-
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
|
124
123
|
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
|
125
124
|
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
|
126
125
|
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
|
@@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2734
2734
|
params.public_path = value;
|
|
2735
2735
|
}
|
|
2736
2736
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
|
2737
|
+
add_opt(common_arg(
|
|
2738
|
+
{"--api-prefix"}, "PREFIX",
|
|
2739
|
+
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
|
|
2740
|
+
[](common_params & params, const std::string & value) {
|
|
2741
|
+
params.api_prefix = value;
|
|
2742
|
+
}
|
|
2743
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
|
2737
2744
|
add_opt(common_arg(
|
|
2738
2745
|
{"--no-webui"},
|
|
2739
2746
|
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
|
@@ -2794,6 +2801,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2794
2801
|
params.ssl_file_cert = value;
|
|
2795
2802
|
}
|
|
2796
2803
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
|
2804
|
+
add_opt(common_arg(
|
|
2805
|
+
{"--chat-template-kwargs"}, "STRING",
|
|
2806
|
+
string_format("sets additional params for the json template parser"),
|
|
2807
|
+
[](common_params & params, const std::string & value) {
|
|
2808
|
+
auto parsed = json::parse(value);
|
|
2809
|
+
for (const auto & item : parsed.items()) {
|
|
2810
|
+
params.default_template_kwargs[item.key()] = item.value().dump();
|
|
2811
|
+
}
|
|
2812
|
+
}
|
|
2813
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
|
2797
2814
|
add_opt(common_arg(
|
|
2798
2815
|
{"-to", "--timeout"}, "N",
|
|
2799
2816
|
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
|
@@ -14,6 +14,8 @@
|
|
|
14
14
|
#include <string>
|
|
15
15
|
#include <vector>
|
|
16
16
|
|
|
17
|
+
using json = nlohmann::ordered_json;
|
|
18
|
+
|
|
17
19
|
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
|
|
18
20
|
auto time = std::chrono::system_clock::to_time_t(now);
|
|
19
21
|
auto local_time = *std::localtime(&time);
|
|
@@ -129,6 +131,7 @@ struct templates_params {
|
|
|
129
131
|
bool add_generation_prompt = true;
|
|
130
132
|
bool enable_thinking = true;
|
|
131
133
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
134
|
+
json extra_context;
|
|
132
135
|
};
|
|
133
136
|
|
|
134
137
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
|
@@ -709,16 +712,23 @@ static void foreach_function(const json & tools, const std::function<void(const
|
|
|
709
712
|
|
|
710
713
|
static std::string apply(
|
|
711
714
|
const common_chat_template & tmpl,
|
|
712
|
-
const
|
|
713
|
-
const
|
|
714
|
-
|
|
715
|
-
const
|
|
715
|
+
const struct templates_params & inputs,
|
|
716
|
+
const std::optional<json> & messages_override = std::nullopt,
|
|
717
|
+
const std::optional<json> & tools_override = std::nullopt,
|
|
718
|
+
const std::optional<json> & additional_context = std::nullopt)
|
|
716
719
|
{
|
|
717
720
|
minja::chat_template_inputs tmpl_inputs;
|
|
718
|
-
tmpl_inputs.messages = messages;
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
721
|
+
tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
|
|
722
|
+
if (tools_override) {
|
|
723
|
+
tmpl_inputs.tools = *tools_override;
|
|
724
|
+
} else {
|
|
725
|
+
tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
|
|
726
|
+
}
|
|
727
|
+
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
|
|
728
|
+
tmpl_inputs.extra_context = inputs.extra_context;
|
|
729
|
+
if (additional_context) {
|
|
730
|
+
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
731
|
+
}
|
|
722
732
|
// TODO: add flag to control date/time, if only for testing purposes.
|
|
723
733
|
// tmpl_inputs.now = std::chrono::system_clock::now();
|
|
724
734
|
|
|
@@ -817,7 +827,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
|
|
|
817
827
|
inputs.messages,
|
|
818
828
|
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
|
|
819
829
|
|
|
820
|
-
data.prompt = apply(tmpl,
|
|
830
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
|
|
821
831
|
data.format = COMMON_CHAT_FORMAT_GENERIC;
|
|
822
832
|
return data;
|
|
823
833
|
}
|
|
@@ -893,7 +903,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
|
|
893
903
|
data.preserved_tokens = {
|
|
894
904
|
"[TOOL_CALLS]",
|
|
895
905
|
};
|
|
896
|
-
data.prompt = apply(tmpl, inputs
|
|
906
|
+
data.prompt = apply(tmpl, inputs);
|
|
897
907
|
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
|
898
908
|
return data;
|
|
899
909
|
}
|
|
@@ -923,7 +933,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
|
|
|
923
933
|
adjusted_messages.push_back(msg);
|
|
924
934
|
}
|
|
925
935
|
}
|
|
926
|
-
data.prompt = apply(tmpl,
|
|
936
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
|
927
937
|
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
|
|
928
938
|
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
|
|
929
939
|
if (!inputs.enable_thinking) {
|
|
@@ -1111,7 +1121,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
|
|
|
1111
1121
|
} else {
|
|
1112
1122
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1113
1123
|
}
|
|
1114
|
-
data.prompt = apply(tmpl, inputs
|
|
1124
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
|
|
1115
1125
|
{"date_string", format_time(inputs.now, "%d %b %Y")},
|
|
1116
1126
|
{"tools_in_user_message", false},
|
|
1117
1127
|
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
|
|
@@ -1176,7 +1186,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
|
|
|
1176
1186
|
|
|
1177
1187
|
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1178
1188
|
common_chat_params data;
|
|
1179
|
-
auto prompt = apply(tmpl, inputs
|
|
1189
|
+
auto prompt = apply(tmpl, inputs);
|
|
1180
1190
|
|
|
1181
1191
|
// Hacks to fix the official (broken) prompt.
|
|
1182
1192
|
// It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
|
|
@@ -1271,7 +1281,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
|
1271
1281
|
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1272
1282
|
LOG_DBG("%s\n", __func__);
|
|
1273
1283
|
common_chat_params data;
|
|
1274
|
-
data.prompt = apply(tmpl, inputs
|
|
1284
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
|
|
1275
1285
|
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
|
|
1276
1286
|
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
|
1277
1287
|
});
|
|
@@ -1327,7 +1337,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
|
|
1327
1337
|
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
|
|
1328
1338
|
// If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
|
|
1329
1339
|
common_chat_params data;
|
|
1330
|
-
data.prompt = apply(tmpl, inputs
|
|
1340
|
+
data.prompt = apply(tmpl, inputs);
|
|
1331
1341
|
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
|
|
1332
1342
|
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1333
1343
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
@@ -1454,7 +1464,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
|
|
|
1454
1464
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1455
1465
|
}
|
|
1456
1466
|
|
|
1457
|
-
data.prompt = apply(tmpl, inputs
|
|
1467
|
+
data.prompt = apply(tmpl, inputs);
|
|
1458
1468
|
// TODO: if (has_raw_python)
|
|
1459
1469
|
return data;
|
|
1460
1470
|
}
|
|
@@ -1487,14 +1497,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
|
|
|
1487
1497
|
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1488
1498
|
common_chat_params data;
|
|
1489
1499
|
|
|
1490
|
-
json
|
|
1500
|
+
json extra_context = json {
|
|
1491
1501
|
{"enable_thinking", inputs.enable_thinking},
|
|
1492
1502
|
};
|
|
1503
|
+
extra_context.update(inputs.extra_context);
|
|
1493
1504
|
|
|
1494
|
-
data.prompt = apply(tmpl, inputs
|
|
1505
|
+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
|
|
1495
1506
|
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
|
1496
1507
|
if (string_ends_with(data.prompt, "<think>\n")) {
|
|
1497
|
-
if (!
|
|
1508
|
+
if (!extra_context["enable_thinking"]) {
|
|
1498
1509
|
data.prompt += "</think>";
|
|
1499
1510
|
} else {
|
|
1500
1511
|
data.thinking_forced_open = true;
|
|
@@ -1680,7 +1691,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1680
1691
|
|
|
1681
1692
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1682
1693
|
common_chat_params data;
|
|
1683
|
-
data.prompt = apply(tmpl, inputs
|
|
1694
|
+
data.prompt = apply(tmpl, inputs);
|
|
1684
1695
|
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
|
1685
1696
|
data.grammar_lazy = false;
|
|
1686
1697
|
if (!inputs.json_schema.is_null()) {
|
|
@@ -1711,6 +1722,12 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1711
1722
|
params.enable_thinking = inputs.enable_thinking;
|
|
1712
1723
|
params.grammar = inputs.grammar;
|
|
1713
1724
|
params.now = inputs.now;
|
|
1725
|
+
|
|
1726
|
+
params.extra_context = json::object();
|
|
1727
|
+
for (auto el : inputs.chat_template_kwargs) {
|
|
1728
|
+
params.extra_context[el.first] = json::parse(el.second);
|
|
1729
|
+
}
|
|
1730
|
+
|
|
1714
1731
|
if (!inputs.json_schema.empty()) {
|
|
1715
1732
|
params.json_schema = json::parse(inputs.json_schema);
|
|
1716
1733
|
}
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include <chrono>
|
|
8
8
|
#include <string>
|
|
9
9
|
#include <vector>
|
|
10
|
+
#include <map>
|
|
10
11
|
|
|
11
12
|
#include <minja/chat-template.hpp>
|
|
12
13
|
#include <minja/minja.hpp>
|
|
@@ -134,6 +135,7 @@ struct common_chat_templates_inputs {
|
|
|
134
135
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
135
136
|
bool enable_thinking = true;
|
|
136
137
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
138
|
+
std::map<std::string, std::string> chat_template_kwargs;
|
|
137
139
|
};
|
|
138
140
|
|
|
139
141
|
struct common_chat_params {
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include <string>
|
|
9
9
|
#include <string_view>
|
|
10
10
|
#include <vector>
|
|
11
|
+
#include <map>
|
|
11
12
|
#include <sstream>
|
|
12
13
|
|
|
13
14
|
#ifdef _WIN32
|
|
@@ -370,6 +371,7 @@ struct common_params {
|
|
|
370
371
|
|
|
371
372
|
std::string hostname = "127.0.0.1";
|
|
372
373
|
std::string public_path = ""; // NOLINT
|
|
374
|
+
std::string api_prefix = ""; // NOLINT
|
|
373
375
|
std::string chat_template = ""; // NOLINT
|
|
374
376
|
bool use_jinja = false; // NOLINT
|
|
375
377
|
bool enable_chat_template = true;
|
|
@@ -382,6 +384,8 @@ struct common_params {
|
|
|
382
384
|
std::string ssl_file_key = ""; // NOLINT
|
|
383
385
|
std::string ssl_file_cert = ""; // NOLINT
|
|
384
386
|
|
|
387
|
+
std::map<std::string, std::string> default_template_kwargs;
|
|
388
|
+
|
|
385
389
|
// "advanced" endpoints are disabled by default for better security
|
|
386
390
|
bool webui = true;
|
|
387
391
|
bool endpoint_slots = false;
|
|
@@ -181,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
|
|
|
181
181
|
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
|
182
182
|
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
|
183
183
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
|
184
|
-
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
|
185
184
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
186
185
|
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
187
186
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
@@ -266,7 +265,6 @@ set(GGML_PUBLIC_HEADERS
|
|
|
266
265
|
include/ggml-cann.h
|
|
267
266
|
include/ggml-cpp.h
|
|
268
267
|
include/ggml-cuda.h
|
|
269
|
-
include/ggml-kompute.h
|
|
270
268
|
include/ggml-opt.h
|
|
271
269
|
include/ggml-metal.h
|
|
272
270
|
include/ggml-rpc.h
|
|
@@ -360,6 +358,13 @@ write_basic_package_version_file(
|
|
|
360
358
|
VERSION ${GGML_INSTALL_VERSION}
|
|
361
359
|
COMPATIBILITY SameMajorVersion)
|
|
362
360
|
|
|
361
|
+
target_compile_definitions(ggml-base PRIVATE
|
|
362
|
+
GGML_VERSION="${GGML_INSTALL_VERSION}"
|
|
363
|
+
GGML_COMMIT="${GGML_BUILD_COMMIT}"
|
|
364
|
+
)
|
|
365
|
+
message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
|
|
366
|
+
message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
|
|
367
|
+
|
|
363
368
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
|
364
369
|
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
|
365
370
|
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
|
|
@@ -339,7 +339,7 @@ extern "C" {
|
|
|
339
339
|
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
|
340
340
|
|
|
341
341
|
// Compare the output of two backends
|
|
342
|
-
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
|
342
|
+
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
|
|
343
343
|
|
|
344
344
|
// Tensor initialization
|
|
345
345
|
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
|
@@ -134,6 +134,7 @@ extern "C" {
|
|
|
134
134
|
|
|
135
135
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
136
136
|
|
|
137
|
+
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
|
137
138
|
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
|
138
139
|
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
|
139
140
|
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|