@fugood/llama.node 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +17 -13
- package/src/LlamaCompletionWorker.cpp +2 -0
- package/src/llama.cpp/common/arg.cpp +28 -11
- package/src/llama.cpp/common/chat.cpp +46 -2
- package/src/llama.cpp/common/chat.h +7 -2
- package/src/llama.cpp/common/common.h +3 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml.h +37 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +65 -0
- package/src/llama.cpp/src/llama-arch.h +10 -0
- package/src/llama.cpp/src/llama-chat.cpp +13 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +8 -8
- package/src/llama.cpp/src/llama-graph.cpp +118 -9
- package/src/llama.cpp/src/llama-graph.h +38 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +4 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +499 -4
- package/src/llama.cpp/src/llama-model.h +24 -4
- package/src/llama.cpp/src/llama-quant.cpp +37 -1
- package/src/llama.cpp/src/llama-vocab.cpp +42 -0
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.6",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -71,19 +71,19 @@
|
|
|
71
71
|
"CMakeLists.txt"
|
|
72
72
|
],
|
|
73
73
|
"optionalDependencies": {
|
|
74
|
-
"@fugood/node-llama-linux-x64": "1.1.
|
|
75
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-cuda": "1.1.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.1.
|
|
80
|
-
"@fugood/node-llama-win32-x64": "1.1.
|
|
81
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-cuda": "1.1.
|
|
83
|
-
"@fugood/node-llama-win32-arm64": "1.1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.1.
|
|
85
|
-
"@fugood/node-llama-darwin-x64": "1.1.
|
|
86
|
-
"@fugood/node-llama-darwin-arm64": "1.1.
|
|
74
|
+
"@fugood/node-llama-linux-x64": "1.1.6",
|
|
75
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.1.6",
|
|
76
|
+
"@fugood/node-llama-linux-x64-cuda": "1.1.6",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.1.6",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.1.6",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.1.6",
|
|
80
|
+
"@fugood/node-llama-win32-x64": "1.1.6",
|
|
81
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.1.6",
|
|
82
|
+
"@fugood/node-llama-win32-x64-cuda": "1.1.6",
|
|
83
|
+
"@fugood/node-llama-win32-arm64": "1.1.6",
|
|
84
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.1.6",
|
|
85
|
+
"@fugood/node-llama-darwin-x64": "1.1.6",
|
|
86
|
+
"@fugood/node-llama-darwin-arm64": "1.1.6"
|
|
87
87
|
},
|
|
88
88
|
"devDependencies": {
|
|
89
89
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
2
|
-
index
|
|
2
|
+
index 60805ab3..71b4236a 100644
|
|
3
3
|
--- a/src/llama.cpp/common/chat.cpp
|
|
4
4
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
5
5
|
@@ -6,9 +6,6 @@
|
|
@@ -12,13 +12,15 @@ index 114dbfcc..6771bd43 100644
|
|
|
12
12
|
#include <cstdio>
|
|
13
13
|
#include <exception>
|
|
14
14
|
#include <iostream>
|
|
15
|
-
@@ -123,
|
|
15
|
+
@@ -123,16 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
|
16
16
|
return diffs;
|
|
17
17
|
}
|
|
18
18
|
|
|
19
19
|
-typedef minja::chat_template common_chat_template;
|
|
20
20
|
-
|
|
21
21
|
-struct common_chat_templates {
|
|
22
|
+
- bool add_bos;
|
|
23
|
+
- bool add_eos;
|
|
22
24
|
- bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
|
23
25
|
- std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
|
24
26
|
- std::unique_ptr<common_chat_template> template_tool_use;
|
|
@@ -27,21 +29,23 @@ index 114dbfcc..6771bd43 100644
|
|
|
27
29
|
struct templates_params {
|
|
28
30
|
json messages;
|
|
29
31
|
json tools;
|
|
30
|
-
diff --git a/common/chat.h b/common/chat.h
|
|
31
|
-
index
|
|
32
|
+
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
33
|
+
index b014f9f0..3a868797 100644
|
|
32
34
|
--- a/src/llama.cpp/common/chat.h
|
|
33
35
|
+++ b/src/llama.cpp/common/chat.h
|
|
34
|
-
@@ -9,7 +9,
|
|
36
|
+
@@ -9,7 +9,18 @@
|
|
35
37
|
#include <vector>
|
|
36
38
|
#include <map>
|
|
37
39
|
|
|
38
40
|
-struct common_chat_templates;
|
|
39
|
-
+#include
|
|
40
|
-
+#include
|
|
41
|
+
+#include "minja/chat-template.hpp"
|
|
42
|
+
+#include "minja/minja.hpp"
|
|
41
43
|
+
|
|
42
44
|
+typedef minja::chat_template common_chat_template;
|
|
43
45
|
+
|
|
44
46
|
+struct common_chat_templates {
|
|
47
|
+
+ bool add_bos;
|
|
48
|
+
+ bool add_eos;
|
|
45
49
|
+ bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
|
46
50
|
+ std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
|
47
51
|
+ std::unique_ptr<common_chat_template> template_tool_use;
|
|
@@ -50,10 +54,10 @@ index ca807c14..56649863 100644
|
|
|
50
54
|
struct common_chat_tool_call {
|
|
51
55
|
std::string name;
|
|
52
56
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
53
|
-
index
|
|
57
|
+
index c6962d1d..ba5a4786 100644
|
|
54
58
|
--- a/src/llama.cpp/common/common.cpp
|
|
55
59
|
+++ b/src/llama.cpp/common/common.cpp
|
|
56
|
-
@@ -
|
|
60
|
+
@@ -1116,6 +1116,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
57
61
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
58
62
|
}
|
|
59
63
|
|
|
@@ -62,10 +66,10 @@ index e4e71ad1..091ddda4 100644
|
|
|
62
66
|
mparams.split_mode = params.split_mode;
|
|
63
67
|
mparams.tensor_split = params.tensor_split;
|
|
64
68
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
65
|
-
index
|
|
69
|
+
index 6c1c7ee2..c3eb0552 100644
|
|
66
70
|
--- a/src/llama.cpp/common/common.h
|
|
67
71
|
+++ b/src/llama.cpp/common/common.h
|
|
68
|
-
@@ -
|
|
72
|
+
@@ -242,6 +242,7 @@ enum common_reasoning_format {
|
|
69
73
|
};
|
|
70
74
|
|
|
71
75
|
struct common_params {
|
|
@@ -74,10 +78,10 @@ index 8922090e..3c2d1a6a 100644
|
|
|
74
78
|
int32_t n_ctx = 4096; // context size
|
|
75
79
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
76
80
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
77
|
-
index
|
|
81
|
+
index f188d163..0c33acad 100644
|
|
78
82
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
79
83
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
80
|
-
@@ -
|
|
84
|
+
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
81
85
|
)
|
|
82
86
|
|
|
83
87
|
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
|
@@ -320,6 +320,8 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
320
320
|
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
321
321
|
} else if (_reasoning_format == "deepseek-legacy") {
|
|
322
322
|
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
|
|
323
|
+
} else if (_reasoning_format == "auto") {
|
|
324
|
+
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
|
323
325
|
} else {
|
|
324
326
|
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
325
327
|
}
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
#include <cstdarg>
|
|
25
25
|
#include <filesystem>
|
|
26
26
|
#include <fstream>
|
|
27
|
+
#include <list>
|
|
27
28
|
#include <regex>
|
|
28
29
|
#include <set>
|
|
29
30
|
#include <string>
|
|
@@ -2375,20 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2375
2376
|
}
|
|
2376
2377
|
throw std::invalid_argument("unknown buffer type");
|
|
2377
2378
|
}
|
|
2378
|
-
//
|
|
2379
|
-
|
|
2379
|
+
// keep strings alive and avoid leaking memory by storing them in a static vector
|
|
2380
|
+
static std::list<std::string> buft_overrides;
|
|
2381
|
+
buft_overrides.push_back(tensor_name);
|
|
2382
|
+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
|
|
2380
2383
|
}
|
|
2381
2384
|
}
|
|
2382
2385
|
));
|
|
2383
2386
|
add_opt(common_arg(
|
|
2384
|
-
{"--cpu-moe"},
|
|
2385
|
-
"
|
|
2387
|
+
{"--cpu-moe", "-cmoe"},
|
|
2388
|
+
"keep all Mixture of Experts (MoE) weights in the CPU",
|
|
2386
2389
|
[](common_params & params) {
|
|
2387
|
-
params.tensor_buft_overrides.push_back({"\\.
|
|
2388
|
-
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
|
2389
|
-
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
|
|
2390
|
+
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
|
|
2390
2391
|
}
|
|
2391
2392
|
).set_env("LLAMA_ARG_CPU_MOE"));
|
|
2393
|
+
add_opt(common_arg(
|
|
2394
|
+
{"--n-cpu-moe", "-ncmoe"}, "N",
|
|
2395
|
+
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
|
|
2396
|
+
[](common_params & params, int value) {
|
|
2397
|
+
if (value < 0) {
|
|
2398
|
+
throw std::invalid_argument("invalid value");
|
|
2399
|
+
}
|
|
2400
|
+
for (int i = 0; i < value; ++i) {
|
|
2401
|
+
// keep strings alive and avoid leaking memory by storing them in a static vector
|
|
2402
|
+
static std::list<std::string> buft_overrides;
|
|
2403
|
+
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
|
2404
|
+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
2405
|
+
}
|
|
2406
|
+
}
|
|
2407
|
+
).set_env("LLAMA_ARG_N_CPU_MOE"));
|
|
2392
2408
|
add_opt(common_arg(
|
|
2393
2409
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
2394
2410
|
"number of layers to store in VRAM",
|
|
@@ -2649,10 +2665,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2649
2665
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2650
2666
|
add_opt(common_arg(
|
|
2651
2667
|
{"--output-format"}, "{gguf,dat}",
|
|
2652
|
-
string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
|
|
2668
|
+
string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
|
|
2653
2669
|
[](common_params & params, const std::string & value) {
|
|
2654
|
-
/**/ if (value == "gguf") { params.imat_dat =
|
|
2655
|
-
else if (value == "dat") { params.imat_dat =
|
|
2670
|
+
/**/ if (value == "gguf") { params.imat_dat = -1; }
|
|
2671
|
+
else if (value == "dat") { params.imat_dat = 1; }
|
|
2656
2672
|
else { throw std::invalid_argument("invalid output format"); }
|
|
2657
2673
|
}
|
|
2658
2674
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
@@ -2931,11 +2947,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2931
2947
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
|
2932
2948
|
"- none: leaves thoughts unparsed in `message.content`\n"
|
|
2933
2949
|
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
|
2934
|
-
"(default:
|
|
2950
|
+
"(default: auto)",
|
|
2935
2951
|
[](common_params & params, const std::string & value) {
|
|
2936
2952
|
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
|
2937
2953
|
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
|
2938
2954
|
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
|
2955
|
+
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
|
|
2939
2956
|
else { throw std::invalid_argument("invalid value"); }
|
|
2940
2957
|
}
|
|
2941
2958
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
|
@@ -132,6 +132,8 @@ struct templates_params {
|
|
|
132
132
|
bool enable_thinking = true;
|
|
133
133
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
134
134
|
json extra_context;
|
|
135
|
+
bool add_bos;
|
|
136
|
+
bool add_eos;
|
|
135
137
|
};
|
|
136
138
|
|
|
137
139
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
|
@@ -434,6 +436,8 @@ std::string common_chat_format_single(
|
|
|
434
436
|
|
|
435
437
|
common_chat_templates_inputs inputs;
|
|
436
438
|
inputs.use_jinja = use_jinja;
|
|
439
|
+
inputs.add_bos = tmpls->add_bos;
|
|
440
|
+
inputs.add_eos = tmpls->add_eos;
|
|
437
441
|
|
|
438
442
|
std::string fmt_past_msg;
|
|
439
443
|
if (!past_msg.empty()) {
|
|
@@ -458,6 +462,8 @@ std::string common_chat_format_single(
|
|
|
458
462
|
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
|
|
459
463
|
common_chat_templates_inputs inputs;
|
|
460
464
|
inputs.use_jinja = use_jinja;
|
|
465
|
+
inputs.add_bos = tmpls->add_bos;
|
|
466
|
+
inputs.add_eos = tmpls->add_eos;
|
|
461
467
|
auto add_simple_msg = [&](auto role, auto content) {
|
|
462
468
|
common_chat_msg msg;
|
|
463
469
|
msg.role = role;
|
|
@@ -535,6 +541,8 @@ common_chat_templates_ptr common_chat_templates_init(
|
|
|
535
541
|
}
|
|
536
542
|
std::string token_bos = bos_token_override;
|
|
537
543
|
std::string token_eos = eos_token_override;
|
|
544
|
+
bool add_bos = false;
|
|
545
|
+
bool add_eos = false;
|
|
538
546
|
if (model) {
|
|
539
547
|
const auto * vocab = llama_model_get_vocab(model);
|
|
540
548
|
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
|
|
@@ -549,9 +557,13 @@ common_chat_templates_ptr common_chat_templates_init(
|
|
|
549
557
|
};
|
|
550
558
|
token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
|
|
551
559
|
token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
|
|
560
|
+
add_bos = llama_vocab_get_add_bos(vocab);
|
|
561
|
+
add_eos = llama_vocab_get_add_eos(vocab);
|
|
552
562
|
}
|
|
553
563
|
common_chat_templates_ptr tmpls(new common_chat_templates());
|
|
554
564
|
tmpls->has_explicit_template = has_explicit_template;
|
|
565
|
+
tmpls->add_bos = add_bos;
|
|
566
|
+
tmpls->add_eos = add_eos;
|
|
555
567
|
try {
|
|
556
568
|
tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
|
|
557
569
|
} catch (const std::exception & e) {
|
|
@@ -581,6 +593,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
581
593
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
|
582
594
|
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
|
583
595
|
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
596
|
+
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
|
584
597
|
default:
|
|
585
598
|
throw std::runtime_error("Unknown chat format");
|
|
586
599
|
}
|
|
@@ -589,6 +602,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
589
602
|
const char * common_reasoning_format_name(common_reasoning_format format) {
|
|
590
603
|
switch (format) {
|
|
591
604
|
case COMMON_REASONING_FORMAT_NONE: return "none";
|
|
605
|
+
case COMMON_REASONING_FORMAT_AUTO: return "auto";
|
|
592
606
|
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
|
593
607
|
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
|
|
594
608
|
default:
|
|
@@ -737,10 +751,10 @@ static std::string apply(
|
|
|
737
751
|
// instead of using `chat_template_options.use_bos_token = false`, since these tokens
|
|
738
752
|
// may be needed inside the template / between messages too.
|
|
739
753
|
auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
|
|
740
|
-
if (string_starts_with(result, tmpl.bos_token())) {
|
|
754
|
+
if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
|
|
741
755
|
result = result.substr(tmpl.bos_token().size());
|
|
742
756
|
}
|
|
743
|
-
if (string_ends_with(result, tmpl.eos_token())) {
|
|
757
|
+
if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
|
|
744
758
|
result = result.substr(0, result.size() - tmpl.eos_token().size());
|
|
745
759
|
}
|
|
746
760
|
return result;
|
|
@@ -1278,6 +1292,26 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
|
1278
1292
|
tool_calls_end);
|
|
1279
1293
|
}
|
|
1280
1294
|
|
|
1295
|
+
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1296
|
+
common_chat_params data;
|
|
1297
|
+
auto prompt = apply(tmpl, inputs);
|
|
1298
|
+
|
|
1299
|
+
data.prompt = prompt;
|
|
1300
|
+
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
|
|
1301
|
+
|
|
1302
|
+
// TODO: support tool calls in GPT-OSS?
|
|
1303
|
+
|
|
1304
|
+
return data;
|
|
1305
|
+
}
|
|
1306
|
+
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
|
|
1307
|
+
// TODO @ngxson : this won't work with --special enabled, we should fix that
|
|
1308
|
+
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
|
|
1309
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1310
|
+
builder.add_content(builder.consume_rest());
|
|
1311
|
+
return;
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
|
|
1281
1315
|
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1282
1316
|
LOG_DBG("%s\n", __func__);
|
|
1283
1317
|
common_chat_params data;
|
|
@@ -1720,6 +1754,8 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1720
1754
|
params.enable_thinking = inputs.enable_thinking;
|
|
1721
1755
|
params.grammar = inputs.grammar;
|
|
1722
1756
|
params.now = inputs.now;
|
|
1757
|
+
params.add_bos = inputs.add_bos;
|
|
1758
|
+
params.add_eos = inputs.add_eos;
|
|
1723
1759
|
|
|
1724
1760
|
params.extra_context = json::object();
|
|
1725
1761
|
for (auto el : inputs.chat_template_kwargs) {
|
|
@@ -1761,6 +1797,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1761
1797
|
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
|
1762
1798
|
}
|
|
1763
1799
|
|
|
1800
|
+
// GPT-OSS
|
|
1801
|
+
if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
|
|
1802
|
+
return common_chat_params_init_gpt_oss(tmpl, params);
|
|
1803
|
+
}
|
|
1804
|
+
|
|
1764
1805
|
// Use generic handler when mixing tools + JSON schema.
|
|
1765
1806
|
// TODO: support that mix in handlers below.
|
|
1766
1807
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -1912,6 +1953,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
1912
1953
|
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
|
1913
1954
|
common_chat_parse_command_r7b(builder);
|
|
1914
1955
|
break;
|
|
1956
|
+
case COMMON_CHAT_FORMAT_GPT_OSS:
|
|
1957
|
+
common_chat_parse_gpt_oss(builder);
|
|
1958
|
+
break;
|
|
1915
1959
|
default:
|
|
1916
1960
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
1917
1961
|
}
|
|
@@ -9,12 +9,14 @@
|
|
|
9
9
|
#include <vector>
|
|
10
10
|
#include <map>
|
|
11
11
|
|
|
12
|
-
#include
|
|
13
|
-
#include
|
|
12
|
+
#include "minja/chat-template.hpp"
|
|
13
|
+
#include "minja/minja.hpp"
|
|
14
14
|
|
|
15
15
|
typedef minja::chat_template common_chat_template;
|
|
16
16
|
|
|
17
17
|
struct common_chat_templates {
|
|
18
|
+
bool add_bos;
|
|
19
|
+
bool add_eos;
|
|
18
20
|
bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
|
19
21
|
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
|
20
22
|
std::unique_ptr<common_chat_template> template_tool_use;
|
|
@@ -118,6 +120,7 @@ enum common_chat_format {
|
|
|
118
120
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
|
119
121
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
|
120
122
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
123
|
+
COMMON_CHAT_FORMAT_GPT_OSS,
|
|
121
124
|
|
|
122
125
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
123
126
|
};
|
|
@@ -136,6 +139,8 @@ struct common_chat_templates_inputs {
|
|
|
136
139
|
bool enable_thinking = true;
|
|
137
140
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
138
141
|
std::map<std::string, std::string> chat_template_kwargs;
|
|
142
|
+
bool add_bos = false;
|
|
143
|
+
bool add_eos = false;
|
|
139
144
|
};
|
|
140
145
|
|
|
141
146
|
struct common_chat_params {
|
|
@@ -236,6 +236,7 @@ struct common_params_diffusion {
|
|
|
236
236
|
|
|
237
237
|
enum common_reasoning_format {
|
|
238
238
|
COMMON_REASONING_FORMAT_NONE,
|
|
239
|
+
COMMON_REASONING_FORMAT_AUTO,
|
|
239
240
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
240
241
|
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
241
242
|
};
|
|
@@ -395,7 +396,7 @@ struct common_params {
|
|
|
395
396
|
std::string chat_template = ""; // NOLINT
|
|
396
397
|
bool use_jinja = false; // NOLINT
|
|
397
398
|
bool enable_chat_template = true;
|
|
398
|
-
common_reasoning_format reasoning_format =
|
|
399
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
|
399
400
|
int reasoning_budget = -1;
|
|
400
401
|
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
|
401
402
|
|
|
@@ -440,7 +441,7 @@ struct common_params {
|
|
|
440
441
|
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
|
441
442
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
|
442
443
|
int32_t i_chunk = 0; // start processing from this chunk
|
|
443
|
-
|
|
444
|
+
int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
|
|
444
445
|
|
|
445
446
|
bool process_output = false; // collect data for the output tensor
|
|
446
447
|
bool compute_ppl = true; // whether to compute perplexity
|
|
@@ -39,8 +39,9 @@ if (WIN32)
|
|
|
39
39
|
set(CMAKE_SHARED_MODULE_PREFIX "")
|
|
40
40
|
endif()
|
|
41
41
|
|
|
42
|
-
option(BUILD_SHARED_LIBS
|
|
43
|
-
option(GGML_BACKEND_DL
|
|
42
|
+
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
|
43
|
+
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
|
44
|
+
set(GGML_BACKEND_DIR "" CACHE PATH "ggml: directory to load dynamic backends from (requires GGML_BACKEND_DL")
|
|
44
45
|
|
|
45
46
|
#
|
|
46
47
|
# option list
|
|
@@ -304,6 +304,16 @@
|
|
|
304
304
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
|
305
305
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
|
306
306
|
|
|
307
|
+
#define GGML_TENSOR_TERNARY_OP_LOCALS \
|
|
308
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
|
309
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
|
310
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
|
311
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
|
312
|
+
GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
|
|
313
|
+
GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \
|
|
314
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
|
315
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
|
316
|
+
|
|
307
317
|
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
|
|
308
318
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
|
309
319
|
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
|
@@ -395,7 +405,8 @@ extern "C" {
|
|
|
395
405
|
// GGML_TYPE_IQ4_NL_4_4 = 36,
|
|
396
406
|
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
|
397
407
|
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
|
398
|
-
|
|
408
|
+
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
|
409
|
+
GGML_TYPE_COUNT = 40,
|
|
399
410
|
};
|
|
400
411
|
|
|
401
412
|
// precision
|
|
@@ -430,6 +441,7 @@ extern "C" {
|
|
|
430
441
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
|
431
442
|
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
|
432
443
|
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
|
444
|
+
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
|
|
433
445
|
};
|
|
434
446
|
|
|
435
447
|
// available tensor operations:
|
|
@@ -438,6 +450,7 @@ extern "C" {
|
|
|
438
450
|
|
|
439
451
|
GGML_OP_DUP,
|
|
440
452
|
GGML_OP_ADD,
|
|
453
|
+
GGML_OP_ADD_ID,
|
|
441
454
|
GGML_OP_ADD1,
|
|
442
455
|
GGML_OP_ACC,
|
|
443
456
|
GGML_OP_SUB,
|
|
@@ -557,6 +570,7 @@ extern "C" {
|
|
|
557
570
|
GGML_GLU_OP_REGLU,
|
|
558
571
|
GGML_GLU_OP_GEGLU,
|
|
559
572
|
GGML_GLU_OP_SWIGLU,
|
|
573
|
+
GGML_GLU_OP_SWIGLU_OAI,
|
|
560
574
|
GGML_GLU_OP_GEGLU_ERF,
|
|
561
575
|
GGML_GLU_OP_GEGLU_QUICK,
|
|
562
576
|
|
|
@@ -831,6 +845,13 @@ extern "C" {
|
|
|
831
845
|
struct ggml_tensor * b,
|
|
832
846
|
enum ggml_type type);
|
|
833
847
|
|
|
848
|
+
// dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
|
|
849
|
+
GGML_API struct ggml_tensor * ggml_add_id(
|
|
850
|
+
struct ggml_context * ctx,
|
|
851
|
+
struct ggml_tensor * a,
|
|
852
|
+
struct ggml_tensor * b,
|
|
853
|
+
struct ggml_tensor * ids);
|
|
854
|
+
|
|
834
855
|
GGML_API struct ggml_tensor * ggml_add1(
|
|
835
856
|
struct ggml_context * ctx,
|
|
836
857
|
struct ggml_tensor * a,
|
|
@@ -1198,6 +1219,13 @@ extern "C" {
|
|
|
1198
1219
|
struct ggml_tensor * a,
|
|
1199
1220
|
struct ggml_tensor * b);
|
|
1200
1221
|
|
|
1222
|
+
GGML_API struct ggml_tensor * ggml_swiglu_oai(
|
|
1223
|
+
struct ggml_context * ctx,
|
|
1224
|
+
struct ggml_tensor * a,
|
|
1225
|
+
struct ggml_tensor * b,
|
|
1226
|
+
float alpha,
|
|
1227
|
+
float limit);
|
|
1228
|
+
|
|
1201
1229
|
// normalize along rows
|
|
1202
1230
|
GGML_API struct ggml_tensor * ggml_norm(
|
|
1203
1231
|
struct ggml_context * ctx,
|
|
@@ -1570,6 +1598,10 @@ extern "C" {
|
|
|
1570
1598
|
float scale,
|
|
1571
1599
|
float max_bias);
|
|
1572
1600
|
|
|
1601
|
+
GGML_API void ggml_soft_max_add_sinks(
|
|
1602
|
+
struct ggml_tensor * a,
|
|
1603
|
+
struct ggml_tensor * sinks);
|
|
1604
|
+
|
|
1573
1605
|
GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
|
|
1574
1606
|
struct ggml_context * ctx,
|
|
1575
1607
|
struct ggml_tensor * a,
|
|
@@ -2052,6 +2084,10 @@ extern "C" {
|
|
|
2052
2084
|
GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
|
|
2053
2085
|
const struct ggml_tensor * a);
|
|
2054
2086
|
|
|
2087
|
+
GGML_API void ggml_flash_attn_ext_add_sinks(
|
|
2088
|
+
struct ggml_tensor * a,
|
|
2089
|
+
struct ggml_tensor * sinks);
|
|
2090
|
+
|
|
2055
2091
|
// TODO: needs to be adapted to ggml_flash_attn_ext
|
|
2056
2092
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
|
2057
2093
|
struct ggml_context * ctx,
|
|
@@ -214,6 +214,13 @@ add_library(ggml
|
|
|
214
214
|
ggml-backend-reg.cpp)
|
|
215
215
|
add_library(ggml::ggml ALIAS ggml)
|
|
216
216
|
|
|
217
|
+
if (GGML_BACKEND_DIR)
|
|
218
|
+
if (NOT GGML_BACKEND_DL)
|
|
219
|
+
message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
|
|
220
|
+
endif()
|
|
221
|
+
target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
|
|
222
|
+
endif()
|
|
223
|
+
|
|
217
224
|
target_link_libraries(ggml PUBLIC ggml-base)
|
|
218
225
|
|
|
219
226
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
@@ -227,7 +234,11 @@ function(ggml_add_backend_library backend)
|
|
|
227
234
|
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
|
228
235
|
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
|
229
236
|
add_dependencies(ggml ${backend})
|
|
230
|
-
|
|
237
|
+
if (GGML_BACKEND_DIR)
|
|
238
|
+
install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
|
|
239
|
+
else()
|
|
240
|
+
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
241
|
+
endif()
|
|
231
242
|
else()
|
|
232
243
|
add_library(${backend} ${ARGN})
|
|
233
244
|
target_link_libraries(ggml PUBLIC ${backend})
|
|
@@ -589,6 +589,67 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
589
589
|
*s = sumf;
|
|
590
590
|
}
|
|
591
591
|
|
|
592
|
+
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
593
|
+
assert(nrc == 1);
|
|
594
|
+
UNUSED(nrc);
|
|
595
|
+
UNUSED(bx);
|
|
596
|
+
UNUSED(by);
|
|
597
|
+
UNUSED(bs);
|
|
598
|
+
assert(n % QK_MXFP4 == 0);
|
|
599
|
+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
|
600
|
+
|
|
601
|
+
const block_mxfp4 * GGML_RESTRICT x = vx;
|
|
602
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
603
|
+
|
|
604
|
+
const int nb = n / QK_MXFP4;
|
|
605
|
+
|
|
606
|
+
int ib = 0;
|
|
607
|
+
float sumf = 0;
|
|
608
|
+
|
|
609
|
+
#if defined __ARM_NEON
|
|
610
|
+
const int8x16_t values = vld1q_s8(kvalues_mxfp4);
|
|
611
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
|
612
|
+
uint8x16x2_t q4bits;
|
|
613
|
+
int8x16x4_t q4b;
|
|
614
|
+
int8x16x4_t q8b;
|
|
615
|
+
int32x4_t prod_1;
|
|
616
|
+
int32x4_t prod_2;
|
|
617
|
+
|
|
618
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
619
|
+
q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
|
|
620
|
+
q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
|
|
621
|
+
q8b.val[0] = vld1q_s8(y[ib + 0].qs);
|
|
622
|
+
q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16);
|
|
623
|
+
q8b.val[2] = vld1q_s8(y[ib + 1].qs);
|
|
624
|
+
q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16);
|
|
625
|
+
|
|
626
|
+
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
|
627
|
+
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
|
628
|
+
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
|
629
|
+
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
|
630
|
+
|
|
631
|
+
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
|
632
|
+
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
|
633
|
+
|
|
634
|
+
sumf +=
|
|
635
|
+
GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
|
|
636
|
+
GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
#endif
|
|
640
|
+
for (; ib < nb; ++ib) {
|
|
641
|
+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
|
|
642
|
+
int sumi1 = 0;
|
|
643
|
+
int sumi2 = 0;
|
|
644
|
+
for (int j = 0; j < QK_MXFP4/2; ++j) {
|
|
645
|
+
sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
|
|
646
|
+
sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
|
|
647
|
+
}
|
|
648
|
+
sumf += d * (sumi1 + sumi2);
|
|
649
|
+
}
|
|
650
|
+
*s = sumf;
|
|
651
|
+
}
|
|
652
|
+
|
|
592
653
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
593
654
|
const int qk = QK8_0;
|
|
594
655
|
const int nb = n / qk;
|