@fugood/llama.node 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +17 -13
- package/src/LlamaCompletionWorker.cpp +2 -0
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +80 -10
- package/src/llama.cpp/common/chat.cpp +52 -8
- package/src/llama.cpp/common/chat.h +7 -2
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +16 -6
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/include/ggml.h +37 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +9 -4
- package/src/llama.cpp/src/llama-arch.cpp +105 -0
- package/src/llama.cpp/src/llama-arch.h +12 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +33 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +19 -10
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +175 -148
- package/src/llama.cpp/src/llama-graph.h +60 -23
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +949 -75
- package/src/llama.cpp/src/llama-model.h +24 -4
- package/src/llama.cpp/src/llama-quant.cpp +40 -4
- package/src/llama.cpp/src/llama-vocab.cpp +49 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
package/lib/binding.ts
CHANGED
|
@@ -65,6 +65,14 @@ export type LlamaModelOptions = {
|
|
|
65
65
|
lora?: string
|
|
66
66
|
lora_scaled?: number
|
|
67
67
|
lora_list?: { path: string; scaled: number }[]
|
|
68
|
+
/**
|
|
69
|
+
* RoPE base frequency, use 0 to use model default (recommended)
|
|
70
|
+
*/
|
|
71
|
+
rope_freq_base?: number
|
|
72
|
+
/**
|
|
73
|
+
* RoPE frequency scaling factor, use 0 to use model default (recommended)
|
|
74
|
+
*/
|
|
75
|
+
rope_freq_scale?: number
|
|
68
76
|
}
|
|
69
77
|
|
|
70
78
|
export type CompletionResponseFormat = {
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.6",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -71,19 +71,19 @@
|
|
|
71
71
|
"CMakeLists.txt"
|
|
72
72
|
],
|
|
73
73
|
"optionalDependencies": {
|
|
74
|
-
"@fugood/node-llama-linux-x64": "1.1.
|
|
75
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-cuda": "1.1.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.1.
|
|
80
|
-
"@fugood/node-llama-win32-x64": "1.1.
|
|
81
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-cuda": "1.1.
|
|
83
|
-
"@fugood/node-llama-win32-arm64": "1.1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.1.
|
|
85
|
-
"@fugood/node-llama-darwin-x64": "1.1.
|
|
86
|
-
"@fugood/node-llama-darwin-arm64": "1.1.
|
|
74
|
+
"@fugood/node-llama-linux-x64": "1.1.6",
|
|
75
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.1.6",
|
|
76
|
+
"@fugood/node-llama-linux-x64-cuda": "1.1.6",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.1.6",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.1.6",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.1.6",
|
|
80
|
+
"@fugood/node-llama-win32-x64": "1.1.6",
|
|
81
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.1.6",
|
|
82
|
+
"@fugood/node-llama-win32-x64-cuda": "1.1.6",
|
|
83
|
+
"@fugood/node-llama-win32-arm64": "1.1.6",
|
|
84
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.1.6",
|
|
85
|
+
"@fugood/node-llama-darwin-x64": "1.1.6",
|
|
86
|
+
"@fugood/node-llama-darwin-arm64": "1.1.6"
|
|
87
87
|
},
|
|
88
88
|
"devDependencies": {
|
|
89
89
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
2
|
-
index
|
|
2
|
+
index 60805ab3..71b4236a 100644
|
|
3
3
|
--- a/src/llama.cpp/common/chat.cpp
|
|
4
4
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
5
5
|
@@ -6,9 +6,6 @@
|
|
@@ -12,13 +12,15 @@ index 114dbfcc..6771bd43 100644
|
|
|
12
12
|
#include <cstdio>
|
|
13
13
|
#include <exception>
|
|
14
14
|
#include <iostream>
|
|
15
|
-
@@ -123,
|
|
15
|
+
@@ -123,16 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
|
16
16
|
return diffs;
|
|
17
17
|
}
|
|
18
18
|
|
|
19
19
|
-typedef minja::chat_template common_chat_template;
|
|
20
20
|
-
|
|
21
21
|
-struct common_chat_templates {
|
|
22
|
+
- bool add_bos;
|
|
23
|
+
- bool add_eos;
|
|
22
24
|
- bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
|
23
25
|
- std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
|
24
26
|
- std::unique_ptr<common_chat_template> template_tool_use;
|
|
@@ -27,21 +29,23 @@ index 114dbfcc..6771bd43 100644
|
|
|
27
29
|
struct templates_params {
|
|
28
30
|
json messages;
|
|
29
31
|
json tools;
|
|
30
|
-
diff --git a/common/chat.h b/common/chat.h
|
|
31
|
-
index
|
|
32
|
+
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
33
|
+
index b014f9f0..3a868797 100644
|
|
32
34
|
--- a/src/llama.cpp/common/chat.h
|
|
33
35
|
+++ b/src/llama.cpp/common/chat.h
|
|
34
|
-
@@ -9,7 +9,
|
|
36
|
+
@@ -9,7 +9,18 @@
|
|
35
37
|
#include <vector>
|
|
36
38
|
#include <map>
|
|
37
39
|
|
|
38
40
|
-struct common_chat_templates;
|
|
39
|
-
+#include
|
|
40
|
-
+#include
|
|
41
|
+
+#include "minja/chat-template.hpp"
|
|
42
|
+
+#include "minja/minja.hpp"
|
|
41
43
|
+
|
|
42
44
|
+typedef minja::chat_template common_chat_template;
|
|
43
45
|
+
|
|
44
46
|
+struct common_chat_templates {
|
|
47
|
+
+ bool add_bos;
|
|
48
|
+
+ bool add_eos;
|
|
45
49
|
+ bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
|
46
50
|
+ std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
|
47
51
|
+ std::unique_ptr<common_chat_template> template_tool_use;
|
|
@@ -50,10 +54,10 @@ index ca807c14..56649863 100644
|
|
|
50
54
|
struct common_chat_tool_call {
|
|
51
55
|
std::string name;
|
|
52
56
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
53
|
-
index
|
|
57
|
+
index c6962d1d..ba5a4786 100644
|
|
54
58
|
--- a/src/llama.cpp/common/common.cpp
|
|
55
59
|
+++ b/src/llama.cpp/common/common.cpp
|
|
56
|
-
@@ -
|
|
60
|
+
@@ -1116,6 +1116,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
57
61
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
58
62
|
}
|
|
59
63
|
|
|
@@ -62,10 +66,10 @@ index e4e71ad1..091ddda4 100644
|
|
|
62
66
|
mparams.split_mode = params.split_mode;
|
|
63
67
|
mparams.tensor_split = params.tensor_split;
|
|
64
68
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
65
|
-
index
|
|
69
|
+
index 6c1c7ee2..c3eb0552 100644
|
|
66
70
|
--- a/src/llama.cpp/common/common.h
|
|
67
71
|
+++ b/src/llama.cpp/common/common.h
|
|
68
|
-
@@ -
|
|
72
|
+
@@ -242,6 +242,7 @@ enum common_reasoning_format {
|
|
69
73
|
};
|
|
70
74
|
|
|
71
75
|
struct common_params {
|
|
@@ -74,10 +78,10 @@ index 8922090e..3c2d1a6a 100644
|
|
|
74
78
|
int32_t n_ctx = 4096; // context size
|
|
75
79
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
76
80
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
77
|
-
index
|
|
81
|
+
index f188d163..0c33acad 100644
|
|
78
82
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
79
83
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
80
|
-
@@ -
|
|
84
|
+
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
81
85
|
)
|
|
82
86
|
|
|
83
87
|
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
|
@@ -320,6 +320,8 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
320
320
|
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
321
321
|
} else if (_reasoning_format == "deepseek-legacy") {
|
|
322
322
|
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
|
|
323
|
+
} else if (_reasoning_format == "auto") {
|
|
324
|
+
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
|
323
325
|
} else {
|
|
324
326
|
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
325
327
|
}
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -250,6 +250,9 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
250
250
|
params.kv_unified = get_option<bool>(options, "kv_unified", false);
|
|
251
251
|
params.swa_full = get_option<bool>(options, "swa_full", false);
|
|
252
252
|
|
|
253
|
+
params.rope_freq_base = get_option<float>(options, "rope_freq_base", 0.0f);
|
|
254
|
+
params.rope_freq_scale = get_option<float>(options, "rope_freq_scale", 0.0f);
|
|
255
|
+
|
|
253
256
|
params.use_mlock = get_option<bool>(options, "use_mlock", false);
|
|
254
257
|
params.use_mmap = get_option<bool>(options, "use_mmap", true);
|
|
255
258
|
params.numa =
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
#include <cstdarg>
|
|
25
25
|
#include <filesystem>
|
|
26
26
|
#include <fstream>
|
|
27
|
+
#include <list>
|
|
27
28
|
#include <regex>
|
|
28
29
|
#include <set>
|
|
29
30
|
#include <string>
|
|
@@ -977,6 +978,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
977
978
|
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
|
978
979
|
string_process_escapes(seq_breaker);
|
|
979
980
|
}
|
|
981
|
+
for (auto & pair : params.speculative.replacements) {
|
|
982
|
+
string_process_escapes(pair.first);
|
|
983
|
+
string_process_escapes(pair.second);
|
|
984
|
+
}
|
|
980
985
|
}
|
|
981
986
|
|
|
982
987
|
if (!params.kv_overrides.empty()) {
|
|
@@ -2091,6 +2096,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2091
2096
|
params.no_kv_offload = true;
|
|
2092
2097
|
}
|
|
2093
2098
|
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
|
2099
|
+
add_opt(common_arg(
|
|
2100
|
+
{"-nr", "--no-repack"},
|
|
2101
|
+
"disable weight repacking",
|
|
2102
|
+
[](common_params & params) {
|
|
2103
|
+
params.no_extra_bufts = true;
|
|
2104
|
+
}
|
|
2105
|
+
).set_env("LLAMA_ARG_NO_REPACK"));
|
|
2094
2106
|
add_opt(common_arg(
|
|
2095
2107
|
{"-ctk", "--cache-type-k"}, "TYPE",
|
|
2096
2108
|
string_format(
|
|
@@ -2364,11 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2364
2376
|
}
|
|
2365
2377
|
throw std::invalid_argument("unknown buffer type");
|
|
2366
2378
|
}
|
|
2367
|
-
//
|
|
2368
|
-
|
|
2379
|
+
// keep strings alive and avoid leaking memory by storing them in a static vector
|
|
2380
|
+
static std::list<std::string> buft_overrides;
|
|
2381
|
+
buft_overrides.push_back(tensor_name);
|
|
2382
|
+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
|
|
2369
2383
|
}
|
|
2370
2384
|
}
|
|
2371
2385
|
));
|
|
2386
|
+
add_opt(common_arg(
|
|
2387
|
+
{"--cpu-moe", "-cmoe"},
|
|
2388
|
+
"keep all Mixture of Experts (MoE) weights in the CPU",
|
|
2389
|
+
[](common_params & params) {
|
|
2390
|
+
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
|
|
2391
|
+
}
|
|
2392
|
+
).set_env("LLAMA_ARG_CPU_MOE"));
|
|
2393
|
+
add_opt(common_arg(
|
|
2394
|
+
{"--n-cpu-moe", "-ncmoe"}, "N",
|
|
2395
|
+
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
|
|
2396
|
+
[](common_params & params, int value) {
|
|
2397
|
+
if (value < 0) {
|
|
2398
|
+
throw std::invalid_argument("invalid value");
|
|
2399
|
+
}
|
|
2400
|
+
for (int i = 0; i < value; ++i) {
|
|
2401
|
+
// keep strings alive and avoid leaking memory by storing them in a static vector
|
|
2402
|
+
static std::list<std::string> buft_overrides;
|
|
2403
|
+
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
|
2404
|
+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
2405
|
+
}
|
|
2406
|
+
}
|
|
2407
|
+
).set_env("LLAMA_ARG_N_CPU_MOE"));
|
|
2372
2408
|
add_opt(common_arg(
|
|
2373
2409
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
2374
2410
|
"number of layers to store in VRAM",
|
|
@@ -2627,6 +2663,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2627
2663
|
params.n_out_freq = value;
|
|
2628
2664
|
}
|
|
2629
2665
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2666
|
+
add_opt(common_arg(
|
|
2667
|
+
{"--output-format"}, "{gguf,dat}",
|
|
2668
|
+
string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
|
|
2669
|
+
[](common_params & params, const std::string & value) {
|
|
2670
|
+
/**/ if (value == "gguf") { params.imat_dat = -1; }
|
|
2671
|
+
else if (value == "dat") { params.imat_dat = 1; }
|
|
2672
|
+
else { throw std::invalid_argument("invalid output format"); }
|
|
2673
|
+
}
|
|
2674
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2630
2675
|
add_opt(common_arg(
|
|
2631
2676
|
{"--save-frequency"}, "N",
|
|
2632
2677
|
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
|
@@ -2902,11 +2947,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2902
2947
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
|
2903
2948
|
"- none: leaves thoughts unparsed in `message.content`\n"
|
|
2904
2949
|
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
|
2905
|
-
"(default:
|
|
2950
|
+
"(default: auto)",
|
|
2906
2951
|
[](common_params & params, const std::string & value) {
|
|
2907
2952
|
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
|
2908
2953
|
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
|
2909
2954
|
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
|
2955
|
+
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
|
|
2910
2956
|
else { throw std::invalid_argument("invalid value"); }
|
|
2911
2957
|
}
|
|
2912
2958
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
|
@@ -3249,6 +3295,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3249
3295
|
params.speculative.model.path = value;
|
|
3250
3296
|
}
|
|
3251
3297
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
3298
|
+
add_opt(common_arg(
|
|
3299
|
+
{"--spec-replace"}, "TARGET", "DRAFT",
|
|
3300
|
+
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
|
|
3301
|
+
[](common_params & params, const std::string & tgt, const std::string & dft) {
|
|
3302
|
+
params.speculative.replacements.push_back({ tgt, dft });
|
|
3303
|
+
}
|
|
3304
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
3252
3305
|
add_opt(common_arg(
|
|
3253
3306
|
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
|
3254
3307
|
string_format(
|
|
@@ -3438,12 +3491,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3438
3491
|
}
|
|
3439
3492
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3440
3493
|
|
|
3441
|
-
// diffusion parameters
|
|
3442
3494
|
add_opt(common_arg(
|
|
3443
3495
|
{ "--diffusion-steps" }, "N",
|
|
3444
3496
|
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
|
3445
3497
|
[](common_params & params, int value) { params.diffusion.steps = value; }
|
|
3446
3498
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3499
|
+
add_opt(common_arg(
|
|
3500
|
+
{ "--diffusion-visual" },
|
|
3501
|
+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
|
|
3502
|
+
params.diffusion.visual_mode ? "true" : "false"),
|
|
3503
|
+
[](common_params & params) { params.diffusion.visual_mode = true; }
|
|
3504
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3505
|
+
|
|
3447
3506
|
add_opt(common_arg(
|
|
3448
3507
|
{ "--diffusion-eps" }, "F",
|
|
3449
3508
|
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
|
@@ -3451,21 +3510,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3451
3510
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3452
3511
|
add_opt(common_arg(
|
|
3453
3512
|
{ "--diffusion-algorithm" }, "N",
|
|
3454
|
-
string_format("diffusion algorithm: 0=ORIGIN, 1=
|
|
3513
|
+
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
|
|
3455
3514
|
params.diffusion.algorithm),
|
|
3456
3515
|
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
|
3457
3516
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3458
3517
|
add_opt(common_arg(
|
|
3459
3518
|
{ "--diffusion-alg-temp" }, "F",
|
|
3460
|
-
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
3519
|
+
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
3461
3520
|
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
|
3462
3521
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3522
|
+
|
|
3523
|
+
add_opt(common_arg(
|
|
3524
|
+
{ "--diffusion-block-length" }, "N",
|
|
3525
|
+
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
|
|
3526
|
+
[](common_params & params, int value) { params.diffusion.block_length = value; }
|
|
3527
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3463
3528
|
add_opt(common_arg(
|
|
3464
|
-
{ "--diffusion-
|
|
3465
|
-
string_format("
|
|
3466
|
-
|
|
3467
|
-
|
|
3529
|
+
{ "--diffusion-cfg-scale" }, "F",
|
|
3530
|
+
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
|
|
3531
|
+
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
|
|
3532
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3533
|
+
add_opt(common_arg(
|
|
3534
|
+
{ "--diffusion-add-gumbel-noise" }, "F",
|
|
3535
|
+
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
|
|
3536
|
+
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
|
|
3468
3537
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3469
3538
|
|
|
3539
|
+
|
|
3470
3540
|
return ctx_arg;
|
|
3471
3541
|
}
|
|
@@ -132,6 +132,8 @@ struct templates_params {
|
|
|
132
132
|
bool enable_thinking = true;
|
|
133
133
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
134
134
|
json extra_context;
|
|
135
|
+
bool add_bos;
|
|
136
|
+
bool add_eos;
|
|
135
137
|
};
|
|
136
138
|
|
|
137
139
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
|
@@ -434,6 +436,8 @@ std::string common_chat_format_single(
|
|
|
434
436
|
|
|
435
437
|
common_chat_templates_inputs inputs;
|
|
436
438
|
inputs.use_jinja = use_jinja;
|
|
439
|
+
inputs.add_bos = tmpls->add_bos;
|
|
440
|
+
inputs.add_eos = tmpls->add_eos;
|
|
437
441
|
|
|
438
442
|
std::string fmt_past_msg;
|
|
439
443
|
if (!past_msg.empty()) {
|
|
@@ -458,6 +462,8 @@ std::string common_chat_format_single(
|
|
|
458
462
|
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
|
|
459
463
|
common_chat_templates_inputs inputs;
|
|
460
464
|
inputs.use_jinja = use_jinja;
|
|
465
|
+
inputs.add_bos = tmpls->add_bos;
|
|
466
|
+
inputs.add_eos = tmpls->add_eos;
|
|
461
467
|
auto add_simple_msg = [&](auto role, auto content) {
|
|
462
468
|
common_chat_msg msg;
|
|
463
469
|
msg.role = role;
|
|
@@ -535,6 +541,8 @@ common_chat_templates_ptr common_chat_templates_init(
|
|
|
535
541
|
}
|
|
536
542
|
std::string token_bos = bos_token_override;
|
|
537
543
|
std::string token_eos = eos_token_override;
|
|
544
|
+
bool add_bos = false;
|
|
545
|
+
bool add_eos = false;
|
|
538
546
|
if (model) {
|
|
539
547
|
const auto * vocab = llama_model_get_vocab(model);
|
|
540
548
|
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
|
|
@@ -549,9 +557,13 @@ common_chat_templates_ptr common_chat_templates_init(
|
|
|
549
557
|
};
|
|
550
558
|
token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
|
|
551
559
|
token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
|
|
560
|
+
add_bos = llama_vocab_get_add_bos(vocab);
|
|
561
|
+
add_eos = llama_vocab_get_add_eos(vocab);
|
|
552
562
|
}
|
|
553
563
|
common_chat_templates_ptr tmpls(new common_chat_templates());
|
|
554
564
|
tmpls->has_explicit_template = has_explicit_template;
|
|
565
|
+
tmpls->add_bos = add_bos;
|
|
566
|
+
tmpls->add_eos = add_eos;
|
|
555
567
|
try {
|
|
556
568
|
tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
|
|
557
569
|
} catch (const std::exception & e) {
|
|
@@ -581,6 +593,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
581
593
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
|
582
594
|
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
|
583
595
|
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
596
|
+
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
|
584
597
|
default:
|
|
585
598
|
throw std::runtime_error("Unknown chat format");
|
|
586
599
|
}
|
|
@@ -589,6 +602,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
589
602
|
const char * common_reasoning_format_name(common_reasoning_format format) {
|
|
590
603
|
switch (format) {
|
|
591
604
|
case COMMON_REASONING_FORMAT_NONE: return "none";
|
|
605
|
+
case COMMON_REASONING_FORMAT_AUTO: return "auto";
|
|
592
606
|
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
|
593
607
|
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
|
|
594
608
|
default:
|
|
@@ -737,10 +751,10 @@ static std::string apply(
|
|
|
737
751
|
// instead of using `chat_template_options.use_bos_token = false`, since these tokens
|
|
738
752
|
// may be needed inside the template / between messages too.
|
|
739
753
|
auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
|
|
740
|
-
if (string_starts_with(result, tmpl.bos_token())) {
|
|
754
|
+
if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
|
|
741
755
|
result = result.substr(tmpl.bos_token().size());
|
|
742
756
|
}
|
|
743
|
-
if (string_ends_with(result, tmpl.eos_token())) {
|
|
757
|
+
if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
|
|
744
758
|
result = result.substr(0, result.size() - tmpl.eos_token().size());
|
|
745
759
|
}
|
|
746
760
|
return result;
|
|
@@ -1278,6 +1292,26 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
|
1278
1292
|
tool_calls_end);
|
|
1279
1293
|
}
|
|
1280
1294
|
|
|
1295
|
+
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1296
|
+
common_chat_params data;
|
|
1297
|
+
auto prompt = apply(tmpl, inputs);
|
|
1298
|
+
|
|
1299
|
+
data.prompt = prompt;
|
|
1300
|
+
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
|
|
1301
|
+
|
|
1302
|
+
// TODO: support tool calls in GPT-OSS?
|
|
1303
|
+
|
|
1304
|
+
return data;
|
|
1305
|
+
}
|
|
1306
|
+
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
|
|
1307
|
+
// TODO @ngxson : this won't work with --special enabled, we should fix that
|
|
1308
|
+
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
|
|
1309
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1310
|
+
builder.add_content(builder.consume_rest());
|
|
1311
|
+
return;
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
|
|
1281
1315
|
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1282
1316
|
LOG_DBG("%s\n", __func__);
|
|
1283
1317
|
common_chat_params data;
|
|
@@ -1635,7 +1669,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1635
1669
|
"|<function name=\"([^\"]+)\">" // match 5 (function name again)
|
|
1636
1670
|
);
|
|
1637
1671
|
|
|
1638
|
-
|
|
1672
|
+
while (auto res = builder.try_find_regex(open_regex)) {
|
|
1639
1673
|
const auto & block_start = res->groups[1];
|
|
1640
1674
|
std::string block_end = block_start.empty() ? "" : "```";
|
|
1641
1675
|
|
|
@@ -1657,7 +1691,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1657
1691
|
builder.consume_literal(block_end);
|
|
1658
1692
|
builder.consume_spaces();
|
|
1659
1693
|
}
|
|
1660
|
-
builder.add_content(builder.consume_rest());
|
|
1661
1694
|
} else {
|
|
1662
1695
|
throw common_chat_msg_partial_exception("failed to parse tool call");
|
|
1663
1696
|
}
|
|
@@ -1682,11 +1715,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1682
1715
|
builder.consume_spaces();
|
|
1683
1716
|
}
|
|
1684
1717
|
}
|
|
1685
|
-
builder.add_content(builder.consume_rest());
|
|
1686
1718
|
}
|
|
1687
|
-
} else {
|
|
1688
|
-
builder.add_content(builder.consume_rest());
|
|
1689
1719
|
}
|
|
1720
|
+
|
|
1721
|
+
builder.add_content(builder.consume_rest());
|
|
1690
1722
|
}
|
|
1691
1723
|
|
|
1692
1724
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
@@ -1722,6 +1754,8 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1722
1754
|
params.enable_thinking = inputs.enable_thinking;
|
|
1723
1755
|
params.grammar = inputs.grammar;
|
|
1724
1756
|
params.now = inputs.now;
|
|
1757
|
+
params.add_bos = inputs.add_bos;
|
|
1758
|
+
params.add_eos = inputs.add_eos;
|
|
1725
1759
|
|
|
1726
1760
|
params.extra_context = json::object();
|
|
1727
1761
|
for (auto el : inputs.chat_template_kwargs) {
|
|
@@ -1763,6 +1797,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1763
1797
|
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
|
1764
1798
|
}
|
|
1765
1799
|
|
|
1800
|
+
// GPT-OSS
|
|
1801
|
+
if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
|
|
1802
|
+
return common_chat_params_init_gpt_oss(tmpl, params);
|
|
1803
|
+
}
|
|
1804
|
+
|
|
1766
1805
|
// Use generic handler when mixing tools + JSON schema.
|
|
1767
1806
|
// TODO: support that mix in handlers below.
|
|
1768
1807
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -1914,6 +1953,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
1914
1953
|
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
|
1915
1954
|
common_chat_parse_command_r7b(builder);
|
|
1916
1955
|
break;
|
|
1956
|
+
case COMMON_CHAT_FORMAT_GPT_OSS:
|
|
1957
|
+
common_chat_parse_gpt_oss(builder);
|
|
1958
|
+
break;
|
|
1917
1959
|
default:
|
|
1918
1960
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
1919
1961
|
}
|
|
@@ -1933,6 +1975,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
|
|
1933
1975
|
}
|
|
1934
1976
|
}
|
|
1935
1977
|
auto msg = builder.result();
|
|
1936
|
-
|
|
1978
|
+
if (!is_partial) {
|
|
1979
|
+
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
|
1980
|
+
}
|
|
1937
1981
|
return msg;
|
|
1938
1982
|
}
|
|
@@ -9,12 +9,14 @@
|
|
|
9
9
|
#include <vector>
|
|
10
10
|
#include <map>
|
|
11
11
|
|
|
12
|
-
#include
|
|
13
|
-
#include
|
|
12
|
+
#include "minja/chat-template.hpp"
|
|
13
|
+
#include "minja/minja.hpp"
|
|
14
14
|
|
|
15
15
|
typedef minja::chat_template common_chat_template;
|
|
16
16
|
|
|
17
17
|
struct common_chat_templates {
|
|
18
|
+
bool add_bos;
|
|
19
|
+
bool add_eos;
|
|
18
20
|
bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
|
19
21
|
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
|
20
22
|
std::unique_ptr<common_chat_template> template_tool_use;
|
|
@@ -118,6 +120,7 @@ enum common_chat_format {
|
|
|
118
120
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
|
119
121
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
|
120
122
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
123
|
+
COMMON_CHAT_FORMAT_GPT_OSS,
|
|
121
124
|
|
|
122
125
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
123
126
|
};
|
|
@@ -136,6 +139,8 @@ struct common_chat_templates_inputs {
|
|
|
136
139
|
bool enable_thinking = true;
|
|
137
140
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
138
141
|
std::map<std::string, std::string> chat_template_kwargs;
|
|
142
|
+
bool add_bos = false;
|
|
143
|
+
bool add_eos = false;
|
|
139
144
|
};
|
|
140
145
|
|
|
141
146
|
struct common_chat_params {
|
|
@@ -1123,6 +1123,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1123
1123
|
mparams.use_mmap = params.use_mmap;
|
|
1124
1124
|
mparams.use_mlock = params.use_mlock;
|
|
1125
1125
|
mparams.check_tensors = params.check_tensors;
|
|
1126
|
+
mparams.use_extra_bufts = !params.no_extra_bufts;
|
|
1126
1127
|
|
|
1127
1128
|
if (params.kv_overrides.empty()) {
|
|
1128
1129
|
mparams.kv_overrides = NULL;
|
|
@@ -201,6 +201,7 @@ struct common_params_speculative {
|
|
|
201
201
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
|
202
202
|
float p_split = 0.1f; // speculative decoding split probability
|
|
203
203
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
|
204
|
+
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
|
204
205
|
|
|
205
206
|
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
206
207
|
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
@@ -220,15 +221,22 @@ struct common_params_vocoder {
|
|
|
220
221
|
};
|
|
221
222
|
|
|
222
223
|
struct common_params_diffusion {
|
|
223
|
-
int32_t steps
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
float
|
|
227
|
-
|
|
224
|
+
int32_t steps = 128;
|
|
225
|
+
bool visual_mode = false;
|
|
226
|
+
|
|
227
|
+
float eps = 0; // epsilon for timesteps
|
|
228
|
+
int32_t block_length = 0; // block length for generation
|
|
229
|
+
|
|
230
|
+
int32_t algorithm = 4; // default algorithm: low-confidence
|
|
231
|
+
float alg_temp = 0.0f; // algorithm temperature
|
|
232
|
+
|
|
233
|
+
float cfg_scale = 0; // classifier-free guidance scale
|
|
234
|
+
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
|
|
228
235
|
};
|
|
229
236
|
|
|
230
237
|
enum common_reasoning_format {
|
|
231
238
|
COMMON_REASONING_FORMAT_NONE,
|
|
239
|
+
COMMON_REASONING_FORMAT_AUTO,
|
|
232
240
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
233
241
|
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
234
242
|
};
|
|
@@ -353,6 +361,7 @@ struct common_params {
|
|
|
353
361
|
bool warmup = true; // warmup run
|
|
354
362
|
bool check_tensors = false; // validate tensor data
|
|
355
363
|
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
|
364
|
+
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
|
|
356
365
|
|
|
357
366
|
bool single_turn = false; // single turn chat conversation
|
|
358
367
|
|
|
@@ -387,7 +396,7 @@ struct common_params {
|
|
|
387
396
|
std::string chat_template = ""; // NOLINT
|
|
388
397
|
bool use_jinja = false; // NOLINT
|
|
389
398
|
bool enable_chat_template = true;
|
|
390
|
-
common_reasoning_format reasoning_format =
|
|
399
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
|
391
400
|
int reasoning_budget = -1;
|
|
392
401
|
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
|
393
402
|
|
|
@@ -432,6 +441,7 @@ struct common_params {
|
|
|
432
441
|
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
|
433
442
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
|
434
443
|
int32_t i_chunk = 0; // start processing from this chunk
|
|
444
|
+
int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
|
|
435
445
|
|
|
436
446
|
bool process_output = false; // collect data for the output tensor
|
|
437
447
|
bool compute_ppl = true; // whether to compute perplexity
|