@fugood/llama.node 1.4.12 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -9
- package/src/llama.cpp/common/arg.cpp +99 -45
- package/src/llama.cpp/common/chat.cpp +4 -4
- package/src/llama.cpp/common/common.cpp +19 -0
- package/src/llama.cpp/common/common.h +10 -0
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/include/llama.h +87 -8
- package/src/llama.cpp/src/llama-arch.cpp +2 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +615 -28
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +8 -2
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +51 -11
- package/src/llama.cpp/src/llama-sampling.cpp +1232 -170
- package/src/llama.cpp/src/llama-sampling.h +16 -7
- package/src/llama.cpp/src/llama.cpp +38 -30
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.13",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,20 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-darwin-arm64": "1.4.
|
|
76
|
-
"@fugood/node-llama-darwin-x64": "1.4.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.4.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.4.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.4.
|
|
81
|
-
"@fugood/node-llama-linux-x64": "1.4.
|
|
82
|
-
"@fugood/node-llama-linux-x64-cuda": "1.4.
|
|
83
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.4.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.4.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.4.
|
|
86
|
-
"@fugood/node-llama-win32-x64": "1.4.
|
|
87
|
-
"@fugood/node-llama-win32-x64-cuda": "1.4.
|
|
88
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.4.
|
|
75
|
+
"@fugood/node-llama-darwin-arm64": "1.4.13",
|
|
76
|
+
"@fugood/node-llama-darwin-x64": "1.4.13",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.4.13",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.4.13",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.13",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.4.13",
|
|
81
|
+
"@fugood/node-llama-linux-x64": "1.4.13",
|
|
82
|
+
"@fugood/node-llama-linux-x64-cuda": "1.4.13",
|
|
83
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.4.13",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.4.13",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.4.13",
|
|
86
|
+
"@fugood/node-llama-win32-x64": "1.4.13",
|
|
87
|
+
"@fugood/node-llama-win32-x64-cuda": "1.4.13",
|
|
88
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.4.13"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
91
91
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -32,7 +32,7 @@ index 1bcba9cd8..b7cd68734 100644
|
|
|
32
32
|
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
|
|
33
33
|
int count = 0;
|
|
34
34
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
35
|
-
index
|
|
35
|
+
index 22e527bab..c3d0affca 100644
|
|
36
36
|
--- a/src/llama.cpp/common/chat.cpp
|
|
37
37
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
38
38
|
@@ -7,9 +7,6 @@
|
|
@@ -96,10 +96,10 @@ index 8bd4a325f..333b3301f 100644
|
|
|
96
96
|
struct common_chat_tool_call {
|
|
97
97
|
std::string name;
|
|
98
98
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
99
|
-
index
|
|
99
|
+
index 41b2b6833..fe9ba05aa 100644
|
|
100
100
|
--- a/src/llama.cpp/common/common.cpp
|
|
101
101
|
+++ b/src/llama.cpp/common/common.cpp
|
|
102
|
-
@@ -
|
|
102
|
+
@@ -1361,6 +1361,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
103
103
|
mparams.devices = params.devices.data();
|
|
104
104
|
}
|
|
105
105
|
|
|
@@ -108,10 +108,10 @@ index 79c475612..cf189f8bc 100644
|
|
|
108
108
|
mparams.main_gpu = params.main_gpu;
|
|
109
109
|
mparams.split_mode = params.split_mode;
|
|
110
110
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
111
|
-
index
|
|
111
|
+
index d6fd0d37a..477209ce5 100644
|
|
112
112
|
--- a/src/llama.cpp/common/common.h
|
|
113
113
|
+++ b/src/llama.cpp/common/common.h
|
|
114
|
-
@@ -
|
|
114
|
+
@@ -310,6 +310,7 @@ struct lr_opt {
|
|
115
115
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
116
116
|
|
|
117
117
|
struct common_params {
|
|
@@ -133,10 +133,10 @@ index 7622d0bf4..d2edcfddb 100644
|
|
|
133
133
|
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
|
134
134
|
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
|
135
135
|
diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
136
|
-
index
|
|
136
|
+
index 365a24b49..83bf4ee62 100644
|
|
137
137
|
--- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
138
138
|
+++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
139
|
-
@@ -
|
|
139
|
+
@@ -2798,9 +2798,24 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
|
|
140
140
|
GGML_UNUSED(dev);
|
|
141
141
|
}
|
|
142
142
|
|
|
@@ -163,7 +163,7 @@ index 13b96d61f..5fa163442 100644
|
|
|
163
163
|
*total = *free;
|
|
164
164
|
|
|
165
165
|
GGML_UNUSED(dev);
|
|
166
|
-
@@ -
|
|
166
|
+
@@ -3010,10 +3025,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
167
167
|
}
|
|
168
168
|
}
|
|
169
169
|
|
|
@@ -185,7 +185,7 @@ index 13b96d61f..5fa163442 100644
|
|
|
185
185
|
|
|
186
186
|
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
|
|
187
187
|
|
|
188
|
-
@@ -
|
|
188
|
+
@@ -3026,6 +3048,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
189
189
|
} catch (const std::exception & exc) {
|
|
190
190
|
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
|
|
191
191
|
devices[i].context = nullptr;
|
|
@@ -679,7 +679,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
|
679
679
|
"llama-quantize",
|
|
680
680
|
"llama-qwen2vl-cli",
|
|
681
681
|
"llama-retrieval",
|
|
682
|
-
"llama-run",
|
|
683
682
|
"llama-save-load-state",
|
|
684
683
|
"llama-server",
|
|
685
684
|
"llama-simple",
|
|
@@ -854,6 +853,54 @@ bool common_arg_utils::is_autoy(const std::string & value) {
|
|
|
854
853
|
return value == "auto" || value == "-1";
|
|
855
854
|
}
|
|
856
855
|
|
|
856
|
+
// Simple CSV parser that handles quoted fields and escaped quotes
|
|
857
|
+
// example:
|
|
858
|
+
// input: value1,"value, with, commas","value with ""escaped"" quotes",value4
|
|
859
|
+
// output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
|
|
860
|
+
static std::vector<std::string> parse_csv_row(const std::string& input) {
|
|
861
|
+
std::vector<std::string> fields;
|
|
862
|
+
std::string field;
|
|
863
|
+
bool in_quotes = false;
|
|
864
|
+
|
|
865
|
+
for (size_t i = 0; i < input.length(); ++i) {
|
|
866
|
+
char ch = input[i];
|
|
867
|
+
|
|
868
|
+
if (ch == '"') {
|
|
869
|
+
if (!in_quotes) {
|
|
870
|
+
// start of quoted field (only valid if at beginning of field)
|
|
871
|
+
if (!field.empty()) {
|
|
872
|
+
// quote appeared in middle of unquoted field, treat as literal
|
|
873
|
+
field += '"';
|
|
874
|
+
} else {
|
|
875
|
+
in_quotes = true; // start
|
|
876
|
+
}
|
|
877
|
+
} else {
|
|
878
|
+
if (i + 1 < input.length() && input[i + 1] == '"') {
|
|
879
|
+
// escaped quote: ""
|
|
880
|
+
field += '"';
|
|
881
|
+
++i; // skip the next quote
|
|
882
|
+
} else {
|
|
883
|
+
in_quotes = false; // end
|
|
884
|
+
}
|
|
885
|
+
}
|
|
886
|
+
} else if (ch == ',') {
|
|
887
|
+
if (in_quotes) {
|
|
888
|
+
field += ',';
|
|
889
|
+
} else {
|
|
890
|
+
fields.push_back(std::move(field));
|
|
891
|
+
field.clear();
|
|
892
|
+
}
|
|
893
|
+
} else {
|
|
894
|
+
field += ch;
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
// Add the last field
|
|
899
|
+
fields.push_back(std::move(field));
|
|
900
|
+
|
|
901
|
+
return fields;
|
|
902
|
+
}
|
|
903
|
+
|
|
857
904
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
858
905
|
// per-example default params
|
|
859
906
|
// we define here to make sure it's included in llama-gen-docs
|
|
@@ -1250,7 +1297,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1250
1297
|
{"--in-file"}, "FNAME",
|
|
1251
1298
|
"an input file (use comma-separated values to specify multiple files)",
|
|
1252
1299
|
[](common_params & params, const std::string & value) {
|
|
1253
|
-
for (const auto & item :
|
|
1300
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
1254
1301
|
std::ifstream file(item);
|
|
1255
1302
|
if (!file) {
|
|
1256
1303
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
@@ -1397,7 +1444,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1397
1444
|
[](common_params & params, bool value) {
|
|
1398
1445
|
params.warmup = value;
|
|
1399
1446
|
}
|
|
1400
|
-
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1447
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
|
|
1401
1448
|
add_opt(common_arg(
|
|
1402
1449
|
{"--spm-infill"},
|
|
1403
1450
|
string_format(
|
|
@@ -1695,6 +1742,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1695
1742
|
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
|
|
1696
1743
|
}
|
|
1697
1744
|
).set_sparam());
|
|
1745
|
+
add_opt(common_arg(
|
|
1746
|
+
{"-bs", "--backend-sampling"},
|
|
1747
|
+
"enable backend sampling (experimental) (default: disabled)",
|
|
1748
|
+
[](common_params & params) {
|
|
1749
|
+
params.sampling.backend_sampling = true;
|
|
1750
|
+
}
|
|
1751
|
+
).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
|
|
1698
1752
|
add_opt(common_arg(
|
|
1699
1753
|
{"--pooling"}, "{none,mean,cls,last,rank}",
|
|
1700
1754
|
"pooling type for embeddings, use model default if unspecified",
|
|
@@ -1706,7 +1760,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1706
1760
|
else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
|
|
1707
1761
|
else { throw std::invalid_argument("invalid value"); }
|
|
1708
1762
|
}
|
|
1709
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
|
|
1763
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
|
|
1710
1764
|
add_opt(common_arg(
|
|
1711
1765
|
{"--attention"}, "{causal,non-causal}",
|
|
1712
1766
|
"attention type for embeddings, use model default if unspecified",
|
|
@@ -1995,7 +2049,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1995
2049
|
{"--image", "--audio"}, "FILE",
|
|
1996
2050
|
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
|
|
1997
2051
|
[](common_params & params, const std::string & value) {
|
|
1998
|
-
for (const auto & item :
|
|
2052
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
1999
2053
|
params.image.emplace_back(item);
|
|
2000
2054
|
}
|
|
2001
2055
|
}
|
|
@@ -2252,37 +2306,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2252
2306
|
));
|
|
2253
2307
|
add_opt(common_arg(
|
|
2254
2308
|
{"--override-kv"}, "KEY=TYPE:VALUE,...",
|
|
2255
|
-
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated
|
|
2309
|
+
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
|
|
2256
2310
|
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
|
|
2257
2311
|
[](common_params & params, const std::string & value) {
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
bool escaping = false;
|
|
2262
|
-
|
|
2263
|
-
for (const char c : value) {
|
|
2264
|
-
if (escaping) {
|
|
2265
|
-
current.push_back(c);
|
|
2266
|
-
escaping = false;
|
|
2267
|
-
} else if (c == '\\') {
|
|
2268
|
-
escaping = true;
|
|
2269
|
-
} else if (c == ',') {
|
|
2270
|
-
kv_overrides.push_back(current);
|
|
2271
|
-
current.clear();
|
|
2272
|
-
} else {
|
|
2273
|
-
current.push_back(c);
|
|
2274
|
-
}
|
|
2275
|
-
}
|
|
2276
|
-
|
|
2277
|
-
if (escaping) {
|
|
2278
|
-
current.push_back('\\');
|
|
2279
|
-
}
|
|
2280
|
-
|
|
2281
|
-
kv_overrides.push_back(current);
|
|
2282
|
-
|
|
2283
|
-
for (const auto & kv_override : kv_overrides) {
|
|
2284
|
-
if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
|
|
2285
|
-
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
|
|
2312
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2313
|
+
if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
|
|
2314
|
+
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
|
|
2286
2315
|
}
|
|
2287
2316
|
}
|
|
2288
2317
|
}
|
|
@@ -2299,7 +2328,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2299
2328
|
{"--lora"}, "FNAME",
|
|
2300
2329
|
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
|
|
2301
2330
|
[](common_params & params, const std::string & value) {
|
|
2302
|
-
for (const auto & item :
|
|
2331
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2303
2332
|
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
|
|
2304
2333
|
}
|
|
2305
2334
|
}
|
|
@@ -2310,7 +2339,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2310
2339
|
"path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
|
|
2311
2340
|
"note: use comma-separated values",
|
|
2312
2341
|
[](common_params & params, const std::string & value) {
|
|
2313
|
-
for (const auto & item :
|
|
2342
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2314
2343
|
auto parts = string_split<std::string>(item, ':');
|
|
2315
2344
|
if (parts.size() != 2) {
|
|
2316
2345
|
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
|
|
@@ -2324,7 +2353,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2324
2353
|
{"--control-vector"}, "FNAME",
|
|
2325
2354
|
"add a control vector\nnote: use comma-separated values to add multiple control vectors",
|
|
2326
2355
|
[](common_params & params, const std::string & value) {
|
|
2327
|
-
for (const auto & item :
|
|
2356
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2328
2357
|
params.control_vectors.push_back({ 1.0f, item, });
|
|
2329
2358
|
}
|
|
2330
2359
|
}
|
|
@@ -2334,7 +2363,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2334
2363
|
"add a control vector with user defined scaling SCALE\n"
|
|
2335
2364
|
"note: use comma-separated values (format: FNAME:SCALE,...)",
|
|
2336
2365
|
[](common_params & params, const std::string & value) {
|
|
2337
|
-
for (const auto & item :
|
|
2366
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2338
2367
|
auto parts = string_split<std::string>(item, ':');
|
|
2339
2368
|
if (parts.size() != 2) {
|
|
2340
2369
|
throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
|
|
@@ -2432,7 +2461,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2432
2461
|
{"--context-file"}, "FNAME",
|
|
2433
2462
|
"file to load context from (use comma-separated values to specify multiple files)",
|
|
2434
2463
|
[](common_params & params, const std::string & value) {
|
|
2435
|
-
for (const auto & item :
|
|
2464
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2436
2465
|
std::ifstream file(item, std::ios::binary);
|
|
2437
2466
|
if (!file) {
|
|
2438
2467
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
@@ -2579,7 +2608,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2579
2608
|
[](common_params & params, int value) {
|
|
2580
2609
|
params.embd_normalize = value;
|
|
2581
2610
|
}
|
|
2582
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2611
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
|
|
2583
2612
|
add_opt(common_arg(
|
|
2584
2613
|
{"--embd-output-format"}, "FORMAT",
|
|
2585
2614
|
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
|
|
@@ -2657,7 +2686,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2657
2686
|
[](common_params & params) {
|
|
2658
2687
|
params.embedding = true;
|
|
2659
2688
|
}
|
|
2660
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2689
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2661
2690
|
add_opt(common_arg(
|
|
2662
2691
|
{"--rerank", "--reranking"},
|
|
2663
2692
|
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
|
@@ -2668,9 +2697,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2668
2697
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
|
2669
2698
|
add_opt(common_arg(
|
|
2670
2699
|
{"--api-key"}, "KEY",
|
|
2671
|
-
"API key to use for authentication (default: none)",
|
|
2700
|
+
"API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
|
|
2672
2701
|
[](common_params & params, const std::string & value) {
|
|
2673
|
-
|
|
2702
|
+
for (const auto & key : parse_csv_row(value)) {
|
|
2703
|
+
if (!key.empty()) {
|
|
2704
|
+
params.api_keys.push_back(key);
|
|
2705
|
+
}
|
|
2706
|
+
}
|
|
2674
2707
|
}
|
|
2675
2708
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
|
2676
2709
|
add_opt(common_arg(
|
|
@@ -2684,7 +2717,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2684
2717
|
std::string key;
|
|
2685
2718
|
while (std::getline(key_file, key)) {
|
|
2686
2719
|
if (!key.empty()) {
|
|
2687
|
-
|
|
2720
|
+
params.api_keys.push_back(key);
|
|
2688
2721
|
}
|
|
2689
2722
|
}
|
|
2690
2723
|
key_file.close();
|
|
@@ -2706,7 +2739,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2706
2739
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
|
2707
2740
|
add_opt(common_arg(
|
|
2708
2741
|
{"--chat-template-kwargs"}, "STRING",
|
|
2709
|
-
|
|
2742
|
+
"sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
|
|
2710
2743
|
[](common_params & params, const std::string & value) {
|
|
2711
2744
|
auto parsed = json::parse(value);
|
|
2712
2745
|
for (const auto & item : parsed.items()) {
|
|
@@ -3344,6 +3377,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3344
3377
|
}
|
|
3345
3378
|
}
|
|
3346
3379
|
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3380
|
+
add_opt(common_arg(
|
|
3381
|
+
{"--save-logits"},
|
|
3382
|
+
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
|
|
3383
|
+
[](common_params & params) {
|
|
3384
|
+
params.save_logits = true;
|
|
3385
|
+
}
|
|
3386
|
+
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
|
3387
|
+
add_opt(common_arg(
|
|
3388
|
+
{"--logits-output-dir"}, "PATH",
|
|
3389
|
+
string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
|
|
3390
|
+
[](common_params & params, const std::string & value) {
|
|
3391
|
+
params.logits_output_dir = value;
|
|
3392
|
+
}
|
|
3393
|
+
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
|
3394
|
+
add_opt(common_arg(
|
|
3395
|
+
{"--tensor-filter"}, "REGEX",
|
|
3396
|
+
"filter tensor names for debug output (regex pattern, can be specified multiple times)",
|
|
3397
|
+
[](common_params & params, const std::string & value) {
|
|
3398
|
+
params.tensor_filter.push_back(value);
|
|
3399
|
+
}
|
|
3400
|
+
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
|
3347
3401
|
|
|
3348
3402
|
// presets
|
|
3349
3403
|
add_opt(common_arg(
|
|
@@ -2052,7 +2052,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
2052
2052
|
// Trigger on tool calls that appear in the commentary channel
|
|
2053
2053
|
data.grammar_triggers.push_back({
|
|
2054
2054
|
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
|
2055
|
-
"<\\|channel\\|>(commentary|analysis) to"
|
|
2055
|
+
"<\\|channel\\|>(?:commentary|analysis) to"
|
|
2056
2056
|
});
|
|
2057
2057
|
|
|
2058
2058
|
// Trigger tool calls that appear in the role section, either at the
|
|
@@ -2385,17 +2385,17 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
|
|
2385
2385
|
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
|
|
2386
2386
|
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
|
|
2387
2387
|
data.grammar_triggers.push_back({
|
|
2388
|
-
|
|
2388
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
|
2389
2389
|
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
2390
2390
|
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
2391
|
-
std::string(data.thinking_forced_open ? "
|
|
2391
|
+
std::string(data.thinking_forced_open ? "(</think>\\s*)" : "") + (
|
|
2392
2392
|
"\\s*("
|
|
2393
2393
|
"(?:<tool_call>"
|
|
2394
2394
|
"|<function"
|
|
2395
2395
|
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
|
|
2396
2396
|
"\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
|
|
2397
2397
|
")"
|
|
2398
|
-
")
|
|
2398
|
+
")"
|
|
2399
2399
|
),
|
|
2400
2400
|
});
|
|
2401
2401
|
data.preserved_tokens = {
|
|
@@ -1086,6 +1086,7 @@ struct common_init_result::impl {
|
|
|
1086
1086
|
std::vector<llama_adapter_lora_ptr> lora;
|
|
1087
1087
|
|
|
1088
1088
|
std::vector<common_sampler_ptr> samplers;
|
|
1089
|
+
std::vector<llama_sampler_seq_config> samplers_seq_config;
|
|
1089
1090
|
};
|
|
1090
1091
|
|
|
1091
1092
|
common_init_result::common_init_result(common_params & params) :
|
|
@@ -1162,10 +1163,19 @@ common_init_result::common_init_result(common_params & params) :
|
|
|
1162
1163
|
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
|
1163
1164
|
//}
|
|
1164
1165
|
|
|
1166
|
+
// init the backend samplers as part of the context creation
|
|
1165
1167
|
pimpl->samplers.resize(cparams.n_seq_max);
|
|
1168
|
+
pimpl->samplers_seq_config.resize(cparams.n_seq_max);
|
|
1166
1169
|
|
|
1167
1170
|
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
|
|
1168
1171
|
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
|
|
1172
|
+
pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
// TODO: temporarily gated behind a flag
|
|
1176
|
+
if (params.sampling.backend_sampling) {
|
|
1177
|
+
cparams.samplers = pimpl->samplers_seq_config.data();
|
|
1178
|
+
cparams.n_samplers = pimpl->samplers_seq_config.size();
|
|
1169
1179
|
}
|
|
1170
1180
|
|
|
1171
1181
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
@@ -1189,6 +1199,12 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
|
|
|
1189
1199
|
return pimpl->samplers[seq_id].get();
|
|
1190
1200
|
}
|
|
1191
1201
|
|
|
1202
|
+
void common_init_result::reset_samplers() {
|
|
1203
|
+
for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
|
|
1204
|
+
llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1192
1208
|
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
|
1193
1209
|
return pimpl->lora;
|
|
1194
1210
|
}
|
|
@@ -1304,6 +1320,9 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
|
|
1304
1320
|
llama_synchronize(lctx);
|
|
1305
1321
|
llama_perf_context_reset(lctx);
|
|
1306
1322
|
llama_set_warmup(lctx, false);
|
|
1323
|
+
|
|
1324
|
+
// reset samplers to reset RNG state after warmup to the seeded state
|
|
1325
|
+
res->reset_samplers();
|
|
1307
1326
|
}
|
|
1308
1327
|
|
|
1309
1328
|
return res;
|
|
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
|
|
|
80
80
|
//
|
|
81
81
|
|
|
82
82
|
enum llama_example {
|
|
83
|
+
LLAMA_EXAMPLE_DEBUG,
|
|
83
84
|
LLAMA_EXAMPLE_COMMON,
|
|
84
85
|
LLAMA_EXAMPLE_SPECULATIVE,
|
|
85
86
|
LLAMA_EXAMPLE_COMPLETION,
|
|
@@ -216,6 +217,8 @@ struct common_params_sampling {
|
|
|
216
217
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
217
218
|
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
|
218
219
|
|
|
220
|
+
bool backend_sampling = false;
|
|
221
|
+
|
|
219
222
|
bool has_logit_bias() const {
|
|
220
223
|
return !logit_bias.empty();
|
|
221
224
|
}
|
|
@@ -371,6 +374,11 @@ struct common_params {
|
|
|
371
374
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
|
372
375
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
|
373
376
|
|
|
377
|
+
// llama-debug specific options
|
|
378
|
+
std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
|
|
379
|
+
bool save_logits = false; // whether to save logits to files // NOLINT
|
|
380
|
+
std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
|
|
381
|
+
|
|
374
382
|
std::vector<std::string> in_files; // all input files
|
|
375
383
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
|
376
384
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
@@ -690,7 +698,9 @@ struct common_init_result {
|
|
|
690
698
|
|
|
691
699
|
llama_model * model();
|
|
692
700
|
llama_context * context();
|
|
701
|
+
|
|
693
702
|
common_sampler * sampler(llama_seq_id seq_id);
|
|
703
|
+
void reset_samplers();
|
|
694
704
|
|
|
695
705
|
std::vector<llama_adapter_lora_ptr> & lora();
|
|
696
706
|
|
|
@@ -106,12 +106,16 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
|
|
|
106
106
|
}
|
|
107
107
|
|
|
108
108
|
static llama_sampler_i llama_sampler_llg_i = {
|
|
109
|
-
/* .name
|
|
110
|
-
/* .accept
|
|
111
|
-
/* .apply
|
|
112
|
-
/* .reset
|
|
113
|
-
/* .clone
|
|
114
|
-
/* .free
|
|
109
|
+
/* .name = */ llama_sampler_llg_name,
|
|
110
|
+
/* .accept = */ llama_sampler_llg_accept_impl,
|
|
111
|
+
/* .apply = */ llama_sampler_llg_apply,
|
|
112
|
+
/* .reset = */ llama_sampler_llg_reset,
|
|
113
|
+
/* .clone = */ llama_sampler_llg_clone,
|
|
114
|
+
/* .free = */ llama_sampler_llg_free,
|
|
115
|
+
/* .backend_init = */ NULL,
|
|
116
|
+
/* .backend_accept = */ NULL,
|
|
117
|
+
/* .backend_apply = */ NULL,
|
|
118
|
+
/* .backend_set_input = */ NULL,
|
|
115
119
|
};
|
|
116
120
|
|
|
117
121
|
static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
|
|
@@ -27,7 +27,7 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
|
|
|
27
27
|
return res;
|
|
28
28
|
}
|
|
29
29
|
std::match_results<std::string::const_reverse_iterator> srmatch;
|
|
30
|
-
if (std::
|
|
30
|
+
if (std::regex_search(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial, std::regex_constants::match_continuous)) {
|
|
31
31
|
auto group = srmatch[1].str();
|
|
32
32
|
if (group.length() != 0) {
|
|
33
33
|
auto it = srmatch[1].second.base();
|
|
@@ -55,18 +55,18 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
|
|
|
55
55
|
to see if a string ends with a partial regex match, but but it's not in std::regex yet.
|
|
56
56
|
Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
|
|
57
57
|
|
|
58
|
-
- /abcd/ -> (dcba|cba|ba|a)
|
|
59
|
-
- /a|b/ -> (a|b)
|
|
58
|
+
- /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:(?:d)?c)?b)?a)
|
|
59
|
+
- /a|b/ -> ^(a|b)
|
|
60
60
|
- /a*?/ -> error, could match ""
|
|
61
|
-
- /a*b/ -> ((?:b)?a*+)
|
|
62
|
-
- /.*?ab/ -> ((?:b)?a)
|
|
63
|
-
- /a.*?b/ -> ((?:b)?.*?a)
|
|
64
|
-
- /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a)
|
|
65
|
-
- /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a)
|
|
66
|
-
- /ab{2,4}c/ ->
|
|
61
|
+
- /a*b/ -> ^((?:b)?a*+) (final repetitions become eager)
|
|
62
|
+
- /.*?ab/ -> ^((?:b)?a) (omit .*)
|
|
63
|
+
- /a.*?b/ -> ^((?:b)?.*?a) (keep reluctant matches)
|
|
64
|
+
- /a(bc)d/ -> ^((?:(?:d)?(?:(?:c)?b))?a)
|
|
65
|
+
- /a(bc|de)/ -> ^((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a)
|
|
66
|
+
- /ab{2,4}c/ -> ^cbbb?b?a -> ^((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a)
|
|
67
67
|
|
|
68
|
-
The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
|
|
69
|
-
|
|
68
|
+
The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern.
|
|
69
|
+
All other groups are turned into non-capturing groups, and reluctant quantifiers are ignored.
|
|
70
70
|
*/
|
|
71
71
|
std::string regex_to_reversed_partial_regex(const std::string & pattern) {
|
|
72
72
|
auto it = pattern.begin();
|
|
@@ -177,7 +177,7 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
|
|
|
177
177
|
}
|
|
178
178
|
}
|
|
179
179
|
|
|
180
|
-
// /abcd/ -> (dcba|cba|ba|a)
|
|
180
|
+
// /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:d)?c)?b)?a)
|
|
181
181
|
// if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
|
|
182
182
|
// We'll do the outermost capturing group and final .* in the enclosing function.
|
|
183
183
|
std::vector<std::string> res_alts;
|
|
@@ -200,5 +200,5 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
|
|
|
200
200
|
throw std::runtime_error("Unmatched '(' in pattern");
|
|
201
201
|
}
|
|
202
202
|
|
|
203
|
-
return "(" + res + ")
|
|
203
|
+
return "^(" + res + ")";
|
|
204
204
|
}
|