@fugood/llama.node 1.4.8 → 1.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +7 -7
- package/src/LlamaContext.cpp +2 -0
- package/src/llama.cpp/common/arg.cpp +107 -31
- package/src/llama.cpp/common/common.cpp +1 -1
- package/src/llama.cpp/common/common.h +4 -1
- package/src/llama.cpp/common/sampling.cpp +51 -37
- package/src/llama.cpp/common/sampling.h +6 -3
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +283 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/src/llama-arch.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +56 -13
- package/src/llama.cpp/src/llama-model.cpp +7 -5
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama.cpp +22 -32
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.9",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,20 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-darwin-arm64": "1.4.
|
|
76
|
-
"@fugood/node-llama-darwin-x64": "1.4.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.4.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.4.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.4.
|
|
81
|
-
"@fugood/node-llama-linux-x64": "1.4.
|
|
82
|
-
"@fugood/node-llama-linux-x64-cuda": "1.4.
|
|
83
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.4.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.4.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.4.
|
|
86
|
-
"@fugood/node-llama-win32-x64": "1.4.
|
|
87
|
-
"@fugood/node-llama-win32-x64-cuda": "1.4.
|
|
88
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.4.
|
|
75
|
+
"@fugood/node-llama-darwin-arm64": "1.4.9",
|
|
76
|
+
"@fugood/node-llama-darwin-x64": "1.4.9",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.4.9",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.4.9",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.9",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.4.9",
|
|
81
|
+
"@fugood/node-llama-linux-x64": "1.4.9",
|
|
82
|
+
"@fugood/node-llama-linux-x64-cuda": "1.4.9",
|
|
83
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.4.9",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.4.9",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.4.9",
|
|
86
|
+
"@fugood/node-llama-win32-x64": "1.4.9",
|
|
87
|
+
"@fugood/node-llama-win32-x64-cuda": "1.4.9",
|
|
88
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.4.9"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
91
91
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -98,7 +98,7 @@ index 6085510a4..263076ce2 100644
|
|
|
98
98
|
struct common_chat_tool_call {
|
|
99
99
|
std::string name;
|
|
100
100
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
101
|
-
index
|
|
101
|
+
index d4e8c7405..af3dec813 100644
|
|
102
102
|
--- a/src/llama.cpp/common/common.cpp
|
|
103
103
|
+++ b/src/llama.cpp/common/common.cpp
|
|
104
104
|
@@ -1343,6 +1343,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
@@ -110,7 +110,7 @@ index 5a8cf5248..8010a990e 100644
|
|
|
110
110
|
mparams.split_mode = params.split_mode;
|
|
111
111
|
mparams.tensor_split = params.tensor_split;
|
|
112
112
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
113
|
-
index
|
|
113
|
+
index 3e314f4c8..5750a4057 100644
|
|
114
114
|
--- a/src/llama.cpp/common/common.h
|
|
115
115
|
+++ b/src/llama.cpp/common/common.h
|
|
116
116
|
@@ -307,6 +307,7 @@ struct lr_opt {
|
|
@@ -122,7 +122,7 @@ index d70744840..dea8c4546 100644
|
|
|
122
122
|
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
|
123
123
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
124
124
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
125
|
-
index
|
|
125
|
+
index 28fb7612e..63f7e1ca1 100644
|
|
126
126
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
127
127
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
128
128
|
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
@@ -135,10 +135,10 @@ index fc31089f3..aa9befe4c 100644
|
|
|
135
135
|
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
|
136
136
|
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
|
137
137
|
diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
138
|
-
index
|
|
138
|
+
index 6a00abacc..9e12459b6 100644
|
|
139
139
|
--- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
140
140
|
+++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
141
|
-
@@ -
|
|
141
|
+
@@ -3226,11 +3226,26 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
|
|
142
142
|
GGML_UNUSED(dev);
|
|
143
143
|
}
|
|
144
144
|
|
|
@@ -168,7 +168,7 @@ index 514f086f6..792abaa58 100644
|
|
|
168
168
|
GGML_UNUSED(dev);
|
|
169
169
|
}
|
|
170
170
|
|
|
171
|
-
@@ -
|
|
171
|
+
@@ -3413,10 +3428,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
172
172
|
}
|
|
173
173
|
}
|
|
174
174
|
|
|
@@ -187,7 +187,7 @@ index 514f086f6..792abaa58 100644
|
|
|
187
187
|
|
|
188
188
|
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
|
|
189
189
|
|
|
190
|
-
@@ -
|
|
190
|
+
@@ -3429,6 +3451,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
191
191
|
} catch (std::exception const &exc) {
|
|
192
192
|
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
|
|
193
193
|
devices[i].context = nullptr;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -250,6 +250,8 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
250
250
|
}
|
|
251
251
|
|
|
252
252
|
common_params params;
|
|
253
|
+
params.fit_params = false;
|
|
254
|
+
|
|
253
255
|
params.model.path = get_option<std::string>(options, "model", "");
|
|
254
256
|
if (params.model.path.empty()) {
|
|
255
257
|
Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException();
|
|
@@ -420,6 +420,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
420
420
|
}
|
|
421
421
|
};
|
|
422
422
|
|
|
423
|
+
std::set<std::string> seen_args;
|
|
424
|
+
|
|
423
425
|
for (int i = 1; i < argc; i++) {
|
|
424
426
|
const std::string arg_prefix = "--";
|
|
425
427
|
|
|
@@ -430,6 +432,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
430
432
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
431
433
|
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
432
434
|
}
|
|
435
|
+
if (!seen_args.insert(arg).second) {
|
|
436
|
+
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
437
|
+
}
|
|
433
438
|
auto & tmp = arg_to_options[arg];
|
|
434
439
|
auto opt = *tmp.first;
|
|
435
440
|
bool is_positive = tmp.second;
|
|
@@ -750,6 +755,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
|
|
750
755
|
}
|
|
751
756
|
};
|
|
752
757
|
|
|
758
|
+
std::set<std::string> seen_args;
|
|
759
|
+
|
|
753
760
|
for (int i = 1; i < argc; i++) {
|
|
754
761
|
const std::string arg_prefix = "--";
|
|
755
762
|
|
|
@@ -760,6 +767,9 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
|
|
760
767
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
761
768
|
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
762
769
|
}
|
|
770
|
+
if (!seen_args.insert(arg).second) {
|
|
771
|
+
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
772
|
+
}
|
|
763
773
|
auto opt = *arg_to_options[arg];
|
|
764
774
|
std::string val;
|
|
765
775
|
if (opt.value_hint != nullptr) {
|
|
@@ -863,7 +873,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
863
873
|
sampler_type_chars += common_sampler_type_to_chr(sampler);
|
|
864
874
|
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
|
|
865
875
|
}
|
|
866
|
-
sampler_type_names.
|
|
876
|
+
if (!sampler_type_names.empty()) {
|
|
877
|
+
sampler_type_names.pop_back(); // remove last semicolon
|
|
878
|
+
}
|
|
867
879
|
|
|
868
880
|
|
|
869
881
|
/**
|
|
@@ -1184,7 +1196,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1184
1196
|
[](common_params & params, const std::string & value) {
|
|
1185
1197
|
params.system_prompt = value;
|
|
1186
1198
|
}
|
|
1187
|
-
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
|
1199
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
|
|
1188
1200
|
add_opt(common_arg(
|
|
1189
1201
|
{"--perf"},
|
|
1190
1202
|
{"--no-perf"},
|
|
@@ -1226,13 +1238,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1226
1238
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
|
1227
1239
|
add_opt(common_arg(
|
|
1228
1240
|
{"--in-file"}, "FNAME",
|
|
1229
|
-
"an input file (
|
|
1241
|
+
"an input file (use comma-separated values to specify multiple files)",
|
|
1230
1242
|
[](common_params & params, const std::string & value) {
|
|
1231
|
-
std::
|
|
1232
|
-
|
|
1233
|
-
|
|
1243
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
1244
|
+
std::ifstream file(item);
|
|
1245
|
+
if (!file) {
|
|
1246
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
1247
|
+
}
|
|
1248
|
+
params.in_files.push_back(item);
|
|
1234
1249
|
}
|
|
1235
|
-
params.in_files.push_back(value);
|
|
1236
1250
|
}
|
|
1237
1251
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
1238
1252
|
add_opt(common_arg(
|
|
@@ -1969,9 +1983,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1969
1983
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
|
|
1970
1984
|
add_opt(common_arg(
|
|
1971
1985
|
{"--image", "--audio"}, "FILE",
|
|
1972
|
-
"path to an image or audio file. use with multimodal models,
|
|
1986
|
+
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
|
|
1973
1987
|
[](common_params & params, const std::string & value) {
|
|
1974
|
-
|
|
1988
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
1989
|
+
params.image.emplace_back(item);
|
|
1990
|
+
}
|
|
1975
1991
|
}
|
|
1976
1992
|
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
|
|
1977
1993
|
add_opt(common_arg(
|
|
@@ -2218,12 +2234,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2218
2234
|
}
|
|
2219
2235
|
));
|
|
2220
2236
|
add_opt(common_arg(
|
|
2221
|
-
{"--override-kv"}, "KEY=TYPE:VALUE",
|
|
2222
|
-
"advanced option to override model metadata by key.
|
|
2223
|
-
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
|
|
2237
|
+
{"--override-kv"}, "KEY=TYPE:VALUE,...",
|
|
2238
|
+
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
|
|
2239
|
+
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
|
|
2224
2240
|
[](common_params & params, const std::string & value) {
|
|
2225
|
-
|
|
2226
|
-
|
|
2241
|
+
std::vector<std::string> kv_overrides;
|
|
2242
|
+
|
|
2243
|
+
std::string current;
|
|
2244
|
+
bool escaping = false;
|
|
2245
|
+
|
|
2246
|
+
for (const char c : value) {
|
|
2247
|
+
if (escaping) {
|
|
2248
|
+
current.push_back(c);
|
|
2249
|
+
escaping = false;
|
|
2250
|
+
} else if (c == '\\') {
|
|
2251
|
+
escaping = true;
|
|
2252
|
+
} else if (c == ',') {
|
|
2253
|
+
kv_overrides.push_back(current);
|
|
2254
|
+
current.clear();
|
|
2255
|
+
} else {
|
|
2256
|
+
current.push_back(c);
|
|
2257
|
+
}
|
|
2258
|
+
}
|
|
2259
|
+
|
|
2260
|
+
if (escaping) {
|
|
2261
|
+
current.push_back('\\');
|
|
2262
|
+
}
|
|
2263
|
+
|
|
2264
|
+
kv_overrides.push_back(current);
|
|
2265
|
+
|
|
2266
|
+
for (const auto & kv_override : kv_overrides) {
|
|
2267
|
+
if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
|
|
2268
|
+
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
|
|
2269
|
+
}
|
|
2227
2270
|
}
|
|
2228
2271
|
}
|
|
2229
2272
|
));
|
|
@@ -2237,33 +2280,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2237
2280
|
));
|
|
2238
2281
|
add_opt(common_arg(
|
|
2239
2282
|
{"--lora"}, "FNAME",
|
|
2240
|
-
"path to LoRA adapter (
|
|
2283
|
+
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
|
|
2241
2284
|
[](common_params & params, const std::string & value) {
|
|
2242
|
-
|
|
2285
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2286
|
+
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
|
|
2287
|
+
}
|
|
2243
2288
|
}
|
|
2244
2289
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
2245
2290
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
2246
2291
|
add_opt(common_arg(
|
|
2247
|
-
{"--lora-scaled"}, "FNAME
|
|
2248
|
-
"path to LoRA adapter with user defined scaling (
|
|
2249
|
-
|
|
2250
|
-
|
|
2292
|
+
{"--lora-scaled"}, "FNAME:SCALE,...",
|
|
2293
|
+
"path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
|
|
2294
|
+
"note: use comma-separated values",
|
|
2295
|
+
[](common_params & params, const std::string & value) {
|
|
2296
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2297
|
+
auto parts = string_split<std::string>(item, ':');
|
|
2298
|
+
if (parts.size() != 2) {
|
|
2299
|
+
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
|
|
2300
|
+
}
|
|
2301
|
+
params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
|
|
2302
|
+
}
|
|
2251
2303
|
}
|
|
2252
2304
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
2253
2305
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
2254
2306
|
add_opt(common_arg(
|
|
2255
2307
|
{"--control-vector"}, "FNAME",
|
|
2256
|
-
"add a control vector\nnote:
|
|
2308
|
+
"add a control vector\nnote: use comma-separated values to add multiple control vectors",
|
|
2257
2309
|
[](common_params & params, const std::string & value) {
|
|
2258
|
-
|
|
2310
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2311
|
+
params.control_vectors.push_back({ 1.0f, item, });
|
|
2312
|
+
}
|
|
2259
2313
|
}
|
|
2260
2314
|
));
|
|
2261
2315
|
add_opt(common_arg(
|
|
2262
|
-
{"--control-vector-scaled"}, "FNAME
|
|
2316
|
+
{"--control-vector-scaled"}, "FNAME:SCALE,...",
|
|
2263
2317
|
"add a control vector with user defined scaling SCALE\n"
|
|
2264
|
-
"note:
|
|
2265
|
-
[](common_params & params, const std::string &
|
|
2266
|
-
|
|
2318
|
+
"note: use comma-separated values (format: FNAME:SCALE,...)",
|
|
2319
|
+
[](common_params & params, const std::string & value) {
|
|
2320
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2321
|
+
auto parts = string_split<std::string>(item, ':');
|
|
2322
|
+
if (parts.size() != 2) {
|
|
2323
|
+
throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
|
|
2324
|
+
}
|
|
2325
|
+
params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
|
|
2326
|
+
}
|
|
2267
2327
|
}
|
|
2268
2328
|
));
|
|
2269
2329
|
add_opt(common_arg(
|
|
@@ -2353,13 +2413,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2353
2413
|
).set_env("HF_TOKEN"));
|
|
2354
2414
|
add_opt(common_arg(
|
|
2355
2415
|
{"--context-file"}, "FNAME",
|
|
2356
|
-
"file to load context from (
|
|
2416
|
+
"file to load context from (use comma-separated values to specify multiple files)",
|
|
2357
2417
|
[](common_params & params, const std::string & value) {
|
|
2358
|
-
std::
|
|
2359
|
-
|
|
2360
|
-
|
|
2418
|
+
for (const auto & item : string_split<std::string>(value, ',')) {
|
|
2419
|
+
std::ifstream file(item, std::ios::binary);
|
|
2420
|
+
if (!file) {
|
|
2421
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
2422
|
+
}
|
|
2423
|
+
params.context_files.push_back(item);
|
|
2361
2424
|
}
|
|
2362
|
-
params.context_files.push_back(value);
|
|
2363
2425
|
}
|
|
2364
2426
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
|
2365
2427
|
add_opt(common_arg(
|
|
@@ -2550,6 +2612,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2550
2612
|
params.api_prefix = value;
|
|
2551
2613
|
}
|
|
2552
2614
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
|
2615
|
+
add_opt(common_arg(
|
|
2616
|
+
{"--webui-config"}, "JSON",
|
|
2617
|
+
"JSON that provides default WebUI settings (overrides WebUI defaults)",
|
|
2618
|
+
[](common_params & params, const std::string & value) {
|
|
2619
|
+
params.webui_config_json = value;
|
|
2620
|
+
}
|
|
2621
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
|
|
2622
|
+
add_opt(common_arg(
|
|
2623
|
+
{"--webui-config-file"}, "PATH",
|
|
2624
|
+
"JSON file that provides default WebUI settings (overrides WebUI defaults)",
|
|
2625
|
+
[](common_params & params, const std::string & value) {
|
|
2626
|
+
params.webui_config_json = read_file(value);
|
|
2627
|
+
}
|
|
2628
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
|
|
2553
2629
|
add_opt(common_arg(
|
|
2554
2630
|
{"--webui"},
|
|
2555
2631
|
{"--no-webui"},
|
|
@@ -1092,7 +1092,7 @@ common_init_result::common_init_result(common_params & params) :
|
|
|
1092
1092
|
auto cparams = common_context_params_to_llama(params);
|
|
1093
1093
|
|
|
1094
1094
|
if (params.fit_params) {
|
|
1095
|
-
LOG_INF("%s: fitting params to device memory,
|
|
1095
|
+
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
|
1096
1096
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
|
1097
1097
|
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
|
1098
1098
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
|
@@ -485,8 +485,11 @@ struct common_params {
|
|
|
485
485
|
|
|
486
486
|
std::map<std::string, std::string> default_template_kwargs;
|
|
487
487
|
|
|
488
|
+
// webui configs
|
|
489
|
+
bool webui = true;
|
|
490
|
+
std::string webui_config_json;
|
|
491
|
+
|
|
488
492
|
// "advanced" endpoints are disabled by default for better security
|
|
489
|
-
bool webui = true;
|
|
490
493
|
bool endpoint_slots = true;
|
|
491
494
|
bool endpoint_props = false; // only control POST requests, not GET
|
|
492
495
|
bool endpoint_metrics = false;
|
|
@@ -104,10 +104,9 @@ struct ring_buffer {
|
|
|
104
104
|
struct common_sampler {
|
|
105
105
|
common_params_sampling params;
|
|
106
106
|
|
|
107
|
+
struct llama_sampler * grmr;
|
|
107
108
|
struct llama_sampler * chain;
|
|
108
109
|
|
|
109
|
-
bool grammar;
|
|
110
|
-
|
|
111
110
|
ring_buffer<llama_token> prev;
|
|
112
111
|
|
|
113
112
|
std::vector<llama_token_data> cur;
|
|
@@ -167,15 +166,14 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
167
166
|
|
|
168
167
|
lparams.no_perf = params.no_perf;
|
|
169
168
|
|
|
169
|
+
llama_sampler * grmr = nullptr;
|
|
170
170
|
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
|
171
171
|
|
|
172
|
-
bool grammar = false;
|
|
173
172
|
std::vector<llama_sampler *> samplers;
|
|
174
173
|
|
|
175
174
|
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
|
176
175
|
#ifdef LLAMA_USE_LLGUIDANCE
|
|
177
|
-
|
|
178
|
-
grammar = true;
|
|
176
|
+
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
|
179
177
|
#else
|
|
180
178
|
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
|
181
179
|
#endif // LLAMA_USE_LLGUIDANCE
|
|
@@ -224,15 +222,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
224
222
|
|
|
225
223
|
if (!params.grammar.empty()) {
|
|
226
224
|
if (params.grammar_lazy) {
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
trigger_tokens.data(), trigger_tokens.size()));
|
|
225
|
+
grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
|
226
|
+
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
|
227
|
+
trigger_tokens.data(), trigger_tokens.size());
|
|
231
228
|
} else {
|
|
232
|
-
|
|
229
|
+
grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
|
233
230
|
}
|
|
234
|
-
|
|
235
|
-
grammar = true;
|
|
236
231
|
}
|
|
237
232
|
}
|
|
238
233
|
|
|
@@ -303,8 +298,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
303
298
|
|
|
304
299
|
auto * result = new common_sampler {
|
|
305
300
|
/* .params = */ params,
|
|
301
|
+
/* .grmr = */ grmr,
|
|
306
302
|
/* .chain = */ chain,
|
|
307
|
-
/* .grammar = */ grammar,
|
|
308
303
|
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
309
304
|
/* .cur = */ {},
|
|
310
305
|
/* .cur_p = */ {},
|
|
@@ -315,6 +310,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
315
310
|
|
|
316
311
|
void common_sampler_free(struct common_sampler * gsmpl) {
|
|
317
312
|
if (gsmpl) {
|
|
313
|
+
llama_sampler_free(gsmpl->grmr);
|
|
318
314
|
llama_sampler_free(gsmpl->chain);
|
|
319
315
|
|
|
320
316
|
delete gsmpl;
|
|
@@ -324,25 +320,12 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
|
|
324
320
|
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
|
325
321
|
const auto tm = gsmpl->tm();
|
|
326
322
|
|
|
327
|
-
if (gsmpl->
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
for (int i = 0; i < n_smpl; i++) {
|
|
331
|
-
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
|
332
|
-
|
|
333
|
-
// the grammar sampler is always the first one
|
|
334
|
-
if (i == 0) {
|
|
335
|
-
if (accept_grammar) {
|
|
336
|
-
llama_sampler_accept(smpl, token);
|
|
337
|
-
}
|
|
338
|
-
} else {
|
|
339
|
-
llama_sampler_accept(smpl, token);
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
} else {
|
|
343
|
-
llama_sampler_accept(gsmpl->chain, token);
|
|
323
|
+
if (gsmpl->grmr && accept_grammar) {
|
|
324
|
+
llama_sampler_accept(gsmpl->grmr, token);
|
|
344
325
|
}
|
|
345
326
|
|
|
327
|
+
llama_sampler_accept(gsmpl->chain, token);
|
|
328
|
+
|
|
346
329
|
gsmpl->prev.push_back(token);
|
|
347
330
|
}
|
|
348
331
|
|
|
@@ -353,8 +336,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
|
|
|
353
336
|
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
|
354
337
|
return new common_sampler {
|
|
355
338
|
/* .params = */ gsmpl->params,
|
|
339
|
+
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
|
356
340
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
|
357
|
-
/* .grammar = */ gsmpl->grammar,
|
|
358
341
|
/* .prev = */ gsmpl->prev,
|
|
359
342
|
/* .cur = */ gsmpl->cur,
|
|
360
343
|
/* .cur_p = */ gsmpl->cur_p,
|
|
@@ -410,7 +393,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
|
|
410
393
|
return gsmpl->chain;
|
|
411
394
|
}
|
|
412
395
|
|
|
413
|
-
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
|
|
396
|
+
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
|
414
397
|
llama_synchronize(ctx);
|
|
415
398
|
|
|
416
399
|
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
|
@@ -418,11 +401,42 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|
|
418
401
|
|
|
419
402
|
llama_token id = LLAMA_TOKEN_NULL;
|
|
420
403
|
|
|
404
|
+
auto & grmr = gsmpl->grmr;
|
|
421
405
|
auto & chain = gsmpl->chain;
|
|
422
406
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
|
423
407
|
|
|
424
408
|
gsmpl->set_logits(ctx, idx);
|
|
425
409
|
|
|
410
|
+
if (grammar_first) {
|
|
411
|
+
llama_sampler_apply(grmr, &cur_p);
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
llama_sampler_apply(chain, &cur_p);
|
|
415
|
+
|
|
416
|
+
id = cur_p.data[cur_p.selected].id;
|
|
417
|
+
|
|
418
|
+
if (grammar_first) {
|
|
419
|
+
return id;
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// check if it the sampled token fits the grammar (grammar-based rejection sampling)
|
|
423
|
+
{
|
|
424
|
+
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
|
425
|
+
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
|
426
|
+
|
|
427
|
+
llama_sampler_apply(grmr, &single_token_data_array);
|
|
428
|
+
|
|
429
|
+
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
|
430
|
+
if (is_valid) {
|
|
431
|
+
return id;
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// resampling:
|
|
436
|
+
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
|
437
|
+
gsmpl->set_logits(ctx, idx);
|
|
438
|
+
|
|
439
|
+
llama_sampler_apply(grmr, &cur_p);
|
|
426
440
|
llama_sampler_apply(chain, &cur_p);
|
|
427
441
|
|
|
428
442
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
|
@@ -432,7 +446,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|
|
432
446
|
return id;
|
|
433
447
|
}
|
|
434
448
|
|
|
435
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
|
|
449
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
|
436
450
|
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
|
437
451
|
|
|
438
452
|
std::vector<llama_token> result;
|
|
@@ -440,7 +454,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|
|
440
454
|
|
|
441
455
|
size_t i = 0;
|
|
442
456
|
for (; i < draft.size(); i++) {
|
|
443
|
-
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
|
457
|
+
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
|
444
458
|
|
|
445
459
|
common_sampler_accept(gsmpl, id, true);
|
|
446
460
|
|
|
@@ -452,7 +466,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|
|
452
466
|
}
|
|
453
467
|
|
|
454
468
|
if (i == draft.size()) {
|
|
455
|
-
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
|
469
|
+
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
|
456
470
|
|
|
457
471
|
common_sampler_accept(gsmpl, id, true);
|
|
458
472
|
|
|
@@ -462,13 +476,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|
|
462
476
|
return result;
|
|
463
477
|
}
|
|
464
478
|
|
|
465
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
|
|
479
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
|
466
480
|
std::vector<int> idxs(draft.size() + 1);
|
|
467
481
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
|
468
482
|
idxs[i] = i;
|
|
469
483
|
}
|
|
470
484
|
|
|
471
|
-
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
|
485
|
+
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
|
472
486
|
}
|
|
473
487
|
|
|
474
488
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
|
@@ -57,7 +57,10 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
|
|
57
57
|
// - check if the token fits the grammar (if any)
|
|
58
58
|
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
|
59
59
|
//
|
|
60
|
-
|
|
60
|
+
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
|
61
|
+
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
|
62
|
+
//
|
|
63
|
+
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
|
61
64
|
|
|
62
65
|
// generalized version of common_sampler_sample
|
|
63
66
|
//
|
|
@@ -75,10 +78,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|
|
75
78
|
//
|
|
76
79
|
// returns at least 1 token, up to idxs.size()
|
|
77
80
|
//
|
|
78
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
|
|
81
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
|
79
82
|
|
|
80
83
|
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
|
81
|
-
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
|
|
84
|
+
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
|
82
85
|
|
|
83
86
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
|
84
87
|
|
|
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
|
|
|
315
315
|
for (int i = 0; i < params.n_draft; ++i) {
|
|
316
316
|
common_batch_clear(batch);
|
|
317
317
|
|
|
318
|
-
common_sampler_sample(smpl, ctx_dft, 0);
|
|
318
|
+
common_sampler_sample(smpl, ctx_dft, 0, true);
|
|
319
319
|
|
|
320
320
|
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
|
321
321
|
|
|
@@ -458,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
458
458
|
if (GGML_RV_ZFH)
|
|
459
459
|
string(APPEND MARCH_STR "_zfh")
|
|
460
460
|
endif()
|
|
461
|
+
|
|
461
462
|
if (GGML_XTHEADVECTOR)
|
|
462
463
|
string(APPEND MARCH_STR "_xtheadvector")
|
|
463
464
|
elseif (GGML_RVV)
|
|
@@ -465,6 +466,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
465
466
|
if (GGML_RV_ZVFH)
|
|
466
467
|
string(APPEND MARCH_STR "_zvfh")
|
|
467
468
|
endif()
|
|
469
|
+
if (GGML_RV_ZVFBFWMA)
|
|
470
|
+
string(APPEND MARCH_STR "_zvfbfwma")
|
|
471
|
+
endif()
|
|
468
472
|
endif()
|
|
469
473
|
if (GGML_RV_ZICBOP)
|
|
470
474
|
string(APPEND MARCH_STR "_zicbop")
|