@fugood/llama.node 1.4.11 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +31 -31
- package/src/llama.cpp/common/arg.cpp +128 -59
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +36 -7
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +42 -23
- package/src/llama.cpp/common/common.h +11 -1
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +100 -12
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +78 -0
- package/src/llama.cpp/src/llama-arch.h +8 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +637 -49
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +12 -5
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +337 -26
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
- package/src/llama.cpp/src/llama-sampling.h +19 -7
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +87 -64
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
- package/src/llama.cpp/src/unicode.cpp +23 -14
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.13",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,20 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-darwin-arm64": "1.4.
|
|
76
|
-
"@fugood/node-llama-darwin-x64": "1.4.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.4.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.4.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.4.
|
|
81
|
-
"@fugood/node-llama-linux-x64": "1.4.
|
|
82
|
-
"@fugood/node-llama-linux-x64-cuda": "1.4.
|
|
83
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.4.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.4.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.4.
|
|
86
|
-
"@fugood/node-llama-win32-x64": "1.4.
|
|
87
|
-
"@fugood/node-llama-win32-x64-cuda": "1.4.
|
|
88
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.4.
|
|
75
|
+
"@fugood/node-llama-darwin-arm64": "1.4.13",
|
|
76
|
+
"@fugood/node-llama-darwin-x64": "1.4.13",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.4.13",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.4.13",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.13",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.4.13",
|
|
81
|
+
"@fugood/node-llama-linux-x64": "1.4.13",
|
|
82
|
+
"@fugood/node-llama-linux-x64-cuda": "1.4.13",
|
|
83
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.4.13",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.4.13",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.4.13",
|
|
86
|
+
"@fugood/node-llama-win32-x64": "1.4.13",
|
|
87
|
+
"@fugood/node-llama-win32-x64-cuda": "1.4.13",
|
|
88
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.4.13"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
91
91
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -32,7 +32,7 @@ index 1bcba9cd8..b7cd68734 100644
|
|
|
32
32
|
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
|
|
33
33
|
int count = 0;
|
|
34
34
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
35
|
-
index
|
|
35
|
+
index 22e527bab..c3d0affca 100644
|
|
36
36
|
--- a/src/llama.cpp/common/chat.cpp
|
|
37
37
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
38
38
|
@@ -7,9 +7,6 @@
|
|
@@ -62,7 +62,7 @@ index 0a426f447..ab02be247 100644
|
|
|
62
62
|
struct templates_params {
|
|
63
63
|
json messages;
|
|
64
64
|
json tools;
|
|
65
|
-
@@ -
|
|
65
|
+
@@ -752,7 +739,7 @@ static std::string apply(
|
|
66
66
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
67
67
|
}
|
|
68
68
|
// TODO: add flag to control date/time, if only for testing purposes.
|
|
@@ -72,7 +72,7 @@ index 0a426f447..ab02be247 100644
|
|
|
72
72
|
minja::chat_template_options tmpl_opts;
|
|
73
73
|
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
74
74
|
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
75
|
-
index
|
|
75
|
+
index 8bd4a325f..333b3301f 100644
|
|
76
76
|
--- a/src/llama.cpp/common/chat.h
|
|
77
77
|
+++ b/src/llama.cpp/common/chat.h
|
|
78
78
|
@@ -10,7 +10,18 @@
|
|
@@ -96,22 +96,22 @@ index 6085510a4..263076ce2 100644
|
|
|
96
96
|
struct common_chat_tool_call {
|
|
97
97
|
std::string name;
|
|
98
98
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
99
|
-
index
|
|
99
|
+
index 41b2b6833..fe9ba05aa 100644
|
|
100
100
|
--- a/src/llama.cpp/common/common.cpp
|
|
101
101
|
+++ b/src/llama.cpp/common/common.cpp
|
|
102
|
-
@@ -
|
|
103
|
-
mparams.
|
|
102
|
+
@@ -1361,6 +1361,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
103
|
+
mparams.devices = params.devices.data();
|
|
104
104
|
}
|
|
105
105
|
|
|
106
106
|
+ mparams.vocab_only = params.vocab_only;
|
|
107
|
+
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
107
108
|
mparams.main_gpu = params.main_gpu;
|
|
108
109
|
mparams.split_mode = params.split_mode;
|
|
109
|
-
mparams.tensor_split = params.tensor_split;
|
|
110
110
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
111
|
-
index
|
|
111
|
+
index d6fd0d37a..477209ce5 100644
|
|
112
112
|
--- a/src/llama.cpp/common/common.h
|
|
113
113
|
+++ b/src/llama.cpp/common/common.h
|
|
114
|
-
@@ -
|
|
114
|
+
@@ -310,6 +310,7 @@ struct lr_opt {
|
|
115
115
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
116
116
|
|
|
117
117
|
struct common_params {
|
|
@@ -120,7 +120,7 @@ index 334372073..e912b593a 100644
|
|
|
120
120
|
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
|
121
121
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
122
122
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
123
|
-
index
|
|
123
|
+
index 7622d0bf4..d2edcfddb 100644
|
|
124
124
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
125
125
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
126
126
|
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
@@ -133,14 +133,13 @@ index 28fb7612e..63f7e1ca1 100644
|
|
|
133
133
|
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
|
134
134
|
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
|
135
135
|
diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
136
|
-
index
|
|
136
|
+
index 365a24b49..83bf4ee62 100644
|
|
137
137
|
--- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
138
138
|
+++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
139
|
-
@@ -
|
|
139
|
+
@@ -2798,9 +2798,24 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
|
|
140
140
|
GGML_UNUSED(dev);
|
|
141
141
|
}
|
|
142
142
|
|
|
143
|
-
+
|
|
144
143
|
+// ~2GB per session for now
|
|
145
144
|
+#define GGML_HEXAGON_SESSION_MEMORY_DEFAULT (2ULL * 1024 * 1024 * 1024)
|
|
146
145
|
+// Max to 3.5GB
|
|
@@ -149,7 +148,6 @@ index 6a00abacc..9e12459b6 100644
|
|
|
149
148
|
static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
150
149
|
- // ~2GB per session for now
|
|
151
150
|
- *free = 2ULL * 1024 * 1024 * 1024;
|
|
152
|
-
- *total = *free;
|
|
153
151
|
+ const char * str_mem = getenv("GGML_HEXAGON_SESSION_MEMORY");
|
|
154
152
|
+ if (str_mem) {
|
|
155
153
|
+ *free = std::stoull(str_mem);
|
|
@@ -161,32 +159,34 @@ index 6a00abacc..9e12459b6 100644
|
|
|
161
159
|
+ } else {
|
|
162
160
|
+ *free = GGML_HEXAGON_SESSION_MEMORY_DEFAULT;
|
|
163
161
|
+ }
|
|
162
|
+
+
|
|
163
|
+
*total = *free;
|
|
164
164
|
|
|
165
|
-
+ *total = *free;
|
|
166
165
|
GGML_UNUSED(dev);
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
@@ -3413,10 +3428,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
166
|
+
@@ -3010,10 +3025,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
170
167
|
}
|
|
171
168
|
}
|
|
172
169
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
opt_ndev = 1;
|
|
170
|
+
- if (opt_arch < 75) {
|
|
171
|
+
- opt_ndev = 1;
|
|
176
172
|
- GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
|
177
|
-
|
|
178
|
-
+
|
|
179
|
-
|
|
180
|
-
+
|
|
181
|
-
+
|
|
182
|
-
+
|
|
183
|
-
|
|
184
|
-
|
|
173
|
+
- }
|
|
174
|
+
+ #if defined(__ANDROID__)
|
|
175
|
+
+ if(opt_arch < 75) {
|
|
176
|
+
+ opt_ndev = 1;
|
|
177
|
+
+ GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75 for Android.\n");
|
|
178
|
+
+ }
|
|
179
|
+
+ #else
|
|
180
|
+
+ if(opt_arch < 73) {
|
|
181
|
+
+ opt_ndev = 1;
|
|
182
|
+
+ GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v73 for Linux and Windows.\n");
|
|
183
|
+
+ }
|
|
184
|
+
+ #endif
|
|
185
185
|
|
|
186
186
|
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
|
|
187
187
|
|
|
188
|
-
@@ -
|
|
189
|
-
} catch (std::exception
|
|
188
|
+
@@ -3026,6 +3048,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
189
|
+
} catch (const std::exception & exc) {
|
|
190
190
|
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
|
|
191
191
|
devices[i].context = nullptr;
|
|
192
192
|
+ opt_ndev = i;
|
|
@@ -679,7 +679,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
|
679
679
|
"llama-quantize",
|
|
680
680
|
"llama-qwen2vl-cli",
|
|
681
681
|
"llama-retrieval",
|
|
682
|
-
"llama-run",
|
|
683
682
|
"llama-save-load-state",
|
|
684
683
|
"llama-server",
|
|
685
684
|
"llama-simple",
|
|
@@ -854,6 +853,54 @@ bool common_arg_utils::is_autoy(const std::string & value) {
|
|
|
854
853
|
return value == "auto" || value == "-1";
|
|
855
854
|
}
|
|
856
855
|
|
|
856
|
+
// Simple CSV parser that handles quoted fields and escaped quotes
|
|
857
|
+
// example:
|
|
858
|
+
// input: value1,"value, with, commas","value with ""escaped"" quotes",value4
|
|
859
|
+
// output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
|
|
860
|
+
static std::vector<std::string> parse_csv_row(const std::string& input) {
|
|
861
|
+
std::vector<std::string> fields;
|
|
862
|
+
std::string field;
|
|
863
|
+
bool in_quotes = false;
|
|
864
|
+
|
|
865
|
+
for (size_t i = 0; i < input.length(); ++i) {
|
|
866
|
+
char ch = input[i];
|
|
867
|
+
|
|
868
|
+
if (ch == '"') {
|
|
869
|
+
if (!in_quotes) {
|
|
870
|
+
// start of quoted field (only valid if at beginning of field)
|
|
871
|
+
if (!field.empty()) {
|
|
872
|
+
// quote appeared in middle of unquoted field, treat as literal
|
|
873
|
+
field += '"';
|
|
874
|
+
} else {
|
|
875
|
+
in_quotes = true; // start
|
|
876
|
+
}
|
|
877
|
+
} else {
|
|
878
|
+
if (i + 1 < input.length() && input[i + 1] == '"') {
|
|
879
|
+
// escaped quote: ""
|
|
880
|
+
field += '"';
|
|
881
|
+
++i; // skip the next quote
|
|
882
|
+
} else {
|
|
883
|
+
in_quotes = false; // end
|
|
884
|
+
}
|
|
885
|
+
}
|
|
886
|
+
} else if (ch == ',') {
|
|
887
|
+
if (in_quotes) {
|
|
888
|
+
field += ',';
|
|
889
|
+
} else {
|
|
890
|
+
fields.push_back(std::move(field));
|
|
891
|
+
field.clear();
|
|
892
|
+
}
|
|
893
|
+
} else {
|
|
894
|
+
field += ch;
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
// Add the last field
|
|
899
|
+
fields.push_back(std::move(field));
|
|
900
|
+
|
|
901
|
+
return fields;
|
|
902
|
+
}
|
|
903
|
+
|
|
857
904
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
858
905
|
// per-example default params
|
|
859
906
|
// we define here to make sure it's included in llama-gen-docs
|
|
@@ -1250,7 +1297,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1250
1297
|
{"--in-file"}, "FNAME",
|
|
1251
1298
|
"an input file (use comma-separated values to specify multiple files)",
|
|
1252
1299
|
[](common_params & params, const std::string & value) {
|
|
1253
|
-
for (const auto & item :
|
|
1300
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
1254
1301
|
std::ifstream file(item);
|
|
1255
1302
|
if (!file) {
|
|
1256
1303
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
@@ -1397,7 +1444,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1397
1444
|
[](common_params & params, bool value) {
|
|
1398
1445
|
params.warmup = value;
|
|
1399
1446
|
}
|
|
1400
|
-
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1447
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
|
|
1401
1448
|
add_opt(common_arg(
|
|
1402
1449
|
{"--spm-infill"},
|
|
1403
1450
|
string_format(
|
|
@@ -1695,6 +1742,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1695
1742
|
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
|
|
1696
1743
|
}
|
|
1697
1744
|
).set_sparam());
|
|
1745
|
+
add_opt(common_arg(
|
|
1746
|
+
{"-bs", "--backend-sampling"},
|
|
1747
|
+
"enable backend sampling (experimental) (default: disabled)",
|
|
1748
|
+
[](common_params & params) {
|
|
1749
|
+
params.sampling.backend_sampling = true;
|
|
1750
|
+
}
|
|
1751
|
+
).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
|
|
1698
1752
|
add_opt(common_arg(
|
|
1699
1753
|
{"--pooling"}, "{none,mean,cls,last,rank}",
|
|
1700
1754
|
"pooling type for embeddings, use model default if unspecified",
|
|
@@ -1706,7 +1760,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1706
1760
|
else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
|
|
1707
1761
|
else { throw std::invalid_argument("invalid value"); }
|
|
1708
1762
|
}
|
|
1709
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
|
|
1763
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
|
|
1710
1764
|
add_opt(common_arg(
|
|
1711
1765
|
{"--attention"}, "{causal,non-causal}",
|
|
1712
1766
|
"attention type for embeddings, use model default if unspecified",
|
|
@@ -1995,7 +2049,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1995
2049
|
{"--image", "--audio"}, "FILE",
|
|
1996
2050
|
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
|
|
1997
2051
|
[](common_params & params, const std::string & value) {
|
|
1998
|
-
for (const auto & item :
|
|
2052
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
1999
2053
|
params.image.emplace_back(item);
|
|
2000
2054
|
}
|
|
2001
2055
|
}
|
|
@@ -2017,7 +2071,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2017
2071
|
if (llama_supports_rpc()) {
|
|
2018
2072
|
add_opt(common_arg(
|
|
2019
2073
|
{"--rpc"}, "SERVERS",
|
|
2020
|
-
"comma separated list of RPC servers",
|
|
2074
|
+
"comma separated list of RPC servers (host:port)",
|
|
2021
2075
|
[](common_params & params, const std::string & value) {
|
|
2022
2076
|
add_rpc_devices(value);
|
|
2023
2077
|
GGML_UNUSED(params);
|
|
@@ -2087,7 +2141,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2087
2141
|
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
|
2088
2142
|
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
|
|
2089
2143
|
}
|
|
2090
|
-
));
|
|
2144
|
+
).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
|
|
2091
2145
|
add_opt(common_arg(
|
|
2092
2146
|
{"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
|
|
2093
2147
|
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
|
@@ -2137,11 +2191,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2137
2191
|
}
|
|
2138
2192
|
}
|
|
2139
2193
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
|
2194
|
+
GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
|
|
2140
2195
|
add_opt(common_arg(
|
|
2141
2196
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
2142
|
-
string_format("max. number of layers to store in VRAM (default: %
|
|
2143
|
-
[](common_params & params,
|
|
2144
|
-
|
|
2197
|
+
string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
|
|
2198
|
+
[](common_params & params, const std::string & value) {
|
|
2199
|
+
if (value == "auto") {
|
|
2200
|
+
params.n_gpu_layers = -1;
|
|
2201
|
+
} else if (value == "all") {
|
|
2202
|
+
params.n_gpu_layers = -2;
|
|
2203
|
+
} else {
|
|
2204
|
+
params.n_gpu_layers = std::stoi(value);
|
|
2205
|
+
}
|
|
2145
2206
|
if (!llama_supports_gpu_offload()) {
|
|
2146
2207
|
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
|
|
2147
2208
|
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
|
@@ -2245,37 +2306,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2245
2306
|
));
|
|
2246
2307
|
add_opt(common_arg(
|
|
2247
2308
|
{"--override-kv"}, "KEY=TYPE:VALUE,...",
|
|
2248
|
-
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated
|
|
2309
|
+
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
|
|
2249
2310
|
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
|
|
2250
2311
|
[](common_params & params, const std::string & value) {
|
|
2251
|
-
|
|
2252
|
-
|
|
2253
|
-
|
|
2254
|
-
bool escaping = false;
|
|
2255
|
-
|
|
2256
|
-
for (const char c : value) {
|
|
2257
|
-
if (escaping) {
|
|
2258
|
-
current.push_back(c);
|
|
2259
|
-
escaping = false;
|
|
2260
|
-
} else if (c == '\\') {
|
|
2261
|
-
escaping = true;
|
|
2262
|
-
} else if (c == ',') {
|
|
2263
|
-
kv_overrides.push_back(current);
|
|
2264
|
-
current.clear();
|
|
2265
|
-
} else {
|
|
2266
|
-
current.push_back(c);
|
|
2267
|
-
}
|
|
2268
|
-
}
|
|
2269
|
-
|
|
2270
|
-
if (escaping) {
|
|
2271
|
-
current.push_back('\\');
|
|
2272
|
-
}
|
|
2273
|
-
|
|
2274
|
-
kv_overrides.push_back(current);
|
|
2275
|
-
|
|
2276
|
-
for (const auto & kv_override : kv_overrides) {
|
|
2277
|
-
if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
|
|
2278
|
-
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
|
|
2312
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2313
|
+
if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
|
|
2314
|
+
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
|
|
2279
2315
|
}
|
|
2280
2316
|
}
|
|
2281
2317
|
}
|
|
@@ -2292,7 +2328,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2292
2328
|
{"--lora"}, "FNAME",
|
|
2293
2329
|
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
|
|
2294
2330
|
[](common_params & params, const std::string & value) {
|
|
2295
|
-
for (const auto & item :
|
|
2331
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2296
2332
|
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
|
|
2297
2333
|
}
|
|
2298
2334
|
}
|
|
@@ -2303,7 +2339,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2303
2339
|
"path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
|
|
2304
2340
|
"note: use comma-separated values",
|
|
2305
2341
|
[](common_params & params, const std::string & value) {
|
|
2306
|
-
for (const auto & item :
|
|
2342
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2307
2343
|
auto parts = string_split<std::string>(item, ':');
|
|
2308
2344
|
if (parts.size() != 2) {
|
|
2309
2345
|
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
|
|
@@ -2317,7 +2353,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2317
2353
|
{"--control-vector"}, "FNAME",
|
|
2318
2354
|
"add a control vector\nnote: use comma-separated values to add multiple control vectors",
|
|
2319
2355
|
[](common_params & params, const std::string & value) {
|
|
2320
|
-
for (const auto & item :
|
|
2356
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2321
2357
|
params.control_vectors.push_back({ 1.0f, item, });
|
|
2322
2358
|
}
|
|
2323
2359
|
}
|
|
@@ -2327,7 +2363,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2327
2363
|
"add a control vector with user defined scaling SCALE\n"
|
|
2328
2364
|
"note: use comma-separated values (format: FNAME:SCALE,...)",
|
|
2329
2365
|
[](common_params & params, const std::string & value) {
|
|
2330
|
-
for (const auto & item :
|
|
2366
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2331
2367
|
auto parts = string_split<std::string>(item, ':');
|
|
2332
2368
|
if (parts.size() != 2) {
|
|
2333
2369
|
throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
|
|
@@ -2425,7 +2461,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2425
2461
|
{"--context-file"}, "FNAME",
|
|
2426
2462
|
"file to load context from (use comma-separated values to specify multiple files)",
|
|
2427
2463
|
[](common_params & params, const std::string & value) {
|
|
2428
|
-
for (const auto & item :
|
|
2464
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2429
2465
|
std::ifstream file(item, std::ios::binary);
|
|
2430
2466
|
if (!file) {
|
|
2431
2467
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
@@ -2572,7 +2608,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2572
2608
|
[](common_params & params, int value) {
|
|
2573
2609
|
params.embd_normalize = value;
|
|
2574
2610
|
}
|
|
2575
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2611
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
|
|
2576
2612
|
add_opt(common_arg(
|
|
2577
2613
|
{"--embd-output-format"}, "FORMAT",
|
|
2578
2614
|
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
|
|
@@ -2650,7 +2686,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2650
2686
|
[](common_params & params) {
|
|
2651
2687
|
params.embedding = true;
|
|
2652
2688
|
}
|
|
2653
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2689
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2654
2690
|
add_opt(common_arg(
|
|
2655
2691
|
{"--rerank", "--reranking"},
|
|
2656
2692
|
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
|
@@ -2661,9 +2697,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2661
2697
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
|
2662
2698
|
add_opt(common_arg(
|
|
2663
2699
|
{"--api-key"}, "KEY",
|
|
2664
|
-
"API key to use for authentication (default: none)",
|
|
2700
|
+
"API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
|
|
2665
2701
|
[](common_params & params, const std::string & value) {
|
|
2666
|
-
|
|
2702
|
+
for (const auto & key : parse_csv_row(value)) {
|
|
2703
|
+
if (!key.empty()) {
|
|
2704
|
+
params.api_keys.push_back(key);
|
|
2705
|
+
}
|
|
2706
|
+
}
|
|
2667
2707
|
}
|
|
2668
2708
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
|
2669
2709
|
add_opt(common_arg(
|
|
@@ -2677,7 +2717,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2677
2717
|
std::string key;
|
|
2678
2718
|
while (std::getline(key_file, key)) {
|
|
2679
2719
|
if (!key.empty()) {
|
|
2680
|
-
|
|
2720
|
+
params.api_keys.push_back(key);
|
|
2681
2721
|
}
|
|
2682
2722
|
}
|
|
2683
2723
|
key_file.close();
|
|
@@ -2699,7 +2739,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2699
2739
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
|
2700
2740
|
add_opt(common_arg(
|
|
2701
2741
|
{"--chat-template-kwargs"}, "STRING",
|
|
2702
|
-
|
|
2742
|
+
"sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
|
|
2703
2743
|
[](common_params & params, const std::string & value) {
|
|
2704
2744
|
auto parsed = json::parse(value);
|
|
2705
2745
|
for (const auto & item : parsed.items()) {
|
|
@@ -3175,11 +3215,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3175
3215
|
params.speculative.devices = parse_device_list(value);
|
|
3176
3216
|
}
|
|
3177
3217
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
3218
|
+
GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
|
|
3178
3219
|
add_opt(common_arg(
|
|
3179
3220
|
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
3180
|
-
"number of layers to store in VRAM
|
|
3181
|
-
|
|
3182
|
-
|
|
3221
|
+
string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
|
|
3222
|
+
params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
|
|
3223
|
+
[](common_params & params, const std::string & value) {
|
|
3224
|
+
if (value == "auto") {
|
|
3225
|
+
params.speculative.n_gpu_layers = -1;
|
|
3226
|
+
} else if (value == "all") {
|
|
3227
|
+
params.speculative.n_gpu_layers = -2;
|
|
3228
|
+
} else {
|
|
3229
|
+
params.speculative.n_gpu_layers = std::stoi(value);
|
|
3230
|
+
}
|
|
3183
3231
|
if (!llama_supports_gpu_offload()) {
|
|
3184
3232
|
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
|
|
3185
3233
|
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
|
@@ -3329,6 +3377,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3329
3377
|
}
|
|
3330
3378
|
}
|
|
3331
3379
|
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3380
|
+
add_opt(common_arg(
|
|
3381
|
+
{"--save-logits"},
|
|
3382
|
+
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
|
|
3383
|
+
[](common_params & params) {
|
|
3384
|
+
params.save_logits = true;
|
|
3385
|
+
}
|
|
3386
|
+
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
|
3387
|
+
add_opt(common_arg(
|
|
3388
|
+
{"--logits-output-dir"}, "PATH",
|
|
3389
|
+
string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
|
|
3390
|
+
[](common_params & params, const std::string & value) {
|
|
3391
|
+
params.logits_output_dir = value;
|
|
3392
|
+
}
|
|
3393
|
+
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
|
3394
|
+
add_opt(common_arg(
|
|
3395
|
+
{"--tensor-filter"}, "REGEX",
|
|
3396
|
+
"filter tensor names for debug output (regex pattern, can be specified multiple times)",
|
|
3397
|
+
[](common_params & params, const std::string & value) {
|
|
3398
|
+
params.tensor_filter.push_back(value);
|
|
3399
|
+
}
|
|
3400
|
+
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
|
3332
3401
|
|
|
3333
3402
|
// presets
|
|
3334
3403
|
add_opt(common_arg(
|
|
@@ -3518,15 +3587,15 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
|
|
|
3518
3587
|
[](common_params &, const std::string &) { /* unused */ }
|
|
3519
3588
|
).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
|
|
3520
3589
|
|
|
3590
|
+
args.push_back(common_arg(
|
|
3591
|
+
{"stop-timeout"}, "SECONDS",
|
|
3592
|
+
"in server router mode, force-kill model instance after this many seconds of graceful shutdown",
|
|
3593
|
+
[](common_params &, int) { /* unused */ }
|
|
3594
|
+
).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
|
|
3595
|
+
|
|
3521
3596
|
// args.push_back(common_arg(
|
|
3522
3597
|
// {"pin"},
|
|
3523
3598
|
// "in server router mode, do not unload this model if models_max is exceeded",
|
|
3524
3599
|
// [](common_params &) { /* unused */ }
|
|
3525
3600
|
// ).set_preset_only());
|
|
3526
|
-
|
|
3527
|
-
// args.push_back(common_arg(
|
|
3528
|
-
// {"unload-idle-seconds"}, "SECONDS",
|
|
3529
|
-
// "in server router mode, unload models idle for more than this many seconds",
|
|
3530
|
-
// [](common_params &, int) { /* unused */ }
|
|
3531
|
-
// ).set_preset_only());
|
|
3532
3601
|
}
|
|
@@ -1395,6 +1395,14 @@ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
|
|
1395
1395
|
builder.consume_reasoning_with_xml_tool_calls(form, "<seed:think>", "</seed:think>");
|
|
1396
1396
|
}
|
|
1397
1397
|
|
|
1398
|
+
static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
|
|
1399
|
+
builder.try_parse_reasoning("<|think|>", "<|end|><|begin|>assistant<|content|>");
|
|
1400
|
+
|
|
1401
|
+
// TODO: Tool calling
|
|
1402
|
+
|
|
1403
|
+
builder.add_content(builder.consume_rest());
|
|
1404
|
+
}
|
|
1405
|
+
|
|
1398
1406
|
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
|
|
1399
1407
|
builder.try_parse_reasoning("<think>", "</think>");
|
|
1400
1408
|
builder.add_content(builder.consume_rest());
|
|
@@ -1479,6 +1487,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
1479
1487
|
case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
|
|
1480
1488
|
common_chat_parse_xiaomi_mimo(builder);
|
|
1481
1489
|
break;
|
|
1490
|
+
case COMMON_CHAT_FORMAT_SOLAR_OPEN:
|
|
1491
|
+
common_chat_parse_solar_open(builder);
|
|
1492
|
+
break;
|
|
1482
1493
|
default:
|
|
1483
1494
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
1484
1495
|
}
|