@fugood/llama.node 0.3.14 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/llama.cpp/.github/workflows/build.yml +30 -1
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/arg.cpp +20 -2
- package/src/llama.cpp/common/common.cpp +6 -3
- package/src/llama.cpp/common/speculative.cpp +4 -4
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +4 -4
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +6 -6
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/run.cpp +91 -46
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +32 -15
- package/src/llama.cpp/examples/server/utils.hpp +3 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/tts/tts.cpp +12 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +24 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +5 -27
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +150 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +253 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +95 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +66 -26
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +12 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +103 -34
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +19 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +352 -146
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml.c +85 -2
- package/src/llama.cpp/include/llama.h +86 -22
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +102 -16
- package/src/llama.cpp/src/llama-arch.h +18 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -110
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-model.cpp +8207 -163
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama.cpp +51 -9984
- package/src/llama.cpp/tests/test-backend-ops.cpp +88 -9
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
|
@@ -79,6 +79,7 @@ class Opt {
|
|
|
79
79
|
ctx_params = llama_context_default_params();
|
|
80
80
|
model_params = llama_model_default_params();
|
|
81
81
|
context_size_default = ctx_params.n_batch;
|
|
82
|
+
n_threads_default = ctx_params.n_threads;
|
|
82
83
|
ngl_default = model_params.n_gpu_layers;
|
|
83
84
|
common_params_sampling sampling;
|
|
84
85
|
temperature_default = sampling.temp;
|
|
@@ -104,6 +105,7 @@ class Opt {
|
|
|
104
105
|
|
|
105
106
|
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
|
|
106
107
|
ctx_params.n_ctx = ctx_params.n_batch;
|
|
108
|
+
ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default;
|
|
107
109
|
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
|
|
108
110
|
temperature = temperature >= 0 ? temperature : temperature_default;
|
|
109
111
|
|
|
@@ -116,12 +118,12 @@ class Opt {
|
|
|
116
118
|
std::string chat_template_file;
|
|
117
119
|
std::string user;
|
|
118
120
|
bool use_jinja = false;
|
|
119
|
-
int context_size = -1, ngl = -1;
|
|
121
|
+
int context_size = -1, ngl = -1, n_threads = -1;
|
|
120
122
|
float temperature = -1;
|
|
121
123
|
bool verbose = false;
|
|
122
124
|
|
|
123
125
|
private:
|
|
124
|
-
int context_size_default = -1, ngl_default = -1;
|
|
126
|
+
int context_size_default = -1, ngl_default = -1, n_threads_default = -1;
|
|
125
127
|
float temperature_default = -1;
|
|
126
128
|
bool help = false;
|
|
127
129
|
|
|
@@ -159,53 +161,94 @@ class Opt {
|
|
|
159
161
|
return 0;
|
|
160
162
|
}
|
|
161
163
|
|
|
164
|
+
int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) {
|
|
165
|
+
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
|
|
166
|
+
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
|
|
167
|
+
return 1;
|
|
168
|
+
}
|
|
169
|
+
} else if (options_parsing &&
|
|
170
|
+
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
|
171
|
+
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
|
|
172
|
+
return 1;
|
|
173
|
+
}
|
|
174
|
+
} else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) {
|
|
175
|
+
if (handle_option_with_value(argc, argv, i, n_threads) == 1) {
|
|
176
|
+
return 1;
|
|
177
|
+
}
|
|
178
|
+
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
|
|
179
|
+
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
|
|
180
|
+
return 1;
|
|
181
|
+
}
|
|
182
|
+
} else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) {
|
|
183
|
+
if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
|
|
184
|
+
return 1;
|
|
185
|
+
}
|
|
186
|
+
use_jinja = true;
|
|
187
|
+
} else {
|
|
188
|
+
return 2;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return 0;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
int parse_options(const char ** argv, int & i, bool & options_parsing) {
|
|
195
|
+
if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
|
|
196
|
+
verbose = true;
|
|
197
|
+
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
|
|
198
|
+
use_jinja = true;
|
|
199
|
+
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
|
200
|
+
help = true;
|
|
201
|
+
return 0;
|
|
202
|
+
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
|
203
|
+
options_parsing = false;
|
|
204
|
+
} else {
|
|
205
|
+
return 2;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
return 0;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
int parse_positional_args(const char ** argv, int & i, int & positional_args_i) {
|
|
212
|
+
if (positional_args_i == 0) {
|
|
213
|
+
if (!argv[i][0] || argv[i][0] == '-') {
|
|
214
|
+
return 1;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
++positional_args_i;
|
|
218
|
+
model_ = argv[i];
|
|
219
|
+
} else if (positional_args_i == 1) {
|
|
220
|
+
++positional_args_i;
|
|
221
|
+
user = argv[i];
|
|
222
|
+
} else {
|
|
223
|
+
user += " " + std::string(argv[i]);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
return 0;
|
|
227
|
+
}
|
|
228
|
+
|
|
162
229
|
int parse(int argc, const char ** argv) {
|
|
163
230
|
bool options_parsing = true;
|
|
164
231
|
for (int i = 1, positional_args_i = 0; i < argc; ++i) {
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
|
|
182
|
-
use_jinja = true;
|
|
183
|
-
} else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){
|
|
184
|
-
if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
|
|
185
|
-
return 1;
|
|
186
|
-
}
|
|
187
|
-
use_jinja = true;
|
|
188
|
-
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
|
189
|
-
help = true;
|
|
190
|
-
return 0;
|
|
191
|
-
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
|
192
|
-
options_parsing = false;
|
|
193
|
-
} else if (positional_args_i == 0) {
|
|
194
|
-
if (!argv[i][0] || argv[i][0] == '-') {
|
|
195
|
-
return 1;
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
++positional_args_i;
|
|
199
|
-
model_ = argv[i];
|
|
200
|
-
} else if (positional_args_i == 1) {
|
|
201
|
-
++positional_args_i;
|
|
202
|
-
user = argv[i];
|
|
203
|
-
} else {
|
|
204
|
-
user += " " + std::string(argv[i]);
|
|
232
|
+
int ret = parse_options_with_value(argc, argv, i, options_parsing);
|
|
233
|
+
if (ret == 0) {
|
|
234
|
+
continue;
|
|
235
|
+
} else if (ret == 1) {
|
|
236
|
+
return ret;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
ret = parse_options(argv, i, options_parsing);
|
|
240
|
+
if (ret == 0) {
|
|
241
|
+
continue;
|
|
242
|
+
} else if (ret == 1) {
|
|
243
|
+
return ret;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (parse_positional_args(argv, i, positional_args_i)) {
|
|
247
|
+
return 1;
|
|
205
248
|
}
|
|
206
249
|
}
|
|
207
250
|
|
|
208
|
-
if (model_.empty()){
|
|
251
|
+
if (model_.empty()) {
|
|
209
252
|
return 1;
|
|
210
253
|
}
|
|
211
254
|
|
|
@@ -232,6 +275,8 @@ class Opt {
|
|
|
232
275
|
" Number of GPU layers (default: %d)\n"
|
|
233
276
|
" --temp <value>\n"
|
|
234
277
|
" Temperature (default: %.1f)\n"
|
|
278
|
+
" -t, --threads <value>\n"
|
|
279
|
+
" Number of threads to use during generation (default: %d)\n"
|
|
235
280
|
" -v, --verbose, --log-verbose\n"
|
|
236
281
|
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
|
|
237
282
|
" -h, --help\n"
|
|
@@ -260,7 +305,7 @@ class Opt {
|
|
|
260
305
|
" llama-run file://some-file3.gguf\n"
|
|
261
306
|
" llama-run --ngl 999 some-file4.gguf\n"
|
|
262
307
|
" llama-run --ngl 999 some-file5.gguf Hello World\n",
|
|
263
|
-
context_size_default, ngl_default, temperature_default);
|
|
308
|
+
context_size_default, ngl_default, temperature_default, n_threads_default);
|
|
264
309
|
}
|
|
265
310
|
};
|
|
266
311
|
|
|
@@ -891,7 +936,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
|
|
|
891
936
|
// Function to tokenize the prompt
|
|
892
937
|
static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
|
|
893
938
|
std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
|
|
894
|
-
const bool is_first =
|
|
939
|
+
const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
|
|
895
940
|
|
|
896
941
|
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
|
897
942
|
prompt_tokens.resize(n_prompt_tokens);
|
|
@@ -907,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
|
|
|
907
952
|
// Check if we have enough space in the context to evaluate this batch
|
|
908
953
|
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
|
|
909
954
|
const int n_ctx = llama_n_ctx(ctx.get());
|
|
910
|
-
const int n_ctx_used =
|
|
955
|
+
const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
|
|
911
956
|
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
|
912
957
|
printf(LOG_COL_DEFAULT "\n");
|
|
913
958
|
printe("context size exceeded\n");
|
|
@@ -15,7 +15,7 @@ int main(int argc, char ** argv) {
|
|
|
15
15
|
return 1;
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
common_init();
|
|
19
19
|
|
|
20
20
|
if (params.n_predict < 0) {
|
|
21
21
|
params.n_predict = 16;
|
|
@@ -196,7 +196,7 @@ int main(int argc, char ** argv) {
|
|
|
196
196
|
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
|
197
197
|
|
|
198
198
|
// erase whole kv
|
|
199
|
-
|
|
199
|
+
llama_kv_self_clear(ctx3);
|
|
200
200
|
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
|
201
201
|
|
|
202
202
|
// restore kv into seq 1
|
|
@@ -1872,6 +1872,10 @@ struct server_context {
|
|
|
1872
1872
|
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
|
1873
1873
|
params_dft.n_parallel = 1;
|
|
1874
1874
|
|
|
1875
|
+
// force F16 KV cache for the draft model for extra performance
|
|
1876
|
+
params_dft.cache_type_k = GGML_TYPE_F16;
|
|
1877
|
+
params_dft.cache_type_v = GGML_TYPE_F16;
|
|
1878
|
+
|
|
1875
1879
|
llama_init_dft = common_init_from_params(params_dft);
|
|
1876
1880
|
|
|
1877
1881
|
model_dft = llama_init_dft.model.get();
|
|
@@ -1892,10 +1896,6 @@ struct server_context {
|
|
|
1892
1896
|
cparams_dft = common_context_params_to_llama(params_dft);
|
|
1893
1897
|
cparams_dft.n_batch = n_ctx_dft;
|
|
1894
1898
|
|
|
1895
|
-
// force F16 KV cache for the draft model for extra performance
|
|
1896
|
-
cparams_dft.type_k = GGML_TYPE_F16;
|
|
1897
|
-
cparams_dft.type_v = GGML_TYPE_F16;
|
|
1898
|
-
|
|
1899
1899
|
// the context is not needed - we will create one for each slot
|
|
1900
1900
|
llama_init_dft.context.reset();
|
|
1901
1901
|
}
|
|
@@ -2040,6 +2040,18 @@ struct server_context {
|
|
|
2040
2040
|
return ret;
|
|
2041
2041
|
}
|
|
2042
2042
|
|
|
2043
|
+
bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
|
2044
|
+
const llama_model * model = llama_get_model(ctx);
|
|
2045
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
2046
|
+
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
|
2047
|
+
for (const auto & token : tokens) {
|
|
2048
|
+
if (token < 0 || token >= n_vocab) {
|
|
2049
|
+
return false;
|
|
2050
|
+
}
|
|
2051
|
+
}
|
|
2052
|
+
return true;
|
|
2053
|
+
}
|
|
2054
|
+
|
|
2043
2055
|
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
|
2044
2056
|
slot.reset();
|
|
2045
2057
|
slot.id_task = task.id;
|
|
@@ -2054,6 +2066,11 @@ struct server_context {
|
|
|
2054
2066
|
slot.lora = task.params.lora;
|
|
2055
2067
|
}
|
|
2056
2068
|
|
|
2069
|
+
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
|
|
2070
|
+
if (!can_detokenize) {
|
|
2071
|
+
send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
|
|
2072
|
+
return false;
|
|
2073
|
+
}
|
|
2057
2074
|
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
|
|
2058
2075
|
|
|
2059
2076
|
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
|
@@ -2096,7 +2113,7 @@ struct server_context {
|
|
|
2096
2113
|
SRV_DBG("%s", "clearing KV cache\n");
|
|
2097
2114
|
|
|
2098
2115
|
// clear the entire KV cache
|
|
2099
|
-
|
|
2116
|
+
llama_kv_self_clear(ctx);
|
|
2100
2117
|
clean_kv_cache = false;
|
|
2101
2118
|
}
|
|
2102
2119
|
|
|
@@ -2638,8 +2655,8 @@ struct server_context {
|
|
|
2638
2655
|
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
|
|
2639
2656
|
res->t_start = metrics.t_start;
|
|
2640
2657
|
|
|
2641
|
-
res->kv_cache_tokens_count =
|
|
2642
|
-
res->kv_cache_used_cells =
|
|
2658
|
+
res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
|
|
2659
|
+
res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
|
|
2643
2660
|
|
|
2644
2661
|
res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
|
|
2645
2662
|
res->t_prompt_processing_total = metrics.t_prompt_processing_total;
|
|
@@ -2755,7 +2772,7 @@ struct server_context {
|
|
|
2755
2772
|
|
|
2756
2773
|
// Erase token cache
|
|
2757
2774
|
const size_t n_erased = slot->cache_tokens.size();
|
|
2758
|
-
|
|
2775
|
+
llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
|
|
2759
2776
|
slot->cache_tokens.clear();
|
|
2760
2777
|
|
|
2761
2778
|
auto res = std::make_unique<server_task_result_slot_erase>();
|
|
@@ -2823,8 +2840,8 @@ struct server_context {
|
|
|
2823
2840
|
|
|
2824
2841
|
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
|
|
2825
2842
|
|
|
2826
|
-
|
|
2827
|
-
|
|
2843
|
+
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
|
2844
|
+
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
|
|
2828
2845
|
|
|
2829
2846
|
if (slot.params.cache_prompt) {
|
|
2830
2847
|
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
|
@@ -3015,8 +3032,8 @@ struct server_context {
|
|
|
3015
3032
|
|
|
3016
3033
|
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
|
3017
3034
|
|
|
3018
|
-
|
|
3019
|
-
|
|
3035
|
+
llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
|
|
3036
|
+
llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
|
|
3020
3037
|
|
|
3021
3038
|
for (size_t i = 0; i < n_match; i++) {
|
|
3022
3039
|
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
|
@@ -3054,9 +3071,9 @@ struct server_context {
|
|
|
3054
3071
|
}
|
|
3055
3072
|
|
|
3056
3073
|
// keep only the common part
|
|
3057
|
-
if (!
|
|
3074
|
+
if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
|
|
3058
3075
|
// could not partially delete (likely using a non-Transformer model)
|
|
3059
|
-
|
|
3076
|
+
llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
|
|
3060
3077
|
|
|
3061
3078
|
// there is no common part left
|
|
3062
3079
|
slot.n_past = 0;
|
|
@@ -3296,7 +3313,7 @@ struct server_context {
|
|
|
3296
3313
|
slot.cache_tokens.push_back(id);
|
|
3297
3314
|
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
|
|
3298
3315
|
|
|
3299
|
-
|
|
3316
|
+
llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
|
|
3300
3317
|
|
|
3301
3318
|
for (size_t i = 0; i < ids.size(); ++i) {
|
|
3302
3319
|
completion_token_output result;
|
|
@@ -621,7 +621,9 @@ static json oaicompat_completion_params_parse(
|
|
|
621
621
|
|
|
622
622
|
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
|
623
623
|
llama_params["prompt"] = chat_params.prompt;
|
|
624
|
-
|
|
624
|
+
if (!chat_params.grammar.empty()) {
|
|
625
|
+
llama_params["grammar"] = chat_params.grammar;
|
|
626
|
+
}
|
|
625
627
|
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
|
626
628
|
auto grammar_triggers = json::array();
|
|
627
629
|
for (const auto & trigger : chat_params.grammar_triggers) {
|
|
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
|
|
|
98
98
|
auto generate = [&](const std::string & prompt) {
|
|
99
99
|
std::string response;
|
|
100
100
|
|
|
101
|
-
const bool is_first =
|
|
101
|
+
const bool is_first = llama_kv_self_used_cells(ctx) == 0;
|
|
102
102
|
|
|
103
103
|
// tokenize the prompt
|
|
104
104
|
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
|
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
|
|
|
113
113
|
while (true) {
|
|
114
114
|
// check if we have enough space in the context to evaluate this batch
|
|
115
115
|
int n_ctx = llama_n_ctx(ctx);
|
|
116
|
-
int n_ctx_used =
|
|
116
|
+
int n_ctx_used = llama_kv_self_used_cells(ctx);
|
|
117
117
|
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
|
118
118
|
printf("\033[0m\n");
|
|
119
119
|
fprintf(stderr, "context size exceeded\n");
|
|
@@ -331,11 +331,11 @@ int main(int argc, char ** argv) {
|
|
|
331
331
|
}
|
|
332
332
|
|
|
333
333
|
active_seqs.erase(s);
|
|
334
|
-
for(int i = 0; i < n_seq_dft; i++) {
|
|
334
|
+
for (int i = 0; i < n_seq_dft; i++) {
|
|
335
335
|
if (i == s) {
|
|
336
336
|
continue;
|
|
337
337
|
}
|
|
338
|
-
if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
|
|
338
|
+
if (drafts[i].active && drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
|
|
339
339
|
// synchronize active status for sequences with the same drafted token
|
|
340
340
|
drafts[i].active = drafts[i].active && accept;
|
|
341
341
|
if (!drafts[i].active) {
|
|
@@ -420,14 +420,14 @@ int main(int argc, char ** argv) {
|
|
|
420
420
|
{
|
|
421
421
|
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
|
422
422
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
423
|
+
llama_kv_self_seq_keep(ctx_dft, s_keep);
|
|
424
|
+
llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
|
425
|
+
llama_kv_self_seq_keep(ctx_dft, 0);
|
|
426
426
|
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
427
|
+
llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
|
|
428
|
+
llama_kv_self_seq_keep(ctx_tgt, s_keep);
|
|
429
|
+
llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
|
|
430
|
+
llama_kv_self_seq_keep(ctx_tgt, 0);
|
|
431
431
|
}
|
|
432
432
|
|
|
433
433
|
for (int s = 0; s < n_seq_dft; ++s) {
|
|
@@ -444,7 +444,7 @@ int main(int argc, char ** argv) {
|
|
|
444
444
|
common_batch_clear(batch_dft);
|
|
445
445
|
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
|
446
446
|
|
|
447
|
-
|
|
447
|
+
llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
|
448
448
|
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
|
449
449
|
llama_decode(ctx_dft, batch_dft);
|
|
450
450
|
|
|
@@ -503,8 +503,8 @@ int main(int argc, char ** argv) {
|
|
|
503
503
|
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
|
|
504
504
|
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
|
505
505
|
|
|
506
|
-
|
|
507
|
-
|
|
506
|
+
llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
|
507
|
+
llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
|
508
508
|
|
|
509
509
|
// all previous tokens from this branch are now also part of the new branch
|
|
510
510
|
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
|
|
@@ -585,9 +585,9 @@ int main(int argc, char ** argv) {
|
|
|
585
585
|
|
|
586
586
|
// evaluate the target model on the drafted tokens
|
|
587
587
|
{
|
|
588
|
-
|
|
588
|
+
llama_kv_self_seq_keep(ctx_tgt, 0);
|
|
589
589
|
for (int s = 1; s < n_seq_dft; ++s) {
|
|
590
|
-
|
|
590
|
+
llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
|
|
591
591
|
}
|
|
592
592
|
|
|
593
593
|
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
|
@@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
|
|
|
217
217
|
{
|
|
218
218
|
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
|
219
219
|
|
|
220
|
-
|
|
220
|
+
llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
|
|
221
221
|
}
|
|
222
222
|
|
|
223
223
|
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
|
@@ -87,11 +87,11 @@ struct wav_header {
|
|
|
87
87
|
uint32_t data_size;
|
|
88
88
|
};
|
|
89
89
|
|
|
90
|
-
static
|
|
90
|
+
static bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
|
|
91
91
|
std::ofstream file(fname, std::ios::binary);
|
|
92
92
|
if (!file) {
|
|
93
|
-
LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
|
|
94
|
-
return;
|
|
93
|
+
LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str());
|
|
94
|
+
return false;
|
|
95
95
|
}
|
|
96
96
|
|
|
97
97
|
wav_header header;
|
|
@@ -108,7 +108,7 @@ static void save_wav16(const std::string & fname, const std::vector<float> & dat
|
|
|
108
108
|
file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
|
|
109
109
|
}
|
|
110
110
|
|
|
111
|
-
file.
|
|
111
|
+
return file.good();
|
|
112
112
|
}
|
|
113
113
|
|
|
114
114
|
static void fill_hann_window(int length, bool periodic, float * output) {
|
|
@@ -536,6 +536,7 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t
|
|
|
536
536
|
int main(int argc, char ** argv) {
|
|
537
537
|
common_params params;
|
|
538
538
|
|
|
539
|
+
params.out_file = "output.wav";
|
|
539
540
|
params.prompt = "";
|
|
540
541
|
|
|
541
542
|
params.n_predict = 4096;
|
|
@@ -1060,8 +1061,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
|
|
1060
1061
|
}
|
|
1061
1062
|
#endif
|
|
1062
1063
|
|
|
1063
|
-
const std::string fname = "output.wav";
|
|
1064
|
-
|
|
1065
1064
|
const int n_sr = 24000; // sampling rate
|
|
1066
1065
|
|
|
1067
1066
|
// zero out first 0.25 seconds
|
|
@@ -1072,11 +1071,15 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
|
|
1072
1071
|
LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
|
|
1073
1072
|
LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
|
|
1074
1073
|
|
|
1075
|
-
|
|
1074
|
+
int retval = 0;
|
|
1076
1075
|
|
|
1077
|
-
|
|
1076
|
+
if (save_wav16(params.out_file, audio, n_sr)) {
|
|
1077
|
+
LOG_INF("%s: audio written to file '%s'\n", __func__, params.out_file.c_str());
|
|
1078
|
+
} else {
|
|
1079
|
+
retval = ENOENT;
|
|
1080
|
+
}
|
|
1078
1081
|
|
|
1079
1082
|
llama_backend_free();
|
|
1080
1083
|
|
|
1081
|
-
return
|
|
1084
|
+
return retval;
|
|
1082
1085
|
}
|
|
@@ -186,6 +186,7 @@ option(GGML_OPENMP "ggml: use OpenMP"
|
|
|
186
186
|
option(GGML_RPC "ggml: use RPC" OFF)
|
|
187
187
|
option(GGML_SYCL "ggml: use SYCL" OFF)
|
|
188
188
|
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
|
189
|
+
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
|
|
189
190
|
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
|
190
191
|
"ggml: sycl target device")
|
|
191
192
|
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
function(ggml_get_flags CCID CCVER)
|
|
2
|
+
set(C_FLAGS "")
|
|
3
|
+
set(CXX_FLAGS "")
|
|
4
|
+
|
|
5
|
+
if (CCID MATCHES "Clang")
|
|
6
|
+
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
|
7
|
+
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
|
8
|
+
|
|
9
|
+
if (
|
|
10
|
+
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
|
11
|
+
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
|
12
|
+
)
|
|
13
|
+
list(APPEND C_FLAGS -Wdouble-promotion)
|
|
14
|
+
endif()
|
|
15
|
+
elseif (CCID STREQUAL "GNU")
|
|
16
|
+
set(C_FLAGS -Wdouble-promotion)
|
|
17
|
+
set(CXX_FLAGS -Wno-array-bounds)
|
|
18
|
+
|
|
19
|
+
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
|
20
|
+
list(APPEND CXX_FLAGS -Wextra-semi)
|
|
21
|
+
endif()
|
|
22
|
+
endif()
|
|
23
|
+
|
|
24
|
+
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
|
25
|
+
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
|
26
|
+
endfunction()
|
|
@@ -454,6 +454,7 @@ extern "C" {
|
|
|
454
454
|
GGML_OP_RMS_NORM,
|
|
455
455
|
GGML_OP_RMS_NORM_BACK,
|
|
456
456
|
GGML_OP_GROUP_NORM,
|
|
457
|
+
GGML_OP_L2_NORM,
|
|
457
458
|
|
|
458
459
|
GGML_OP_MUL_MAT,
|
|
459
460
|
GGML_OP_MUL_MAT_ID,
|
|
@@ -502,6 +503,7 @@ extern "C" {
|
|
|
502
503
|
GGML_OP_ADD_REL_POS,
|
|
503
504
|
GGML_OP_RWKV_WKV6,
|
|
504
505
|
GGML_OP_GATED_LINEAR_ATTN,
|
|
506
|
+
GGML_OP_RWKV_WKV7,
|
|
505
507
|
|
|
506
508
|
GGML_OP_UNARY,
|
|
507
509
|
|
|
@@ -1095,6 +1097,18 @@ extern "C" {
|
|
|
1095
1097
|
int n_groups,
|
|
1096
1098
|
float eps);
|
|
1097
1099
|
|
|
1100
|
+
// l2 normalize along rows
|
|
1101
|
+
// used in rwkv v7
|
|
1102
|
+
GGML_API struct ggml_tensor * ggml_l2_norm(
|
|
1103
|
+
struct ggml_context * ctx,
|
|
1104
|
+
struct ggml_tensor * a,
|
|
1105
|
+
float eps);
|
|
1106
|
+
|
|
1107
|
+
GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
|
|
1108
|
+
struct ggml_context * ctx,
|
|
1109
|
+
struct ggml_tensor * a,
|
|
1110
|
+
float eps);
|
|
1111
|
+
|
|
1098
1112
|
// a - x
|
|
1099
1113
|
// b - dy
|
|
1100
1114
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
|
@@ -1890,6 +1904,16 @@ extern "C" {
|
|
|
1890
1904
|
struct ggml_tensor * state,
|
|
1891
1905
|
float scale);
|
|
1892
1906
|
|
|
1907
|
+
GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
|
|
1908
|
+
struct ggml_context * ctx,
|
|
1909
|
+
struct ggml_tensor * r,
|
|
1910
|
+
struct ggml_tensor * w,
|
|
1911
|
+
struct ggml_tensor * k,
|
|
1912
|
+
struct ggml_tensor * v,
|
|
1913
|
+
struct ggml_tensor * a,
|
|
1914
|
+
struct ggml_tensor * b,
|
|
1915
|
+
struct ggml_tensor * state);
|
|
1916
|
+
|
|
1893
1917
|
// custom operators
|
|
1894
1918
|
|
|
1895
1919
|
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
include(CheckCXXCompilerFlag)
|
|
2
|
+
include("../cmake/common.cmake")
|
|
2
3
|
|
|
3
4
|
add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
|
|
4
5
|
|
|
@@ -24,33 +25,6 @@ if (NOT MSVC)
|
|
|
24
25
|
endif()
|
|
25
26
|
endif()
|
|
26
27
|
|
|
27
|
-
function(ggml_get_flags CCID CCVER)
|
|
28
|
-
set(C_FLAGS "")
|
|
29
|
-
set(CXX_FLAGS "")
|
|
30
|
-
|
|
31
|
-
if (CCID MATCHES "Clang")
|
|
32
|
-
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
|
33
|
-
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
|
34
|
-
|
|
35
|
-
if (
|
|
36
|
-
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
|
37
|
-
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
|
38
|
-
)
|
|
39
|
-
list(APPEND C_FLAGS -Wdouble-promotion)
|
|
40
|
-
endif()
|
|
41
|
-
elseif (CCID STREQUAL "GNU")
|
|
42
|
-
set(C_FLAGS -Wdouble-promotion)
|
|
43
|
-
set(CXX_FLAGS -Wno-array-bounds)
|
|
44
|
-
|
|
45
|
-
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
|
46
|
-
list(APPEND CXX_FLAGS -Wextra-semi)
|
|
47
|
-
endif()
|
|
48
|
-
endif()
|
|
49
|
-
|
|
50
|
-
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
|
51
|
-
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
|
52
|
-
endfunction()
|
|
53
|
-
|
|
54
28
|
if (GGML_FATAL_WARNINGS)
|
|
55
29
|
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
|
56
30
|
list(APPEND C_FLAGS -Werror)
|
|
@@ -351,6 +325,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Android")
|
|
|
351
325
|
target_link_libraries(ggml-base PRIVATE dl)
|
|
352
326
|
endif()
|
|
353
327
|
|
|
328
|
+
if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
|
|
329
|
+
target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
|
|
330
|
+
endif()
|
|
331
|
+
|
|
354
332
|
if (BUILD_SHARED_LIBS)
|
|
355
333
|
foreach (target ggml-base ggml)
|
|
356
334
|
set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|