@fugood/llama.node 0.3.17 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +39 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +366 -19
- package/src/LlamaCompletionWorker.h +30 -10
- package/src/LlamaContext.cpp +213 -5
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
- package/src/llama.cpp/.github/workflows/build.yml +41 -762
- package/src/llama.cpp/.github/workflows/docker.yml +5 -2
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +12 -12
- package/src/llama.cpp/CMakeLists.txt +5 -17
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +31 -3
- package/src/llama.cpp/common/arg.cpp +48 -29
- package/src/llama.cpp/common/chat.cpp +128 -106
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +37 -1
- package/src/llama.cpp/common/common.h +18 -9
- package/src/llama.cpp/common/llguidance.cpp +1 -0
- package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/src/llama.cpp/common/minja/minja.hpp +69 -36
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +57 -50
- package/src/llama.cpp/examples/CMakeLists.txt +2 -23
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml.h +10 -7
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
- package/src/llama.cpp/ggml/src/ggml.c +29 -20
- package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/src/llama.cpp/include/llama.h +52 -11
- package/src/llama.cpp/requirements/requirements-all.txt +3 -3
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +3 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +17 -7
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +389 -501
- package/src/llama.cpp/src/llama-context.h +44 -32
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +20 -38
- package/src/llama.cpp/src/llama-graph.h +12 -8
- package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
- package/src/llama.cpp/src/llama-kv-cache.h +271 -85
- package/src/llama.cpp/src/llama-memory.h +11 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +316 -69
- package/src/llama.cpp/src/llama-model.h +8 -1
- package/src/llama.cpp/src/llama-quant.cpp +15 -13
- package/src/llama.cpp/src/llama-sampling.cpp +18 -6
- package/src/llama.cpp/src/llama-vocab.cpp +42 -4
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +14 -0
- package/src/llama.cpp/tests/CMakeLists.txt +10 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
- package/src/llama.cpp/tests/test-chat.cpp +3 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
- package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
- package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
- package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
- package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.h +0 -135
- package/src/llama.cpp/examples/llava/llava.cpp +0 -586
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/mtmd.h +0 -168
- package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
|
@@ -195,6 +195,46 @@ static std::string pair_str(const std::pair<int, int> & p) {
|
|
|
195
195
|
return buf;
|
|
196
196
|
}
|
|
197
197
|
|
|
198
|
+
static std::vector<int> parse_int_range(const std::string & s) {
|
|
199
|
+
// first[-last[(+|*)step]]
|
|
200
|
+
std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
|
|
201
|
+
|
|
202
|
+
std::smatch match;
|
|
203
|
+
std::string::const_iterator search_start(s.cbegin());
|
|
204
|
+
std::vector<int> result;
|
|
205
|
+
while (std::regex_search(search_start, s.cend(), match, range_regex)) {
|
|
206
|
+
int first = std::stoi(match[1]);
|
|
207
|
+
int last = match[2].matched ? std::stoi(match[2]) : first;
|
|
208
|
+
char op = match[3].matched ? match[3].str()[0] : '+';
|
|
209
|
+
int step = match[4].matched ? std::stoi(match[4]) : 1;
|
|
210
|
+
|
|
211
|
+
for (int i = first; i <= last;) {
|
|
212
|
+
result.push_back(i);
|
|
213
|
+
|
|
214
|
+
int prev_i = i;
|
|
215
|
+
|
|
216
|
+
if (op == '+') {
|
|
217
|
+
i += step;
|
|
218
|
+
} else if (op == '*') {
|
|
219
|
+
i *= step;
|
|
220
|
+
} else {
|
|
221
|
+
throw std::invalid_argument("invalid range format");
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if (i <= prev_i) {
|
|
225
|
+
throw std::invalid_argument("invalid range");
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
search_start = match.suffix().first;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
if (search_start != s.cend()) {
|
|
232
|
+
throw std::invalid_argument("invalid range format");
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
return result;
|
|
236
|
+
}
|
|
237
|
+
|
|
198
238
|
struct cmd_params {
|
|
199
239
|
std::vector<std::string> model;
|
|
200
240
|
std::vector<int> n_prompt;
|
|
@@ -205,6 +245,7 @@ struct cmd_params {
|
|
|
205
245
|
std::vector<int> n_ubatch;
|
|
206
246
|
std::vector<ggml_type> type_k;
|
|
207
247
|
std::vector<ggml_type> type_v;
|
|
248
|
+
std::vector<float> defrag_thold;
|
|
208
249
|
std::vector<int> n_threads;
|
|
209
250
|
std::vector<std::string> cpu_mask;
|
|
210
251
|
std::vector<bool> cpu_strict;
|
|
@@ -219,6 +260,7 @@ struct cmd_params {
|
|
|
219
260
|
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
|
|
220
261
|
std::vector<bool> use_mmap;
|
|
221
262
|
std::vector<bool> embeddings;
|
|
263
|
+
std::vector<bool> no_op_offload;
|
|
222
264
|
ggml_numa_strategy numa;
|
|
223
265
|
int reps;
|
|
224
266
|
ggml_sched_priority prio;
|
|
@@ -239,6 +281,7 @@ static const cmd_params cmd_params_defaults = {
|
|
|
239
281
|
/* n_ubatch */ { 512 },
|
|
240
282
|
/* type_k */ { GGML_TYPE_F16 },
|
|
241
283
|
/* type_v */ { GGML_TYPE_F16 },
|
|
284
|
+
/* defrag_thold */ { -1.0f },
|
|
242
285
|
/* n_threads */ { cpu_get_num_math() },
|
|
243
286
|
/* cpu_mask */ { "0x0" },
|
|
244
287
|
/* cpu_strict */ { false },
|
|
@@ -250,9 +293,10 @@ static const cmd_params cmd_params_defaults = {
|
|
|
250
293
|
/* no_kv_offload */ { false },
|
|
251
294
|
/* flash_attn */ { false },
|
|
252
295
|
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
|
253
|
-
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} },
|
|
296
|
+
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
|
|
254
297
|
/* use_mmap */ { true },
|
|
255
298
|
/* embeddings */ { false },
|
|
299
|
+
/* no_op_offload */ { false },
|
|
256
300
|
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
|
257
301
|
/* reps */ 5,
|
|
258
302
|
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
|
@@ -268,13 +312,29 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
268
312
|
printf("\n");
|
|
269
313
|
printf("options:\n");
|
|
270
314
|
printf(" -h, --help\n");
|
|
315
|
+
printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
|
|
316
|
+
printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
|
|
317
|
+
cmd_params_defaults.reps);
|
|
318
|
+
printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n",
|
|
319
|
+
cmd_params_defaults.prio);
|
|
320
|
+
printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
|
|
321
|
+
cmd_params_defaults.delay);
|
|
322
|
+
printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
|
|
323
|
+
output_format_str(cmd_params_defaults.output_format));
|
|
324
|
+
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
|
|
325
|
+
output_format_str(cmd_params_defaults.output_format_stderr));
|
|
326
|
+
printf(" -v, --verbose verbose output\n");
|
|
327
|
+
printf(" --progress print test progress indicators\n");
|
|
328
|
+
printf("\n");
|
|
329
|
+
printf("test parameters:\n");
|
|
271
330
|
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
|
272
331
|
printf(" -p, --n-prompt <n> (default: %s)\n",
|
|
273
332
|
join(cmd_params_defaults.n_prompt, ",").c_str());
|
|
274
333
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
|
275
334
|
printf(" -pg <pp,tg> (default: %s)\n",
|
|
276
335
|
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
|
277
|
-
printf(" -d, --n-depth <n> (default: %s)\n",
|
|
336
|
+
printf(" -d, --n-depth <n> (default: %s)\n",
|
|
337
|
+
join(cmd_params_defaults.n_depth, ",").c_str());
|
|
278
338
|
printf(" -b, --batch-size <n> (default: %s)\n",
|
|
279
339
|
join(cmd_params_defaults.n_batch, ",").c_str());
|
|
280
340
|
printf(" -ub, --ubatch-size <n> (default: %s)\n",
|
|
@@ -283,6 +343,8 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
283
343
|
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
|
284
344
|
printf(" -ctv, --cache-type-v <t> (default: %s)\n",
|
|
285
345
|
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
|
346
|
+
printf(" -dt, --defrag-thold <f> (default: %s)\n",
|
|
347
|
+
join(cmd_params_defaults.defrag_thold, ",").c_str());
|
|
286
348
|
printf(" -t, --threads <n> (default: %s)\n",
|
|
287
349
|
join(cmd_params_defaults.n_threads, ",").c_str());
|
|
288
350
|
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
|
|
@@ -306,24 +368,17 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
306
368
|
join(cmd_params_defaults.flash_attn, ",").c_str());
|
|
307
369
|
printf(" -mmp, --mmap <0|1> (default: %s)\n",
|
|
308
370
|
join(cmd_params_defaults.use_mmap, ",").c_str());
|
|
309
|
-
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
|
310
371
|
printf(" -embd, --embeddings <0|1> (default: %s)\n",
|
|
311
372
|
join(cmd_params_defaults.embeddings, ",").c_str());
|
|
312
373
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
|
313
|
-
printf(" -ot --override-tensors <tensor name pattern>=<buffer type
|
|
314
|
-
printf("
|
|
315
|
-
printf(" --
|
|
316
|
-
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
|
317
|
-
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
|
|
318
|
-
output_format_str(cmd_params_defaults.output_format));
|
|
319
|
-
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
|
|
320
|
-
output_format_str(cmd_params_defaults.output_format_stderr));
|
|
321
|
-
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
|
322
|
-
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
|
|
374
|
+
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
|
|
375
|
+
printf(" (default: disabled)\n");
|
|
376
|
+
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
|
|
323
377
|
printf("\n");
|
|
324
378
|
printf(
|
|
325
|
-
"Multiple values can be given for each parameter by separating them with ','
|
|
326
|
-
"multiple times
|
|
379
|
+
"Multiple values can be given for each parameter by separating them with ','\n"
|
|
380
|
+
"or by specifying the parameter multiple times. Ranges can be given as\n"
|
|
381
|
+
"'first-last' or 'first-last+step' or 'first-last*mult'.\n");
|
|
327
382
|
}
|
|
328
383
|
|
|
329
384
|
static ggml_type ggml_type_from_name(const std::string & s) {
|
|
@@ -377,186 +432,197 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
377
432
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
378
433
|
}
|
|
379
434
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
if (
|
|
385
|
-
|
|
386
|
-
break;
|
|
387
|
-
}
|
|
388
|
-
auto p = string_split<std::string>(argv[i], split_delim);
|
|
389
|
-
params.model.insert(params.model.end(), p.begin(), p.end());
|
|
390
|
-
} else if (arg == "-p" || arg == "--n-prompt") {
|
|
391
|
-
if (++i >= argc) {
|
|
392
|
-
invalid_param = true;
|
|
393
|
-
break;
|
|
394
|
-
}
|
|
395
|
-
auto p = string_split<int>(argv[i], split_delim);
|
|
396
|
-
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
|
|
397
|
-
} else if (arg == "-n" || arg == "--n-gen") {
|
|
398
|
-
if (++i >= argc) {
|
|
399
|
-
invalid_param = true;
|
|
400
|
-
break;
|
|
401
|
-
}
|
|
402
|
-
auto p = string_split<int>(argv[i], split_delim);
|
|
403
|
-
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
|
404
|
-
} else if (arg == "-pg") {
|
|
405
|
-
if (++i >= argc) {
|
|
406
|
-
invalid_param = true;
|
|
407
|
-
break;
|
|
408
|
-
}
|
|
409
|
-
auto p = string_split<std::string>(argv[i], ',');
|
|
410
|
-
if (p.size() != 2) {
|
|
411
|
-
invalid_param = true;
|
|
412
|
-
break;
|
|
413
|
-
}
|
|
414
|
-
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
|
|
415
|
-
} else if (arg == "-d" || arg == "--n-depth") {
|
|
416
|
-
if (++i >= argc) {
|
|
417
|
-
invalid_param = true;
|
|
418
|
-
break;
|
|
419
|
-
}
|
|
420
|
-
auto p = string_split<int>(argv[i], split_delim);
|
|
421
|
-
params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
|
|
422
|
-
} else if (arg == "-b" || arg == "--batch-size") {
|
|
423
|
-
if (++i >= argc) {
|
|
424
|
-
invalid_param = true;
|
|
425
|
-
break;
|
|
426
|
-
}
|
|
427
|
-
auto p = string_split<int>(argv[i], split_delim);
|
|
428
|
-
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
|
429
|
-
} else if (arg == "-ub" || arg == "--ubatch-size") {
|
|
430
|
-
if (++i >= argc) {
|
|
431
|
-
invalid_param = true;
|
|
432
|
-
break;
|
|
433
|
-
}
|
|
434
|
-
auto p = string_split<int>(argv[i], split_delim);
|
|
435
|
-
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
|
|
436
|
-
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
|
437
|
-
if (++i >= argc) {
|
|
438
|
-
invalid_param = true;
|
|
439
|
-
break;
|
|
440
|
-
}
|
|
441
|
-
auto p = string_split<std::string>(argv[i], split_delim);
|
|
442
|
-
std::vector<ggml_type> types;
|
|
443
|
-
for (const auto & t : p) {
|
|
444
|
-
ggml_type gt = ggml_type_from_name(t);
|
|
445
|
-
if (gt == GGML_TYPE_COUNT) {
|
|
435
|
+
try {
|
|
436
|
+
if (arg == "-h" || arg == "--help") {
|
|
437
|
+
print_usage(argc, argv);
|
|
438
|
+
exit(0);
|
|
439
|
+
} else if (arg == "-m" || arg == "--model") {
|
|
440
|
+
if (++i >= argc) {
|
|
446
441
|
invalid_param = true;
|
|
447
442
|
break;
|
|
448
443
|
}
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
if (
|
|
452
|
-
|
|
453
|
-
}
|
|
454
|
-
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
|
455
|
-
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
|
456
|
-
if (++i >= argc) {
|
|
457
|
-
invalid_param = true;
|
|
458
|
-
break;
|
|
459
|
-
}
|
|
460
|
-
auto p = string_split<std::string>(argv[i], split_delim);
|
|
461
|
-
std::vector<ggml_type> types;
|
|
462
|
-
for (const auto & t : p) {
|
|
463
|
-
ggml_type gt = ggml_type_from_name(t);
|
|
464
|
-
if (gt == GGML_TYPE_COUNT) {
|
|
444
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
445
|
+
params.model.insert(params.model.end(), p.begin(), p.end());
|
|
446
|
+
} else if (arg == "-p" || arg == "--n-prompt") {
|
|
447
|
+
if (++i >= argc) {
|
|
465
448
|
invalid_param = true;
|
|
466
449
|
break;
|
|
467
450
|
}
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
if (
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
451
|
+
auto p = parse_int_range(argv[i]);
|
|
452
|
+
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
|
|
453
|
+
} else if (arg == "-n" || arg == "--n-gen") {
|
|
454
|
+
if (++i >= argc) {
|
|
455
|
+
invalid_param = true;
|
|
456
|
+
break;
|
|
457
|
+
}
|
|
458
|
+
auto p = parse_int_range(argv[i]);
|
|
459
|
+
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
|
460
|
+
} else if (arg == "-pg") {
|
|
461
|
+
if (++i >= argc) {
|
|
462
|
+
invalid_param = true;
|
|
463
|
+
break;
|
|
464
|
+
}
|
|
465
|
+
auto p = string_split<std::string>(argv[i], ',');
|
|
466
|
+
if (p.size() != 2) {
|
|
467
|
+
invalid_param = true;
|
|
468
|
+
break;
|
|
469
|
+
}
|
|
470
|
+
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
|
|
471
|
+
} else if (arg == "-d" || arg == "--n-depth") {
|
|
472
|
+
if (++i >= argc) {
|
|
473
|
+
invalid_param = true;
|
|
474
|
+
break;
|
|
475
|
+
}
|
|
476
|
+
auto p = parse_int_range(argv[i]);
|
|
477
|
+
params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
|
|
478
|
+
} else if (arg == "-b" || arg == "--batch-size") {
|
|
479
|
+
if (++i >= argc) {
|
|
480
|
+
invalid_param = true;
|
|
481
|
+
break;
|
|
482
|
+
}
|
|
483
|
+
auto p = parse_int_range(argv[i]);
|
|
484
|
+
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
|
485
|
+
} else if (arg == "-ub" || arg == "--ubatch-size") {
|
|
486
|
+
if (++i >= argc) {
|
|
487
|
+
invalid_param = true;
|
|
488
|
+
break;
|
|
489
|
+
}
|
|
490
|
+
auto p = parse_int_range(argv[i]);
|
|
491
|
+
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
|
|
492
|
+
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
|
493
|
+
if (++i >= argc) {
|
|
494
|
+
invalid_param = true;
|
|
495
|
+
break;
|
|
496
|
+
}
|
|
497
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
498
|
+
|
|
499
|
+
std::vector<ggml_type> types;
|
|
500
|
+
for (const auto & t : p) {
|
|
501
|
+
ggml_type gt = ggml_type_from_name(t);
|
|
502
|
+
if (gt == GGML_TYPE_COUNT) {
|
|
503
|
+
invalid_param = true;
|
|
504
|
+
break;
|
|
505
|
+
}
|
|
506
|
+
types.push_back(gt);
|
|
507
|
+
}
|
|
508
|
+
if (invalid_param) {
|
|
509
|
+
break;
|
|
510
|
+
}
|
|
511
|
+
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
|
512
|
+
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
|
513
|
+
if (++i >= argc) {
|
|
514
|
+
invalid_param = true;
|
|
515
|
+
break;
|
|
516
|
+
}
|
|
517
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
518
|
+
|
|
519
|
+
std::vector<ggml_type> types;
|
|
520
|
+
for (const auto & t : p) {
|
|
521
|
+
ggml_type gt = ggml_type_from_name(t);
|
|
522
|
+
if (gt == GGML_TYPE_COUNT) {
|
|
523
|
+
invalid_param = true;
|
|
524
|
+
break;
|
|
525
|
+
}
|
|
526
|
+
types.push_back(gt);
|
|
527
|
+
}
|
|
528
|
+
if (invalid_param) {
|
|
529
|
+
break;
|
|
530
|
+
}
|
|
531
|
+
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
|
532
|
+
} else if (arg == "-dt" || arg == "--defrag-thold") {
|
|
533
|
+
if (++i >= argc) {
|
|
534
|
+
invalid_param = true;
|
|
535
|
+
break;
|
|
536
|
+
}
|
|
537
|
+
auto p = string_split<float>(argv[i], split_delim);
|
|
538
|
+
params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
|
|
539
|
+
} else if (arg == "-t" || arg == "--threads") {
|
|
540
|
+
if (++i >= argc) {
|
|
541
|
+
invalid_param = true;
|
|
542
|
+
break;
|
|
543
|
+
}
|
|
544
|
+
auto p = parse_int_range(argv[i]);
|
|
545
|
+
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
|
546
|
+
} else if (arg == "-C" || arg == "--cpu-mask") {
|
|
547
|
+
if (++i >= argc) {
|
|
548
|
+
invalid_param = true;
|
|
549
|
+
break;
|
|
550
|
+
}
|
|
551
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
552
|
+
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
|
|
553
|
+
} else if (arg == "--cpu-strict") {
|
|
554
|
+
if (++i >= argc) {
|
|
555
|
+
invalid_param = true;
|
|
556
|
+
break;
|
|
557
|
+
}
|
|
558
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
559
|
+
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
|
|
560
|
+
} else if (arg == "--poll") {
|
|
561
|
+
if (++i >= argc) {
|
|
562
|
+
invalid_param = true;
|
|
563
|
+
break;
|
|
564
|
+
}
|
|
565
|
+
auto p = parse_int_range(argv[i]);
|
|
566
|
+
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
|
567
|
+
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
|
568
|
+
if (++i >= argc) {
|
|
569
|
+
invalid_param = true;
|
|
570
|
+
break;
|
|
571
|
+
}
|
|
572
|
+
auto p = parse_int_range(argv[i]);
|
|
573
|
+
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
|
574
|
+
} else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
|
|
575
|
+
if (++i >= argc) {
|
|
576
|
+
invalid_param = true;
|
|
577
|
+
break;
|
|
578
|
+
}
|
|
579
|
+
params.rpc_servers.push_back(argv[i]);
|
|
580
|
+
} else if (arg == "-sm" || arg == "--split-mode") {
|
|
581
|
+
if (++i >= argc) {
|
|
582
|
+
invalid_param = true;
|
|
583
|
+
break;
|
|
584
|
+
}
|
|
585
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
586
|
+
|
|
587
|
+
std::vector<llama_split_mode> modes;
|
|
588
|
+
for (const auto & m : p) {
|
|
589
|
+
llama_split_mode mode;
|
|
590
|
+
if (m == "none") {
|
|
591
|
+
mode = LLAMA_SPLIT_MODE_NONE;
|
|
592
|
+
} else if (m == "layer") {
|
|
593
|
+
mode = LLAMA_SPLIT_MODE_LAYER;
|
|
594
|
+
} else if (m == "row") {
|
|
595
|
+
mode = LLAMA_SPLIT_MODE_ROW;
|
|
596
|
+
} else {
|
|
597
|
+
invalid_param = true;
|
|
598
|
+
break;
|
|
599
|
+
}
|
|
600
|
+
modes.push_back(mode);
|
|
601
|
+
}
|
|
602
|
+
if (invalid_param) {
|
|
603
|
+
break;
|
|
604
|
+
}
|
|
605
|
+
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
|
606
|
+
} else if (arg == "-mg" || arg == "--main-gpu") {
|
|
607
|
+
if (++i >= argc) {
|
|
608
|
+
invalid_param = true;
|
|
609
|
+
break;
|
|
610
|
+
}
|
|
611
|
+
params.main_gpu = parse_int_range(argv[i]);
|
|
612
|
+
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
|
613
|
+
if (++i >= argc) {
|
|
614
|
+
invalid_param = true;
|
|
615
|
+
break;
|
|
616
|
+
}
|
|
617
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
618
|
+
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
|
619
|
+
} else if (arg == "--numa") {
|
|
620
|
+
if (++i >= argc) {
|
|
531
621
|
invalid_param = true;
|
|
532
622
|
break;
|
|
533
623
|
}
|
|
534
|
-
modes.push_back(mode);
|
|
535
|
-
}
|
|
536
|
-
if (invalid_param) {
|
|
537
|
-
break;
|
|
538
|
-
}
|
|
539
|
-
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
|
540
|
-
} else if (arg == "-mg" || arg == "--main-gpu") {
|
|
541
|
-
if (++i >= argc) {
|
|
542
|
-
invalid_param = true;
|
|
543
|
-
break;
|
|
544
|
-
}
|
|
545
|
-
params.main_gpu = string_split<int>(argv[i], split_delim);
|
|
546
|
-
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
|
547
|
-
if (++i >= argc) {
|
|
548
|
-
invalid_param = true;
|
|
549
|
-
break;
|
|
550
|
-
}
|
|
551
|
-
auto p = string_split<bool>(argv[i], split_delim);
|
|
552
|
-
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
|
553
|
-
} else if (arg == "--numa") {
|
|
554
|
-
if (++i >= argc) {
|
|
555
|
-
invalid_param = true;
|
|
556
|
-
break;
|
|
557
|
-
} else {
|
|
558
624
|
std::string value(argv[i]);
|
|
559
|
-
|
|
625
|
+
if (value == "distribute" || value == "") {
|
|
560
626
|
params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
|
|
561
627
|
} else if (value == "isolate") {
|
|
562
628
|
params.numa = GGML_NUMA_STRATEGY_ISOLATE;
|
|
@@ -566,170 +632,183 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
566
632
|
invalid_param = true;
|
|
567
633
|
break;
|
|
568
634
|
}
|
|
569
|
-
}
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
635
|
+
} else if (arg == "-fa" || arg == "--flash-attn") {
|
|
636
|
+
if (++i >= argc) {
|
|
637
|
+
invalid_param = true;
|
|
638
|
+
break;
|
|
639
|
+
}
|
|
640
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
641
|
+
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
|
|
642
|
+
} else if (arg == "-mmp" || arg == "--mmap") {
|
|
643
|
+
if (++i >= argc) {
|
|
644
|
+
invalid_param = true;
|
|
645
|
+
break;
|
|
646
|
+
}
|
|
647
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
648
|
+
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
|
649
|
+
} else if (arg == "-embd" || arg == "--embeddings") {
|
|
650
|
+
if (++i >= argc) {
|
|
651
|
+
invalid_param = true;
|
|
652
|
+
break;
|
|
653
|
+
}
|
|
654
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
655
|
+
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
|
|
656
|
+
} else if (arg == "-nopo" || arg == "--no-op-offload") {
|
|
657
|
+
if (++i >= argc) {
|
|
658
|
+
invalid_param = true;
|
|
659
|
+
break;
|
|
660
|
+
}
|
|
661
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
662
|
+
params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
|
|
663
|
+
} else if (arg == "-ts" || arg == "--tensor-split") {
|
|
664
|
+
if (++i >= argc) {
|
|
665
|
+
invalid_param = true;
|
|
666
|
+
break;
|
|
667
|
+
}
|
|
668
|
+
for (auto ts : string_split<std::string>(argv[i], split_delim)) {
|
|
669
|
+
// split string by ; and /
|
|
670
|
+
const std::regex regex{ R"([;/]+)" };
|
|
671
|
+
std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
|
|
672
|
+
std::vector<std::string> split_arg{ it, {} };
|
|
673
|
+
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
|
674
|
+
|
|
675
|
+
std::vector<float> tensor_split(llama_max_devices());
|
|
676
|
+
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
|
677
|
+
if (i < split_arg.size()) {
|
|
678
|
+
tensor_split[i] = std::stof(split_arg[i]);
|
|
679
|
+
} else {
|
|
680
|
+
tensor_split[i] = 0.0f;
|
|
681
|
+
}
|
|
609
682
|
}
|
|
683
|
+
params.tensor_split.push_back(tensor_split);
|
|
610
684
|
}
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
buft_list[ggml_backend_buft_name(buft)] = buft;
|
|
685
|
+
} else if (arg == "-ot" || arg == "--override-tensor") {
|
|
686
|
+
if (++i >= argc) {
|
|
687
|
+
invalid_param = true;
|
|
688
|
+
break;
|
|
689
|
+
}
|
|
690
|
+
auto * value = argv[i];
|
|
691
|
+
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
|
692
|
+
if (buft_list.empty()) {
|
|
693
|
+
// enumerate all the devices and add their buffer types to the list
|
|
694
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
695
|
+
auto * dev = ggml_backend_dev_get(i);
|
|
696
|
+
auto * buft = ggml_backend_dev_buffer_type(dev);
|
|
697
|
+
if (buft) {
|
|
698
|
+
buft_list[ggml_backend_buft_name(buft)] = buft;
|
|
699
|
+
}
|
|
627
700
|
}
|
|
628
701
|
}
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
702
|
+
auto override_group_span_len = std::strcspn(value, ",");
|
|
703
|
+
bool last_group = false;
|
|
704
|
+
do {
|
|
705
|
+
if (override_group_span_len == 0) {
|
|
706
|
+
// Adds an empty override-tensors for an empty span
|
|
707
|
+
params.tensor_buft_overrides.push_back({{}});
|
|
708
|
+
if (value[override_group_span_len] == '\0') {
|
|
709
|
+
value = &value[override_group_span_len];
|
|
710
|
+
last_group = true;
|
|
711
|
+
} else {
|
|
712
|
+
value = &value[override_group_span_len + 1];
|
|
713
|
+
override_group_span_len = std::strcspn(value, ",");
|
|
714
|
+
}
|
|
715
|
+
continue;
|
|
716
|
+
}
|
|
717
|
+
// Stamps null terminators into the argv
|
|
718
|
+
// value for this option to avoid the
|
|
719
|
+
// memory leak present in the implementation
|
|
720
|
+
// over in arg.cpp. Acceptable because we
|
|
721
|
+
// only parse these args once in this program.
|
|
722
|
+
auto * override_group = value;
|
|
636
723
|
if (value[override_group_span_len] == '\0') {
|
|
637
724
|
value = &value[override_group_span_len];
|
|
638
725
|
last_group = true;
|
|
639
726
|
} else {
|
|
727
|
+
value[override_group_span_len] = '\0';
|
|
640
728
|
value = &value[override_group_span_len + 1];
|
|
641
|
-
override_group_span_len = std::strcspn(value, ",");
|
|
642
729
|
}
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
value = &value[override_group_span_len];
|
|
653
|
-
last_group = true;
|
|
654
|
-
} else {
|
|
655
|
-
value[override_group_span_len] = '\0';
|
|
656
|
-
value = &value[override_group_span_len + 1];
|
|
657
|
-
}
|
|
658
|
-
std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
|
|
659
|
-
auto override_span_len = std::strcspn(override_group, ";");
|
|
660
|
-
while (override_span_len > 0) {
|
|
661
|
-
auto override = override_group;
|
|
662
|
-
if (override_group[override_span_len] != '\0') {
|
|
663
|
-
override_group[override_span_len] = '\0';
|
|
664
|
-
override_group = &override_group[override_span_len + 1];
|
|
665
|
-
} else {
|
|
666
|
-
override_group = &override_group[override_span_len];
|
|
667
|
-
}
|
|
668
|
-
auto tensor_name_span_len = std::strcspn(override, "=");
|
|
669
|
-
if (tensor_name_span_len >= override_span_len) {
|
|
670
|
-
invalid_param = true;
|
|
671
|
-
break;
|
|
672
|
-
}
|
|
673
|
-
override[tensor_name_span_len] = '\0';
|
|
674
|
-
auto tensor_name = override;
|
|
675
|
-
auto buffer_type = &override[tensor_name_span_len + 1];
|
|
676
|
-
if (buft_list.find(buffer_type) == buft_list.end()) {
|
|
677
|
-
printf("Available buffer types:\n");
|
|
678
|
-
for (const auto & it : buft_list) {
|
|
679
|
-
printf(" %s\n", ggml_backend_buft_name(it.second));
|
|
730
|
+
std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
|
|
731
|
+
auto override_span_len = std::strcspn(override_group, ";");
|
|
732
|
+
while (override_span_len > 0) {
|
|
733
|
+
auto * override = override_group;
|
|
734
|
+
if (override_group[override_span_len] != '\0') {
|
|
735
|
+
override_group[override_span_len] = '\0';
|
|
736
|
+
override_group = &override_group[override_span_len + 1];
|
|
737
|
+
} else {
|
|
738
|
+
override_group = &override_group[override_span_len];
|
|
680
739
|
}
|
|
681
|
-
|
|
740
|
+
auto tensor_name_span_len = std::strcspn(override, "=");
|
|
741
|
+
if (tensor_name_span_len >= override_span_len) {
|
|
742
|
+
invalid_param = true;
|
|
743
|
+
break;
|
|
744
|
+
}
|
|
745
|
+
override[tensor_name_span_len] = '\0';
|
|
746
|
+
auto * tensor_name = override;
|
|
747
|
+
auto * buffer_type = &override[tensor_name_span_len + 1];
|
|
748
|
+
if (buft_list.find(buffer_type) == buft_list.end()) {
|
|
749
|
+
printf("error: unrecognized buffer type '%s'\n", buffer_type);
|
|
750
|
+
printf("Available buffer types:\n");
|
|
751
|
+
for (const auto & it : buft_list) {
|
|
752
|
+
printf(" %s\n", ggml_backend_buft_name(it.second));
|
|
753
|
+
}
|
|
754
|
+
invalid_param = true;
|
|
755
|
+
break;
|
|
756
|
+
}
|
|
757
|
+
group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
|
|
758
|
+
override_span_len = std::strcspn(override_group, ";");
|
|
759
|
+
}
|
|
760
|
+
if (invalid_param) {
|
|
682
761
|
break;
|
|
683
762
|
}
|
|
684
|
-
group_tensor_buft_overrides.push_back({
|
|
685
|
-
|
|
763
|
+
group_tensor_buft_overrides.push_back({nullptr,nullptr});
|
|
764
|
+
params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
|
|
765
|
+
override_group_span_len = std::strcspn(value, ",");
|
|
766
|
+
} while (!last_group);
|
|
767
|
+
} else if (arg == "-r" || arg == "--repetitions") {
|
|
768
|
+
if (++i >= argc) {
|
|
769
|
+
invalid_param = true;
|
|
770
|
+
break;
|
|
686
771
|
}
|
|
687
|
-
|
|
772
|
+
params.reps = std::stoi(argv[i]);
|
|
773
|
+
} else if (arg == "--prio") {
|
|
774
|
+
if (++i >= argc) {
|
|
775
|
+
invalid_param = true;
|
|
688
776
|
break;
|
|
689
777
|
}
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
invalid_param =
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
invalid_param =
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
invalid_param = true;
|
|
715
|
-
break;
|
|
716
|
-
}
|
|
717
|
-
invalid_param = !output_format_from_str(argv[i], params.output_format);
|
|
718
|
-
} else if (arg == "-oe" || arg == "--output-err") {
|
|
719
|
-
if (++i >= argc) {
|
|
778
|
+
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
|
|
779
|
+
} else if (arg == "--delay") {
|
|
780
|
+
if (++i >= argc) {
|
|
781
|
+
invalid_param = true;
|
|
782
|
+
break;
|
|
783
|
+
}
|
|
784
|
+
params.delay = std::stoi(argv[i]);
|
|
785
|
+
} else if (arg == "-o" || arg == "--output") {
|
|
786
|
+
if (++i >= argc) {
|
|
787
|
+
invalid_param = true;
|
|
788
|
+
break;
|
|
789
|
+
}
|
|
790
|
+
invalid_param = !output_format_from_str(argv[i], params.output_format);
|
|
791
|
+
} else if (arg == "-oe" || arg == "--output-err") {
|
|
792
|
+
if (++i >= argc) {
|
|
793
|
+
invalid_param = true;
|
|
794
|
+
break;
|
|
795
|
+
}
|
|
796
|
+
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
|
797
|
+
} else if (arg == "-v" || arg == "--verbose") {
|
|
798
|
+
params.verbose = true;
|
|
799
|
+
} else if (arg == "--progress") {
|
|
800
|
+
params.progress = true;
|
|
801
|
+
} else {
|
|
720
802
|
invalid_param = true;
|
|
721
803
|
break;
|
|
722
804
|
}
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
params.verbose = true;
|
|
726
|
-
} else if (arg == "--progress") {
|
|
727
|
-
params.progress = true;
|
|
728
|
-
} else {
|
|
805
|
+
} catch (const std::exception & e) {
|
|
806
|
+
fprintf(stderr, "error: %s\n", e.what());
|
|
729
807
|
invalid_param = true;
|
|
730
808
|
break;
|
|
731
809
|
}
|
|
732
810
|
}
|
|
811
|
+
|
|
733
812
|
if (invalid_param) {
|
|
734
813
|
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
|
735
814
|
print_usage(argc, argv);
|
|
@@ -764,6 +843,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
764
843
|
if (params.type_v.empty()) {
|
|
765
844
|
params.type_v = cmd_params_defaults.type_v;
|
|
766
845
|
}
|
|
846
|
+
if (params.defrag_thold.empty()) {
|
|
847
|
+
params.defrag_thold = cmd_params_defaults.defrag_thold;
|
|
848
|
+
}
|
|
767
849
|
if (params.n_gpu_layers.empty()) {
|
|
768
850
|
params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
|
|
769
851
|
}
|
|
@@ -794,6 +876,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
794
876
|
if (params.embeddings.empty()) {
|
|
795
877
|
params.embeddings = cmd_params_defaults.embeddings;
|
|
796
878
|
}
|
|
879
|
+
if (params.no_op_offload.empty()) {
|
|
880
|
+
params.no_op_offload = cmd_params_defaults.no_op_offload;
|
|
881
|
+
}
|
|
797
882
|
if (params.n_threads.empty()) {
|
|
798
883
|
params.n_threads = cmd_params_defaults.n_threads;
|
|
799
884
|
}
|
|
@@ -819,6 +904,7 @@ struct cmd_params_instance {
|
|
|
819
904
|
int n_ubatch;
|
|
820
905
|
ggml_type type_k;
|
|
821
906
|
ggml_type type_v;
|
|
907
|
+
float defrag_thold;
|
|
822
908
|
int n_threads;
|
|
823
909
|
std::string cpu_mask;
|
|
824
910
|
bool cpu_strict;
|
|
@@ -833,6 +919,7 @@ struct cmd_params_instance {
|
|
|
833
919
|
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
834
920
|
bool use_mmap;
|
|
835
921
|
bool embeddings;
|
|
922
|
+
bool no_op_offload;
|
|
836
923
|
|
|
837
924
|
llama_model_params to_llama_mparams() const {
|
|
838
925
|
llama_model_params mparams = llama_model_default_params();
|
|
@@ -894,14 +981,16 @@ struct cmd_params_instance {
|
|
|
894
981
|
llama_context_params to_llama_cparams() const {
|
|
895
982
|
llama_context_params cparams = llama_context_default_params();
|
|
896
983
|
|
|
897
|
-
cparams.n_ctx
|
|
898
|
-
cparams.n_batch
|
|
899
|
-
cparams.n_ubatch
|
|
900
|
-
cparams.type_k
|
|
901
|
-
cparams.type_v
|
|
902
|
-
cparams.
|
|
903
|
-
cparams.
|
|
904
|
-
cparams.
|
|
984
|
+
cparams.n_ctx = n_prompt + n_gen + n_depth;
|
|
985
|
+
cparams.n_batch = n_batch;
|
|
986
|
+
cparams.n_ubatch = n_ubatch;
|
|
987
|
+
cparams.type_k = type_k;
|
|
988
|
+
cparams.type_v = type_v;
|
|
989
|
+
cparams.defrag_thold = defrag_thold;
|
|
990
|
+
cparams.offload_kqv = !no_kv_offload;
|
|
991
|
+
cparams.flash_attn = flash_attn;
|
|
992
|
+
cparams.embeddings = embeddings;
|
|
993
|
+
cparams.op_offload = !no_op_offload;
|
|
905
994
|
|
|
906
995
|
return cparams;
|
|
907
996
|
}
|
|
@@ -921,10 +1010,12 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
921
1010
|
for (const auto & ot : params.tensor_buft_overrides)
|
|
922
1011
|
for (const auto & mmp : params.use_mmap)
|
|
923
1012
|
for (const auto & embd : params.embeddings)
|
|
1013
|
+
for (const auto & nopo : params.no_op_offload)
|
|
924
1014
|
for (const auto & nb : params.n_batch)
|
|
925
1015
|
for (const auto & nub : params.n_ubatch)
|
|
926
1016
|
for (const auto & tk : params.type_k)
|
|
927
1017
|
for (const auto & tv : params.type_v)
|
|
1018
|
+
for (const auto & defrag_thold : params.defrag_thold)
|
|
928
1019
|
for (const auto & nkvo : params.no_kv_offload)
|
|
929
1020
|
for (const auto & fa : params.flash_attn)
|
|
930
1021
|
for (const auto & nt : params.n_threads)
|
|
@@ -945,6 +1036,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
945
1036
|
/* .n_ubatch = */ nub,
|
|
946
1037
|
/* .type_k = */ tk,
|
|
947
1038
|
/* .type_v = */ tv,
|
|
1039
|
+
/* .defrag_thold = */ defrag_thold,
|
|
948
1040
|
/* .n_threads = */ nt,
|
|
949
1041
|
/* .cpu_mask = */ cm,
|
|
950
1042
|
/* .cpu_strict = */ cs,
|
|
@@ -959,6 +1051,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
959
1051
|
/* .tensor_buft_overrides = */ ot,
|
|
960
1052
|
/* .use_mmap = */ mmp,
|
|
961
1053
|
/* .embeddings = */ embd,
|
|
1054
|
+
/* .no_op_offload= */ nopo,
|
|
962
1055
|
};
|
|
963
1056
|
instances.push_back(instance);
|
|
964
1057
|
}
|
|
@@ -976,6 +1069,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
976
1069
|
/* .n_ubatch = */ nub,
|
|
977
1070
|
/* .type_k = */ tk,
|
|
978
1071
|
/* .type_v = */ tv,
|
|
1072
|
+
/* .defrag_thold = */ defrag_thold,
|
|
979
1073
|
/* .n_threads = */ nt,
|
|
980
1074
|
/* .cpu_mask = */ cm,
|
|
981
1075
|
/* .cpu_strict = */ cs,
|
|
@@ -990,6 +1084,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
990
1084
|
/* .tensor_buft_overrides = */ ot,
|
|
991
1085
|
/* .use_mmap = */ mmp,
|
|
992
1086
|
/* .embeddings = */ embd,
|
|
1087
|
+
/* .no_op_offload= */ nopo,
|
|
993
1088
|
};
|
|
994
1089
|
instances.push_back(instance);
|
|
995
1090
|
}
|
|
@@ -1007,6 +1102,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
1007
1102
|
/* .n_ubatch = */ nub,
|
|
1008
1103
|
/* .type_k = */ tk,
|
|
1009
1104
|
/* .type_v = */ tv,
|
|
1105
|
+
/* .defrag_thold = */ defrag_thold,
|
|
1010
1106
|
/* .n_threads = */ nt,
|
|
1011
1107
|
/* .cpu_mask = */ cm,
|
|
1012
1108
|
/* .cpu_strict = */ cs,
|
|
@@ -1021,6 +1117,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
1021
1117
|
/* .tensor_buft_overrides = */ ot,
|
|
1022
1118
|
/* .use_mmap = */ mmp,
|
|
1023
1119
|
/* .embeddings = */ embd,
|
|
1120
|
+
/* .no_op_offload= */ nopo,
|
|
1024
1121
|
};
|
|
1025
1122
|
instances.push_back(instance);
|
|
1026
1123
|
}
|
|
@@ -1047,6 +1144,7 @@ struct test {
|
|
|
1047
1144
|
int poll;
|
|
1048
1145
|
ggml_type type_k;
|
|
1049
1146
|
ggml_type type_v;
|
|
1147
|
+
float defrag_thold;
|
|
1050
1148
|
int n_gpu_layers;
|
|
1051
1149
|
llama_split_mode split_mode;
|
|
1052
1150
|
int main_gpu;
|
|
@@ -1056,6 +1154,7 @@ struct test {
|
|
|
1056
1154
|
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
1057
1155
|
bool use_mmap;
|
|
1058
1156
|
bool embeddings;
|
|
1157
|
+
bool no_op_offload;
|
|
1059
1158
|
int n_prompt;
|
|
1060
1159
|
int n_gen;
|
|
1061
1160
|
int n_depth;
|
|
@@ -1080,6 +1179,7 @@ struct test {
|
|
|
1080
1179
|
poll = inst.poll;
|
|
1081
1180
|
type_k = inst.type_k;
|
|
1082
1181
|
type_v = inst.type_v;
|
|
1182
|
+
defrag_thold = inst.defrag_thold;
|
|
1083
1183
|
n_gpu_layers = inst.n_gpu_layers;
|
|
1084
1184
|
split_mode = inst.split_mode;
|
|
1085
1185
|
main_gpu = inst.main_gpu;
|
|
@@ -1089,6 +1189,7 @@ struct test {
|
|
|
1089
1189
|
tensor_buft_overrides = inst.tensor_buft_overrides;
|
|
1090
1190
|
use_mmap = inst.use_mmap;
|
|
1091
1191
|
embeddings = inst.embeddings;
|
|
1192
|
+
no_op_offload = inst.no_op_offload;
|
|
1092
1193
|
n_prompt = inst.n_prompt;
|
|
1093
1194
|
n_gen = inst.n_gen;
|
|
1094
1195
|
n_depth = inst.n_depth;
|
|
@@ -1134,7 +1235,8 @@ struct test {
|
|
|
1134
1235
|
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
|
1135
1236
|
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
|
1136
1237
|
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
|
1137
|
-
"
|
|
1238
|
+
"defrag_thold",
|
|
1239
|
+
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
|
|
1138
1240
|
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
|
1139
1241
|
};
|
|
1140
1242
|
return fields;
|
|
@@ -1146,14 +1248,14 @@ struct test {
|
|
|
1146
1248
|
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
|
|
1147
1249
|
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
|
|
1148
1250
|
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
|
|
1149
|
-
field == "avg_ns" || field == "stddev_ns") {
|
|
1251
|
+
field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
|
|
1150
1252
|
return INT;
|
|
1151
1253
|
}
|
|
1152
1254
|
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
|
1153
1255
|
field == "use_mmap" || field == "embeddings") {
|
|
1154
1256
|
return BOOL;
|
|
1155
1257
|
}
|
|
1156
|
-
if (field == "avg_ts" || field == "stddev_ts") {
|
|
1258
|
+
if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
|
|
1157
1259
|
return FLOAT;
|
|
1158
1260
|
}
|
|
1159
1261
|
return STRING;
|
|
@@ -1220,8 +1322,10 @@ struct test {
|
|
|
1220
1322
|
std::to_string(flash_attn),
|
|
1221
1323
|
tensor_split_str,
|
|
1222
1324
|
tensor_buft_overrides_str,
|
|
1325
|
+
std::to_string(defrag_thold),
|
|
1223
1326
|
std::to_string(use_mmap),
|
|
1224
1327
|
std::to_string(embeddings),
|
|
1328
|
+
std::to_string(no_op_offload),
|
|
1225
1329
|
std::to_string(n_prompt),
|
|
1226
1330
|
std::to_string(n_gen),
|
|
1227
1331
|
std::to_string(n_depth),
|
|
@@ -1404,6 +1508,9 @@ struct markdown_printer : public printer {
|
|
|
1404
1508
|
if (field == "test") {
|
|
1405
1509
|
return 15;
|
|
1406
1510
|
}
|
|
1511
|
+
if (field == "no_op_offload") {
|
|
1512
|
+
return 4;
|
|
1513
|
+
}
|
|
1407
1514
|
|
|
1408
1515
|
int width = std::max((int) field.length(), 10);
|
|
1409
1516
|
|
|
@@ -1435,6 +1542,9 @@ struct markdown_printer : public printer {
|
|
|
1435
1542
|
if (field == "embeddings") {
|
|
1436
1543
|
return "embd";
|
|
1437
1544
|
}
|
|
1545
|
+
if (field == "no_op_offload") {
|
|
1546
|
+
return "nopo";
|
|
1547
|
+
}
|
|
1438
1548
|
if (field == "tensor_split") {
|
|
1439
1549
|
return "ts";
|
|
1440
1550
|
}
|
|
@@ -1479,6 +1589,9 @@ struct markdown_printer : public printer {
|
|
|
1479
1589
|
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
|
|
1480
1590
|
fields.emplace_back("type_v");
|
|
1481
1591
|
}
|
|
1592
|
+
if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
|
|
1593
|
+
fields.emplace_back("defrag_thold");
|
|
1594
|
+
}
|
|
1482
1595
|
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
|
1483
1596
|
fields.emplace_back("main_gpu");
|
|
1484
1597
|
}
|
|
@@ -1503,6 +1616,9 @@ struct markdown_printer : public printer {
|
|
|
1503
1616
|
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
|
|
1504
1617
|
fields.emplace_back("embeddings");
|
|
1505
1618
|
}
|
|
1619
|
+
if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
|
|
1620
|
+
fields.emplace_back("no_op_offload");
|
|
1621
|
+
}
|
|
1506
1622
|
fields.emplace_back("test");
|
|
1507
1623
|
fields.emplace_back("t/s");
|
|
1508
1624
|
|
|
@@ -1621,7 +1737,7 @@ struct sql_printer : public printer {
|
|
|
1621
1737
|
}
|
|
1622
1738
|
};
|
|
1623
1739
|
|
|
1624
|
-
static
|
|
1740
|
+
static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
|
|
1625
1741
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1626
1742
|
|
|
1627
1743
|
const llama_model * model = llama_get_model(ctx);
|
|
@@ -1638,14 +1754,19 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
|
|
|
1638
1754
|
for (int i = 1; i < n_tokens; i++) {
|
|
1639
1755
|
tokens[i] = std::rand() % n_vocab;
|
|
1640
1756
|
}
|
|
1641
|
-
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
|
|
1757
|
+
int res = llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
|
|
1758
|
+
if (res != 0) {
|
|
1759
|
+
fprintf(stderr, "%s: failed to decode prompt batch, res = %d\n", __func__, res);
|
|
1760
|
+
return false;
|
|
1761
|
+
}
|
|
1642
1762
|
n_processed += n_tokens;
|
|
1643
1763
|
}
|
|
1644
1764
|
|
|
1645
1765
|
llama_synchronize(ctx);
|
|
1766
|
+
return true;
|
|
1646
1767
|
}
|
|
1647
1768
|
|
|
1648
|
-
static
|
|
1769
|
+
static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
|
|
1649
1770
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1650
1771
|
|
|
1651
1772
|
const llama_model * model = llama_get_model(ctx);
|
|
@@ -1655,10 +1776,15 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
|
|
|
1655
1776
|
llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
|
|
1656
1777
|
|
|
1657
1778
|
for (int i = 0; i < n_gen; i++) {
|
|
1658
|
-
llama_decode(ctx, llama_batch_get_one(&token, 1));
|
|
1779
|
+
int res = llama_decode(ctx, llama_batch_get_one(&token, 1));
|
|
1780
|
+
if (res != 0) {
|
|
1781
|
+
fprintf(stderr, "%s: failed to decode generation batch, res = %d\n", __func__, res);
|
|
1782
|
+
return false;
|
|
1783
|
+
}
|
|
1659
1784
|
llama_synchronize(ctx);
|
|
1660
1785
|
token = std::rand() % n_vocab;
|
|
1661
1786
|
}
|
|
1787
|
+
return true;
|
|
1662
1788
|
}
|
|
1663
1789
|
|
|
1664
1790
|
static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
|
|
@@ -1701,10 +1827,11 @@ int main(int argc, char ** argv) {
|
|
|
1701
1827
|
fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
|
|
1702
1828
|
#endif
|
|
1703
1829
|
|
|
1704
|
-
cmd_params params = parse_cmd_params(argc, argv);
|
|
1705
|
-
|
|
1706
1830
|
// initialize backends
|
|
1707
1831
|
ggml_backend_load_all();
|
|
1832
|
+
|
|
1833
|
+
cmd_params params = parse_cmd_params(argc, argv);
|
|
1834
|
+
|
|
1708
1835
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1709
1836
|
if (!cpu_dev) {
|
|
1710
1837
|
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
|
|
@@ -1802,13 +1929,21 @@ int main(int argc, char ** argv) {
|
|
|
1802
1929
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
|
|
1803
1930
|
}
|
|
1804
1931
|
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
|
1805
|
-
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1932
|
+
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1933
|
+
if (!res) {
|
|
1934
|
+
fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
|
|
1935
|
+
exit(1);
|
|
1936
|
+
}
|
|
1806
1937
|
}
|
|
1807
1938
|
if (t.n_gen > 0) {
|
|
1808
1939
|
if (params.progress) {
|
|
1809
1940
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
|
|
1810
1941
|
}
|
|
1811
|
-
test_gen(ctx, 1, t.n_threads);
|
|
1942
|
+
bool res = test_gen(ctx, 1, t.n_threads);
|
|
1943
|
+
if (!res) {
|
|
1944
|
+
fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
|
|
1945
|
+
exit(1);
|
|
1946
|
+
}
|
|
1812
1947
|
}
|
|
1813
1948
|
|
|
1814
1949
|
for (int i = 0; i < params.reps; i++) {
|
|
@@ -1819,7 +1954,11 @@ int main(int argc, char ** argv) {
|
|
|
1819
1954
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
|
|
1820
1955
|
i + 1, params.reps);
|
|
1821
1956
|
}
|
|
1822
|
-
test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
|
|
1957
|
+
bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
|
|
1958
|
+
if (!res) {
|
|
1959
|
+
fprintf(stderr, "%s: error: failed to run depth\n", __func__);
|
|
1960
|
+
exit(1);
|
|
1961
|
+
}
|
|
1823
1962
|
}
|
|
1824
1963
|
|
|
1825
1964
|
uint64_t t_start = get_time_ns();
|
|
@@ -1829,14 +1968,22 @@ int main(int argc, char ** argv) {
|
|
|
1829
1968
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
|
|
1830
1969
|
i + 1, params.reps);
|
|
1831
1970
|
}
|
|
1832
|
-
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1971
|
+
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1972
|
+
if (!res) {
|
|
1973
|
+
fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
|
|
1974
|
+
exit(1);
|
|
1975
|
+
}
|
|
1833
1976
|
}
|
|
1834
1977
|
if (t.n_gen > 0) {
|
|
1835
1978
|
if (params.progress) {
|
|
1836
1979
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
|
|
1837
1980
|
i + 1, params.reps);
|
|
1838
1981
|
}
|
|
1839
|
-
test_gen(ctx, t.n_gen, t.n_threads);
|
|
1982
|
+
bool res = test_gen(ctx, t.n_gen, t.n_threads);
|
|
1983
|
+
if (!res) {
|
|
1984
|
+
fprintf(stderr, "%s: error: failed to run gen\n", __func__);
|
|
1985
|
+
exit(1);
|
|
1986
|
+
}
|
|
1840
1987
|
}
|
|
1841
1988
|
|
|
1842
1989
|
uint64_t t_ns = get_time_ns() - t_start;
|