@fugood/llama.node 0.3.17 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -195,6 +195,46 @@ static std::string pair_str(const std::pair<int, int> & p) {
195
195
  return buf;
196
196
  }
197
197
 
198
+ static std::vector<int> parse_int_range(const std::string & s) {
199
+ // first[-last[(+|*)step]]
200
+ std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
201
+
202
+ std::smatch match;
203
+ std::string::const_iterator search_start(s.cbegin());
204
+ std::vector<int> result;
205
+ while (std::regex_search(search_start, s.cend(), match, range_regex)) {
206
+ int first = std::stoi(match[1]);
207
+ int last = match[2].matched ? std::stoi(match[2]) : first;
208
+ char op = match[3].matched ? match[3].str()[0] : '+';
209
+ int step = match[4].matched ? std::stoi(match[4]) : 1;
210
+
211
+ for (int i = first; i <= last;) {
212
+ result.push_back(i);
213
+
214
+ int prev_i = i;
215
+
216
+ if (op == '+') {
217
+ i += step;
218
+ } else if (op == '*') {
219
+ i *= step;
220
+ } else {
221
+ throw std::invalid_argument("invalid range format");
222
+ }
223
+
224
+ if (i <= prev_i) {
225
+ throw std::invalid_argument("invalid range");
226
+ }
227
+ }
228
+ search_start = match.suffix().first;
229
+ }
230
+
231
+ if (search_start != s.cend()) {
232
+ throw std::invalid_argument("invalid range format");
233
+ }
234
+
235
+ return result;
236
+ }
237
+
198
238
  struct cmd_params {
199
239
  std::vector<std::string> model;
200
240
  std::vector<int> n_prompt;
@@ -205,6 +245,7 @@ struct cmd_params {
205
245
  std::vector<int> n_ubatch;
206
246
  std::vector<ggml_type> type_k;
207
247
  std::vector<ggml_type> type_v;
248
+ std::vector<float> defrag_thold;
208
249
  std::vector<int> n_threads;
209
250
  std::vector<std::string> cpu_mask;
210
251
  std::vector<bool> cpu_strict;
@@ -219,6 +260,7 @@ struct cmd_params {
219
260
  std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
220
261
  std::vector<bool> use_mmap;
221
262
  std::vector<bool> embeddings;
263
+ std::vector<bool> no_op_offload;
222
264
  ggml_numa_strategy numa;
223
265
  int reps;
224
266
  ggml_sched_priority prio;
@@ -239,6 +281,7 @@ static const cmd_params cmd_params_defaults = {
239
281
  /* n_ubatch */ { 512 },
240
282
  /* type_k */ { GGML_TYPE_F16 },
241
283
  /* type_v */ { GGML_TYPE_F16 },
284
+ /* defrag_thold */ { -1.0f },
242
285
  /* n_threads */ { cpu_get_num_math() },
243
286
  /* cpu_mask */ { "0x0" },
244
287
  /* cpu_strict */ { false },
@@ -250,9 +293,10 @@ static const cmd_params cmd_params_defaults = {
250
293
  /* no_kv_offload */ { false },
251
294
  /* flash_attn */ { false },
252
295
  /* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
253
- /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} },
296
+ /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
254
297
  /* use_mmap */ { true },
255
298
  /* embeddings */ { false },
299
+ /* no_op_offload */ { false },
256
300
  /* numa */ GGML_NUMA_STRATEGY_DISABLED,
257
301
  /* reps */ 5,
258
302
  /* prio */ GGML_SCHED_PRIO_NORMAL,
@@ -268,13 +312,29 @@ static void print_usage(int /* argc */, char ** argv) {
268
312
  printf("\n");
269
313
  printf("options:\n");
270
314
  printf(" -h, --help\n");
315
+ printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
316
+ printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
317
+ cmd_params_defaults.reps);
318
+ printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n",
319
+ cmd_params_defaults.prio);
320
+ printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
321
+ cmd_params_defaults.delay);
322
+ printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
323
+ output_format_str(cmd_params_defaults.output_format));
324
+ printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
325
+ output_format_str(cmd_params_defaults.output_format_stderr));
326
+ printf(" -v, --verbose verbose output\n");
327
+ printf(" --progress print test progress indicators\n");
328
+ printf("\n");
329
+ printf("test parameters:\n");
271
330
  printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
272
331
  printf(" -p, --n-prompt <n> (default: %s)\n",
273
332
  join(cmd_params_defaults.n_prompt, ",").c_str());
274
333
  printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
275
334
  printf(" -pg <pp,tg> (default: %s)\n",
276
335
  join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
277
- printf(" -d, --n-depth <n> (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
336
+ printf(" -d, --n-depth <n> (default: %s)\n",
337
+ join(cmd_params_defaults.n_depth, ",").c_str());
278
338
  printf(" -b, --batch-size <n> (default: %s)\n",
279
339
  join(cmd_params_defaults.n_batch, ",").c_str());
280
340
  printf(" -ub, --ubatch-size <n> (default: %s)\n",
@@ -283,6 +343,8 @@ static void print_usage(int /* argc */, char ** argv) {
283
343
  join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
284
344
  printf(" -ctv, --cache-type-v <t> (default: %s)\n",
285
345
  join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
346
+ printf(" -dt, --defrag-thold <f> (default: %s)\n",
347
+ join(cmd_params_defaults.defrag_thold, ",").c_str());
286
348
  printf(" -t, --threads <n> (default: %s)\n",
287
349
  join(cmd_params_defaults.n_threads, ",").c_str());
288
350
  printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
@@ -306,24 +368,17 @@ static void print_usage(int /* argc */, char ** argv) {
306
368
  join(cmd_params_defaults.flash_attn, ",").c_str());
307
369
  printf(" -mmp, --mmap <0|1> (default: %s)\n",
308
370
  join(cmd_params_defaults.use_mmap, ",").c_str());
309
- printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
310
371
  printf(" -embd, --embeddings <0|1> (default: %s)\n",
311
372
  join(cmd_params_defaults.embeddings, ",").c_str());
312
373
  printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
313
- printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n");
314
- printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
315
- printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
316
- printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
317
- printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
318
- output_format_str(cmd_params_defaults.output_format));
319
- printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
320
- output_format_str(cmd_params_defaults.output_format_stderr));
321
- printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
322
- printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
374
+ printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
375
+ printf(" (default: disabled)\n");
376
+ printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
323
377
  printf("\n");
324
378
  printf(
325
- "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter "
326
- "multiple times.\n");
379
+ "Multiple values can be given for each parameter by separating them with ','\n"
380
+ "or by specifying the parameter multiple times. Ranges can be given as\n"
381
+ "'first-last' or 'first-last+step' or 'first-last*mult'.\n");
327
382
  }
328
383
 
329
384
  static ggml_type ggml_type_from_name(const std::string & s) {
@@ -377,186 +432,197 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
377
432
  std::replace(arg.begin(), arg.end(), '_', '-');
378
433
  }
379
434
 
380
- if (arg == "-h" || arg == "--help") {
381
- print_usage(argc, argv);
382
- exit(0);
383
- } else if (arg == "-m" || arg == "--model") {
384
- if (++i >= argc) {
385
- invalid_param = true;
386
- break;
387
- }
388
- auto p = string_split<std::string>(argv[i], split_delim);
389
- params.model.insert(params.model.end(), p.begin(), p.end());
390
- } else if (arg == "-p" || arg == "--n-prompt") {
391
- if (++i >= argc) {
392
- invalid_param = true;
393
- break;
394
- }
395
- auto p = string_split<int>(argv[i], split_delim);
396
- params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
397
- } else if (arg == "-n" || arg == "--n-gen") {
398
- if (++i >= argc) {
399
- invalid_param = true;
400
- break;
401
- }
402
- auto p = string_split<int>(argv[i], split_delim);
403
- params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
404
- } else if (arg == "-pg") {
405
- if (++i >= argc) {
406
- invalid_param = true;
407
- break;
408
- }
409
- auto p = string_split<std::string>(argv[i], ',');
410
- if (p.size() != 2) {
411
- invalid_param = true;
412
- break;
413
- }
414
- params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
415
- } else if (arg == "-d" || arg == "--n-depth") {
416
- if (++i >= argc) {
417
- invalid_param = true;
418
- break;
419
- }
420
- auto p = string_split<int>(argv[i], split_delim);
421
- params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
422
- } else if (arg == "-b" || arg == "--batch-size") {
423
- if (++i >= argc) {
424
- invalid_param = true;
425
- break;
426
- }
427
- auto p = string_split<int>(argv[i], split_delim);
428
- params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
429
- } else if (arg == "-ub" || arg == "--ubatch-size") {
430
- if (++i >= argc) {
431
- invalid_param = true;
432
- break;
433
- }
434
- auto p = string_split<int>(argv[i], split_delim);
435
- params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
436
- } else if (arg == "-ctk" || arg == "--cache-type-k") {
437
- if (++i >= argc) {
438
- invalid_param = true;
439
- break;
440
- }
441
- auto p = string_split<std::string>(argv[i], split_delim);
442
- std::vector<ggml_type> types;
443
- for (const auto & t : p) {
444
- ggml_type gt = ggml_type_from_name(t);
445
- if (gt == GGML_TYPE_COUNT) {
435
+ try {
436
+ if (arg == "-h" || arg == "--help") {
437
+ print_usage(argc, argv);
438
+ exit(0);
439
+ } else if (arg == "-m" || arg == "--model") {
440
+ if (++i >= argc) {
446
441
  invalid_param = true;
447
442
  break;
448
443
  }
449
- types.push_back(gt);
450
- }
451
- if (invalid_param) {
452
- break;
453
- }
454
- params.type_k.insert(params.type_k.end(), types.begin(), types.end());
455
- } else if (arg == "-ctv" || arg == "--cache-type-v") {
456
- if (++i >= argc) {
457
- invalid_param = true;
458
- break;
459
- }
460
- auto p = string_split<std::string>(argv[i], split_delim);
461
- std::vector<ggml_type> types;
462
- for (const auto & t : p) {
463
- ggml_type gt = ggml_type_from_name(t);
464
- if (gt == GGML_TYPE_COUNT) {
444
+ auto p = string_split<std::string>(argv[i], split_delim);
445
+ params.model.insert(params.model.end(), p.begin(), p.end());
446
+ } else if (arg == "-p" || arg == "--n-prompt") {
447
+ if (++i >= argc) {
465
448
  invalid_param = true;
466
449
  break;
467
450
  }
468
- types.push_back(gt);
469
- }
470
- if (invalid_param) {
471
- break;
472
- }
473
- params.type_v.insert(params.type_v.end(), types.begin(), types.end());
474
- } else if (arg == "-t" || arg == "--threads") {
475
- if (++i >= argc) {
476
- invalid_param = true;
477
- break;
478
- }
479
- auto p = string_split<int>(argv[i], split_delim);
480
- params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
481
- } else if (arg == "-C" || arg == "--cpu-mask") {
482
- if (++i >= argc) {
483
- invalid_param = true;
484
- break;
485
- }
486
- auto p = string_split<std::string>(argv[i], split_delim);
487
- params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
488
- } else if (arg == "--cpu-strict") {
489
- if (++i >= argc) {
490
- invalid_param = true;
491
- break;
492
- }
493
- auto p = string_split<bool>(argv[i], split_delim);
494
- params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
495
- } else if (arg == "--poll") {
496
- if (++i >= argc) {
497
- invalid_param = true;
498
- break;
499
- }
500
- auto p = string_split<int>(argv[i], split_delim);
501
- params.poll.insert(params.poll.end(), p.begin(), p.end());
502
- } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
503
- if (++i >= argc) {
504
- invalid_param = true;
505
- break;
506
- }
507
- auto p = string_split<int>(argv[i], split_delim);
508
- params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
509
- } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
510
- if (++i >= argc) {
511
- invalid_param = true;
512
- break;
513
- }
514
- params.rpc_servers.push_back(argv[i]);
515
- } else if (arg == "-sm" || arg == "--split-mode") {
516
- if (++i >= argc) {
517
- invalid_param = true;
518
- break;
519
- }
520
- auto p = string_split<std::string>(argv[i], split_delim);
521
- std::vector<llama_split_mode> modes;
522
- for (const auto & m : p) {
523
- llama_split_mode mode;
524
- if (m == "none") {
525
- mode = LLAMA_SPLIT_MODE_NONE;
526
- } else if (m == "layer") {
527
- mode = LLAMA_SPLIT_MODE_LAYER;
528
- } else if (m == "row") {
529
- mode = LLAMA_SPLIT_MODE_ROW;
530
- } else {
451
+ auto p = parse_int_range(argv[i]);
452
+ params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
453
+ } else if (arg == "-n" || arg == "--n-gen") {
454
+ if (++i >= argc) {
455
+ invalid_param = true;
456
+ break;
457
+ }
458
+ auto p = parse_int_range(argv[i]);
459
+ params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
460
+ } else if (arg == "-pg") {
461
+ if (++i >= argc) {
462
+ invalid_param = true;
463
+ break;
464
+ }
465
+ auto p = string_split<std::string>(argv[i], ',');
466
+ if (p.size() != 2) {
467
+ invalid_param = true;
468
+ break;
469
+ }
470
+ params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
471
+ } else if (arg == "-d" || arg == "--n-depth") {
472
+ if (++i >= argc) {
473
+ invalid_param = true;
474
+ break;
475
+ }
476
+ auto p = parse_int_range(argv[i]);
477
+ params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
478
+ } else if (arg == "-b" || arg == "--batch-size") {
479
+ if (++i >= argc) {
480
+ invalid_param = true;
481
+ break;
482
+ }
483
+ auto p = parse_int_range(argv[i]);
484
+ params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
485
+ } else if (arg == "-ub" || arg == "--ubatch-size") {
486
+ if (++i >= argc) {
487
+ invalid_param = true;
488
+ break;
489
+ }
490
+ auto p = parse_int_range(argv[i]);
491
+ params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
492
+ } else if (arg == "-ctk" || arg == "--cache-type-k") {
493
+ if (++i >= argc) {
494
+ invalid_param = true;
495
+ break;
496
+ }
497
+ auto p = string_split<std::string>(argv[i], split_delim);
498
+
499
+ std::vector<ggml_type> types;
500
+ for (const auto & t : p) {
501
+ ggml_type gt = ggml_type_from_name(t);
502
+ if (gt == GGML_TYPE_COUNT) {
503
+ invalid_param = true;
504
+ break;
505
+ }
506
+ types.push_back(gt);
507
+ }
508
+ if (invalid_param) {
509
+ break;
510
+ }
511
+ params.type_k.insert(params.type_k.end(), types.begin(), types.end());
512
+ } else if (arg == "-ctv" || arg == "--cache-type-v") {
513
+ if (++i >= argc) {
514
+ invalid_param = true;
515
+ break;
516
+ }
517
+ auto p = string_split<std::string>(argv[i], split_delim);
518
+
519
+ std::vector<ggml_type> types;
520
+ for (const auto & t : p) {
521
+ ggml_type gt = ggml_type_from_name(t);
522
+ if (gt == GGML_TYPE_COUNT) {
523
+ invalid_param = true;
524
+ break;
525
+ }
526
+ types.push_back(gt);
527
+ }
528
+ if (invalid_param) {
529
+ break;
530
+ }
531
+ params.type_v.insert(params.type_v.end(), types.begin(), types.end());
532
+ } else if (arg == "-dt" || arg == "--defrag-thold") {
533
+ if (++i >= argc) {
534
+ invalid_param = true;
535
+ break;
536
+ }
537
+ auto p = string_split<float>(argv[i], split_delim);
538
+ params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
539
+ } else if (arg == "-t" || arg == "--threads") {
540
+ if (++i >= argc) {
541
+ invalid_param = true;
542
+ break;
543
+ }
544
+ auto p = parse_int_range(argv[i]);
545
+ params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
546
+ } else if (arg == "-C" || arg == "--cpu-mask") {
547
+ if (++i >= argc) {
548
+ invalid_param = true;
549
+ break;
550
+ }
551
+ auto p = string_split<std::string>(argv[i], split_delim);
552
+ params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
553
+ } else if (arg == "--cpu-strict") {
554
+ if (++i >= argc) {
555
+ invalid_param = true;
556
+ break;
557
+ }
558
+ auto p = string_split<bool>(argv[i], split_delim);
559
+ params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
560
+ } else if (arg == "--poll") {
561
+ if (++i >= argc) {
562
+ invalid_param = true;
563
+ break;
564
+ }
565
+ auto p = parse_int_range(argv[i]);
566
+ params.poll.insert(params.poll.end(), p.begin(), p.end());
567
+ } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
568
+ if (++i >= argc) {
569
+ invalid_param = true;
570
+ break;
571
+ }
572
+ auto p = parse_int_range(argv[i]);
573
+ params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
574
+ } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
575
+ if (++i >= argc) {
576
+ invalid_param = true;
577
+ break;
578
+ }
579
+ params.rpc_servers.push_back(argv[i]);
580
+ } else if (arg == "-sm" || arg == "--split-mode") {
581
+ if (++i >= argc) {
582
+ invalid_param = true;
583
+ break;
584
+ }
585
+ auto p = string_split<std::string>(argv[i], split_delim);
586
+
587
+ std::vector<llama_split_mode> modes;
588
+ for (const auto & m : p) {
589
+ llama_split_mode mode;
590
+ if (m == "none") {
591
+ mode = LLAMA_SPLIT_MODE_NONE;
592
+ } else if (m == "layer") {
593
+ mode = LLAMA_SPLIT_MODE_LAYER;
594
+ } else if (m == "row") {
595
+ mode = LLAMA_SPLIT_MODE_ROW;
596
+ } else {
597
+ invalid_param = true;
598
+ break;
599
+ }
600
+ modes.push_back(mode);
601
+ }
602
+ if (invalid_param) {
603
+ break;
604
+ }
605
+ params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
606
+ } else if (arg == "-mg" || arg == "--main-gpu") {
607
+ if (++i >= argc) {
608
+ invalid_param = true;
609
+ break;
610
+ }
611
+ params.main_gpu = parse_int_range(argv[i]);
612
+ } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
613
+ if (++i >= argc) {
614
+ invalid_param = true;
615
+ break;
616
+ }
617
+ auto p = string_split<bool>(argv[i], split_delim);
618
+ params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
619
+ } else if (arg == "--numa") {
620
+ if (++i >= argc) {
531
621
  invalid_param = true;
532
622
  break;
533
623
  }
534
- modes.push_back(mode);
535
- }
536
- if (invalid_param) {
537
- break;
538
- }
539
- params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
540
- } else if (arg == "-mg" || arg == "--main-gpu") {
541
- if (++i >= argc) {
542
- invalid_param = true;
543
- break;
544
- }
545
- params.main_gpu = string_split<int>(argv[i], split_delim);
546
- } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
547
- if (++i >= argc) {
548
- invalid_param = true;
549
- break;
550
- }
551
- auto p = string_split<bool>(argv[i], split_delim);
552
- params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
553
- } else if (arg == "--numa") {
554
- if (++i >= argc) {
555
- invalid_param = true;
556
- break;
557
- } else {
558
624
  std::string value(argv[i]);
559
- /**/ if (value == "distribute" || value == "") {
625
+ if (value == "distribute" || value == "") {
560
626
  params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
561
627
  } else if (value == "isolate") {
562
628
  params.numa = GGML_NUMA_STRATEGY_ISOLATE;
@@ -566,170 +632,183 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
566
632
  invalid_param = true;
567
633
  break;
568
634
  }
569
- }
570
- } else if (arg == "-fa" || arg == "--flash-attn") {
571
- if (++i >= argc) {
572
- invalid_param = true;
573
- break;
574
- }
575
- auto p = string_split<bool>(argv[i], split_delim);
576
- params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
577
- } else if (arg == "-mmp" || arg == "--mmap") {
578
- if (++i >= argc) {
579
- invalid_param = true;
580
- break;
581
- }
582
- auto p = string_split<bool>(argv[i], split_delim);
583
- params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
584
- } else if (arg == "-embd" || arg == "--embeddings") {
585
- if (++i >= argc) {
586
- invalid_param = true;
587
- break;
588
- }
589
- auto p = string_split<bool>(argv[i], split_delim);
590
- params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
591
- } else if (arg == "-ts" || arg == "--tensor-split") {
592
- if (++i >= argc) {
593
- invalid_param = true;
594
- break;
595
- }
596
- for (auto ts : string_split<std::string>(argv[i], split_delim)) {
597
- // split string by ; and /
598
- const std::regex regex{ R"([;/]+)" };
599
- std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
600
- std::vector<std::string> split_arg{ it, {} };
601
- GGML_ASSERT(split_arg.size() <= llama_max_devices());
602
-
603
- std::vector<float> tensor_split(llama_max_devices());
604
- for (size_t i = 0; i < llama_max_devices(); ++i) {
605
- if (i < split_arg.size()) {
606
- tensor_split[i] = std::stof(split_arg[i]);
607
- } else {
608
- tensor_split[i] = 0.0f;
635
+ } else if (arg == "-fa" || arg == "--flash-attn") {
636
+ if (++i >= argc) {
637
+ invalid_param = true;
638
+ break;
639
+ }
640
+ auto p = string_split<bool>(argv[i], split_delim);
641
+ params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
642
+ } else if (arg == "-mmp" || arg == "--mmap") {
643
+ if (++i >= argc) {
644
+ invalid_param = true;
645
+ break;
646
+ }
647
+ auto p = string_split<bool>(argv[i], split_delim);
648
+ params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
649
+ } else if (arg == "-embd" || arg == "--embeddings") {
650
+ if (++i >= argc) {
651
+ invalid_param = true;
652
+ break;
653
+ }
654
+ auto p = string_split<bool>(argv[i], split_delim);
655
+ params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
656
+ } else if (arg == "-nopo" || arg == "--no-op-offload") {
657
+ if (++i >= argc) {
658
+ invalid_param = true;
659
+ break;
660
+ }
661
+ auto p = string_split<bool>(argv[i], split_delim);
662
+ params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
663
+ } else if (arg == "-ts" || arg == "--tensor-split") {
664
+ if (++i >= argc) {
665
+ invalid_param = true;
666
+ break;
667
+ }
668
+ for (auto ts : string_split<std::string>(argv[i], split_delim)) {
669
+ // split string by ; and /
670
+ const std::regex regex{ R"([;/]+)" };
671
+ std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
672
+ std::vector<std::string> split_arg{ it, {} };
673
+ GGML_ASSERT(split_arg.size() <= llama_max_devices());
674
+
675
+ std::vector<float> tensor_split(llama_max_devices());
676
+ for (size_t i = 0; i < llama_max_devices(); ++i) {
677
+ if (i < split_arg.size()) {
678
+ tensor_split[i] = std::stof(split_arg[i]);
679
+ } else {
680
+ tensor_split[i] = 0.0f;
681
+ }
609
682
  }
683
+ params.tensor_split.push_back(tensor_split);
610
684
  }
611
- params.tensor_split.push_back(tensor_split);
612
- }
613
- } else if (arg == "-ot" || arg == "--override-tensor") {
614
- if (++i >= argc) {
615
- invalid_param = true;
616
- break;
617
- }
618
- auto value = argv[i];
619
- /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
620
- if (buft_list.empty()) {
621
- // enumerate all the devices and add their buffer types to the list
622
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
623
- auto * dev = ggml_backend_dev_get(i);
624
- auto * buft = ggml_backend_dev_buffer_type(dev);
625
- if (buft) {
626
- buft_list[ggml_backend_buft_name(buft)] = buft;
685
+ } else if (arg == "-ot" || arg == "--override-tensor") {
686
+ if (++i >= argc) {
687
+ invalid_param = true;
688
+ break;
689
+ }
690
+ auto * value = argv[i];
691
+ /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
692
+ if (buft_list.empty()) {
693
+ // enumerate all the devices and add their buffer types to the list
694
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
695
+ auto * dev = ggml_backend_dev_get(i);
696
+ auto * buft = ggml_backend_dev_buffer_type(dev);
697
+ if (buft) {
698
+ buft_list[ggml_backend_buft_name(buft)] = buft;
699
+ }
627
700
  }
628
701
  }
629
- }
630
- auto override_group_span_len = std::strcspn(value, ",");
631
- bool last_group = false;
632
- do {
633
- if (override_group_span_len == 0) {
634
- // Adds an empty override-tensors for an empty span
635
- params.tensor_buft_overrides.push_back({{}});
702
+ auto override_group_span_len = std::strcspn(value, ",");
703
+ bool last_group = false;
704
+ do {
705
+ if (override_group_span_len == 0) {
706
+ // Adds an empty override-tensors for an empty span
707
+ params.tensor_buft_overrides.push_back({{}});
708
+ if (value[override_group_span_len] == '\0') {
709
+ value = &value[override_group_span_len];
710
+ last_group = true;
711
+ } else {
712
+ value = &value[override_group_span_len + 1];
713
+ override_group_span_len = std::strcspn(value, ",");
714
+ }
715
+ continue;
716
+ }
717
+ // Stamps null terminators into the argv
718
+ // value for this option to avoid the
719
+ // memory leak present in the implementation
720
+ // over in arg.cpp. Acceptable because we
721
+ // only parse these args once in this program.
722
+ auto * override_group = value;
636
723
  if (value[override_group_span_len] == '\0') {
637
724
  value = &value[override_group_span_len];
638
725
  last_group = true;
639
726
  } else {
727
+ value[override_group_span_len] = '\0';
640
728
  value = &value[override_group_span_len + 1];
641
- override_group_span_len = std::strcspn(value, ",");
642
729
  }
643
- continue;
644
- }
645
- // Stamps null terminators into the argv
646
- // value for this option to avoid the
647
- // memory leak present in the implementation
648
- // over in arg.cpp. Acceptable because we
649
- // only parse these args once in this program.
650
- auto override_group = value;
651
- if (value[override_group_span_len] == '\0') {
652
- value = &value[override_group_span_len];
653
- last_group = true;
654
- } else {
655
- value[override_group_span_len] = '\0';
656
- value = &value[override_group_span_len + 1];
657
- }
658
- std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
659
- auto override_span_len = std::strcspn(override_group, ";");
660
- while (override_span_len > 0) {
661
- auto override = override_group;
662
- if (override_group[override_span_len] != '\0') {
663
- override_group[override_span_len] = '\0';
664
- override_group = &override_group[override_span_len + 1];
665
- } else {
666
- override_group = &override_group[override_span_len];
667
- }
668
- auto tensor_name_span_len = std::strcspn(override, "=");
669
- if (tensor_name_span_len >= override_span_len) {
670
- invalid_param = true;
671
- break;
672
- }
673
- override[tensor_name_span_len] = '\0';
674
- auto tensor_name = override;
675
- auto buffer_type = &override[tensor_name_span_len + 1];
676
- if (buft_list.find(buffer_type) == buft_list.end()) {
677
- printf("Available buffer types:\n");
678
- for (const auto & it : buft_list) {
679
- printf(" %s\n", ggml_backend_buft_name(it.second));
730
+ std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
731
+ auto override_span_len = std::strcspn(override_group, ";");
732
+ while (override_span_len > 0) {
733
+ auto * override = override_group;
734
+ if (override_group[override_span_len] != '\0') {
735
+ override_group[override_span_len] = '\0';
736
+ override_group = &override_group[override_span_len + 1];
737
+ } else {
738
+ override_group = &override_group[override_span_len];
680
739
  }
681
- invalid_param = true;
740
+ auto tensor_name_span_len = std::strcspn(override, "=");
741
+ if (tensor_name_span_len >= override_span_len) {
742
+ invalid_param = true;
743
+ break;
744
+ }
745
+ override[tensor_name_span_len] = '\0';
746
+ auto * tensor_name = override;
747
+ auto * buffer_type = &override[tensor_name_span_len + 1];
748
+ if (buft_list.find(buffer_type) == buft_list.end()) {
749
+ printf("error: unrecognized buffer type '%s'\n", buffer_type);
750
+ printf("Available buffer types:\n");
751
+ for (const auto & it : buft_list) {
752
+ printf(" %s\n", ggml_backend_buft_name(it.second));
753
+ }
754
+ invalid_param = true;
755
+ break;
756
+ }
757
+ group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
758
+ override_span_len = std::strcspn(override_group, ";");
759
+ }
760
+ if (invalid_param) {
682
761
  break;
683
762
  }
684
- group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
685
- override_span_len = std::strcspn(override_group, ";");
763
+ group_tensor_buft_overrides.push_back({nullptr,nullptr});
764
+ params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
765
+ override_group_span_len = std::strcspn(value, ",");
766
+ } while (!last_group);
767
+ } else if (arg == "-r" || arg == "--repetitions") {
768
+ if (++i >= argc) {
769
+ invalid_param = true;
770
+ break;
686
771
  }
687
- if (invalid_param) {
772
+ params.reps = std::stoi(argv[i]);
773
+ } else if (arg == "--prio") {
774
+ if (++i >= argc) {
775
+ invalid_param = true;
688
776
  break;
689
777
  }
690
- group_tensor_buft_overrides.push_back({nullptr,nullptr});
691
- params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
692
- override_group_span_len = std::strcspn(value, ",");
693
- } while (!last_group);
694
- } else if (arg == "-r" || arg == "--repetitions") {
695
- if (++i >= argc) {
696
- invalid_param = true;
697
- break;
698
- }
699
- params.reps = std::stoi(argv[i]);
700
- } else if (arg == "--prio") {
701
- if (++i >= argc) {
702
- invalid_param = true;
703
- break;
704
- }
705
- params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
706
- } else if (arg == "--delay") {
707
- if (++i >= argc) {
708
- invalid_param = true;
709
- break;
710
- }
711
- params.delay = std::stoi(argv[i]);
712
- } else if (arg == "-o" || arg == "--output") {
713
- if (++i >= argc) {
714
- invalid_param = true;
715
- break;
716
- }
717
- invalid_param = !output_format_from_str(argv[i], params.output_format);
718
- } else if (arg == "-oe" || arg == "--output-err") {
719
- if (++i >= argc) {
778
+ params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
779
+ } else if (arg == "--delay") {
780
+ if (++i >= argc) {
781
+ invalid_param = true;
782
+ break;
783
+ }
784
+ params.delay = std::stoi(argv[i]);
785
+ } else if (arg == "-o" || arg == "--output") {
786
+ if (++i >= argc) {
787
+ invalid_param = true;
788
+ break;
789
+ }
790
+ invalid_param = !output_format_from_str(argv[i], params.output_format);
791
+ } else if (arg == "-oe" || arg == "--output-err") {
792
+ if (++i >= argc) {
793
+ invalid_param = true;
794
+ break;
795
+ }
796
+ invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
797
+ } else if (arg == "-v" || arg == "--verbose") {
798
+ params.verbose = true;
799
+ } else if (arg == "--progress") {
800
+ params.progress = true;
801
+ } else {
720
802
  invalid_param = true;
721
803
  break;
722
804
  }
723
- invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
724
- } else if (arg == "-v" || arg == "--verbose") {
725
- params.verbose = true;
726
- } else if (arg == "--progress") {
727
- params.progress = true;
728
- } else {
805
+ } catch (const std::exception & e) {
806
+ fprintf(stderr, "error: %s\n", e.what());
729
807
  invalid_param = true;
730
808
  break;
731
809
  }
732
810
  }
811
+
733
812
  if (invalid_param) {
734
813
  fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
735
814
  print_usage(argc, argv);
@@ -764,6 +843,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
764
843
  if (params.type_v.empty()) {
765
844
  params.type_v = cmd_params_defaults.type_v;
766
845
  }
846
+ if (params.defrag_thold.empty()) {
847
+ params.defrag_thold = cmd_params_defaults.defrag_thold;
848
+ }
767
849
  if (params.n_gpu_layers.empty()) {
768
850
  params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
769
851
  }
@@ -794,6 +876,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
794
876
  if (params.embeddings.empty()) {
795
877
  params.embeddings = cmd_params_defaults.embeddings;
796
878
  }
879
+ if (params.no_op_offload.empty()) {
880
+ params.no_op_offload = cmd_params_defaults.no_op_offload;
881
+ }
797
882
  if (params.n_threads.empty()) {
798
883
  params.n_threads = cmd_params_defaults.n_threads;
799
884
  }
@@ -819,6 +904,7 @@ struct cmd_params_instance {
819
904
  int n_ubatch;
820
905
  ggml_type type_k;
821
906
  ggml_type type_v;
907
+ float defrag_thold;
822
908
  int n_threads;
823
909
  std::string cpu_mask;
824
910
  bool cpu_strict;
@@ -833,6 +919,7 @@ struct cmd_params_instance {
833
919
  std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
834
920
  bool use_mmap;
835
921
  bool embeddings;
922
+ bool no_op_offload;
836
923
 
837
924
  llama_model_params to_llama_mparams() const {
838
925
  llama_model_params mparams = llama_model_default_params();
@@ -894,14 +981,16 @@ struct cmd_params_instance {
894
981
  llama_context_params to_llama_cparams() const {
895
982
  llama_context_params cparams = llama_context_default_params();
896
983
 
897
- cparams.n_ctx = n_prompt + n_gen + n_depth;
898
- cparams.n_batch = n_batch;
899
- cparams.n_ubatch = n_ubatch;
900
- cparams.type_k = type_k;
901
- cparams.type_v = type_v;
902
- cparams.offload_kqv = !no_kv_offload;
903
- cparams.flash_attn = flash_attn;
904
- cparams.embeddings = embeddings;
984
+ cparams.n_ctx = n_prompt + n_gen + n_depth;
985
+ cparams.n_batch = n_batch;
986
+ cparams.n_ubatch = n_ubatch;
987
+ cparams.type_k = type_k;
988
+ cparams.type_v = type_v;
989
+ cparams.defrag_thold = defrag_thold;
990
+ cparams.offload_kqv = !no_kv_offload;
991
+ cparams.flash_attn = flash_attn;
992
+ cparams.embeddings = embeddings;
993
+ cparams.op_offload = !no_op_offload;
905
994
 
906
995
  return cparams;
907
996
  }
@@ -921,10 +1010,12 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
921
1010
  for (const auto & ot : params.tensor_buft_overrides)
922
1011
  for (const auto & mmp : params.use_mmap)
923
1012
  for (const auto & embd : params.embeddings)
1013
+ for (const auto & nopo : params.no_op_offload)
924
1014
  for (const auto & nb : params.n_batch)
925
1015
  for (const auto & nub : params.n_ubatch)
926
1016
  for (const auto & tk : params.type_k)
927
1017
  for (const auto & tv : params.type_v)
1018
+ for (const auto & defrag_thold : params.defrag_thold)
928
1019
  for (const auto & nkvo : params.no_kv_offload)
929
1020
  for (const auto & fa : params.flash_attn)
930
1021
  for (const auto & nt : params.n_threads)
@@ -945,6 +1036,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
945
1036
  /* .n_ubatch = */ nub,
946
1037
  /* .type_k = */ tk,
947
1038
  /* .type_v = */ tv,
1039
+ /* .defrag_thold = */ defrag_thold,
948
1040
  /* .n_threads = */ nt,
949
1041
  /* .cpu_mask = */ cm,
950
1042
  /* .cpu_strict = */ cs,
@@ -959,6 +1051,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
959
1051
  /* .tensor_buft_overrides = */ ot,
960
1052
  /* .use_mmap = */ mmp,
961
1053
  /* .embeddings = */ embd,
1054
+ /* .no_op_offload= */ nopo,
962
1055
  };
963
1056
  instances.push_back(instance);
964
1057
  }
@@ -976,6 +1069,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
976
1069
  /* .n_ubatch = */ nub,
977
1070
  /* .type_k = */ tk,
978
1071
  /* .type_v = */ tv,
1072
+ /* .defrag_thold = */ defrag_thold,
979
1073
  /* .n_threads = */ nt,
980
1074
  /* .cpu_mask = */ cm,
981
1075
  /* .cpu_strict = */ cs,
@@ -990,6 +1084,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
990
1084
  /* .tensor_buft_overrides = */ ot,
991
1085
  /* .use_mmap = */ mmp,
992
1086
  /* .embeddings = */ embd,
1087
+ /* .no_op_offload= */ nopo,
993
1088
  };
994
1089
  instances.push_back(instance);
995
1090
  }
@@ -1007,6 +1102,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1007
1102
  /* .n_ubatch = */ nub,
1008
1103
  /* .type_k = */ tk,
1009
1104
  /* .type_v = */ tv,
1105
+ /* .defrag_thold = */ defrag_thold,
1010
1106
  /* .n_threads = */ nt,
1011
1107
  /* .cpu_mask = */ cm,
1012
1108
  /* .cpu_strict = */ cs,
@@ -1021,6 +1117,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1021
1117
  /* .tensor_buft_overrides = */ ot,
1022
1118
  /* .use_mmap = */ mmp,
1023
1119
  /* .embeddings = */ embd,
1120
+ /* .no_op_offload= */ nopo,
1024
1121
  };
1025
1122
  instances.push_back(instance);
1026
1123
  }
@@ -1047,6 +1144,7 @@ struct test {
1047
1144
  int poll;
1048
1145
  ggml_type type_k;
1049
1146
  ggml_type type_v;
1147
+ float defrag_thold;
1050
1148
  int n_gpu_layers;
1051
1149
  llama_split_mode split_mode;
1052
1150
  int main_gpu;
@@ -1056,6 +1154,7 @@ struct test {
1056
1154
  std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
1057
1155
  bool use_mmap;
1058
1156
  bool embeddings;
1157
+ bool no_op_offload;
1059
1158
  int n_prompt;
1060
1159
  int n_gen;
1061
1160
  int n_depth;
@@ -1080,6 +1179,7 @@ struct test {
1080
1179
  poll = inst.poll;
1081
1180
  type_k = inst.type_k;
1082
1181
  type_v = inst.type_v;
1182
+ defrag_thold = inst.defrag_thold;
1083
1183
  n_gpu_layers = inst.n_gpu_layers;
1084
1184
  split_mode = inst.split_mode;
1085
1185
  main_gpu = inst.main_gpu;
@@ -1089,6 +1189,7 @@ struct test {
1089
1189
  tensor_buft_overrides = inst.tensor_buft_overrides;
1090
1190
  use_mmap = inst.use_mmap;
1091
1191
  embeddings = inst.embeddings;
1192
+ no_op_offload = inst.no_op_offload;
1092
1193
  n_prompt = inst.n_prompt;
1093
1194
  n_gen = inst.n_gen;
1094
1195
  n_depth = inst.n_depth;
@@ -1134,7 +1235,8 @@ struct test {
1134
1235
  "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
1135
1236
  "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
1136
1237
  "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
1137
- "use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time",
1238
+ "defrag_thold",
1239
+ "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
1138
1240
  "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
1139
1241
  };
1140
1242
  return fields;
@@ -1146,14 +1248,14 @@ struct test {
1146
1248
  if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
1147
1249
  field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
1148
1250
  field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
1149
- field == "avg_ns" || field == "stddev_ns") {
1251
+ field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
1150
1252
  return INT;
1151
1253
  }
1152
1254
  if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
1153
1255
  field == "use_mmap" || field == "embeddings") {
1154
1256
  return BOOL;
1155
1257
  }
1156
- if (field == "avg_ts" || field == "stddev_ts") {
1258
+ if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
1157
1259
  return FLOAT;
1158
1260
  }
1159
1261
  return STRING;
@@ -1220,8 +1322,10 @@ struct test {
1220
1322
  std::to_string(flash_attn),
1221
1323
  tensor_split_str,
1222
1324
  tensor_buft_overrides_str,
1325
+ std::to_string(defrag_thold),
1223
1326
  std::to_string(use_mmap),
1224
1327
  std::to_string(embeddings),
1328
+ std::to_string(no_op_offload),
1225
1329
  std::to_string(n_prompt),
1226
1330
  std::to_string(n_gen),
1227
1331
  std::to_string(n_depth),
@@ -1404,6 +1508,9 @@ struct markdown_printer : public printer {
1404
1508
  if (field == "test") {
1405
1509
  return 15;
1406
1510
  }
1511
+ if (field == "no_op_offload") {
1512
+ return 4;
1513
+ }
1407
1514
 
1408
1515
  int width = std::max((int) field.length(), 10);
1409
1516
 
@@ -1435,6 +1542,9 @@ struct markdown_printer : public printer {
1435
1542
  if (field == "embeddings") {
1436
1543
  return "embd";
1437
1544
  }
1545
+ if (field == "no_op_offload") {
1546
+ return "nopo";
1547
+ }
1438
1548
  if (field == "tensor_split") {
1439
1549
  return "ts";
1440
1550
  }
@@ -1479,6 +1589,9 @@ struct markdown_printer : public printer {
1479
1589
  if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
1480
1590
  fields.emplace_back("type_v");
1481
1591
  }
1592
+ if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
1593
+ fields.emplace_back("defrag_thold");
1594
+ }
1482
1595
  if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
1483
1596
  fields.emplace_back("main_gpu");
1484
1597
  }
@@ -1503,6 +1616,9 @@ struct markdown_printer : public printer {
1503
1616
  if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
1504
1617
  fields.emplace_back("embeddings");
1505
1618
  }
1619
+ if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
1620
+ fields.emplace_back("no_op_offload");
1621
+ }
1506
1622
  fields.emplace_back("test");
1507
1623
  fields.emplace_back("t/s");
1508
1624
 
@@ -1621,7 +1737,7 @@ struct sql_printer : public printer {
1621
1737
  }
1622
1738
  };
1623
1739
 
1624
- static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
1740
+ static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
1625
1741
  llama_set_n_threads(ctx, n_threads, n_threads);
1626
1742
 
1627
1743
  const llama_model * model = llama_get_model(ctx);
@@ -1638,14 +1754,19 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
1638
1754
  for (int i = 1; i < n_tokens; i++) {
1639
1755
  tokens[i] = std::rand() % n_vocab;
1640
1756
  }
1641
- llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
1757
+ int res = llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
1758
+ if (res != 0) {
1759
+ fprintf(stderr, "%s: failed to decode prompt batch, res = %d\n", __func__, res);
1760
+ return false;
1761
+ }
1642
1762
  n_processed += n_tokens;
1643
1763
  }
1644
1764
 
1645
1765
  llama_synchronize(ctx);
1766
+ return true;
1646
1767
  }
1647
1768
 
1648
- static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
1769
+ static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
1649
1770
  llama_set_n_threads(ctx, n_threads, n_threads);
1650
1771
 
1651
1772
  const llama_model * model = llama_get_model(ctx);
@@ -1655,10 +1776,15 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
1655
1776
  llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
1656
1777
 
1657
1778
  for (int i = 0; i < n_gen; i++) {
1658
- llama_decode(ctx, llama_batch_get_one(&token, 1));
1779
+ int res = llama_decode(ctx, llama_batch_get_one(&token, 1));
1780
+ if (res != 0) {
1781
+ fprintf(stderr, "%s: failed to decode generation batch, res = %d\n", __func__, res);
1782
+ return false;
1783
+ }
1659
1784
  llama_synchronize(ctx);
1660
1785
  token = std::rand() % n_vocab;
1661
1786
  }
1787
+ return true;
1662
1788
  }
1663
1789
 
1664
1790
  static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
@@ -1701,10 +1827,11 @@ int main(int argc, char ** argv) {
1701
1827
  fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
1702
1828
  #endif
1703
1829
 
1704
- cmd_params params = parse_cmd_params(argc, argv);
1705
-
1706
1830
  // initialize backends
1707
1831
  ggml_backend_load_all();
1832
+
1833
+ cmd_params params = parse_cmd_params(argc, argv);
1834
+
1708
1835
  auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1709
1836
  if (!cpu_dev) {
1710
1837
  fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
@@ -1802,13 +1929,21 @@ int main(int argc, char ** argv) {
1802
1929
  fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
1803
1930
  }
1804
1931
  //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1805
- test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1932
+ bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1933
+ if (!res) {
1934
+ fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
1935
+ exit(1);
1936
+ }
1806
1937
  }
1807
1938
  if (t.n_gen > 0) {
1808
1939
  if (params.progress) {
1809
1940
  fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
1810
1941
  }
1811
- test_gen(ctx, 1, t.n_threads);
1942
+ bool res = test_gen(ctx, 1, t.n_threads);
1943
+ if (!res) {
1944
+ fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
1945
+ exit(1);
1946
+ }
1812
1947
  }
1813
1948
 
1814
1949
  for (int i = 0; i < params.reps; i++) {
@@ -1819,7 +1954,11 @@ int main(int argc, char ** argv) {
1819
1954
  fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
1820
1955
  i + 1, params.reps);
1821
1956
  }
1822
- test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
1957
+ bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
1958
+ if (!res) {
1959
+ fprintf(stderr, "%s: error: failed to run depth\n", __func__);
1960
+ exit(1);
1961
+ }
1823
1962
  }
1824
1963
 
1825
1964
  uint64_t t_start = get_time_ns();
@@ -1829,14 +1968,22 @@ int main(int argc, char ** argv) {
1829
1968
  fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
1830
1969
  i + 1, params.reps);
1831
1970
  }
1832
- test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1971
+ bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1972
+ if (!res) {
1973
+ fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
1974
+ exit(1);
1975
+ }
1833
1976
  }
1834
1977
  if (t.n_gen > 0) {
1835
1978
  if (params.progress) {
1836
1979
  fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
1837
1980
  i + 1, params.reps);
1838
1981
  }
1839
- test_gen(ctx, t.n_gen, t.n_threads);
1982
+ bool res = test_gen(ctx, t.n_gen, t.n_threads);
1983
+ if (!res) {
1984
+ fprintf(stderr, "%s: error: failed to run gen\n", __func__);
1985
+ exit(1);
1986
+ }
1840
1987
  }
1841
1988
 
1842
1989
  uint64_t t_ns = get_time_ns() - t_start;