@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -16,6 +16,7 @@
16
16
  #include <sstream>
17
17
  #include <string>
18
18
  #include <vector>
19
+ #include <thread>
19
20
 
20
21
  #include "ggml.h"
21
22
  #include "llama.h"
@@ -27,6 +28,14 @@
27
28
  #include "ggml-cann.h"
28
29
  #endif
29
30
 
31
+ #ifdef _WIN32
32
+ #define WIN32_LEAN_AND_MEAN
33
+ #ifndef NOMINMAX
34
+ # define NOMINMAX
35
+ #endif
36
+ #include <windows.h>
37
+ #endif
38
+
30
39
  // utils
31
40
  static uint64_t get_time_ns() {
32
41
  using clock = std::chrono::high_resolution_clock;
@@ -96,6 +105,30 @@ static std::string get_cpu_info() {
96
105
  }
97
106
  fclose(f);
98
107
  }
108
+ #elif defined(_WIN32)
109
+ HKEY hKey;
110
+ if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
111
+ TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
112
+ 0,
113
+ KEY_READ,
114
+ &hKey) != ERROR_SUCCESS) {
115
+ // fail to open registry key
116
+ return "";
117
+ }
118
+ char cpu_brand[256];
119
+ DWORD cpu_brand_size = sizeof(cpu_brand);
120
+ if (RegQueryValueExA(hKey,
121
+ TEXT("ProcessorNameString"),
122
+ NULL,
123
+ NULL,
124
+ (LPBYTE)cpu_brand,
125
+ &cpu_brand_size) == ERROR_SUCCESS) {
126
+ id.assign(cpu_brand, cpu_brand_size);
127
+ if (id.find('\0') != std::string::npos) {
128
+ id.resize(id.find('\0'));
129
+ }
130
+ }
131
+ RegCloseKey(hKey);
99
132
  #endif
100
133
  // TODO: other platforms
101
134
  return id;
@@ -141,13 +174,14 @@ static std::string get_gpu_info() {
141
174
  }
142
175
 
143
176
  // command line params
144
- enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
177
+ enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
145
178
 
146
179
  static const char * output_format_str(output_formats format) {
147
180
  switch (format) {
148
181
  case NONE: return "none";
149
182
  case CSV: return "csv";
150
183
  case JSON: return "json";
184
+ case JSONL: return "jsonl";
151
185
  case MARKDOWN: return "md";
152
186
  case SQL: return "sql";
153
187
  default: GGML_ABORT("invalid output format");
@@ -161,6 +195,8 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
161
195
  format = CSV;
162
196
  } else if (s == "json") {
163
197
  format = JSON;
198
+ } else if (s == "jsonl") {
199
+ format = JSONL;
164
200
  } else if (s == "md") {
165
201
  format = MARKDOWN;
166
202
  } else if (s == "sql") {
@@ -196,6 +232,9 @@ struct cmd_params {
196
232
  std::vector<ggml_type> type_k;
197
233
  std::vector<ggml_type> type_v;
198
234
  std::vector<int> n_threads;
235
+ std::vector<std::string> cpu_mask;
236
+ std::vector<bool> cpu_strict;
237
+ std::vector<int> poll;
199
238
  std::vector<int> n_gpu_layers;
200
239
  std::vector<std::string> rpc_servers;
201
240
  std::vector<llama_split_mode> split_mode;
@@ -207,7 +246,10 @@ struct cmd_params {
207
246
  std::vector<bool> embeddings;
208
247
  ggml_numa_strategy numa;
209
248
  int reps;
249
+ ggml_sched_priority prio;
250
+ int delay;
210
251
  bool verbose;
252
+ bool progress;
211
253
  output_formats output_format;
212
254
  output_formats output_format_stderr;
213
255
  };
@@ -222,6 +264,9 @@ static const cmd_params cmd_params_defaults = {
222
264
  /* type_k */ {GGML_TYPE_F16},
223
265
  /* type_v */ {GGML_TYPE_F16},
224
266
  /* n_threads */ {cpu_get_num_math()},
267
+ /* cpu_mask */ {"0x0"},
268
+ /* cpu_strict */ {false},
269
+ /* poll */ {50},
225
270
  /* n_gpu_layers */ {99},
226
271
  /* rpc_servers */ {""},
227
272
  /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
@@ -233,7 +278,10 @@ static const cmd_params cmd_params_defaults = {
233
278
  /* embeddings */ {false},
234
279
  /* numa */ GGML_NUMA_STRATEGY_DISABLED,
235
280
  /* reps */ 5,
281
+ /* prio */ GGML_SCHED_PRIO_NORMAL,
282
+ /* delay */ 0,
236
283
  /* verbose */ false,
284
+ /* progress */ false,
237
285
  /* output_format */ MARKDOWN,
238
286
  /* output_format_stderr */ NONE,
239
287
  };
@@ -243,29 +291,37 @@ static void print_usage(int /* argc */, char ** argv) {
243
291
  printf("\n");
244
292
  printf("options:\n");
245
293
  printf(" -h, --help\n");
246
- printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
247
- printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
248
- printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
249
- printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
250
- printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
251
- printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
252
- printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
253
- printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
254
- printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
255
- printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
256
- printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
257
- printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
258
- printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
259
- printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
260
- printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
261
- printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
262
- printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
263
- printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
264
- printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
265
- printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
266
- printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
267
- printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
268
- printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
294
+ printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
295
+ printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
296
+ printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
297
+ printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
298
+ printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
299
+ printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
300
+ printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
301
+ printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
302
+ printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
303
+ printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
304
+ printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
305
+ printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
306
+ printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
307
+ #ifdef GGML_USE_RPC
308
+ printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
309
+ #endif
310
+ printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
311
+ printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
312
+ printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
313
+ printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
314
+ printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
315
+ printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
316
+ printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
317
+ printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
318
+ printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
319
+ printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
320
+ printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
321
+ printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
322
+ printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
323
+ printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
324
+ printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
269
325
  printf("\n");
270
326
  printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
271
327
  }
@@ -309,6 +365,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
309
365
  params.output_format_stderr = cmd_params_defaults.output_format_stderr;
310
366
  params.reps = cmd_params_defaults.reps;
311
367
  params.numa = cmd_params_defaults.numa;
368
+ params.prio = cmd_params_defaults.prio;
369
+ params.delay = cmd_params_defaults.delay;
370
+ params.progress = cmd_params_defaults.progress;
312
371
 
313
372
  for (int i = 1; i < argc; i++) {
314
373
  arg = argv[i];
@@ -380,6 +439,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
380
439
  }
381
440
  types.push_back(gt);
382
441
  }
442
+ if (invalid_param) {
443
+ break;
444
+ }
383
445
  params.type_k.insert(params.type_k.end(), types.begin(), types.end());
384
446
  } else if (arg == "-ctv" || arg == "--cache-type-v") {
385
447
  if (++i >= argc) {
@@ -396,6 +458,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
396
458
  }
397
459
  types.push_back(gt);
398
460
  }
461
+ if (invalid_param) {
462
+ break;
463
+ }
399
464
  params.type_v.insert(params.type_v.end(), types.begin(), types.end());
400
465
  } else if (arg == "-t" || arg == "--threads") {
401
466
  if (++i >= argc) {
@@ -404,6 +469,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
404
469
  }
405
470
  auto p = string_split<int>(argv[i], split_delim);
406
471
  params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
472
+ } else if (arg == "-C" || arg == "--cpu-mask") {
473
+ if (++i >= argc) {
474
+ invalid_param = true;
475
+ break;
476
+ }
477
+ auto p = string_split<std::string>(argv[i], split_delim);
478
+ params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
479
+ } else if (arg == "--cpu-strict") {
480
+ if (++i >= argc) {
481
+ invalid_param = true;
482
+ break;
483
+ }
484
+ auto p = string_split<bool>(argv[i], split_delim);
485
+ params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
486
+ } else if (arg == "--poll") {
487
+ if (++i >= argc) {
488
+ invalid_param = true;
489
+ break;
490
+ }
491
+ auto p = string_split<int>(argv[i], split_delim);
492
+ params.poll.insert(params.poll.end(), p.begin(), p.end());
407
493
  } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
408
494
  if (++i >= argc) {
409
495
  invalid_param = true;
@@ -411,12 +497,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
411
497
  }
412
498
  auto p = string_split<int>(argv[i], split_delim);
413
499
  params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
500
+ #ifdef GGML_USE_RPC
414
501
  } else if (arg == "-rpc" || arg == "--rpc") {
415
502
  if (++i >= argc) {
416
503
  invalid_param = true;
417
504
  break;
418
505
  }
419
506
  params.rpc_servers.push_back(argv[i]);
507
+ #endif
420
508
  } else if (arg == "-sm" || arg == "--split-mode") {
421
509
  if (++i >= argc) {
422
510
  invalid_param = true;
@@ -438,6 +526,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
438
526
  }
439
527
  modes.push_back(mode);
440
528
  }
529
+ if (invalid_param) {
530
+ break;
531
+ }
441
532
  params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
442
533
  } else if (arg == "-mg" || arg == "--main-gpu") {
443
534
  if (++i >= argc) {
@@ -512,6 +603,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
512
603
  break;
513
604
  }
514
605
  params.reps = std::stoi(argv[i]);
606
+ } else if (arg == "--prio") {
607
+ if (++i >= argc) {
608
+ invalid_param = true;
609
+ break;
610
+ }
611
+ params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
612
+ } else if (arg == "--delay") {
613
+ if (++i >= argc) {
614
+ invalid_param = true;
615
+ break;
616
+ }
617
+ params.delay = std::stoi(argv[i]);
515
618
  } else if (arg == "-o" || arg == "--output") {
516
619
  if (++i >= argc) {
517
620
  invalid_param = true;
@@ -526,6 +629,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
526
629
  invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
527
630
  } else if (arg == "-v" || arg == "--verbose") {
528
631
  params.verbose = true;
632
+ } else if (arg == "--progress") {
633
+ params.progress = true;
529
634
  } else {
530
635
  invalid_param = true;
531
636
  break;
@@ -556,6 +661,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
556
661
  if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
557
662
  if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
558
663
  if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
664
+ if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
665
+ if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
666
+ if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
559
667
 
560
668
  return params;
561
669
  }
@@ -569,6 +677,9 @@ struct cmd_params_instance {
569
677
  ggml_type type_k;
570
678
  ggml_type type_v;
571
679
  int n_threads;
680
+ std::string cpu_mask;
681
+ bool cpu_strict;
682
+ int poll;
572
683
  int n_gpu_layers;
573
684
  std::string rpc_servers;
574
685
  llama_split_mode split_mode;
@@ -638,7 +749,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
638
749
  for (const auto & tv : params.type_v)
639
750
  for (const auto & nkvo : params.no_kv_offload)
640
751
  for (const auto & fa : params.flash_attn)
641
- for (const auto & nt : params.n_threads) {
752
+ for (const auto & nt : params.n_threads)
753
+ for (const auto & cm : params.cpu_mask)
754
+ for (const auto & cs : params.cpu_strict)
755
+ for (const auto & pl : params.poll) {
642
756
  for (const auto & n_prompt : params.n_prompt) {
643
757
  if (n_prompt == 0) {
644
758
  continue;
@@ -652,6 +766,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
652
766
  /* .type_k = */ tk,
653
767
  /* .type_v = */ tv,
654
768
  /* .n_threads = */ nt,
769
+ /* .cpu_mask = */ cm,
770
+ /* .cpu_strict = */ cs,
771
+ /* .poll = */ pl,
655
772
  /* .n_gpu_layers = */ nl,
656
773
  /* .rpc_servers = */ rpc,
657
774
  /* .split_mode = */ sm,
@@ -678,6 +795,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
678
795
  /* .type_k = */ tk,
679
796
  /* .type_v = */ tv,
680
797
  /* .n_threads = */ nt,
798
+ /* .cpu_mask = */ cm,
799
+ /* .cpu_strict = */ cs,
800
+ /* .poll = */ pl,
681
801
  /* .n_gpu_layers = */ nl,
682
802
  /* .rpc_servers = */ rpc,
683
803
  /* .split_mode = */ sm,
@@ -704,6 +824,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
704
824
  /* .type_k = */ tk,
705
825
  /* .type_v = */ tv,
706
826
  /* .n_threads = */ nt,
827
+ /* .cpu_mask = */ cm,
828
+ /* .cpu_strict = */ cs,
829
+ /* .poll = */ pl,
707
830
  /* .n_gpu_layers = */ nl,
708
831
  /* .rpc_servers = */ rpc,
709
832
  /* .split_mode = */ sm,
@@ -740,6 +863,9 @@ struct test {
740
863
  int n_batch;
741
864
  int n_ubatch;
742
865
  int n_threads;
866
+ std::string cpu_mask;
867
+ bool cpu_strict;
868
+ int poll;
743
869
  bool has_rpc;
744
870
  ggml_type type_k;
745
871
  ggml_type type_v;
@@ -766,6 +892,9 @@ struct test {
766
892
  n_batch = inst.n_batch;
767
893
  n_ubatch = inst.n_ubatch;
768
894
  n_threads = inst.n_threads;
895
+ cpu_mask = inst.cpu_mask;
896
+ cpu_strict = inst.cpu_strict;
897
+ poll = inst.poll;
769
898
  has_rpc = !inst.rpc_servers.empty();
770
899
  type_k = inst.type_k;
771
900
  type_v = inst.type_v;
@@ -843,13 +972,14 @@ struct test {
843
972
  "cpu_info", "gpu_info",
844
973
  "model_filename", "model_type", "model_size", "model_n_params",
845
974
  "n_batch", "n_ubatch",
846
- "n_threads", "type_k", "type_v",
975
+ "n_threads", "cpu_mask", "cpu_strict", "poll",
976
+ "type_k", "type_v",
847
977
  "n_gpu_layers", "split_mode",
848
978
  "main_gpu", "no_kv_offload", "flash_attn",
849
979
  "tensor_split", "use_mmap", "embeddings",
850
980
  "n_prompt", "n_gen", "test_time",
851
981
  "avg_ns", "stddev_ns",
852
- "avg_ts", "stddev_ts"
982
+ "avg_ts", "stddev_ts",
853
983
  };
854
984
  return fields;
855
985
  }
@@ -858,7 +988,7 @@ struct test {
858
988
 
859
989
  static field_type get_field_type(const std::string & field) {
860
990
  if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
861
- field == "n_threads" ||
991
+ field == "n_threads" || field == "poll" ||
862
992
  field == "model_size" || field == "model_n_params" ||
863
993
  field == "n_gpu_layers" || field == "main_gpu" ||
864
994
  field == "n_prompt" || field == "n_gen" ||
@@ -867,6 +997,7 @@ struct test {
867
997
  }
868
998
  if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
869
999
  field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
1000
+ field == "cpu_strict" ||
870
1001
  field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
871
1002
  return BOOL;
872
1003
  }
@@ -899,7 +1030,8 @@ struct test {
899
1030
  cpu_info, gpu_info,
900
1031
  model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
901
1032
  std::to_string(n_batch), std::to_string(n_ubatch),
902
- std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
1033
+ std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
1034
+ ggml_type_name(type_k), ggml_type_name(type_v),
903
1035
  std::to_string(n_gpu_layers), split_mode_str(split_mode),
904
1036
  std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
905
1037
  tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@@ -967,37 +1099,38 @@ struct csv_printer : public printer {
967
1099
  }
968
1100
  };
969
1101
 
970
- struct json_printer : public printer {
971
- bool first = true;
972
1102
 
973
- static std::string escape_json(const std::string & value) {
974
- std::string escaped;
975
- for (auto c : value) {
976
- if (c == '"') {
977
- escaped += "\\\"";
978
- } else if (c == '\\') {
979
- escaped += "\\\\";
980
- } else if (c <= 0x1f) {
981
- char buf[8];
982
- snprintf(buf, sizeof(buf), "\\u%04x", c);
983
- escaped += buf;
984
- } else {
985
- escaped += c;
986
- }
1103
+ static std::string escape_json(const std::string & value) {
1104
+ std::string escaped;
1105
+ for (auto c : value) {
1106
+ if (c == '"') {
1107
+ escaped += "\\\"";
1108
+ } else if (c == '\\') {
1109
+ escaped += "\\\\";
1110
+ } else if (c <= 0x1f) {
1111
+ char buf[8];
1112
+ snprintf(buf, sizeof(buf), "\\u%04x", c);
1113
+ escaped += buf;
1114
+ } else {
1115
+ escaped += c;
987
1116
  }
988
- return escaped;
989
1117
  }
1118
+ return escaped;
1119
+ }
990
1120
 
991
- static std::string format_value(const std::string & field, const std::string & value) {
992
- switch (test::get_field_type(field)) {
993
- case test::STRING:
994
- return "\"" + escape_json(value) + "\"";
995
- case test::BOOL:
996
- return value == "0" ? "false" : "true";
997
- default:
998
- return value;
999
- }
1121
+ static std::string format_json_value(const std::string & field, const std::string & value) {
1122
+ switch (test::get_field_type(field)) {
1123
+ case test::STRING:
1124
+ return "\"" + escape_json(value) + "\"";
1125
+ case test::BOOL:
1126
+ return value == "0" ? "false" : "true";
1127
+ default:
1128
+ return value;
1000
1129
  }
1130
+ }
1131
+
1132
+ struct json_printer : public printer {
1133
+ bool first = true;
1001
1134
 
1002
1135
  void print_header(const cmd_params & params) override {
1003
1136
  fprintf(fout, "[\n");
@@ -1007,7 +1140,7 @@ struct json_printer : public printer {
1007
1140
  void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1008
1141
  assert(fields.size() == values.size());
1009
1142
  for (size_t i = 0; i < fields.size(); i++) {
1010
- fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
1143
+ fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
1011
1144
  }
1012
1145
  }
1013
1146
 
@@ -1030,6 +1163,25 @@ struct json_printer : public printer {
1030
1163
  }
1031
1164
  };
1032
1165
 
1166
+
1167
+ struct jsonl_printer : public printer {
1168
+ void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1169
+ assert(fields.size() == values.size());
1170
+ for (size_t i = 0; i < fields.size(); i++) {
1171
+ fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
1172
+ }
1173
+ }
1174
+
1175
+ void print_test(const test & t) override {
1176
+ fprintf(fout, "{");
1177
+ print_fields(test::get_fields(), t.get_values());
1178
+ fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
1179
+ fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
1180
+ fprintf(fout, "}\n");
1181
+ fflush(fout);
1182
+ }
1183
+ };
1184
+
1033
1185
  struct markdown_printer : public printer {
1034
1186
  std::vector<std::string> fields;
1035
1187
 
@@ -1038,7 +1190,7 @@ struct markdown_printer : public printer {
1038
1190
  return -30;
1039
1191
  }
1040
1192
  if (field == "t/s") {
1041
- return 16;
1193
+ return 20;
1042
1194
  }
1043
1195
  if (field == "size" || field == "params") {
1044
1196
  return 10;
@@ -1120,6 +1272,15 @@ struct markdown_printer : public printer {
1120
1272
  if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
1121
1273
  fields.emplace_back("n_threads");
1122
1274
  }
1275
+ if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
1276
+ fields.emplace_back("cpu_mask");
1277
+ }
1278
+ if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
1279
+ fields.emplace_back("cpu_strict");
1280
+ }
1281
+ if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
1282
+ fields.emplace_back("poll");
1283
+ }
1123
1284
  if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
1124
1285
  fields.emplace_back("n_batch");
1125
1286
  }
@@ -1321,6 +1482,8 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
1321
1482
  return std::unique_ptr<printer>(new csv_printer());
1322
1483
  case JSON:
1323
1484
  return std::unique_ptr<printer>(new json_printer());
1485
+ case JSONL:
1486
+ return std::unique_ptr<printer>(new jsonl_printer());
1324
1487
  case MARKDOWN:
1325
1488
  return std::unique_ptr<printer>(new markdown_printer());
1326
1489
  case SQL:
@@ -1354,6 +1517,8 @@ int main(int argc, char ** argv) {
1354
1517
  llama_backend_init();
1355
1518
  llama_numa_init(params.numa);
1356
1519
 
1520
+ set_process_priority(params.prio);
1521
+
1357
1522
  // initialize printer
1358
1523
  std::unique_ptr<printer> p = create_printer(params.output_format);
1359
1524
  std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
@@ -1373,7 +1538,13 @@ int main(int argc, char ** argv) {
1373
1538
  llama_model * lmodel = nullptr;
1374
1539
  const cmd_params_instance * prev_inst = nullptr;
1375
1540
 
1541
+ int params_idx = 0;
1542
+ auto params_count = params_instances.size();
1376
1543
  for (const auto & inst : params_instances) {
1544
+ params_idx ++;
1545
+ if (params.progress) {
1546
+ fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
1547
+ }
1377
1548
  // keep the same model between tests when possible
1378
1549
  if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
1379
1550
  if (lmodel) {
@@ -1399,12 +1570,40 @@ int main(int argc, char ** argv) {
1399
1570
 
1400
1571
  llama_kv_cache_clear(ctx);
1401
1572
 
1573
+ // cool off before the test
1574
+ if (params.delay) {
1575
+ std::this_thread::sleep_for(std::chrono::seconds(params.delay));
1576
+ }
1577
+
1578
+ struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
1579
+ if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
1580
+ fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
1581
+ exit(1);
1582
+ }
1583
+ tpp.strict_cpu = t.cpu_strict;
1584
+ tpp.poll = t.poll;
1585
+ tpp.prio = params.prio;
1586
+
1587
+ struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
1588
+ if (!threadpool) {
1589
+ fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
1590
+ exit(1);
1591
+ }
1592
+
1593
+ llama_attach_threadpool(ctx, threadpool, NULL);
1594
+
1402
1595
  // warmup run
1403
1596
  if (t.n_prompt > 0) {
1597
+ if (params.progress) {
1598
+ fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
1599
+ }
1404
1600
  //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1405
1601
  test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1406
1602
  }
1407
1603
  if (t.n_gen > 0) {
1604
+ if (params.progress) {
1605
+ fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
1606
+ }
1408
1607
  test_gen(ctx, 1, 0, t.n_threads);
1409
1608
  }
1410
1609
 
@@ -1414,9 +1613,15 @@ int main(int argc, char ** argv) {
1414
1613
  uint64_t t_start = get_time_ns();
1415
1614
 
1416
1615
  if (t.n_prompt > 0) {
1616
+ if (params.progress) {
1617
+ fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
1618
+ }
1417
1619
  test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1418
1620
  }
1419
1621
  if (t.n_gen > 0) {
1622
+ if (params.progress) {
1623
+ fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
1624
+ }
1420
1625
  test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
1421
1626
  }
1422
1627
 
@@ -1434,9 +1639,11 @@ int main(int argc, char ** argv) {
1434
1639
  fflush(p_err->fout);
1435
1640
  }
1436
1641
 
1437
- llama_print_timings(ctx);
1642
+ llama_perf_context_print(ctx);
1438
1643
 
1439
1644
  llama_free(ctx);
1645
+
1646
+ ggml_threadpool_free(threadpool);
1440
1647
  }
1441
1648
 
1442
1649
  llama_free_model(lmodel);