@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -16,15 +16,18 @@
16
16
  #include <sstream>
17
17
  #include <string>
18
18
  #include <vector>
19
+ #include <thread>
19
20
 
20
21
  #include "ggml.h"
21
22
  #include "llama.h"
22
23
  #include "common.h"
23
- #include "ggml-cuda.h"
24
- #include "ggml-sycl.h"
25
24
 
26
- #ifdef GGML_USE_CANN
27
- #include "ggml-cann.h"
25
+ #ifdef _WIN32
26
+ #define WIN32_LEAN_AND_MEAN
27
+ #ifndef NOMINMAX
28
+ # define NOMINMAX
29
+ #endif
30
+ #include <windows.h>
28
31
  #endif
29
32
 
30
33
  // utils
@@ -73,81 +76,38 @@ static T stdev(const std::vector<T> & v) {
73
76
  }
74
77
 
75
78
  static std::string get_cpu_info() {
76
- std::string id;
77
- #ifdef __linux__
78
- FILE * f = fopen("/proc/cpuinfo", "r");
79
- if (f) {
80
- char buf[1024];
81
- while (fgets(buf, sizeof(buf), f)) {
82
- if (strncmp(buf, "model name", 10) == 0) {
83
- char * p = strchr(buf, ':');
84
- if (p) {
85
- p++;
86
- while (std::isspace(*p)) {
87
- p++;
88
- }
89
- while (std::isspace(p[strlen(p) - 1])) {
90
- p[strlen(p) - 1] = '\0';
91
- }
92
- id = p;
93
- break;
94
- }
95
- }
79
+ std::vector<std::string> cpu_list;
80
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
81
+ auto * dev = ggml_backend_dev_get(i);
82
+ auto dev_type = ggml_backend_dev_type(dev);
83
+ if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
84
+ cpu_list.push_back(ggml_backend_dev_description(dev));
96
85
  }
97
- fclose(f);
98
86
  }
99
- #endif
100
- // TODO: other platforms
101
- return id;
87
+ return join(cpu_list, ", ");
102
88
  }
103
89
 
104
90
  static std::string get_gpu_info() {
105
- std::string id;
106
- #ifdef GGML_USE_CUDA
107
- int count = ggml_backend_cuda_get_device_count();
108
- for (int i = 0; i < count; i++) {
109
- char buf[128];
110
- ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
111
- id += buf;
112
- if (i < count - 1) {
113
- id += "/";
91
+ std::vector<std::string> gpu_list;
92
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
93
+ auto * dev = ggml_backend_dev_get(i);
94
+ auto dev_type = ggml_backend_dev_type(dev);
95
+ if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
96
+ gpu_list.push_back(ggml_backend_dev_description(dev));
114
97
  }
115
98
  }
116
- #endif
117
- #ifdef GGML_USE_SYCL
118
- int count = ggml_backend_sycl_get_device_count();
119
- for (int i = 0; i < count; i++) {
120
- char buf[128];
121
- ggml_sycl_get_device_description(i, buf, sizeof(buf));
122
- id += buf;
123
- if (i < count - 1) {
124
- id += "/";
125
- }
126
- }
127
- #endif
128
- #ifdef GGML_USE_CANN
129
- uint32_t count = ggml_backend_cann_get_device_count();
130
- for (uint32_t i = 0; i < count; i++) {
131
- char buf[128];
132
- ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
133
- id += buf;
134
- if (i < count - 1) {
135
- id += "/";
136
- }
137
- }
138
- #endif
139
- // TODO: other backends
140
- return id;
99
+ return join(gpu_list, ", ");
141
100
  }
142
101
 
143
102
  // command line params
144
- enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
103
+ enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
145
104
 
146
105
  static const char * output_format_str(output_formats format) {
147
106
  switch (format) {
148
107
  case NONE: return "none";
149
108
  case CSV: return "csv";
150
109
  case JSON: return "json";
110
+ case JSONL: return "jsonl";
151
111
  case MARKDOWN: return "md";
152
112
  case SQL: return "sql";
153
113
  default: GGML_ABORT("invalid output format");
@@ -161,6 +121,8 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
161
121
  format = CSV;
162
122
  } else if (s == "json") {
163
123
  format = JSON;
124
+ } else if (s == "jsonl") {
125
+ format = JSONL;
164
126
  } else if (s == "md") {
165
127
  format = MARKDOWN;
166
128
  } else if (s == "sql") {
@@ -196,6 +158,9 @@ struct cmd_params {
196
158
  std::vector<ggml_type> type_k;
197
159
  std::vector<ggml_type> type_v;
198
160
  std::vector<int> n_threads;
161
+ std::vector<std::string> cpu_mask;
162
+ std::vector<bool> cpu_strict;
163
+ std::vector<int> poll;
199
164
  std::vector<int> n_gpu_layers;
200
165
  std::vector<std::string> rpc_servers;
201
166
  std::vector<llama_split_mode> split_mode;
@@ -207,7 +172,10 @@ struct cmd_params {
207
172
  std::vector<bool> embeddings;
208
173
  ggml_numa_strategy numa;
209
174
  int reps;
175
+ ggml_sched_priority prio;
176
+ int delay;
210
177
  bool verbose;
178
+ bool progress;
211
179
  output_formats output_format;
212
180
  output_formats output_format_stderr;
213
181
  };
@@ -222,6 +190,9 @@ static const cmd_params cmd_params_defaults = {
222
190
  /* type_k */ {GGML_TYPE_F16},
223
191
  /* type_v */ {GGML_TYPE_F16},
224
192
  /* n_threads */ {cpu_get_num_math()},
193
+ /* cpu_mask */ {"0x0"},
194
+ /* cpu_strict */ {false},
195
+ /* poll */ {50},
225
196
  /* n_gpu_layers */ {99},
226
197
  /* rpc_servers */ {""},
227
198
  /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
@@ -233,7 +204,10 @@ static const cmd_params cmd_params_defaults = {
233
204
  /* embeddings */ {false},
234
205
  /* numa */ GGML_NUMA_STRATEGY_DISABLED,
235
206
  /* reps */ 5,
207
+ /* prio */ GGML_SCHED_PRIO_NORMAL,
208
+ /* delay */ 0,
236
209
  /* verbose */ false,
210
+ /* progress */ false,
237
211
  /* output_format */ MARKDOWN,
238
212
  /* output_format_stderr */ NONE,
239
213
  };
@@ -243,29 +217,37 @@ static void print_usage(int /* argc */, char ** argv) {
243
217
  printf("\n");
244
218
  printf("options:\n");
245
219
  printf(" -h, --help\n");
246
- printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
247
- printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
248
- printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
249
- printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
250
- printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
251
- printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
252
- printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
253
- printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
254
- printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
255
- printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
256
- printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
257
- printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
258
- printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
259
- printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
260
- printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
261
- printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
262
- printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
263
- printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
264
- printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
265
- printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
266
- printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
267
- printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
268
- printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
220
+ printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
221
+ printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
222
+ printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
223
+ printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
224
+ printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
225
+ printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
226
+ printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
227
+ printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
228
+ printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
229
+ printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
230
+ printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
231
+ printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
232
+ printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
233
+ if (llama_supports_rpc()) {
234
+ printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
235
+ }
236
+ printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
237
+ printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
238
+ printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
239
+ printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
240
+ printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
241
+ printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
242
+ printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
243
+ printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
244
+ printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
245
+ printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
246
+ printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
247
+ printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
248
+ printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
249
+ printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
250
+ printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
269
251
  printf("\n");
270
252
  printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
271
253
  }
@@ -274,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
274
256
  if (s == "f16") {
275
257
  return GGML_TYPE_F16;
276
258
  }
259
+ if (s == "bf16") {
260
+ return GGML_TYPE_BF16;
261
+ }
277
262
  if (s == "q8_0") {
278
263
  return GGML_TYPE_Q8_0;
279
264
  }
@@ -309,6 +294,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
309
294
  params.output_format_stderr = cmd_params_defaults.output_format_stderr;
310
295
  params.reps = cmd_params_defaults.reps;
311
296
  params.numa = cmd_params_defaults.numa;
297
+ params.prio = cmd_params_defaults.prio;
298
+ params.delay = cmd_params_defaults.delay;
299
+ params.progress = cmd_params_defaults.progress;
312
300
 
313
301
  for (int i = 1; i < argc; i++) {
314
302
  arg = argv[i];
@@ -380,6 +368,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
380
368
  }
381
369
  types.push_back(gt);
382
370
  }
371
+ if (invalid_param) {
372
+ break;
373
+ }
383
374
  params.type_k.insert(params.type_k.end(), types.begin(), types.end());
384
375
  } else if (arg == "-ctv" || arg == "--cache-type-v") {
385
376
  if (++i >= argc) {
@@ -396,6 +387,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
396
387
  }
397
388
  types.push_back(gt);
398
389
  }
390
+ if (invalid_param) {
391
+ break;
392
+ }
399
393
  params.type_v.insert(params.type_v.end(), types.begin(), types.end());
400
394
  } else if (arg == "-t" || arg == "--threads") {
401
395
  if (++i >= argc) {
@@ -404,6 +398,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
404
398
  }
405
399
  auto p = string_split<int>(argv[i], split_delim);
406
400
  params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
401
+ } else if (arg == "-C" || arg == "--cpu-mask") {
402
+ if (++i >= argc) {
403
+ invalid_param = true;
404
+ break;
405
+ }
406
+ auto p = string_split<std::string>(argv[i], split_delim);
407
+ params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
408
+ } else if (arg == "--cpu-strict") {
409
+ if (++i >= argc) {
410
+ invalid_param = true;
411
+ break;
412
+ }
413
+ auto p = string_split<bool>(argv[i], split_delim);
414
+ params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
415
+ } else if (arg == "--poll") {
416
+ if (++i >= argc) {
417
+ invalid_param = true;
418
+ break;
419
+ }
420
+ auto p = string_split<int>(argv[i], split_delim);
421
+ params.poll.insert(params.poll.end(), p.begin(), p.end());
407
422
  } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
408
423
  if (++i >= argc) {
409
424
  invalid_param = true;
@@ -411,7 +426,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
411
426
  }
412
427
  auto p = string_split<int>(argv[i], split_delim);
413
428
  params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
414
- } else if (arg == "-rpc" || arg == "--rpc") {
429
+ } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
415
430
  if (++i >= argc) {
416
431
  invalid_param = true;
417
432
  break;
@@ -438,6 +453,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
438
453
  }
439
454
  modes.push_back(mode);
440
455
  }
456
+ if (invalid_param) {
457
+ break;
458
+ }
441
459
  params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
442
460
  } else if (arg == "-mg" || arg == "--main-gpu") {
443
461
  if (++i >= argc) {
@@ -512,6 +530,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
512
530
  break;
513
531
  }
514
532
  params.reps = std::stoi(argv[i]);
533
+ } else if (arg == "--prio") {
534
+ if (++i >= argc) {
535
+ invalid_param = true;
536
+ break;
537
+ }
538
+ params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
539
+ } else if (arg == "--delay") {
540
+ if (++i >= argc) {
541
+ invalid_param = true;
542
+ break;
543
+ }
544
+ params.delay = std::stoi(argv[i]);
515
545
  } else if (arg == "-o" || arg == "--output") {
516
546
  if (++i >= argc) {
517
547
  invalid_param = true;
@@ -526,6 +556,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
526
556
  invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
527
557
  } else if (arg == "-v" || arg == "--verbose") {
528
558
  params.verbose = true;
559
+ } else if (arg == "--progress") {
560
+ params.progress = true;
529
561
  } else {
530
562
  invalid_param = true;
531
563
  break;
@@ -556,6 +588,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
556
588
  if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
557
589
  if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
558
590
  if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
591
+ if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
592
+ if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
593
+ if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
559
594
 
560
595
  return params;
561
596
  }
@@ -569,6 +604,9 @@ struct cmd_params_instance {
569
604
  ggml_type type_k;
570
605
  ggml_type type_v;
571
606
  int n_threads;
607
+ std::string cpu_mask;
608
+ bool cpu_strict;
609
+ int poll;
572
610
  int n_gpu_layers;
573
611
  std::string rpc_servers;
574
612
  llama_split_mode split_mode;
@@ -638,7 +676,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
638
676
  for (const auto & tv : params.type_v)
639
677
  for (const auto & nkvo : params.no_kv_offload)
640
678
  for (const auto & fa : params.flash_attn)
641
- for (const auto & nt : params.n_threads) {
679
+ for (const auto & nt : params.n_threads)
680
+ for (const auto & cm : params.cpu_mask)
681
+ for (const auto & cs : params.cpu_strict)
682
+ for (const auto & pl : params.poll) {
642
683
  for (const auto & n_prompt : params.n_prompt) {
643
684
  if (n_prompt == 0) {
644
685
  continue;
@@ -652,6 +693,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
652
693
  /* .type_k = */ tk,
653
694
  /* .type_v = */ tv,
654
695
  /* .n_threads = */ nt,
696
+ /* .cpu_mask = */ cm,
697
+ /* .cpu_strict = */ cs,
698
+ /* .poll = */ pl,
655
699
  /* .n_gpu_layers = */ nl,
656
700
  /* .rpc_servers = */ rpc,
657
701
  /* .split_mode = */ sm,
@@ -678,6 +722,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
678
722
  /* .type_k = */ tk,
679
723
  /* .type_v = */ tv,
680
724
  /* .n_threads = */ nt,
725
+ /* .cpu_mask = */ cm,
726
+ /* .cpu_strict = */ cs,
727
+ /* .poll = */ pl,
681
728
  /* .n_gpu_layers = */ nl,
682
729
  /* .rpc_servers = */ rpc,
683
730
  /* .split_mode = */ sm,
@@ -704,6 +751,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
704
751
  /* .type_k = */ tk,
705
752
  /* .type_v = */ tv,
706
753
  /* .n_threads = */ nt,
754
+ /* .cpu_mask = */ cm,
755
+ /* .cpu_strict = */ cs,
756
+ /* .poll = */ pl,
707
757
  /* .n_gpu_layers = */ nl,
708
758
  /* .rpc_servers = */ rpc,
709
759
  /* .split_mode = */ sm,
@@ -724,13 +774,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
724
774
  struct test {
725
775
  static const std::string build_commit;
726
776
  static const int build_number;
727
- static const bool cuda;
728
- static const bool vulkan;
729
- static const bool kompute;
730
- static const bool metal;
731
- static const bool sycl;
732
- static const bool gpu_blas;
733
- static const bool blas;
734
777
  static const std::string cpu_info;
735
778
  static const std::string gpu_info;
736
779
  std::string model_filename;
@@ -740,7 +783,9 @@ struct test {
740
783
  int n_batch;
741
784
  int n_ubatch;
742
785
  int n_threads;
743
- bool has_rpc;
786
+ std::string cpu_mask;
787
+ bool cpu_strict;
788
+ int poll;
744
789
  ggml_type type_k;
745
790
  ggml_type type_v;
746
791
  int n_gpu_layers;
@@ -766,7 +811,9 @@ struct test {
766
811
  n_batch = inst.n_batch;
767
812
  n_ubatch = inst.n_ubatch;
768
813
  n_threads = inst.n_threads;
769
- has_rpc = !inst.rpc_servers.empty();
814
+ cpu_mask = inst.cpu_mask;
815
+ cpu_strict = inst.cpu_strict;
816
+ poll = inst.poll;
770
817
  type_k = inst.type_k;
771
818
  type_v = inst.type_v;
772
819
  n_gpu_layers = inst.n_gpu_layers;
@@ -811,45 +858,31 @@ struct test {
811
858
  }
812
859
 
813
860
  static std::string get_backend() {
814
- if (cuda) {
815
- return GGML_CUDA_NAME;
816
- }
817
- if (vulkan) {
818
- return "Vulkan";
819
- }
820
- if (kompute) {
821
- return "Kompute";
822
- }
823
- if (metal) {
824
- return "Metal";
825
- }
826
- if (sycl) {
827
- return GGML_SYCL_NAME;
828
- }
829
- if (gpu_blas) {
830
- return "GPU BLAS";
831
- }
832
- if (blas) {
833
- return "BLAS";
861
+ std::vector<std::string> backends;
862
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
863
+ auto * reg = ggml_backend_reg_get(i);
864
+ std::string name = ggml_backend_reg_name(reg);
865
+ if (name != "CPU") {
866
+ backends.push_back(ggml_backend_reg_name(reg));
867
+ }
834
868
  }
835
-
836
- return "CPU";
869
+ return backends.empty() ? "CPU" : join(backends, ",");
837
870
  }
838
871
 
839
872
  static const std::vector<std::string> & get_fields() {
840
873
  static const std::vector<std::string> fields = {
841
874
  "build_commit", "build_number",
842
- "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
843
- "cpu_info", "gpu_info",
875
+ "cpu_info", "gpu_info", "backends",
844
876
  "model_filename", "model_type", "model_size", "model_n_params",
845
877
  "n_batch", "n_ubatch",
846
- "n_threads", "type_k", "type_v",
878
+ "n_threads", "cpu_mask", "cpu_strict", "poll",
879
+ "type_k", "type_v",
847
880
  "n_gpu_layers", "split_mode",
848
881
  "main_gpu", "no_kv_offload", "flash_attn",
849
882
  "tensor_split", "use_mmap", "embeddings",
850
883
  "n_prompt", "n_gen", "test_time",
851
884
  "avg_ns", "stddev_ns",
852
- "avg_ts", "stddev_ts"
885
+ "avg_ts", "stddev_ts",
853
886
  };
854
887
  return fields;
855
888
  }
@@ -858,15 +891,15 @@ struct test {
858
891
 
859
892
  static field_type get_field_type(const std::string & field) {
860
893
  if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
861
- field == "n_threads" ||
894
+ field == "n_threads" || field == "poll" ||
862
895
  field == "model_size" || field == "model_n_params" ||
863
896
  field == "n_gpu_layers" || field == "main_gpu" ||
864
897
  field == "n_prompt" || field == "n_gen" ||
865
898
  field == "avg_ns" || field == "stddev_ns") {
866
899
  return INT;
867
900
  }
868
- if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
869
- field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
901
+ if (field == "f16_kv" || field == "no_kv_offload" ||
902
+ field == "cpu_strict" ||
870
903
  field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
871
904
  return BOOL;
872
905
  }
@@ -894,12 +927,11 @@ struct test {
894
927
  }
895
928
  std::vector<std::string> values = {
896
929
  build_commit, std::to_string(build_number),
897
- std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
898
- std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
899
- cpu_info, gpu_info,
930
+ cpu_info, gpu_info, get_backend(),
900
931
  model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
901
932
  std::to_string(n_batch), std::to_string(n_ubatch),
902
- std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
933
+ std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
934
+ ggml_type_name(type_k), ggml_type_name(type_v),
903
935
  std::to_string(n_gpu_layers), split_mode_str(split_mode),
904
936
  std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
905
937
  tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@@ -922,13 +954,6 @@ struct test {
922
954
 
923
955
  const std::string test::build_commit = LLAMA_COMMIT;
924
956
  const int test::build_number = LLAMA_BUILD_NUMBER;
925
- const bool test::cuda = !!ggml_cpu_has_cuda();
926
- const bool test::vulkan = !!ggml_cpu_has_vulkan();
927
- const bool test::kompute = !!ggml_cpu_has_kompute();
928
- const bool test::metal = !!ggml_cpu_has_metal();
929
- const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
930
- const bool test::blas = !!ggml_cpu_has_blas();
931
- const bool test::sycl = !!ggml_cpu_has_sycl();
932
957
  const std::string test::cpu_info = get_cpu_info();
933
958
  const std::string test::gpu_info = get_gpu_info();
934
959
 
@@ -967,37 +992,38 @@ struct csv_printer : public printer {
967
992
  }
968
993
  };
969
994
 
970
- struct json_printer : public printer {
971
- bool first = true;
972
995
 
973
- static std::string escape_json(const std::string & value) {
974
- std::string escaped;
975
- for (auto c : value) {
976
- if (c == '"') {
977
- escaped += "\\\"";
978
- } else if (c == '\\') {
979
- escaped += "\\\\";
980
- } else if (c <= 0x1f) {
981
- char buf[8];
982
- snprintf(buf, sizeof(buf), "\\u%04x", c);
983
- escaped += buf;
984
- } else {
985
- escaped += c;
986
- }
996
+ static std::string escape_json(const std::string & value) {
997
+ std::string escaped;
998
+ for (auto c : value) {
999
+ if (c == '"') {
1000
+ escaped += "\\\"";
1001
+ } else if (c == '\\') {
1002
+ escaped += "\\\\";
1003
+ } else if (c <= 0x1f) {
1004
+ char buf[8];
1005
+ snprintf(buf, sizeof(buf), "\\u%04x", c);
1006
+ escaped += buf;
1007
+ } else {
1008
+ escaped += c;
987
1009
  }
988
- return escaped;
989
1010
  }
1011
+ return escaped;
1012
+ }
990
1013
 
991
- static std::string format_value(const std::string & field, const std::string & value) {
992
- switch (test::get_field_type(field)) {
993
- case test::STRING:
994
- return "\"" + escape_json(value) + "\"";
995
- case test::BOOL:
996
- return value == "0" ? "false" : "true";
997
- default:
998
- return value;
999
- }
1014
+ static std::string format_json_value(const std::string & field, const std::string & value) {
1015
+ switch (test::get_field_type(field)) {
1016
+ case test::STRING:
1017
+ return "\"" + escape_json(value) + "\"";
1018
+ case test::BOOL:
1019
+ return value == "0" ? "false" : "true";
1020
+ default:
1021
+ return value;
1000
1022
  }
1023
+ }
1024
+
1025
+ struct json_printer : public printer {
1026
+ bool first = true;
1001
1027
 
1002
1028
  void print_header(const cmd_params & params) override {
1003
1029
  fprintf(fout, "[\n");
@@ -1007,7 +1033,7 @@ struct json_printer : public printer {
1007
1033
  void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1008
1034
  assert(fields.size() == values.size());
1009
1035
  for (size_t i = 0; i < fields.size(); i++) {
1010
- fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
1036
+ fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
1011
1037
  }
1012
1038
  }
1013
1039
 
@@ -1030,6 +1056,25 @@ struct json_printer : public printer {
1030
1056
  }
1031
1057
  };
1032
1058
 
1059
+
1060
+ struct jsonl_printer : public printer {
1061
+ void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1062
+ assert(fields.size() == values.size());
1063
+ for (size_t i = 0; i < fields.size(); i++) {
1064
+ fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
1065
+ }
1066
+ }
1067
+
1068
+ void print_test(const test & t) override {
1069
+ fprintf(fout, "{");
1070
+ print_fields(test::get_fields(), t.get_values());
1071
+ fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
1072
+ fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
1073
+ fprintf(fout, "}\n");
1074
+ fflush(fout);
1075
+ }
1076
+ };
1077
+
1033
1078
  struct markdown_printer : public printer {
1034
1079
  std::vector<std::string> fields;
1035
1080
 
@@ -1038,7 +1083,7 @@ struct markdown_printer : public printer {
1038
1083
  return -30;
1039
1084
  }
1040
1085
  if (field == "t/s") {
1041
- return 16;
1086
+ return 20;
1042
1087
  }
1043
1088
  if (field == "size" || field == "params") {
1044
1089
  return 10;
@@ -1113,13 +1158,23 @@ struct markdown_printer : public printer {
1113
1158
  fields.emplace_back("size");
1114
1159
  fields.emplace_back("params");
1115
1160
  fields.emplace_back("backend");
1116
- bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
1161
+ bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
1162
+ test::get_backend().find("BLAS") != std::string::npos;
1117
1163
  if (!is_cpu_backend) {
1118
1164
  fields.emplace_back("n_gpu_layers");
1119
1165
  }
1120
1166
  if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
1121
1167
  fields.emplace_back("n_threads");
1122
1168
  }
1169
+ if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
1170
+ fields.emplace_back("cpu_mask");
1171
+ }
1172
+ if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
1173
+ fields.emplace_back("cpu_strict");
1174
+ }
1175
+ if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
1176
+ fields.emplace_back("poll");
1177
+ }
1123
1178
  if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
1124
1179
  fields.emplace_back("n_batch");
1125
1180
  }
@@ -1194,9 +1249,6 @@ struct markdown_printer : public printer {
1194
1249
  value = buf;
1195
1250
  } else if (field == "backend") {
1196
1251
  value = test::get_backend();
1197
- if (t.has_rpc) {
1198
- value += "+RPC";
1199
- }
1200
1252
  } else if (field == "test") {
1201
1253
  if (t.n_prompt > 0 && t.n_gen == 0) {
1202
1254
  snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
@@ -1269,7 +1321,7 @@ struct sql_printer : public printer {
1269
1321
  }
1270
1322
  };
1271
1323
 
1272
- static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
1324
+ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
1273
1325
  llama_set_n_threads(ctx, n_threads, n_threads);
1274
1326
 
1275
1327
  const llama_model * model = llama_get_model(ctx);
@@ -1285,14 +1337,14 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
1285
1337
  for (int i = 1; i < n_tokens; i++) {
1286
1338
  tokens[i] = std::rand() % n_vocab;
1287
1339
  }
1288
- llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
1340
+ llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
1289
1341
  n_processed += n_tokens;
1290
1342
  }
1291
1343
 
1292
1344
  llama_synchronize(ctx);
1293
1345
  }
1294
1346
 
1295
- static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
1347
+ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
1296
1348
  llama_set_n_threads(ctx, n_threads, n_threads);
1297
1349
 
1298
1350
  const llama_model * model = llama_get_model(ctx);
@@ -1301,7 +1353,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
1301
1353
  llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
1302
1354
 
1303
1355
  for (int i = 0; i < n_gen; i++) {
1304
- llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
1356
+ llama_decode(ctx, llama_batch_get_one(&token, 1));
1305
1357
  llama_synchronize(ctx);
1306
1358
  token = std::rand() % n_vocab;
1307
1359
  }
@@ -1321,6 +1373,8 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
1321
1373
  return std::unique_ptr<printer>(new csv_printer());
1322
1374
  case JSON:
1323
1375
  return std::unique_ptr<printer>(new json_printer());
1376
+ case JSONL:
1377
+ return std::unique_ptr<printer>(new jsonl_printer());
1324
1378
  case MARKDOWN:
1325
1379
  return std::unique_ptr<printer>(new markdown_printer());
1326
1380
  case SQL:
@@ -1354,6 +1408,8 @@ int main(int argc, char ** argv) {
1354
1408
  llama_backend_init();
1355
1409
  llama_numa_init(params.numa);
1356
1410
 
1411
+ set_process_priority(params.prio);
1412
+
1357
1413
  // initialize printer
1358
1414
  std::unique_ptr<printer> p = create_printer(params.output_format);
1359
1415
  std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
@@ -1373,7 +1429,13 @@ int main(int argc, char ** argv) {
1373
1429
  llama_model * lmodel = nullptr;
1374
1430
  const cmd_params_instance * prev_inst = nullptr;
1375
1431
 
1432
+ int params_idx = 0;
1433
+ auto params_count = params_instances.size();
1376
1434
  for (const auto & inst : params_instances) {
1435
+ params_idx ++;
1436
+ if (params.progress) {
1437
+ fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
1438
+ }
1377
1439
  // keep the same model between tests when possible
1378
1440
  if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
1379
1441
  if (lmodel) {
@@ -1399,13 +1461,41 @@ int main(int argc, char ** argv) {
1399
1461
 
1400
1462
  llama_kv_cache_clear(ctx);
1401
1463
 
1464
+ // cool off before the test
1465
+ if (params.delay) {
1466
+ std::this_thread::sleep_for(std::chrono::seconds(params.delay));
1467
+ }
1468
+
1469
+ struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
1470
+ if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
1471
+ fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
1472
+ exit(1);
1473
+ }
1474
+ tpp.strict_cpu = t.cpu_strict;
1475
+ tpp.poll = t.poll;
1476
+ tpp.prio = params.prio;
1477
+
1478
+ struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
1479
+ if (!threadpool) {
1480
+ fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
1481
+ exit(1);
1482
+ }
1483
+
1484
+ llama_attach_threadpool(ctx, threadpool, NULL);
1485
+
1402
1486
  // warmup run
1403
1487
  if (t.n_prompt > 0) {
1488
+ if (params.progress) {
1489
+ fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
1490
+ }
1404
1491
  //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1405
- test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1492
+ test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1406
1493
  }
1407
1494
  if (t.n_gen > 0) {
1408
- test_gen(ctx, 1, 0, t.n_threads);
1495
+ if (params.progress) {
1496
+ fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
1497
+ }
1498
+ test_gen(ctx, 1, t.n_threads);
1409
1499
  }
1410
1500
 
1411
1501
  for (int i = 0; i < params.reps; i++) {
@@ -1414,10 +1504,16 @@ int main(int argc, char ** argv) {
1414
1504
  uint64_t t_start = get_time_ns();
1415
1505
 
1416
1506
  if (t.n_prompt > 0) {
1417
- test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1507
+ if (params.progress) {
1508
+ fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
1509
+ }
1510
+ test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
1418
1511
  }
1419
1512
  if (t.n_gen > 0) {
1420
- test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
1513
+ if (params.progress) {
1514
+ fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
1515
+ }
1516
+ test_gen(ctx, t.n_gen, t.n_threads);
1421
1517
  }
1422
1518
 
1423
1519
  uint64_t t_ns = get_time_ns() - t_start;
@@ -1434,9 +1530,11 @@ int main(int argc, char ** argv) {
1434
1530
  fflush(p_err->fout);
1435
1531
  }
1436
1532
 
1437
- llama_print_timings(ctx);
1533
+ llama_perf_context_print(ctx);
1438
1534
 
1439
1535
  llama_free(ctx);
1536
+
1537
+ ggml_threadpool_free(threadpool);
1440
1538
  }
1441
1539
 
1442
1540
  llama_free_model(lmodel);